On 17-Mar-2000, Marc-Andre Lemburg said:

Attached you find an update of the Unicode implementation.

    The patch is against the current CVS version. I would appreciate
    if someone with CVS checkin permissions could check the changes
    in.

    The patch contains all bugs and patches sent this week and also
    fixes a leak in the codecs code and a bug in the free list code
    for Unicode objects (which only shows up when compiling Python
    with Py_DEBUG; thanks to MarkH for spotting this one).
This commit is contained in:
Barry Warsaw 2000-03-20 16:36:48 +00:00
parent abc411bac8
commit 51ac58039f
9 changed files with 61 additions and 39 deletions

View File

@ -1,8 +1,5 @@
#ifndef Py_UNICODEOBJECT_H #ifndef Py_UNICODEOBJECT_H
#define Py_UNICODEOBJECT_H #define Py_UNICODEOBJECT_H
#ifdef __cplusplus
extern "C" {
#endif
/* /*
@ -109,8 +106,9 @@ typedef unsigned short Py_UNICODE;
/* --- Internal Unicode Operations ---------------------------------------- */ /* --- Internal Unicode Operations ---------------------------------------- */
/* If you want Python to use the compiler's wctype.h functions instead /* If you want Python to use the compiler's wctype.h functions instead
of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS. of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or
This reduces the interpreter's code size. */ configure Python using --with-ctype-functions. This reduces the
interpreter's code size. */
#if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS) #if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS)
@ -169,6 +167,10 @@ typedef unsigned short Py_UNICODE;
(!memcmp((string)->str + (offset), (substring)->str,\ (!memcmp((string)->str + (offset), (substring)->str,\
(substring)->length*sizeof(Py_UNICODE))) (substring)->length*sizeof(Py_UNICODE)))
#ifdef __cplusplus
extern "C" {
#endif
/* --- Unicode Type ------------------------------------------------------- */ /* --- Unicode Type ------------------------------------------------------- */
typedef struct { typedef struct {
@ -647,7 +649,7 @@ extern DL_IMPORT(int) PyUnicode_Find(
int direction /* Find direction: +1 forward, -1 backward */ int direction /* Find direction: +1 forward, -1 backward */
); );
/* Count the number of occurances of substr in str[start:end]. */ /* Count the number of occurrences of substr in str[start:end]. */
extern DL_IMPORT(int) PyUnicode_Count( extern DL_IMPORT(int) PyUnicode_Count(
PyObject *str, /* String */ PyObject *str, /* String */
@ -656,7 +658,7 @@ extern DL_IMPORT(int) PyUnicode_Count(
int end /* Stop index */ int end /* Stop index */
); );
/* Replace at most maxcount occurances of substr in str with replstr /* Replace at most maxcount occurrences of substr in str with replstr
and return the resulting Unicode object. */ and return the resulting Unicode object. */
extern DL_IMPORT(PyObject *) PyUnicode_Replace( extern DL_IMPORT(PyObject *) PyUnicode_Replace(

View File

@ -30,13 +30,13 @@ Written by Marc-Andre Lemburg (mal@lemburg.com).
import string,codecs,aliases import string,codecs,aliases
_cache = {} _cache = {}
_unkown = '--unkown--' _unknown = '--unknown--'
def search_function(encoding): def search_function(encoding):
# Cache lookup # Cache lookup
entry = _cache.get(encoding,_unkown) entry = _cache.get(encoding,_unknown)
if entry is not _unkown: if entry is not _unknown:
return entry return entry
# Import the module # Import the module

View File

@ -143,6 +143,7 @@ test('translate', 'abc', 'Abc', table)
test('translate', 'xyz', 'xyz', table) test('translate', 'xyz', 'xyz', table)
test('replace', 'one!two!three!', 'one@two!three!', '!', '@', 1) test('replace', 'one!two!three!', 'one@two!three!', '!', '@', 1)
test('replace', 'one!two!three!', 'onetwothree', '!', '')
test('replace', 'one!two!three!', 'one@two@three!', '!', '@', 2) test('replace', 'one!two!three!', 'one@two@three!', '!', '@', 2)
test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 3) test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 3)
test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 4) test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 4)

View File

@ -108,6 +108,7 @@ if 0:
test('translate', u'xyz', u'xyz', table) test('translate', u'xyz', u'xyz', table)
test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1) test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1)
test('replace', u'one!two!three!', u'onetwothree', '!', '')
test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2) test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2)
test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3) test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3)
test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4) test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4)

View File

@ -743,8 +743,9 @@ For explicit handling of files using Unicode, the standard
stream codecs as available through the codecs module should stream codecs as available through the codecs module should
be used. be used.
XXX There should be a short-cut open(filename,mode,encoding) available which The codecs module should provide a short-cut open(filename,mode,encoding)
also assures that mode contains the 'b' character when needed. available which also assures that mode contains the 'b' character when
needed.
File/Stream Input: File/Stream Input:
@ -810,6 +811,10 @@ Unicode-Mappings:
Introduction to Unicode (a little outdated by still nice to read): Introduction to Unicode (a little outdated by still nice to read):
http://www.nada.kth.se/i18n/ucs/unicode-iso10646-oview.html http://www.nada.kth.se/i18n/ucs/unicode-iso10646-oview.html
For comparison:
Introducing Unicode to ECMAScript --
http://www-4.ibm.com/software/developer/library/internationalization-support.html
Encodings: Encodings:
Overview: Overview:
@ -832,7 +837,7 @@ Encodings:
History of this Proposal: History of this Proposal:
------------------------- -------------------------
1.2: 1.2: Removed POD about codecs.open()
1.1: Added note about comparisons and hash values. Added note about 1.1: Added note about comparisons and hash values. Added note about
case mapping algorithms. Changed stream codecs .read() and case mapping algorithms. Changed stream codecs .read() and
.write() method to match the standard file-like object methods .write() method to match the standard file-like object methods

View File

@ -1054,7 +1054,7 @@ strop_translate(self, args)
strstr replacement for arbitrary blocks of memory. strstr replacement for arbitrary blocks of memory.
Locates the first occurance in the memory pointed to by MEM of the Locates the first occurrence in the memory pointed to by MEM of the
contents of memory pointed to by PAT. Returns the index into MEM if contents of memory pointed to by PAT. Returns the index into MEM if
found, or -1 if not found. If len of PAT is greater than length of found, or -1 if not found. If len of PAT is greater than length of
MEM, the function returns -1. MEM, the function returns -1.

View File

@ -1395,7 +1395,7 @@ string_translate(self, args)
strstr replacement for arbitrary blocks of memory. strstr replacement for arbitrary blocks of memory.
Locates the first occurance in the memory pointed to by MEM of the Locates the first occurrence in the memory pointed to by MEM of the
contents of memory pointed to by PAT. Returns the index into MEM if contents of memory pointed to by PAT. Returns the index into MEM if
found, or -1 if not found. If len of PAT is greater than length of found, or -1 if not found. If len of PAT is greater than length of
MEM, the function returns -1. MEM, the function returns -1.
@ -1578,7 +1578,7 @@ string_replace(self, args)
return NULL; return NULL;
if (sub_len <= 0) { if (sub_len <= 0) {
PyErr_SetString(PyExc_ValueError, "empty replacement string"); PyErr_SetString(PyExc_ValueError, "empty pattern string");
return NULL; return NULL;
} }
new_s = mymemreplace(str,len,sub,sub_len,repl,repl_len,count,&out_len); new_s = mymemreplace(str,len,sub,sub_len,repl,repl_len,count,&out_len);

View File

@ -83,7 +83,7 @@ Unicode Integration Proposal (see file Misc/unicode.txt).
all objects on the free list having a size less than this all objects on the free list having a size less than this
limit. This reduces malloc() overhead for small Unicode objects. limit. This reduces malloc() overhead for small Unicode objects.
At worse this will result in MAX_UNICODE_FREELIST_SIZE * At worst this will result in MAX_UNICODE_FREELIST_SIZE *
(sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT + (sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT +
malloc()-overhead) bytes of unused garbage. malloc()-overhead) bytes of unused garbage.
@ -180,7 +180,7 @@ PyUnicodeObject *_PyUnicode_New(int length)
unicode_freelist = *(PyUnicodeObject **)unicode_freelist; unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
unicode_freelist_size--; unicode_freelist_size--;
unicode->ob_type = &PyUnicode_Type; unicode->ob_type = &PyUnicode_Type;
_Py_NewReference(unicode); _Py_NewReference((PyObject *)unicode);
if (unicode->str) { if (unicode->str) {
if (unicode->length < length && if (unicode->length < length &&
_PyUnicode_Resize(unicode, length)) { _PyUnicode_Resize(unicode, length)) {
@ -199,16 +199,19 @@ PyUnicodeObject *_PyUnicode_New(int length)
unicode->str = PyMem_NEW(Py_UNICODE, length + 1); unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
} }
if (!unicode->str) { if (!unicode->str)
PyMem_DEL(unicode); goto onError;
PyErr_NoMemory();
return NULL;
}
unicode->str[length] = 0; unicode->str[length] = 0;
unicode->length = length; unicode->length = length;
unicode->hash = -1; unicode->hash = -1;
unicode->utf8str = NULL; unicode->utf8str = NULL;
return unicode; return unicode;
onError:
_Py_ForgetReference((PyObject *)unicode);
PyMem_DEL(unicode);
PyErr_NoMemory();
return NULL;
} }
static static
@ -224,7 +227,6 @@ void _PyUnicode_Free(register PyUnicodeObject *unicode)
*(PyUnicodeObject **)unicode = unicode_freelist; *(PyUnicodeObject **)unicode = unicode_freelist;
unicode_freelist = unicode; unicode_freelist = unicode;
unicode_freelist_size++; unicode_freelist_size++;
_Py_ForgetReference(unicode);
} }
else { else {
free(unicode->str); free(unicode->str);
@ -489,7 +491,7 @@ int utf8_decoding_error(const char **source,
} }
else { else {
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"UTF-8 decoding error; unkown error handling code: %s", "UTF-8 decoding error; unknown error handling code: %s",
errors); errors);
return -1; return -1;
} }
@ -611,7 +613,7 @@ int utf8_encoding_error(const Py_UNICODE **source,
else { else {
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"UTF-8 encoding error; " "UTF-8 encoding error; "
"unkown error handling code: %s", "unknown error handling code: %s",
errors); errors);
return -1; return -1;
} }
@ -733,7 +735,7 @@ int utf16_decoding_error(const Py_UNICODE **source,
} }
else { else {
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"UTF-16 decoding error; unkown error handling code: %s", "UTF-16 decoding error; unknown error handling code: %s",
errors); errors);
return -1; return -1;
} }
@ -921,7 +923,7 @@ int unicodeescape_decoding_error(const char **source,
else { else {
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"Unicode-Escape decoding error; " "Unicode-Escape decoding error; "
"unkown error handling code: %s", "unknown error handling code: %s",
errors); errors);
return -1; return -1;
} }
@ -1051,6 +1053,10 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
*/ */
static const Py_UNICODE *findchar(const Py_UNICODE *s,
int size,
Py_UNICODE ch);
static static
PyObject *unicodeescape_string(const Py_UNICODE *s, PyObject *unicodeescape_string(const Py_UNICODE *s,
int size, int size,
@ -1069,9 +1075,6 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
p = q = PyString_AS_STRING(repr); p = q = PyString_AS_STRING(repr);
if (quotes) { if (quotes) {
static const Py_UNICODE *findchar(const Py_UNICODE *s,
int size,
Py_UNICODE ch);
*p++ = 'u'; *p++ = 'u';
*p++ = (findchar(s, size, '\'') && *p++ = (findchar(s, size, '\'') &&
!findchar(s, size, '"')) ? '"' : '\''; !findchar(s, size, '"')) ? '"' : '\'';
@ -1298,7 +1301,7 @@ int latin1_encoding_error(const Py_UNICODE **source,
else { else {
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"Latin-1 encoding error; " "Latin-1 encoding error; "
"unkown error handling code: %s", "unknown error handling code: %s",
errors); errors);
return -1; return -1;
} }
@ -1369,7 +1372,7 @@ int ascii_decoding_error(const char **source,
else { else {
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"ASCII decoding error; " "ASCII decoding error; "
"unkown error handling code: %s", "unknown error handling code: %s",
errors); errors);
return -1; return -1;
} }
@ -1431,7 +1434,7 @@ int ascii_encoding_error(const Py_UNICODE **source,
else { else {
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"ASCII encoding error; " "ASCII encoding error; "
"unkown error handling code: %s", "unknown error handling code: %s",
errors); errors);
return -1; return -1;
} }
@ -1502,7 +1505,7 @@ int charmap_decoding_error(const char **source,
else { else {
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"charmap decoding error; " "charmap decoding error; "
"unkown error handling code: %s", "unknown error handling code: %s",
errors); errors);
return -1; return -1;
} }
@ -1618,7 +1621,7 @@ int charmap_encoding_error(const Py_UNICODE **source,
else { else {
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"charmap encoding error; " "charmap encoding error; "
"unkown error handling code: %s", "unknown error handling code: %s",
errors); errors);
return -1; return -1;
} }
@ -1750,7 +1753,7 @@ int translate_error(const Py_UNICODE **source,
else { else {
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"translate error; " "translate error; "
"unkown error handling code: %s", "unknown error handling code: %s",
errors); errors);
return -1; return -1;
} }

View File

@ -93,9 +93,14 @@ PyObject *lowercasestring(const char *string)
PyObject *_PyCodec_Lookup(const char *encoding) PyObject *_PyCodec_Lookup(const char *encoding)
{ {
PyObject *result, *args = NULL, *v; PyObject *result, *args = NULL, *v = NULL;
int i, len; int i, len;
if (_PyCodec_SearchCache == NULL || _PyCodec_SearchPath == NULL) {
PyErr_SetString(PyExc_SystemError,
"codec module not properly initialized");
goto onError;
}
if (!import_encodings_called) if (!import_encodings_called)
import_encodings(); import_encodings();
@ -109,6 +114,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
result = PyDict_GetItem(_PyCodec_SearchCache, v); result = PyDict_GetItem(_PyCodec_SearchCache, v);
if (result != NULL) { if (result != NULL) {
Py_INCREF(result); Py_INCREF(result);
Py_DECREF(v);
return result; return result;
} }
@ -121,6 +127,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
if (args == NULL) if (args == NULL)
goto onError; goto onError;
PyTuple_SET_ITEM(args,0,v); PyTuple_SET_ITEM(args,0,v);
v = NULL;
for (i = 0; i < len; i++) { for (i = 0; i < len; i++) {
PyObject *func; PyObject *func;
@ -146,7 +153,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
if (i == len) { if (i == len) {
/* XXX Perhaps we should cache misses too ? */ /* XXX Perhaps we should cache misses too ? */
PyErr_SetString(PyExc_LookupError, PyErr_SetString(PyExc_LookupError,
"unkown encoding"); "unknown encoding");
goto onError; goto onError;
} }
@ -156,6 +163,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
return result; return result;
onError: onError:
Py_XDECREF(v);
Py_XDECREF(args); Py_XDECREF(args);
return NULL; return NULL;
} }
@ -378,5 +386,7 @@ void _PyCodecRegistry_Init()
void _PyCodecRegistry_Fini() void _PyCodecRegistry_Fini()
{ {
Py_XDECREF(_PyCodec_SearchPath); Py_XDECREF(_PyCodec_SearchPath);
_PyCodec_SearchPath = NULL;
Py_XDECREF(_PyCodec_SearchCache); Py_XDECREF(_PyCodec_SearchCache);
_PyCodec_SearchCache = NULL;
} }