From 51ac58039f62ef9d605974dae32a6ada9c26039b Mon Sep 17 00:00:00 2001 From: Barry Warsaw Date: Mon, 20 Mar 2000 16:36:48 +0000 Subject: [PATCH] On 17-Mar-2000, Marc-Andre Lemburg said: Attached you find an update of the Unicode implementation. The patch is against the current CVS version. I would appreciate if someone with CVS checkin permissions could check the changes in. The patch contains all bugs and patches sent this week and also fixes a leak in the codecs code and a bug in the free list code for Unicode objects (which only shows up when compiling Python with Py_DEBUG; thanks to MarkH for spotting this one). --- Include/unicodeobject.h | 16 ++++++++------ Lib/encodings/__init__.py | 6 +++--- Lib/test/test_string.py | 1 + Lib/test/test_unicode.py | 1 + Misc/unicode.txt | 11 +++++++--- Modules/stropmodule.c | 2 +- Objects/stringobject.c | 4 ++-- Objects/unicodeobject.c | 45 +++++++++++++++++++++------------------ Python/codecs.c | 14 ++++++++++-- 9 files changed, 61 insertions(+), 39 deletions(-) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 37f2b0d3f47..770ecab52c4 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1,8 +1,5 @@ #ifndef Py_UNICODEOBJECT_H #define Py_UNICODEOBJECT_H -#ifdef __cplusplus -extern "C" { -#endif /* @@ -109,8 +106,9 @@ typedef unsigned short Py_UNICODE; /* --- Internal Unicode Operations ---------------------------------------- */ /* If you want Python to use the compiler's wctype.h functions instead - of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS. - This reduces the interpreter's code size. */ + of the ones supplied with Python, define WANT_WCTYPE_FUNCTIONS or + configure Python using --with-ctype-functions. This reduces the + interpreter's code size. */ #if defined(HAVE_USABLE_WCHAR_T) && defined(WANT_WCTYPE_FUNCTIONS) @@ -169,6 +167,10 @@ typedef unsigned short Py_UNICODE; (!memcmp((string)->str + (offset), (substring)->str,\ (substring)->length*sizeof(Py_UNICODE))) +#ifdef __cplusplus +extern "C" { +#endif + /* --- Unicode Type ------------------------------------------------------- */ typedef struct { @@ -647,7 +649,7 @@ extern DL_IMPORT(int) PyUnicode_Find( int direction /* Find direction: +1 forward, -1 backward */ ); -/* Count the number of occurances of substr in str[start:end]. */ +/* Count the number of occurrences of substr in str[start:end]. */ extern DL_IMPORT(int) PyUnicode_Count( PyObject *str, /* String */ @@ -656,7 +658,7 @@ extern DL_IMPORT(int) PyUnicode_Count( int end /* Stop index */ ); -/* Replace at most maxcount occurances of substr in str with replstr +/* Replace at most maxcount occurrences of substr in str with replstr and return the resulting Unicode object. */ extern DL_IMPORT(PyObject *) PyUnicode_Replace( diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py index c33b822a5b2..cd5876e7df6 100644 --- a/Lib/encodings/__init__.py +++ b/Lib/encodings/__init__.py @@ -30,13 +30,13 @@ Written by Marc-Andre Lemburg (mal@lemburg.com). import string,codecs,aliases _cache = {} -_unkown = '--unkown--' +_unknown = '--unknown--' def search_function(encoding): # Cache lookup - entry = _cache.get(encoding,_unkown) - if entry is not _unkown: + entry = _cache.get(encoding,_unknown) + if entry is not _unknown: return entry # Import the module diff --git a/Lib/test/test_string.py b/Lib/test/test_string.py index bb6d035f9b6..4a3e474cd87 100644 --- a/Lib/test/test_string.py +++ b/Lib/test/test_string.py @@ -143,6 +143,7 @@ test('translate', 'abc', 'Abc', table) test('translate', 'xyz', 'xyz', table) test('replace', 'one!two!three!', 'one@two!three!', '!', '@', 1) +test('replace', 'one!two!three!', 'onetwothree', '!', '') test('replace', 'one!two!three!', 'one@two@three!', '!', '@', 2) test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 3) test('replace', 'one!two!three!', 'one@two@three@', '!', '@', 4) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 64b8ff8bd36..69d4273ace8 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -108,6 +108,7 @@ if 0: test('translate', u'xyz', u'xyz', table) test('replace', u'one!two!three!', u'one@two!three!', u'!', u'@', 1) +test('replace', u'one!two!three!', u'onetwothree', '!', '') test('replace', u'one!two!three!', u'one@two@three!', u'!', u'@', 2) test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 3) test('replace', u'one!two!three!', u'one@two@three@', u'!', u'@', 4) diff --git a/Misc/unicode.txt b/Misc/unicode.txt index b31beefcfcd..9a4832afce8 100644 --- a/Misc/unicode.txt +++ b/Misc/unicode.txt @@ -743,8 +743,9 @@ For explicit handling of files using Unicode, the standard stream codecs as available through the codecs module should be used. -XXX There should be a short-cut open(filename,mode,encoding) available which - also assures that mode contains the 'b' character when needed. +The codecs module should provide a short-cut open(filename,mode,encoding) +available which also assures that mode contains the 'b' character when +needed. File/Stream Input: @@ -810,6 +811,10 @@ Unicode-Mappings: Introduction to Unicode (a little outdated by still nice to read): http://www.nada.kth.se/i18n/ucs/unicode-iso10646-oview.html +For comparison: + Introducing Unicode to ECMAScript -- + http://www-4.ibm.com/software/developer/library/internationalization-support.html + Encodings: Overview: @@ -832,7 +837,7 @@ Encodings: History of this Proposal: ------------------------- -1.2: +1.2: Removed POD about codecs.open() 1.1: Added note about comparisons and hash values. Added note about case mapping algorithms. Changed stream codecs .read() and .write() method to match the standard file-like object methods diff --git a/Modules/stropmodule.c b/Modules/stropmodule.c index a0d8b9a1dc3..4c9ee765389 100644 --- a/Modules/stropmodule.c +++ b/Modules/stropmodule.c @@ -1054,7 +1054,7 @@ strop_translate(self, args) strstr replacement for arbitrary blocks of memory. - Locates the first occurance in the memory pointed to by MEM of the + Locates the first occurrence in the memory pointed to by MEM of the contents of memory pointed to by PAT. Returns the index into MEM if found, or -1 if not found. If len of PAT is greater than length of MEM, the function returns -1. diff --git a/Objects/stringobject.c b/Objects/stringobject.c index 2d404b92bb9..10257f7562d 100644 --- a/Objects/stringobject.c +++ b/Objects/stringobject.c @@ -1395,7 +1395,7 @@ string_translate(self, args) strstr replacement for arbitrary blocks of memory. - Locates the first occurance in the memory pointed to by MEM of the + Locates the first occurrence in the memory pointed to by MEM of the contents of memory pointed to by PAT. Returns the index into MEM if found, or -1 if not found. If len of PAT is greater than length of MEM, the function returns -1. @@ -1578,7 +1578,7 @@ string_replace(self, args) return NULL; if (sub_len <= 0) { - PyErr_SetString(PyExc_ValueError, "empty replacement string"); + PyErr_SetString(PyExc_ValueError, "empty pattern string"); return NULL; } new_s = mymemreplace(str,len,sub,sub_len,repl,repl_len,count,&out_len); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index da12da26448..d63165ea05b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -83,7 +83,7 @@ Unicode Integration Proposal (see file Misc/unicode.txt). all objects on the free list having a size less than this limit. This reduces malloc() overhead for small Unicode objects. - At worse this will result in MAX_UNICODE_FREELIST_SIZE * + At worst this will result in MAX_UNICODE_FREELIST_SIZE * (sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT + malloc()-overhead) bytes of unused garbage. @@ -180,7 +180,7 @@ PyUnicodeObject *_PyUnicode_New(int length) unicode_freelist = *(PyUnicodeObject **)unicode_freelist; unicode_freelist_size--; unicode->ob_type = &PyUnicode_Type; - _Py_NewReference(unicode); + _Py_NewReference((PyObject *)unicode); if (unicode->str) { if (unicode->length < length && _PyUnicode_Resize(unicode, length)) { @@ -199,16 +199,19 @@ PyUnicodeObject *_PyUnicode_New(int length) unicode->str = PyMem_NEW(Py_UNICODE, length + 1); } - if (!unicode->str) { - PyMem_DEL(unicode); - PyErr_NoMemory(); - return NULL; - } + if (!unicode->str) + goto onError; unicode->str[length] = 0; unicode->length = length; unicode->hash = -1; unicode->utf8str = NULL; return unicode; + + onError: + _Py_ForgetReference((PyObject *)unicode); + PyMem_DEL(unicode); + PyErr_NoMemory(); + return NULL; } static @@ -224,7 +227,6 @@ void _PyUnicode_Free(register PyUnicodeObject *unicode) *(PyUnicodeObject **)unicode = unicode_freelist; unicode_freelist = unicode; unicode_freelist_size++; - _Py_ForgetReference(unicode); } else { free(unicode->str); @@ -489,7 +491,7 @@ int utf8_decoding_error(const char **source, } else { PyErr_Format(PyExc_ValueError, - "UTF-8 decoding error; unkown error handling code: %s", + "UTF-8 decoding error; unknown error handling code: %s", errors); return -1; } @@ -611,7 +613,7 @@ int utf8_encoding_error(const Py_UNICODE **source, else { PyErr_Format(PyExc_ValueError, "UTF-8 encoding error; " - "unkown error handling code: %s", + "unknown error handling code: %s", errors); return -1; } @@ -733,7 +735,7 @@ int utf16_decoding_error(const Py_UNICODE **source, } else { PyErr_Format(PyExc_ValueError, - "UTF-16 decoding error; unkown error handling code: %s", + "UTF-16 decoding error; unknown error handling code: %s", errors); return -1; } @@ -921,7 +923,7 @@ int unicodeescape_decoding_error(const char **source, else { PyErr_Format(PyExc_ValueError, "Unicode-Escape decoding error; " - "unkown error handling code: %s", + "unknown error handling code: %s", errors); return -1; } @@ -1051,6 +1053,10 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, */ +static const Py_UNICODE *findchar(const Py_UNICODE *s, + int size, + Py_UNICODE ch); + static PyObject *unicodeescape_string(const Py_UNICODE *s, int size, @@ -1069,9 +1075,6 @@ PyObject *unicodeescape_string(const Py_UNICODE *s, p = q = PyString_AS_STRING(repr); if (quotes) { - static const Py_UNICODE *findchar(const Py_UNICODE *s, - int size, - Py_UNICODE ch); *p++ = 'u'; *p++ = (findchar(s, size, '\'') && !findchar(s, size, '"')) ? '"' : '\''; @@ -1298,7 +1301,7 @@ int latin1_encoding_error(const Py_UNICODE **source, else { PyErr_Format(PyExc_ValueError, "Latin-1 encoding error; " - "unkown error handling code: %s", + "unknown error handling code: %s", errors); return -1; } @@ -1369,7 +1372,7 @@ int ascii_decoding_error(const char **source, else { PyErr_Format(PyExc_ValueError, "ASCII decoding error; " - "unkown error handling code: %s", + "unknown error handling code: %s", errors); return -1; } @@ -1431,7 +1434,7 @@ int ascii_encoding_error(const Py_UNICODE **source, else { PyErr_Format(PyExc_ValueError, "ASCII encoding error; " - "unkown error handling code: %s", + "unknown error handling code: %s", errors); return -1; } @@ -1502,7 +1505,7 @@ int charmap_decoding_error(const char **source, else { PyErr_Format(PyExc_ValueError, "charmap decoding error; " - "unkown error handling code: %s", + "unknown error handling code: %s", errors); return -1; } @@ -1618,7 +1621,7 @@ int charmap_encoding_error(const Py_UNICODE **source, else { PyErr_Format(PyExc_ValueError, "charmap encoding error; " - "unkown error handling code: %s", + "unknown error handling code: %s", errors); return -1; } @@ -1750,7 +1753,7 @@ int translate_error(const Py_UNICODE **source, else { PyErr_Format(PyExc_ValueError, "translate error; " - "unkown error handling code: %s", + "unknown error handling code: %s", errors); return -1; } diff --git a/Python/codecs.c b/Python/codecs.c index 5075a20d666..2d493776008 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -93,9 +93,14 @@ PyObject *lowercasestring(const char *string) PyObject *_PyCodec_Lookup(const char *encoding) { - PyObject *result, *args = NULL, *v; + PyObject *result, *args = NULL, *v = NULL; int i, len; + if (_PyCodec_SearchCache == NULL || _PyCodec_SearchPath == NULL) { + PyErr_SetString(PyExc_SystemError, + "codec module not properly initialized"); + goto onError; + } if (!import_encodings_called) import_encodings(); @@ -109,6 +114,7 @@ PyObject *_PyCodec_Lookup(const char *encoding) result = PyDict_GetItem(_PyCodec_SearchCache, v); if (result != NULL) { Py_INCREF(result); + Py_DECREF(v); return result; } @@ -121,6 +127,7 @@ PyObject *_PyCodec_Lookup(const char *encoding) if (args == NULL) goto onError; PyTuple_SET_ITEM(args,0,v); + v = NULL; for (i = 0; i < len; i++) { PyObject *func; @@ -146,7 +153,7 @@ PyObject *_PyCodec_Lookup(const char *encoding) if (i == len) { /* XXX Perhaps we should cache misses too ? */ PyErr_SetString(PyExc_LookupError, - "unkown encoding"); + "unknown encoding"); goto onError; } @@ -156,6 +163,7 @@ PyObject *_PyCodec_Lookup(const char *encoding) return result; onError: + Py_XDECREF(v); Py_XDECREF(args); return NULL; } @@ -378,5 +386,7 @@ void _PyCodecRegistry_Init() void _PyCodecRegistry_Fini() { Py_XDECREF(_PyCodec_SearchPath); + _PyCodec_SearchPath = NULL; Py_XDECREF(_PyCodec_SearchCache); + _PyCodec_SearchCache = NULL; }