From 8f825060f1c168b913f2ac299ca48d4e9375f34d Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Fri, 27 Apr 2012 13:55:39 +0200 Subject: [PATCH] Check newly created consistency using _PyUnicode_CheckConsistency(str, 1) * In debug mode, fill the string data with invalid characters * Simplify also reference counting in PyCodec_BackslashReplaceErrors() and PyCodec_XMLCharRefReplaceError() --- Modules/_json.c | 1 + Modules/md5module.c | 1 + Modules/sha1module.c | 1 + Modules/sha256module.c | 1 + Modules/sha512module.c | 1 + Objects/bytesobject.c | 1 + Objects/unicodeobject.c | 27 +++++++++++++++++---------- Python/codecs.c | 10 ++++++---- Python/compile.c | 1 + Python/import.c | 1 + 10 files changed, 31 insertions(+), 14 deletions(-) diff --git a/Modules/_json.c b/Modules/_json.c index 95c658ca7c5..40c2ced5028 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -246,6 +246,7 @@ ascii_escape_unicode(PyObject *pystr) } } output[chars++] = '"'; + assert(_PyUnicode_CheckConsistency(rval, 1)); return rval; } diff --git a/Modules/md5module.c b/Modules/md5module.c index 86f602ebe5e..ee44c4878d1 100644 --- a/Modules/md5module.c +++ b/Modules/md5module.c @@ -397,6 +397,7 @@ MD5_hexdigest(MD5object *self, PyObject *unused) c = (digest[i] & 0xf); hex_digest[j++] = Py_hexdigits[c]; } + assert(_PyUnicode_CheckConsistency(retval, 1)); return retval; } diff --git a/Modules/sha1module.c b/Modules/sha1module.c index 30e5c5018a3..daea8879600 100644 --- a/Modules/sha1module.c +++ b/Modules/sha1module.c @@ -373,6 +373,7 @@ SHA1_hexdigest(SHA1object *self, PyObject *unused) c = (digest[i] & 0xf); hex_digest[j++] = Py_hexdigits[c]; } + assert(_PyUnicode_CheckConsistency(retval, 1)); return retval; } diff --git a/Modules/sha256module.c b/Modules/sha256module.c index f1ef3293666..76d91afda37 100644 --- a/Modules/sha256module.c +++ b/Modules/sha256module.c @@ -466,6 +466,7 @@ SHA256_hexdigest(SHAobject *self, PyObject *unused) c = (digest[i] & 0xf); hex_digest[j++] = Py_hexdigits[c]; } + assert(_PyUnicode_CheckConsistency(retval, 1)); return retval; } diff --git a/Modules/sha512module.c b/Modules/sha512module.c index 4f5a1139ee2..88f8a64d062 100644 --- a/Modules/sha512module.c +++ b/Modules/sha512module.c @@ -532,6 +532,7 @@ SHA512_hexdigest(SHAobject *self, PyObject *unused) c = (digest[i] & 0xf); hex_digest[j++] = Py_hexdigits[c]; } + assert(_PyUnicode_CheckConsistency(retval, 1)); return retval; } diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 2e6be431c91..b07be26896f 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -626,6 +626,7 @@ PyBytes_Repr(PyObject *obj, int smartquotes) *p++ = c; } *p++ = quote; + assert(_PyUnicode_CheckConsistency(v, 1)); return v; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 364de90877c..60b0a1fbbd9 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -967,7 +967,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) PyObject *obj; PyCompactUnicodeObject *unicode; void *data; - int kind_state; + enum PyUnicode_Kind kind; int is_sharing, is_ascii; Py_ssize_t char_size; Py_ssize_t struct_size; @@ -986,17 +986,17 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) is_sharing = 0; struct_size = sizeof(PyCompactUnicodeObject); if (maxchar < 128) { - kind_state = PyUnicode_1BYTE_KIND; + kind = PyUnicode_1BYTE_KIND; char_size = 1; is_ascii = 1; struct_size = sizeof(PyASCIIObject); } else if (maxchar < 256) { - kind_state = PyUnicode_1BYTE_KIND; + kind = PyUnicode_1BYTE_KIND; char_size = 1; } else if (maxchar < 65536) { - kind_state = PyUnicode_2BYTE_KIND; + kind = PyUnicode_2BYTE_KIND; char_size = 2; if (sizeof(wchar_t) == 2) is_sharing = 1; @@ -1007,7 +1007,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) "invalid maximum character passed to PyUnicode_New"); return NULL; } - kind_state = PyUnicode_4BYTE_KIND; + kind = PyUnicode_4BYTE_KIND; char_size = 4; if (sizeof(wchar_t) == 4) is_sharing = 1; @@ -1041,7 +1041,7 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) _PyUnicode_LENGTH(unicode) = size; _PyUnicode_HASH(unicode) = -1; _PyUnicode_STATE(unicode).interned = 0; - _PyUnicode_STATE(unicode).kind = kind_state; + _PyUnicode_STATE(unicode).kind = kind; _PyUnicode_STATE(unicode).compact = 1; _PyUnicode_STATE(unicode).ready = 1; _PyUnicode_STATE(unicode).ascii = is_ascii; @@ -1049,19 +1049,19 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) ((char*)data)[size] = 0; _PyUnicode_WSTR(unicode) = NULL; } - else if (kind_state == PyUnicode_1BYTE_KIND) { + else if (kind == PyUnicode_1BYTE_KIND) { ((char*)data)[size] = 0; _PyUnicode_WSTR(unicode) = NULL; _PyUnicode_WSTR_LENGTH(unicode) = 0; unicode->utf8 = NULL; unicode->utf8_length = 0; - } + } else { unicode->utf8 = NULL; unicode->utf8_length = 0; - if (kind_state == PyUnicode_2BYTE_KIND) + if (kind == PyUnicode_2BYTE_KIND) ((Py_UCS2*)data)[size] = 0; - else /* kind_state == PyUnicode_4BYTE_KIND */ + else /* kind == PyUnicode_4BYTE_KIND */ ((Py_UCS4*)data)[size] = 0; if (is_sharing) { _PyUnicode_WSTR_LENGTH(unicode) = size; @@ -1072,6 +1072,13 @@ PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar) _PyUnicode_WSTR(unicode) = NULL; } } +#ifdef Py_DEBUG + /* Fill the data with invalid characters to detect bugs earlier. + _PyUnicode_CheckConsistency(str, 1) detects invalid characters, + at least for ASCII and UCS-4 strings. U+00FF is invalid in ASCII + and U+FFFFFFFF is an invalid character in Unicode 6.0. */ + memset(data, 0xff, size * kind); +#endif assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0)); return obj; } diff --git a/Python/codecs.c b/Python/codecs.c index 607feea81c3..797a45f5a1c 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -534,6 +534,7 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) data = PyUnicode_DATA(res); for (i = 0; i < len; ++i) PyUnicode_WRITE(kind, data, i, '?'); + assert(_PyUnicode_CheckConsistency(res, 1)); return Py_BuildValue("(Nn)", res, end); } else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) { @@ -559,6 +560,7 @@ PyObject *PyCodec_ReplaceErrors(PyObject *exc) data = PyUnicode_DATA(res); for (i=0; i < len; i++) PyUnicode_WRITE(kind, data, i, Py_UNICODE_REPLACEMENT_CHARACTER); + assert(_PyUnicode_CheckConsistency(res, 1)); return Py_BuildValue("(Nn)", res, end); } else { @@ -652,8 +654,8 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc) } *outp++ = ';'; } - restuple = Py_BuildValue("(On)", res, end); - Py_DECREF(res); + assert(_PyUnicode_CheckConsistency(res, 1)); + restuple = Py_BuildValue("(Nn)", res, end); Py_DECREF(object); return restuple; } @@ -720,8 +722,8 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) *outp++ = Py_hexdigits[c&0xf]; } - restuple = Py_BuildValue("(On)", res, end); - Py_DECREF(res); + assert(_PyUnicode_CheckConsistency(res, 1)); + restuple = Py_BuildValue("(Nn)", res, end); Py_DECREF(object); return restuple; } diff --git a/Python/compile.c b/Python/compile.c index 79d1d216702..10e9ad27f56 100644 --- a/Python/compile.c +++ b/Python/compile.c @@ -263,6 +263,7 @@ _Py_Mangle(PyObject *privateobj, PyObject *ident) Py_DECREF(result); return NULL; } + assert(_PyUnicode_CheckConsistency(result, 1)); return result; } diff --git a/Python/import.c b/Python/import.c index 8cf10e658c2..103e7de4393 100644 --- a/Python/import.c +++ b/Python/import.c @@ -992,6 +992,7 @@ make_source_pathname(PyObject *path) (j = dot0-right)); PyUnicode_WRITE(kind, data, i+j, 'p'); PyUnicode_WRITE(kind, data, i+j+1, 'y'); + assert(_PyUnicode_CheckConsistency(result, 1)); return result; }