From fe226c0d375c221a7ad0c34fda15c5d2c64e0dcc Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Mon, 3 Oct 2011 03:52:20 +0200 Subject: [PATCH] Rewrite PyUnicode_Resize() * Rename _PyUnicode_Resize() to unicode_resize() * unicode_resize() creates a copy if the string cannot be resized instead of failing * Optimize resize_copy() for wstr strings * Disable temporary resize_inplace() --- Objects/unicodeobject.c | 316 +++++++++++++++++++++++++++------------- 1 file changed, 214 insertions(+), 102 deletions(-) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3b16959f971..230363420dd 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -193,6 +193,8 @@ const unsigned char _Py_ascii_whitespace[] = { 0, 0, 0, 0, 0, 0, 0, 0 }; +static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length); + static PyObject * unicode_encode_call_errorhandler(const char *errors, PyObject **errorHandler,const char *encoding, const char *reason, @@ -320,71 +322,144 @@ Py_LOCAL_INLINE(char *) findchar(void *s, int kind, return NULL; } +static PyObject* +resize_compact(PyObject *unicode, Py_ssize_t length) +{ + Py_ssize_t char_size; + Py_ssize_t struct_size; + Py_ssize_t new_size; + int share_wstr; + + assert(PyUnicode_IS_READY(unicode)); + char_size = PyUnicode_CHARACTER_SIZE(unicode); + if (PyUnicode_IS_COMPACT_ASCII(unicode)) + struct_size = sizeof(PyASCIIObject); + else + struct_size = sizeof(PyCompactUnicodeObject); + share_wstr = (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(unicode)); + + _Py_DEC_REFTOTAL; + _Py_ForgetReference(unicode); + + if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) { + PyErr_NoMemory(); + return NULL; + } + new_size = (struct_size + (length + 1) * char_size); + + unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size); + if (unicode == NULL) { + PyObject_Del(unicode); + PyErr_NoMemory(); + return NULL; + } + _Py_NewReference(unicode); + _PyUnicode_LENGTH(unicode) = length; + if (share_wstr) + _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode); + PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode), + length, 0); + return unicode; +} + static int -unicode_resize(register PyUnicodeObject *unicode, - Py_ssize_t length) +resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length) { void *oldstr; - /* Resizing is only supported for old unicode objects. */ assert(!PyUnicode_IS_COMPACT(unicode)); - assert(_PyUnicode_WSTR(unicode) != NULL); - /* ... and only if they have not been readied yet, because - callees usually rely on the wstr representation when resizing. */ - assert(unicode->data.any == NULL); - - /* Shortcut if there's nothing much to do. */ - if (_PyUnicode_WSTR_LENGTH(unicode) == length) - goto reset; - - /* Resizing shared object (unicode_empty or single character - objects) in-place is not allowed. Use PyUnicode_Resize() - instead ! */ - - if (unicode == unicode_empty || - (_PyUnicode_WSTR_LENGTH(unicode) == 1 && - _PyUnicode_WSTR(unicode)[0] < 256U && - unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) { - PyErr_SetString(PyExc_SystemError, - "can't resize shared str objects"); - return -1; - } - - /* We allocate one more byte to make sure the string is Ux0000 terminated. - The overallocation is also used by fastsearch, which assumes that it's - safe to look at str[length] (without making any assumptions about what - it contains). */ - - oldstr = _PyUnicode_WSTR(unicode); - _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode), - sizeof(Py_UNICODE) * (length + 1)); - if (!_PyUnicode_WSTR(unicode)) { - _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr; - PyErr_NoMemory(); - return -1; - } - _PyUnicode_WSTR(unicode)[length] = 0; - _PyUnicode_WSTR_LENGTH(unicode) = length; - - reset: - if (unicode->data.any != NULL) { - PyObject_FREE(unicode->data.any); - if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != unicode->data.any) { - PyObject_FREE(_PyUnicode_UTF8(unicode)); - } - _PyUnicode_UTF8(unicode) = NULL; - _PyUnicode_UTF8_LENGTH(unicode) = 0; - unicode->data.any = NULL; - _PyUnicode_LENGTH(unicode) = 0; - _PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned; - _PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND; - } + assert(Py_REFCNT(unicode) == 1); _PyUnicode_DIRTY(unicode); + if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) + { + PyObject_DEL(_PyUnicode_UTF8(unicode)); + _PyUnicode_UTF8(unicode) = NULL; + } + + if (PyUnicode_IS_READY(unicode)) { + Py_ssize_t char_size; + Py_ssize_t new_size; + int share_wstr; + void *data; + + data = _PyUnicode_DATA_ANY(unicode); + assert(data != NULL); + char_size = PyUnicode_CHARACTER_SIZE(unicode); + share_wstr = (_PyUnicode_WSTR(unicode) == data); + + if (length > (PY_SSIZE_T_MAX / char_size - 1)) { + PyErr_NoMemory(); + return -1; + } + new_size = (length + 1) * char_size; + + data = (PyObject *)PyObject_REALLOC(data, new_size); + if (data == NULL) { + PyErr_NoMemory(); + return -1; + } + _PyUnicode_DATA_ANY(unicode) = data; + if (share_wstr) + _PyUnicode_WSTR(unicode) = data; + _PyUnicode_LENGTH(unicode) = length; + PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); + if (share_wstr) + return 0; + } + if (_PyUnicode_WSTR(unicode) != NULL) { + assert(_PyUnicode_WSTR(unicode) != NULL); + + oldstr = _PyUnicode_WSTR(unicode); + _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode), + sizeof(Py_UNICODE) * (length + 1)); + if (!_PyUnicode_WSTR(unicode)) { + _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr; + PyErr_NoMemory(); + return -1; + } + _PyUnicode_WSTR(unicode)[length] = 0; + _PyUnicode_WSTR_LENGTH(unicode) = length; + } return 0; } +static PyObject* +resize_copy(PyObject *unicode, Py_ssize_t length) +{ + Py_ssize_t copy_length; + if (PyUnicode_IS_COMPACT(unicode)) { + PyObject *copy; + assert(PyUnicode_IS_READY(unicode)); + + copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode)); + if (copy == NULL) + return NULL; + + copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode)); + if (PyUnicode_CopyCharacters(copy, 0, + unicode, 0, + copy_length) < 0) + { + Py_DECREF(copy); + return NULL; + } + return copy; + } else { + assert(_PyUnicode_WSTR(unicode) != NULL); + assert(_PyUnicode_DATA_ANY(unicode) == NULL); + PyUnicodeObject *w = _PyUnicode_New(length); + if (w == NULL) + return NULL; + copy_length = _PyUnicode_WSTR_LENGTH(unicode); + copy_length = Py_MIN(copy_length, length); + Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode), + copy_length); + return (PyObject*)w; + } +} + /* We allocate one more byte to make sure the string is Ux0000 terminated; some code (e.g. new_identifier) relies on that. @@ -690,7 +765,6 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end, assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + _PyUnicode_GET_LENGTH(unicode))); - return 0; } #endif @@ -1044,50 +1118,84 @@ unicode_dealloc(register PyUnicodeObject *unicode) } static int -_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length) +unicode_resizable(PyObject *unicode) { - register PyUnicodeObject *v; - - /* Argument checks */ - if (unicode == NULL) { - PyErr_BadInternalCall(); - return -1; - } - v = *unicode; - if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 || - PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) { - PyErr_BadInternalCall(); - return -1; + if (Py_REFCNT(unicode) != 1) + return 0; + if (PyUnicode_CHECK_INTERNED(unicode)) + return 0; + if (unicode == unicode_empty) + return 0; + if (PyUnicode_WSTR_LENGTH(unicode) == 1) { + Py_UCS4 ch; + if (PyUnicode_IS_COMPACT(unicode)) + ch = PyUnicode_READ_CHAR(unicode, 0); + else + ch = _PyUnicode_WSTR(unicode)[0]; + if (ch < 256 && unicode_latin1[ch] == unicode) + return 0; } + /* FIXME: reenable resize_inplace */ + if (!PyUnicode_IS_COMPACT(unicode)) + return 0; + return 1; +} - /* Resizing unicode_empty and single character objects is not - possible since these are being shared. - The same goes for new-representation unicode objects or objects which - have already been readied. - For these, we simply return a fresh copy with the same Unicode content. - */ - if ((_PyUnicode_WSTR_LENGTH(v) != length && - (v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) || - PyUnicode_IS_COMPACT(v) || v->data.any) { - PyUnicodeObject *w = _PyUnicode_New(length); - if (w == NULL) +static int +unicode_resize(PyObject **p_unicode, Py_ssize_t length) +{ + PyObject *unicode; + Py_ssize_t old_length; + + assert(p_unicode != NULL); + unicode = *p_unicode; + + assert(unicode != NULL); + assert(PyUnicode_Check(unicode)); + assert(0 <= length); + + if (!PyUnicode_IS_COMPACT(unicode) && !PyUnicode_IS_READY(unicode)) + old_length = PyUnicode_WSTR_LENGTH(unicode); + else + old_length = PyUnicode_GET_LENGTH(unicode); + if (old_length == length) + return 0; + + /* FIXME: really create a new object? */ + if (!unicode_resizable(unicode)) { + PyObject *copy = resize_copy(unicode, length); + if (copy == NULL) return -1; - Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v), - length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v)); - Py_DECREF(*unicode); - *unicode = w; + Py_DECREF(*p_unicode); + *p_unicode = copy; return 0; } - /* Note that we don't have to modify *unicode for unshared Unicode - objects, since we can modify them in-place. */ - return unicode_resize(v, length); + if (PyUnicode_IS_COMPACT(unicode)) { + *p_unicode = resize_compact(unicode, length); + if (*p_unicode == NULL) + return -1; + return 0; + } else + return resize_inplace((PyUnicodeObject*)unicode, length); } int -PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) +PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) { - return _PyUnicode_Resize((PyUnicodeObject **)unicode, length); + PyObject *unicode; + if (p_unicode == NULL) { + PyErr_BadInternalCall(); + return -1; + } + unicode = *p_unicode; + if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0 + || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) + { + PyErr_BadInternalCall(); + return -1; + } + return unicode_resize(p_unicode, length); } static PyObject* @@ -3085,7 +3193,7 @@ unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, if (requiredsize > outsize) { if (requiredsize<2*outsize) requiredsize = 2*outsize; - if (_PyUnicode_Resize(output, requiredsize) < 0) + if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0) goto onError; *outptr = PyUnicode_AS_UNICODE(*output) + *outpos; } @@ -3375,7 +3483,7 @@ utf7Error: } } - if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) + if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) goto onError; Py_XDECREF(errorHandler); @@ -3944,7 +4052,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, /* Adjust length and ready string when it contained errors and is of the old resizable kind. */ if (kind == PyUnicode_WCHAR_KIND) { - if (_PyUnicode_Resize(&unicode, i) < 0 || + if (PyUnicode_Resize((PyObject**)&unicode, i) < 0 || PyUnicode_READY(unicode) == -1) goto onError; } @@ -4449,7 +4557,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s, *consumed = (const char *)q-starts; /* Adjust length */ - if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) + if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) goto onError; Py_XDECREF(errorHandler); @@ -4847,7 +4955,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s, *consumed = (const char *)q-starts; /* Adjust length */ - if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) + if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) goto onError; Py_XDECREF(errorHandler); @@ -5304,9 +5412,13 @@ PyUnicode_DecodeUnicodeEscape(const char *s, /* Ensure the length prediction worked in case of ASCII strings */ assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length); - if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 || - PyUnicode_READY(v) == -1)) - goto onError; + if (kind == PyUnicode_WCHAR_KIND) + { + if (PyUnicode_Resize((PyObject**)&v, i) < 0) + goto onError; + if (PyUnicode_READY(v) == -1) + goto onError; + } Py_XDECREF(errorHandler); Py_XDECREF(exc); return (PyObject *)v; @@ -5602,7 +5714,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, nextByte: ; } - if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) + if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) goto onError; Py_XDECREF(errorHandler); Py_XDECREF(exc); @@ -5790,7 +5902,7 @@ _PyUnicode_DecodeUnicodeInternal(const char *s, } } - if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) + if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) goto onError; Py_XDECREF(errorHandler); Py_XDECREF(exc); @@ -6216,7 +6328,7 @@ PyUnicode_DecodeASCII(const char *s, } } if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) - if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) + if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) goto onError; Py_XDECREF(errorHandler); Py_XDECREF(exc); @@ -6343,7 +6455,7 @@ decode_mbcs(PyUnicodeObject **v, else { /* Extend unicode object */ n = PyUnicode_GET_SIZE(*v); - if (_PyUnicode_Resize(v, n + usize) < 0) + if (PyUnicode_Resize(v, n + usize) < 0) return -1; } @@ -6682,7 +6794,7 @@ PyUnicode_DecodeCharmap(const char *s, (targetsize << 2); extrachars += needed; /* XXX overflow detection missing */ - if (_PyUnicode_Resize(&v, + if (PyUnicode_Resize((PyObject**)&v, PyUnicode_GET_SIZE(v) + needed) < 0) { Py_DECREF(x); goto onError; @@ -6709,7 +6821,7 @@ PyUnicode_DecodeCharmap(const char *s, } } if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) - if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) + if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0) goto onError; Py_XDECREF(errorHandler); Py_XDECREF(exc);