diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 54db9aaddf0..c4c8d8d6409 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1138,142 +1138,104 @@ onError: return NULL; } -/* Not used anymore, now that the encoder supports UTF-16 - surrogates. */ -#if 0 -static -int utf8_encoding_error(const Py_UNICODE **source, - char **dest, - const char *errors, - const char *details) -{ - if ((errors == NULL) || - (strcmp(errors,"strict") == 0)) { - PyErr_Format(PyExc_UnicodeError, - "UTF-8 encoding error: %.400s", - details); - return -1; - } - else if (strcmp(errors,"ignore") == 0) { - return 0; - } - else if (strcmp(errors,"replace") == 0) { - **dest = '?'; - (*dest)++; - return 0; - } - else { - PyErr_Format(PyExc_ValueError, - "UTF-8 encoding error; " - "unknown error handling code: %.400s", - errors); - return -1; - } -} -#endif - -/* Allocation strategy: we default to Latin-1, then do one resize - whenever we hit an order boundary. The assumption is that - characters from higher orders usually occur often enough to warrant - this. +/* Allocation strategy: if the string is short, convert into a stack buffer + and allocate exactly as much space needed at the end. Else allocate the + maximum possible needed (4 result bytes per Unicode character), and return + the excess memory at the end. */ - PyObject * PyUnicode_EncodeUTF8(const Py_UNICODE *s, int size, const char *errors) { - PyObject *v; - char *p; - int len; - int i = 0; - long overalloc = 2; - int nallocated; /* overalloc * size; PyString_ adds one more for \0 */ +#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */ - /* Short-cut for empty strings */ - if (size == 0) - return PyString_FromStringAndSize(NULL, 0); + int i; /* index into s of next input byte */ + PyObject *v; /* result string object */ + char *p; /* next free byte in output buffer */ + int nallocated; /* number of result bytes allocated */ + int nneeded; /* number of result bytes needed */ + char stackbuf[MAX_SHORT_UNICHARS * 4]; - nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int); - v = PyString_FromStringAndSize(NULL, nallocated); - if (v == NULL) - return NULL; + assert(s != NULL); + assert(size >= 0); - p = PyString_AS_STRING(v); + if (size <= MAX_SHORT_UNICHARS) { + /* Write into the stack buffer; nallocated can't overflow. + * At the end, we'll allocate exactly as much heap space as it + * turns out we need. + */ + nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int); + v = NULL; /* will allocate after we're done */ + p = stackbuf; + } + else { + /* Overallocate on the heap, and give the excess back at the end. */ + nallocated = size * 4; + if (nallocated / 4 != size) /* overflow! */ + return PyErr_NoMemory(); + v = PyString_FromStringAndSize(NULL, nallocated); + if (v == NULL) + return NULL; + p = PyString_AS_STRING(v); + } - while (i < size) { + for (i = 0; i < size;) { Py_UCS4 ch = s[i++]; if (ch < 0x80) - /* Encode ASCII */ + /* Encode ASCII */ *p++ = (char) ch; else if (ch < 0x0800) { - /* Encode Latin-1 */ + /* Encode Latin-1 */ *p++ = (char)(0xc0 | (ch >> 6)); *p++ = (char)(0x80 | (ch & 0x3f)); } - else { - /* Encode UCS2 Unicode ordinals */ - if (ch < 0x10000) { - - /* Special case: check for high surrogate */ - if (0xD800 <= ch && ch <= 0xDBFF && i != size) { - Py_UCS4 ch2 = s[i]; - /* Check for low surrogate and combine the two to - form a UCS4 value */ - if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { + /* Encode UCS2 Unicode ordinals */ + if (ch < 0x10000) { + /* Special case: check for high surrogate */ + if (0xD800 <= ch && ch <= 0xDBFF && i != size) { + Py_UCS4 ch2 = s[i]; + /* Check for low surrogate and combine the two to + form a UCS4 value */ + if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000; - i++; - goto encodeUCS4; + i++; + goto encodeUCS4; } - /* Fall through: handles isolated high surrogates */ + /* Fall through: handles isolated high surrogates */ } - - if (overalloc < 3) { - len = Py_SAFE_DOWNCAST(p-PyString_AS_STRING(v), long, int); - assert(len <= nallocated); - overalloc = 3; - nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int); - if (_PyString_Resize(&v, nallocated)) - goto onError; - p = PyString_AS_STRING(v) + len; - } *p++ = (char)(0xe0 | (ch >> 12)); - *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); - *p++ = (char)(0x80 | (ch & 0x3f)); - continue; - } - - /* Encode UCS4 Unicode ordinals */ - encodeUCS4: - if (overalloc < 4) { - len = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int); - assert(len <= nallocated); - overalloc = 4; - nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int); - if (_PyString_Resize(&v, nallocated)) - goto onError; - p = PyString_AS_STRING(v) + len; - } - *p++ = (char)(0xf0 | (ch >> 18)); - *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); - *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); - *p++ = (char)(0x80 | (ch & 0x3f)); - } + *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *p++ = (char)(0x80 | (ch & 0x3f)); + continue; + } +encodeUCS4: + /* Encode UCS4 Unicode ordinals */ + *p++ = (char)(0xf0 | (ch >> 18)); + *p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); + *p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); + *p++ = (char)(0x80 | (ch & 0x3f)); + } } - *p = '\0'; - len = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int); - assert(len <= nallocated); - if (_PyString_Resize(&v, len)) - goto onError; + if (v == NULL) { + /* This was stack allocated. */ + nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int); + assert(nneeded <= nallocated); + v = PyString_FromStringAndSize(stackbuf, nneeded); + } + else { + /* Cut back to size actually needed. */ + nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int); + assert(nneeded <= nallocated); + _PyString_Resize(&v, nneeded); + } return v; - onError: - Py_DECREF(v); - return NULL; +#undef MAX_SHORT_UNICHARS } PyObject *PyUnicode_AsUTF8String(PyObject *unicode)