SF patch 549375: Compromise PyUnicode_EncodeUTF8
This implements ideas from Marc-Andre, Martin, Guido and me on Python-Dev. "Short" Unicode strings are encoded into a "big enough" stack buffer, then exactly as much string space as they turn out to need is allocated at the end. This should have speed benefits akin to Martin's "measure once, allocate once" strategy, but without needing a distinct measuring pass. "Long" Unicode strings allocate as much heap space as they could possibly need (4 x # Unicode chars), and do a realloc at the end to return the untouched excess. Since the overallocation is likely to be substantial, this shouldn't burden the platform realloc with unusably small excess blocks. Also simplified uses of the PyString_xyz functions. Also added a release- build check that 4*size doesn't overflow a C int. Sooner or later, that's going to happen.
This commit is contained in:
parent
73364e64e5
commit
602f740bc2
|
@ -1138,70 +1138,49 @@ onError:
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Not used anymore, now that the encoder supports UTF-16
|
/* Allocation strategy: if the string is short, convert into a stack buffer
|
||||||
surrogates. */
|
and allocate exactly as much space needed at the end. Else allocate the
|
||||||
#if 0
|
maximum possible needed (4 result bytes per Unicode character), and return
|
||||||
static
|
the excess memory at the end.
|
||||||
int utf8_encoding_error(const Py_UNICODE **source,
|
|
||||||
char **dest,
|
|
||||||
const char *errors,
|
|
||||||
const char *details)
|
|
||||||
{
|
|
||||||
if ((errors == NULL) ||
|
|
||||||
(strcmp(errors,"strict") == 0)) {
|
|
||||||
PyErr_Format(PyExc_UnicodeError,
|
|
||||||
"UTF-8 encoding error: %.400s",
|
|
||||||
details);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
else if (strcmp(errors,"ignore") == 0) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
else if (strcmp(errors,"replace") == 0) {
|
|
||||||
**dest = '?';
|
|
||||||
(*dest)++;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
PyErr_Format(PyExc_ValueError,
|
|
||||||
"UTF-8 encoding error; "
|
|
||||||
"unknown error handling code: %.400s",
|
|
||||||
errors);
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Allocation strategy: we default to Latin-1, then do one resize
|
|
||||||
whenever we hit an order boundary. The assumption is that
|
|
||||||
characters from higher orders usually occur often enough to warrant
|
|
||||||
this.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
||||||
int size,
|
int size,
|
||||||
const char *errors)
|
const char *errors)
|
||||||
{
|
{
|
||||||
PyObject *v;
|
#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
|
||||||
char *p;
|
|
||||||
int len;
|
|
||||||
int i = 0;
|
|
||||||
long overalloc = 2;
|
|
||||||
int nallocated; /* overalloc * size; PyString_ adds one more for \0 */
|
|
||||||
|
|
||||||
/* Short-cut for empty strings */
|
int i; /* index into s of next input byte */
|
||||||
if (size == 0)
|
PyObject *v; /* result string object */
|
||||||
return PyString_FromStringAndSize(NULL, 0);
|
char *p; /* next free byte in output buffer */
|
||||||
|
int nallocated; /* number of result bytes allocated */
|
||||||
|
int nneeded; /* number of result bytes needed */
|
||||||
|
char stackbuf[MAX_SHORT_UNICHARS * 4];
|
||||||
|
|
||||||
nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
|
assert(s != NULL);
|
||||||
|
assert(size >= 0);
|
||||||
|
|
||||||
|
if (size <= MAX_SHORT_UNICHARS) {
|
||||||
|
/* Write into the stack buffer; nallocated can't overflow.
|
||||||
|
* At the end, we'll allocate exactly as much heap space as it
|
||||||
|
* turns out we need.
|
||||||
|
*/
|
||||||
|
nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
|
||||||
|
v = NULL; /* will allocate after we're done */
|
||||||
|
p = stackbuf;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
/* Overallocate on the heap, and give the excess back at the end. */
|
||||||
|
nallocated = size * 4;
|
||||||
|
if (nallocated / 4 != size) /* overflow! */
|
||||||
|
return PyErr_NoMemory();
|
||||||
v = PyString_FromStringAndSize(NULL, nallocated);
|
v = PyString_FromStringAndSize(NULL, nallocated);
|
||||||
if (v == NULL)
|
if (v == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
p = PyString_AS_STRING(v);
|
p = PyString_AS_STRING(v);
|
||||||
|
}
|
||||||
|
|
||||||
while (i < size) {
|
for (i = 0; i < size;) {
|
||||||
Py_UCS4 ch = s[i++];
|
Py_UCS4 ch = s[i++];
|
||||||
|
|
||||||
if (ch < 0x80)
|
if (ch < 0x80)
|
||||||
|
@ -1213,11 +1192,9 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
||||||
*p++ = (char)(0xc0 | (ch >> 6));
|
*p++ = (char)(0xc0 | (ch >> 6));
|
||||||
*p++ = (char)(0x80 | (ch & 0x3f));
|
*p++ = (char)(0x80 | (ch & 0x3f));
|
||||||
}
|
}
|
||||||
|
|
||||||
else {
|
else {
|
||||||
/* Encode UCS2 Unicode ordinals */
|
/* Encode UCS2 Unicode ordinals */
|
||||||
if (ch < 0x10000) {
|
if (ch < 0x10000) {
|
||||||
|
|
||||||
/* Special case: check for high surrogate */
|
/* Special case: check for high surrogate */
|
||||||
if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
|
if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
|
||||||
Py_UCS4 ch2 = s[i];
|
Py_UCS4 ch2 = s[i];
|
||||||
|
@ -1230,33 +1207,13 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
||||||
}
|
}
|
||||||
/* Fall through: handles isolated high surrogates */
|
/* Fall through: handles isolated high surrogates */
|
||||||
}
|
}
|
||||||
|
|
||||||
if (overalloc < 3) {
|
|
||||||
len = Py_SAFE_DOWNCAST(p-PyString_AS_STRING(v), long, int);
|
|
||||||
assert(len <= nallocated);
|
|
||||||
overalloc = 3;
|
|
||||||
nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
|
|
||||||
if (_PyString_Resize(&v, nallocated))
|
|
||||||
goto onError;
|
|
||||||
p = PyString_AS_STRING(v) + len;
|
|
||||||
}
|
|
||||||
*p++ = (char)(0xe0 | (ch >> 12));
|
*p++ = (char)(0xe0 | (ch >> 12));
|
||||||
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
||||||
*p++ = (char)(0x80 | (ch & 0x3f));
|
*p++ = (char)(0x80 | (ch & 0x3f));
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
encodeUCS4:
|
||||||
/* Encode UCS4 Unicode ordinals */
|
/* Encode UCS4 Unicode ordinals */
|
||||||
encodeUCS4:
|
|
||||||
if (overalloc < 4) {
|
|
||||||
len = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
|
|
||||||
assert(len <= nallocated);
|
|
||||||
overalloc = 4;
|
|
||||||
nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
|
|
||||||
if (_PyString_Resize(&v, nallocated))
|
|
||||||
goto onError;
|
|
||||||
p = PyString_AS_STRING(v) + len;
|
|
||||||
}
|
|
||||||
*p++ = (char)(0xf0 | (ch >> 18));
|
*p++ = (char)(0xf0 | (ch >> 18));
|
||||||
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
|
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
|
||||||
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
||||||
|
@ -1264,16 +1221,21 @@ PyUnicode_EncodeUTF8(const Py_UNICODE *s,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
*p = '\0';
|
if (v == NULL) {
|
||||||
len = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
|
/* This was stack allocated. */
|
||||||
assert(len <= nallocated);
|
nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
|
||||||
if (_PyString_Resize(&v, len))
|
assert(nneeded <= nallocated);
|
||||||
goto onError;
|
v = PyString_FromStringAndSize(stackbuf, nneeded);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
/* Cut back to size actually needed. */
|
||||||
|
nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
|
||||||
|
assert(nneeded <= nallocated);
|
||||||
|
_PyString_Resize(&v, nneeded);
|
||||||
|
}
|
||||||
return v;
|
return v;
|
||||||
|
|
||||||
onError:
|
#undef MAX_SHORT_UNICHARS
|
||||||
Py_DECREF(v);
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
|
PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
|
||||||
|
|
Loading…
Reference in New Issue