SF patch 549375: Compromise PyUnicode_EncodeUTF8

This implements ideas from Marc-Andre, Martin, Guido and me on Python-Dev. "Short" Unicode strings are encoded into a "big enough" stack buffer, then exactly as much string space as they turn out to need is allocated at the end. This should have speed benefits akin to Martin's "measure once, allocate once" strategy, but without needing a distinct measuring pass. "Long" Unicode strings allocate as much heap space as they could possibly need (4 x # Unicode chars), and do a realloc at the end to return the untouched excess. Since the overallocation is likely to be substantial, this shouldn't burden the platform realloc with unusably small excess blocks. Also simplified uses of the PyString_xyz functions. Also added a release- build check that 4*size doesn't overflow a C int. Sooner or later, that's going to happen.
2002-04-27 18:03:26 +00:00 · 2002-04-27 18:03:26 +00:00 · 602f740bc2
parent 73364e64e5
commit 602f740bc2
1 changed files with 70 additions and 108 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -1138,142 +1138,104 @@ onError:
    return NULL;
 }
-/* Not used anymore, now that the encoder supports UTF-16
+/* Allocation strategy:  if the string is short, convert into a stack buffer
-   surrogates. */
+   and allocate exactly as much space needed at the end.  Else allocate the
-#if 0
+   maximum possible needed (4 result bytes per Unicode character), and return
-static
+   the excess memory at the end.
 int utf8_encoding_error(const Py_UNICODE **source,
 			char **dest,
 			const char *errors,
 			const char *details) 
 {
    if ((errors == NULL) ||
 	(strcmp(errors,"strict") == 0)) {
 	PyErr_Format(PyExc_UnicodeError,
 		     "UTF-8 encoding error: %.400s",
 		     details);
 	return -1;
    }
    else if (strcmp(errors,"ignore") == 0) {
 	return 0;
    }
    else if (strcmp(errors,"replace") == 0) {
 	**dest = '?';
 	(*dest)++;
 	return 0;
    }
    else {
 	PyErr_Format(PyExc_ValueError,
 		     "UTF-8 encoding error; "
 		     "unknown error handling code: %.400s",
 		     errors);
 	return -1;
    }
 }
 #endif
 /* Allocation strategy: we default to Latin-1, then do one resize
   whenever we hit an order boundary. The assumption is that
   characters from higher orders usually occur often enough to warrant
   this.
 */
 PyObject *
 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
 		     int size,
 		     const char *errors)
 {
-    PyObject *v;
+#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
    char *p;
    int len;
    int i = 0;
    long overalloc = 2;
    int nallocated;  /* overalloc * size; PyString_ adds one more for \0 */
-    /* Short-cut for empty strings */
+    int i;              /* index into s of next input byte */
-    if (size == 0)
+    PyObject *v;        /* result string object */
-	return PyString_FromStringAndSize(NULL, 0);
+    char *p;            /* next free byte in output buffer */
    int nallocated;     /* number of result bytes allocated */
    int nneeded;        /* number of result bytes needed */
    char stackbuf[MAX_SHORT_UNICHARS * 4];
-    nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
+    assert(s != NULL);
-    v = PyString_FromStringAndSize(NULL, nallocated);
+    assert(size >= 0);
    if (v == NULL)
        return NULL;
-    p = PyString_AS_STRING(v);
+    if (size <= MAX_SHORT_UNICHARS) {
        /* Write into the stack buffer; nallocated can't overflow.
         * At the end, we'll allocate exactly as much heap space as it
         * turns out we need.
         */
        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
        v = NULL;   /* will allocate after we're done */
        p = stackbuf;
    }
    else {
        /* Overallocate on the heap, and give the excess back at the end. */
        nallocated = size * 4;
        if (nallocated / 4 != size)  /* overflow! */
            return PyErr_NoMemory();
        v = PyString_FromStringAndSize(NULL, nallocated);
        if (v == NULL)
            return NULL;
        p = PyString_AS_STRING(v);
    }
-    while (i < size) {
+    for (i = 0; i < size;) {
        Py_UCS4 ch = s[i++];
        if (ch < 0x80)
-	    /* Encode ASCII */
+            /* Encode ASCII */
            *p++ = (char) ch;
        else if (ch < 0x0800) {
-	    /* Encode Latin-1 */
+            /* Encode Latin-1 */
            *p++ = (char)(0xc0 | (ch >> 6));
            *p++ = (char)(0x80 | (ch & 0x3f));
        }
        else {
-	    /* Encode UCS2 Unicode ordinals */
+            /* Encode UCS2 Unicode ordinals */
-	    if (ch < 0x10000) {
+            if (ch < 0x10000) {
-
+                /* Special case: check for high surrogate */
-		/* Special case: check for high surrogate */
+                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
-		if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
+                    Py_UCS4 ch2 = s[i];
-		    Py_UCS4 ch2 = s[i];
+                    /* Check for low surrogate and combine the two to
-		    /* Check for low surrogate and combine the two to
+                       form a UCS4 value */
-		       form a UCS4 value */
+                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
 		    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
-			i++;
+                        i++;
-			goto encodeUCS4;
+                        goto encodeUCS4;
                    }
-		    /* Fall through: handles isolated high surrogates */
+                    /* Fall through: handles isolated high surrogates */
                }
 		if (overalloc < 3) {
 		    len = Py_SAFE_DOWNCAST(p-PyString_AS_STRING(v), long, int);
                    assert(len <= nallocated);
 		    overalloc = 3;
                    nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
 		    if (_PyString_Resize(&v, nallocated))
 			goto onError;
 		    p = PyString_AS_STRING(v) + len;
 		}
                *p++ = (char)(0xe0 | (ch >> 12));
-		*p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
-		*p++ = (char)(0x80 | (ch & 0x3f));
+                *p++ = (char)(0x80 | (ch & 0x3f));
-		continue;
+                continue;
-	    }
+    	    }
-
+encodeUCS4:
-	    /* Encode UCS4 Unicode ordinals */
+            /* Encode UCS4 Unicode ordinals */
-	encodeUCS4:
+            *p++ = (char)(0xf0 | (ch >> 18));
-	    if (overalloc < 4) {
+            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
-                len = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
+            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
-                assert(len <= nallocated);
+            *p++ = (char)(0x80 | (ch & 0x3f));
-		overalloc = 4;
+        }
                nallocated = Py_SAFE_DOWNCAST(overalloc * size, long, int);
 		if (_PyString_Resize(&v, nallocated))
 		    goto onError;
 		p = PyString_AS_STRING(v) + len;
 	    }
 	    *p++ = (char)(0xf0 | (ch >> 18));
 	    *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
 	    *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
 	    *p++ = (char)(0x80 | (ch & 0x3f));
 	}
    }
-    *p = '\0';
+    if (v == NULL) {
-    len = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
+        /* This was stack allocated. */
-    assert(len <= nallocated);
+        nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
-    if (_PyString_Resize(&v, len))
+        assert(nneeded <= nallocated);
-	goto onError;
+        v = PyString_FromStringAndSize(stackbuf, nneeded);
    }
    else {
    	/* Cut back to size actually needed. */
        nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
        assert(nneeded <= nallocated);
        _PyString_Resize(&v, nneeded);
    }
    return v;
- onError:
+#undef MAX_SHORT_UNICHARS
    Py_DECREF(v);
    return NULL;
 }
 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)