- Patch #1541585: fix buffer overrun when performing repr() on

a unicode string in a build with wide unicode (UCS-4) support. I will forward port to 2.6. Can someone backport to 2.4?
2006-08-21 22:13:11 +00:00 · 2006-08-21 22:13:11 +00:00 · 19c35bba5d
parent bebdc9e52c
commit 19c35bba5d
4 changed files with 37 additions and 12 deletions
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -92,6 +92,10 @@ class UnicodeTest(
                "\\xfe\\xff'")
            testrepr = repr(u''.join(map(unichr, xrange(256))))
            self.assertEqual(testrepr, latin1repr)
            # Test repr works on wide unicode escapes without overflow.
            self.assertEqual(repr(u"\U00010000" * 39 + u"\uffff" * 4096),
                             repr(u"\U00010000" * 39 + u"\uffff" * 4096))
    def test_count(self):
        string_tests.CommonTest.test_count(self)
--- a/Misc/ACKS
+++ b/Misc/ACKS
@ -365,6 +365,7 @@ Detlef Lannert
 Soren Larsen
 Piers Lauder
 Ben Laurie
 Simon Law
 Chris Lawrence
 Christopher Lee
 Inyeol Lee
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -9,6 +9,13 @@ What's New in Python 2.5?
 *Release date: XX-SEP-2006*
 Core and builtins
 -----------------
 - Patch #1541585: fix buffer overrun when performing repr() on
  a unicode string in a build with wide unicode (UCS-4) support.
 Library
 -------
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -2040,7 +2040,28 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
    static const char *hexdigit = "0123456789abcdef";
-    repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
+    /* Initial allocation is based on the longest-possible unichr
       escape.
       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
       unichr, so in this case it's the longest unichr escape. In
       narrow (UTF-16) builds this is five chars per source unichr
       since there are two unichrs in the surrogate pair, so in narrow
       (UTF-16) builds it's not the longest unichr escape.
       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
       so in the narrow (UTF-16) build case it's the longest unichr
       escape.
    */
    repr = PyString_FromStringAndSize(NULL,
        2
 #ifdef Py_UNICODE_WIDE
        + 10*size
 #else
        + 6*size
 #endif
        + 1);
    if (repr == NULL)
        return NULL;
@ -2065,15 +2086,6 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
 #ifdef Py_UNICODE_WIDE
        /* Map 21-bit characters to '\U00xxxxxx' */
        else if (ch >= 0x10000) {
 	    Py_ssize_t offset = p - PyString_AS_STRING(repr);
 	    /* Resize the string if necessary */
 	    if (offset + 12 > PyString_GET_SIZE(repr)) {
 		if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
 		    return NULL;
 		p = PyString_AS_STRING(repr) + offset;
 	    }
            *p++ = '\\';
            *p++ = 'U';
            *p++ = hexdigit[(ch >> 28) & 0x0000000F];
@ -2086,8 +2098,8 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
            *p++ = hexdigit[ch & 0x0000000F];
 	    continue;
        }
-#endif
+#else
-	/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
+	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
 	else if (ch >= 0xD800 && ch < 0xDC00) {
 	    Py_UNICODE ch2;
 	    Py_UCS4 ucs;
@ -2112,6 +2124,7 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
 	    s--;
 	    size++;
 	}
 #endif
        /* Map 16-bit characters to '\uxxxx' */
        if (ch >= 256) {