#1477: ur'\U0010FFFF' raised in narrow unicode builds.

Corrected the raw-unicode-escape codec to use UTF-16 surrogates in this case, just like the unicode-escape codec.
2008-03-23 09:55:29 +00:00 · 2008-03-23 09:55:29 +00:00 · 9a0d3462fc
parent 61854332b9
commit 9a0d3462fc
3 changed files with 63 additions and 6 deletions
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -736,12 +736,25 @@ class UnicodeTest(
        print >>out, u'def\n'
    def test_ucs4(self):
        if sys.maxunicode == 0xFFFF:
            return
        x = u'\U00100000'
        y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
        self.assertEqual(x, y)
        y = r'\U00100000'
        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
        self.assertEqual(x, y)
        y = r'\U00010000'
        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
        self.assertEqual(x, y)
        try:
            '\U11111111'.decode("raw-unicode-escape")
        except UnicodeDecodeError as e:
            self.assertEqual(e.start, 0)
            self.assertEqual(e.end, 10)
        else:
            self.fail("Should have raised UnicodeDecodeError")
    def test_conversion(self):
        # Make sure __unicode__() works properly
        class Foo0:
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -12,6 +12,12 @@ What's New in Python 2.6 alpha 2?
 Core and builtins
 -----------------
 - Issue #1477: With narrow Unicode builds, the unicode escape sequence
  \Uxxxxxxxx did not accept values outside the Basic Multilingual Plane.  This
  affected raw unicode literals and the 'raw-unicode-escape' codec.  Now
  UTF-16 surrogates are generated in this case, like normal unicode literals
  and the 'unicode-escape' codec.
 - Issue #2348: add Py3k warning for file.softspace.
 - Issue #2346/#2347: add Py3k warnings for __methods__ and __members__.
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -3088,8 +3088,22 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
 	    else
 		x += 10 + c - 'A';
 	}
-#ifndef Py_UNICODE_WIDE
+        if (x <= 0xffff)
-        if (x > 0x10000) {
+                /* UCS-2 character */
                *p++ = (Py_UNICODE) x;
        else if (x <= 0x10ffff) {
                /* UCS-4 character. Either store directly, or as
                   surrogate pair. */
 #ifdef Py_UNICODE_WIDE
                *p++ = (Py_UNIC0DE) x;
 #else
                x -= 0x10000L;
                *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
                *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
 #endif
        } else {
            endinpos = s-starts;
            outpos = p-PyUnicode_AS_UNICODE(v);
            if (unicode_decode_call_errorhandler(
                    errors, &errorHandler,
                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
@ -3097,8 +3111,6 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
 		    (PyObject **)&v, &outpos, &p))
 		    goto onError;
        }
 #endif
 	*p++ = x;
 	nextByte:
 	;
    }
@ -3152,6 +3164,32 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
            *p++ = hexdigit[ch & 15];
        }
        else
 #else
 	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
 	if (ch >= 0xD800 && ch < 0xDC00) {
 	    Py_UNICODE ch2;
 	    Py_UCS4 ucs;
 	    ch2 = *s++;
 	    size--;
 	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
 		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
 		*p++ = '\\';
 		*p++ = 'U';
 		*p++ = hexdigit[(ucs >> 28) & 0xf];
 		*p++ = hexdigit[(ucs >> 24) & 0xf];
 		*p++ = hexdigit[(ucs >> 20) & 0xf];
 		*p++ = hexdigit[(ucs >> 16) & 0xf];
 		*p++ = hexdigit[(ucs >> 12) & 0xf];
 		*p++ = hexdigit[(ucs >> 8) & 0xf];
 		*p++ = hexdigit[(ucs >> 4) & 0xf];
 		*p++ = hexdigit[ucs & 0xf];
 		continue;
 	    }
 	    /* Fall through: isolated surrogates are copied as-is */
 	    s--;
 	    size++;
 	}
 #endif
 	/* Map 16-bit characters to '\uxxxx' */
 	if (ch >= 256) {