#1477: ur'\U0010FFFF' raised in narrow unicode builds.
Corrected the raw-unicode-escape codec to use UTF-16 surrogates in this case, just like the unicode-escape codec.
This commit is contained in:
parent
61854332b9
commit
9a0d3462fc
|
@ -736,12 +736,25 @@ class UnicodeTest(
|
||||||
print >>out, u'def\n'
|
print >>out, u'def\n'
|
||||||
|
|
||||||
def test_ucs4(self):
|
def test_ucs4(self):
|
||||||
if sys.maxunicode == 0xFFFF:
|
|
||||||
return
|
|
||||||
x = u'\U00100000'
|
x = u'\U00100000'
|
||||||
y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
|
y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
|
||||||
self.assertEqual(x, y)
|
self.assertEqual(x, y)
|
||||||
|
|
||||||
|
y = r'\U00100000'
|
||||||
|
x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
|
||||||
|
self.assertEqual(x, y)
|
||||||
|
y = r'\U00010000'
|
||||||
|
x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
|
||||||
|
self.assertEqual(x, y)
|
||||||
|
|
||||||
|
try:
|
||||||
|
'\U11111111'.decode("raw-unicode-escape")
|
||||||
|
except UnicodeDecodeError as e:
|
||||||
|
self.assertEqual(e.start, 0)
|
||||||
|
self.assertEqual(e.end, 10)
|
||||||
|
else:
|
||||||
|
self.fail("Should have raised UnicodeDecodeError")
|
||||||
|
|
||||||
def test_conversion(self):
|
def test_conversion(self):
|
||||||
# Make sure __unicode__() works properly
|
# Make sure __unicode__() works properly
|
||||||
class Foo0:
|
class Foo0:
|
||||||
|
|
|
@ -12,6 +12,12 @@ What's New in Python 2.6 alpha 2?
|
||||||
Core and builtins
|
Core and builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #1477: With narrow Unicode builds, the unicode escape sequence
|
||||||
|
\Uxxxxxxxx did not accept values outside the Basic Multilingual Plane. This
|
||||||
|
affected raw unicode literals and the 'raw-unicode-escape' codec. Now
|
||||||
|
UTF-16 surrogates are generated in this case, like normal unicode literals
|
||||||
|
and the 'unicode-escape' codec.
|
||||||
|
|
||||||
- Issue #2348: add Py3k warning for file.softspace.
|
- Issue #2348: add Py3k warning for file.softspace.
|
||||||
|
|
||||||
- Issue #2346/#2347: add Py3k warnings for __methods__ and __members__.
|
- Issue #2346/#2347: add Py3k warnings for __methods__ and __members__.
|
||||||
|
|
|
@ -3088,8 +3088,22 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
|
||||||
else
|
else
|
||||||
x += 10 + c - 'A';
|
x += 10 + c - 'A';
|
||||||
}
|
}
|
||||||
#ifndef Py_UNICODE_WIDE
|
if (x <= 0xffff)
|
||||||
if (x > 0x10000) {
|
/* UCS-2 character */
|
||||||
|
*p++ = (Py_UNICODE) x;
|
||||||
|
else if (x <= 0x10ffff) {
|
||||||
|
/* UCS-4 character. Either store directly, or as
|
||||||
|
surrogate pair. */
|
||||||
|
#ifdef Py_UNICODE_WIDE
|
||||||
|
*p++ = (Py_UNIC0DE) x;
|
||||||
|
#else
|
||||||
|
x -= 0x10000L;
|
||||||
|
*p++ = 0xD800 + (Py_UNICODE) (x >> 10);
|
||||||
|
*p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
endinpos = s-starts;
|
||||||
|
outpos = p-PyUnicode_AS_UNICODE(v);
|
||||||
if (unicode_decode_call_errorhandler(
|
if (unicode_decode_call_errorhandler(
|
||||||
errors, &errorHandler,
|
errors, &errorHandler,
|
||||||
"rawunicodeescape", "\\Uxxxxxxxx out of range",
|
"rawunicodeescape", "\\Uxxxxxxxx out of range",
|
||||||
|
@ -3097,8 +3111,6 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
|
||||||
(PyObject **)&v, &outpos, &p))
|
(PyObject **)&v, &outpos, &p))
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
*p++ = x;
|
|
||||||
nextByte:
|
nextByte:
|
||||||
;
|
;
|
||||||
}
|
}
|
||||||
|
@ -3152,6 +3164,32 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
|
||||||
*p++ = hexdigit[ch & 15];
|
*p++ = hexdigit[ch & 15];
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
#else
|
||||||
|
/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
|
||||||
|
if (ch >= 0xD800 && ch < 0xDC00) {
|
||||||
|
Py_UNICODE ch2;
|
||||||
|
Py_UCS4 ucs;
|
||||||
|
|
||||||
|
ch2 = *s++;
|
||||||
|
size--;
|
||||||
|
if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
|
||||||
|
ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
|
||||||
|
*p++ = '\\';
|
||||||
|
*p++ = 'U';
|
||||||
|
*p++ = hexdigit[(ucs >> 28) & 0xf];
|
||||||
|
*p++ = hexdigit[(ucs >> 24) & 0xf];
|
||||||
|
*p++ = hexdigit[(ucs >> 20) & 0xf];
|
||||||
|
*p++ = hexdigit[(ucs >> 16) & 0xf];
|
||||||
|
*p++ = hexdigit[(ucs >> 12) & 0xf];
|
||||||
|
*p++ = hexdigit[(ucs >> 8) & 0xf];
|
||||||
|
*p++ = hexdigit[(ucs >> 4) & 0xf];
|
||||||
|
*p++ = hexdigit[ucs & 0xf];
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
/* Fall through: isolated surrogates are copied as-is */
|
||||||
|
s--;
|
||||||
|
size++;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
/* Map 16-bit characters to '\uxxxx' */
|
/* Map 16-bit characters to '\uxxxx' */
|
||||||
if (ch >= 256) {
|
if (ch >= 256) {
|
||||||
|
|
Loading…
Reference in New Issue