From e4a189274f3d88d64d5238bf340cec96eff4e5e0 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 9 Sep 2010 20:30:23 +0000 Subject: [PATCH] Issue #9804: ascii() now always represents unicode surrogate pairs as a single `\UXXXXXXXX`, regardless of whether the character is printable or not. Also, the "backslashreplace" error handler now joins surrogate pairs into a single character on UCS-2 builds. --- Lib/test/test_builtin.py | 22 ++++++++++++++++++++ Lib/test/test_codeccallbacks.py | 36 +++++++++++++++++++++++---------- Misc/NEWS | 5 +++++ Python/codecs.c | 26 ++++++++++++++++++------ 4 files changed, 72 insertions(+), 17 deletions(-) diff --git a/Lib/test/test_builtin.py b/Lib/test/test_builtin.py index 4e09ca57061..35b652bd92c 100644 --- a/Lib/test/test_builtin.py +++ b/Lib/test/test_builtin.py @@ -179,6 +179,28 @@ class BuiltinTest(unittest.TestCase): a = {} a[0] = a self.assertEqual(ascii(a), '{0: {...}}') + # Advanced checks for unicode strings + def _check_uni(s): + self.assertEqual(ascii(s), repr(s)) + _check_uni("'") + _check_uni('"') + _check_uni('"\'') + _check_uni('\0') + _check_uni('\r\n\t .') + # Unprintable non-ASCII characters + _check_uni('\x85') + _check_uni('\u1fff') + _check_uni('\U00012fff') + # Lone surrogates + _check_uni('\ud800') + _check_uni('\udfff') + # Issue #9804: surrogates should be joined even for printable + # wide characters (UCS-2 builds). + self.assertEqual(ascii('\U0001d121'), "'\\U0001d121'") + # All together + s = "'\0\"\n\r\t abcd\x85é\U00012fff\uD800\U0001D121xxx." + self.assertEqual(ascii(s), + r"""'\'\x00"\n\r\t abcd\x85\xe9\U00012fff\ud800\U0001d121xxx.'""") def test_neg(self): x = -sys.maxsize-1 diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py index 82782b5db36..6105fc02fa6 100644 --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -577,17 +577,31 @@ class CodecCallbackTest(unittest.TestCase): UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")), ("\\uffff", 1) ) - if sys.maxunicode>0xffff: - self.assertEquals( - codecs.backslashreplace_errors( - UnicodeEncodeError("ascii", "\U00010000", 0, 1, "ouch")), - ("\\U00010000", 1) - ) - self.assertEquals( - codecs.backslashreplace_errors( - UnicodeEncodeError("ascii", "\U0010ffff", 0, 1, "ouch")), - ("\\U0010ffff", 1) - ) + # 1 on UCS-4 builds, 2 on UCS-2 + len_wide = len("\U00010000") + self.assertEquals( + codecs.backslashreplace_errors( + UnicodeEncodeError("ascii", "\U00010000", + 0, len_wide, "ouch")), + ("\\U00010000", len_wide) + ) + self.assertEquals( + codecs.backslashreplace_errors( + UnicodeEncodeError("ascii", "\U0010ffff", + 0, len_wide, "ouch")), + ("\\U0010ffff", len_wide) + ) + # Lone surrogates (regardless of unicode width) + self.assertEquals( + codecs.backslashreplace_errors( + UnicodeEncodeError("ascii", "\ud800", 0, 1, "ouch")), + ("\\ud800", 1) + ) + self.assertEquals( + codecs.backslashreplace_errors( + UnicodeEncodeError("ascii", "\udfff", 0, 1, "ouch")), + ("\\udfff", 1) + ) def test_badhandlerresults(self): results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) ) diff --git a/Misc/NEWS b/Misc/NEWS index eedea781f30..437b9764f69 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,11 @@ What's New in Python 3.2 Alpha 3? Core and Builtins ----------------- +- Issue #9804: ascii() now always represents unicode surrogate pairs as + a single ``\UXXXXXXXX``, regardless of whether the character is printable + or not. Also, the "backslashreplace" error handler now joins surrogate + pairs into a single character on UCS-2 builds. + - Issue #9757: memoryview objects get a release() method to release the underlying buffer (previously this was only done when deallocating the memoryview), and gain support for the context management protocol. diff --git a/Python/codecs.c b/Python/codecs.c index 04487a216c2..45d99291f11 100644 --- a/Python/codecs.c +++ b/Python/codecs.c @@ -678,6 +678,13 @@ static Py_UNICODE hexdigits[] = { PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) { +#ifndef Py_UNICODE_WIDE +#define IS_SURROGATE_PAIR(p, end) \ + (*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \ + *(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF) +#else +#define IS_SURROGATE_PAIR(p, end) 0 +#endif if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { PyObject *restuple; PyObject *object; @@ -702,7 +709,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) else #endif if (*p >= 0x100) { - ressize += 1+1+4; + if (IS_SURROGATE_PAIR(p, startp+end)) { + ressize += 1+1+8; + ++p; + } + else + ressize += 1+1+4; } else ressize += 1+1+2; @@ -712,9 +724,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) return NULL; for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); p < startp+end; ++p) { - Py_UNICODE c = *p; + Py_UCS4 c = (Py_UCS4) *p; *outp++ = '\\'; -#ifdef Py_UNICODE_WIDE + if (IS_SURROGATE_PAIR(p, startp+end)) { + c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000; + ++p; + } if (c >= 0x00010000) { *outp++ = 'U'; *outp++ = hexdigits[(c>>28)&0xf]; @@ -724,9 +739,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) *outp++ = hexdigits[(c>>12)&0xf]; *outp++ = hexdigits[(c>>8)&0xf]; } - else -#endif - if (c >= 0x100) { + else if (c >= 0x100) { *outp++ = 'u'; *outp++ = hexdigits[(c>>12)&0xf]; *outp++ = hexdigits[(c>>8)&0xf]; @@ -746,6 +759,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) wrong_exception_type(exc); return NULL; } +#undef IS_SURROGATE_PAIR } /* This handler is declared static until someone demonstrates