From f413b80806bb7d077a1611610273dab6d916908d Mon Sep 17 00:00:00 2001 From: Benjamin Peterson Date: Fri, 12 Aug 2011 22:17:18 -0500 Subject: [PATCH] in narrow builds, make sure to test codepoints as identifier characters (closes #12732) This fixes the use of Unicode identifiers outside the BMP in narrow builds. --- Lib/test/test_pep3131.py | 3 +++ Lib/test/test_unicode.py | 1 + Misc/NEWS | 3 +++ Objects/unicodeobject.c | 31 +++++++++++++++++++++++-------- 4 files changed, 30 insertions(+), 8 deletions(-) diff --git a/Lib/test/test_pep3131.py b/Lib/test/test_pep3131.py index 9d5f217165f..ed7558a7eec 100644 --- a/Lib/test/test_pep3131.py +++ b/Lib/test/test_pep3131.py @@ -8,9 +8,12 @@ class PEP3131Test(unittest.TestCase): Γ€ = 1 Β΅ = 2 # this is a compatibility character θŸ’ = 3 + π”˜π”«π”¦π” π”¬π”‘π”’ = 4 self.assertEqual(getattr(T, "\xe4"), 1) self.assertEqual(getattr(T, "\u03bc"), 2) self.assertEqual(getattr(T, '\u87d2'), 3) + v = getattr(T, "\U0001d518\U0001d52b\U0001d526\U0001d520\U0001d52c\U0001d521\U0001d522") + self.assertEqual(v, 4) def test_invalid(self): try: diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 55aaba6d3b4..09cf48f10b2 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -404,6 +404,7 @@ class UnicodeTest(string_tests.CommonTest, self.assertTrue("bc".isidentifier()) self.assertTrue("b_".isidentifier()) self.assertTrue("Β΅".isidentifier()) + self.assertTrue("π”˜π”«π”¦π” π”¬π”‘π”’".isidentifier()) self.assertFalse(" ".isidentifier()) self.assertFalse("[".isidentifier()) diff --git a/Misc/NEWS b/Misc/NEWS index c9a0522cfc7..354d09a070c 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,9 @@ What's New in Python 3.2.2? Core and Builtins ----------------- +- Issue #12732: In narrow unicode builds, allow Unicode identifiers which fall + outside the BMP. + - Issue #11603: Fix a crash when __str__ is rebound as __repr__. Patch by Andreas StΓΌhrk. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 75da0e7e8d2..a48b8b41b13 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -7972,14 +7972,30 @@ unicode_isnumeric(PyUnicodeObject *self) return PyBool_FromLong(1); } +static Py_UCS4 +decode_ucs4(const Py_UNICODE *s, Py_ssize_t *i, Py_ssize_t size) +{ + Py_UCS4 ch; + assert(*i < size); + ch = s[(*i)++]; +#ifndef Py_UNICODE_WIDE + if ((ch & 0xfffffc00) == 0xd800 && + *i < size + && (s[*i] & 0xFFFFFC00) == 0xDC00) + ch = ((Py_UCS4)ch << 10UL) + (Py_UCS4)(s[(*i)++]) - 0x35fdc00; +#endif + return ch; +} + int PyUnicode_IsIdentifier(PyObject *self) { - register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); - register const Py_UNICODE *e; + Py_ssize_t i = 0, size = PyUnicode_GET_SIZE(self); + Py_UCS4 first; + const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); /* Special case for empty strings */ - if (PyUnicode_GET_SIZE(self) == 0) + if (!size) return 0; /* PEP 3131 says that the first character must be in @@ -7990,14 +8006,13 @@ PyUnicode_IsIdentifier(PyObject *self) definition of XID_Start and XID_Continue, it is sufficient to check just for these, except that _ must be allowed as starting an identifier. */ - if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */) + first = decode_ucs4(p, &i, size); + if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) return 0; - e = p + PyUnicode_GET_SIZE(self); - for (p++; p < e; p++) { - if (!_PyUnicode_IsXidContinue(*p)) + while (i < size) + if (!_PyUnicode_IsXidContinue(decode_ucs4(p, &i, size))) return 0; - } return 1; }