From 93e7afc5d9b248e264da5dbe97659bf37cb8eb15 Mon Sep 17 00:00:00 2001 From: Ezio Melotti Date: Mon, 22 Aug 2011 14:08:38 +0300 Subject: [PATCH] #9200: The str.is* methods now work with strings that contain non-BMP characters even in narrow Unicode builds. --- Lib/test/test_unicode.py | 158 ++++++++++++++++++++++++++++++++++++++- Misc/NEWS | 3 + Objects/unicodeobject.c | 101 +++++++++++++++---------- 3 files changed, 220 insertions(+), 42 deletions(-) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 09cf48f10b2..2858c8968d5 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -345,26 +345,69 @@ class UnicodeTest(string_tests.CommonTest, def test_islower(self): string_tests.MixinStrUnicodeUserStringTest.test_islower(self) self.checkequalnofix(False, '\u1FFc', 'islower') + # non-BMP, uppercase + self.assertFalse('\U00010401'.islower()) + self.assertFalse('\U00010427'.islower()) + # non-BMP, lowercase + self.assertTrue('\U00010429'.islower()) + self.assertTrue('\U0001044E'.islower()) + # non-BMP, non-cased + self.assertFalse('\U0001F40D'.islower()) + self.assertFalse('\U0001F46F'.islower()) def test_isupper(self): string_tests.MixinStrUnicodeUserStringTest.test_isupper(self) if not sys.platform.startswith('java'): self.checkequalnofix(False, '\u1FFc', 'isupper') + # non-BMP, uppercase + self.assertTrue('\U00010401'.isupper()) + self.assertTrue('\U00010427'.isupper()) + # non-BMP, lowercase + self.assertFalse('\U00010429'.isupper()) + self.assertFalse('\U0001044E'.isupper()) + # non-BMP, non-cased + self.assertFalse('\U0001F40D'.isupper()) + self.assertFalse('\U0001F46F'.isupper()) def test_istitle(self): - string_tests.MixinStrUnicodeUserStringTest.test_title(self) + string_tests.MixinStrUnicodeUserStringTest.test_istitle(self) self.checkequalnofix(True, '\u1FFc', 'istitle') self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle') + # non-BMP, uppercase + lowercase + self.assertTrue('\U00010401\U00010429'.istitle()) + self.assertTrue('\U00010427\U0001044E'.istitle()) + # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6 + for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']: + self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch)) + def test_isspace(self): string_tests.MixinStrUnicodeUserStringTest.test_isspace(self) self.checkequalnofix(True, '\u2000', 'isspace') self.checkequalnofix(True, '\u200a', 'isspace') self.checkequalnofix(False, '\u2014', 'isspace') + # apparently there are no non-BMP spaces chars in Unicode 6 + for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', + '\U0001F40D', '\U0001F46F']: + self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch)) + + def test_isalnum(self): + string_tests.MixinStrUnicodeUserStringTest.test_isalnum(self) + for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', + '\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']: + self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch)) def test_isalpha(self): string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self) self.checkequalnofix(True, '\u1FFc', 'isalpha') + # non-BMP, cased + self.assertTrue('\U00010401'.isalpha()) + self.assertTrue('\U00010427'.isalpha()) + self.assertTrue('\U00010429'.isalpha()) + self.assertTrue('\U0001044E'.isalpha()) + # non-BMP, non-cased + self.assertFalse('\U0001F40D'.isalpha()) + self.assertFalse('\U0001F46F'.isalpha()) def test_isdecimal(self): self.checkequalnofix(False, '', 'isdecimal') @@ -378,12 +421,24 @@ class UnicodeTest(string_tests.CommonTest, self.checkraises(TypeError, 'abc', 'isdecimal', 42) + for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', + '\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']: + self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch)) + for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']: + self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch)) + def test_isdigit(self): string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self) self.checkequalnofix(True, '\u2460', 'isdigit') self.checkequalnofix(False, '\xbc', 'isdigit') self.checkequalnofix(True, '\u0660', 'isdigit') + for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', + '\U0001F40D', '\U0001F46F', '\U00011065']: + self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch)) + for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']: + self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch)) + def test_isnumeric(self): self.checkequalnofix(False, '', 'isnumeric') self.checkequalnofix(False, 'a', 'isnumeric') @@ -396,6 +451,13 @@ class UnicodeTest(string_tests.CommonTest, self.assertRaises(TypeError, "abc".isnumeric, 42) + for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E', + '\U0001F40D', '\U0001F46F']: + self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch)) + for ch in ['\U00011065', '\U0001D7F6', '\U00011066', + '\U000104A0', '\U0001F107']: + self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch)) + def test_isidentifier(self): self.assertTrue("a".isidentifier()) self.assertTrue("Z".isidentifier()) @@ -423,6 +485,100 @@ class UnicodeTest(string_tests.CommonTest, # single surrogate character self.assertFalse("\ud800".isprintable()) + self.assertTrue('\U0001F46F'.isprintable()) + self.assertFalse('\U000E0020'.isprintable()) + + def test_surrogates(self): + for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800', + 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'): + self.assertTrue(s.islower()) + self.assertFalse(s.isupper()) + self.assertFalse(s.istitle()) + for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800', + 'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'): + self.assertFalse(s.islower()) + self.assertTrue(s.isupper()) + self.assertTrue(s.istitle()) + + for meth_name in ('islower', 'isupper', 'istitle'): + meth = getattr(str, meth_name) + for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'): + self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name)) + + for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace', + 'isdecimal', 'isnumeric', + 'isidentifier', 'isprintable'): + meth = getattr(str, meth_name) + for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF', + 'a\uD800b\uDFFF', 'a\uDFFFb\uD800', + 'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'): + self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name)) + + + @unittest.skipIf(sys.maxunicode == 65535, 'test requires wide build') + def test_lower(self): + string_tests.CommonTest.test_lower(self) + self.assertEqual('\U00010427'.lower(), '\U0001044F') + self.assertEqual('\U00010427\U00010427'.lower(), + '\U0001044F\U0001044F') + self.assertEqual('\U00010427\U0001044F'.lower(), + '\U0001044F\U0001044F') + self.assertEqual('X\U00010427x\U0001044F'.lower(), + 'x\U0001044Fx\U0001044F') + + @unittest.skipIf(sys.maxunicode == 65535, 'test requires wide build') + def test_upper(self): + string_tests.CommonTest.test_upper(self) + self.assertEqual('\U0001044F'.upper(), '\U00010427') + self.assertEqual('\U0001044F\U0001044F'.upper(), + '\U00010427\U00010427') + self.assertEqual('\U00010427\U0001044F'.upper(), + '\U00010427\U00010427') + self.assertEqual('X\U00010427x\U0001044F'.upper(), + 'X\U00010427X\U00010427') + + @unittest.skipIf(sys.maxunicode == 65535, 'test requires wide build') + def test_capitalize(self): + string_tests.CommonTest.test_capitalize(self) + self.assertEqual('\U0001044F'.capitalize(), '\U00010427') + self.assertEqual('\U0001044F\U0001044F'.capitalize(), + '\U00010427\U0001044F') + self.assertEqual('\U00010427\U0001044F'.capitalize(), + '\U00010427\U0001044F') + self.assertEqual('\U0001044F\U00010427'.capitalize(), + '\U00010427\U0001044F') + self.assertEqual('X\U00010427x\U0001044F'.capitalize(), + 'X\U0001044Fx\U0001044F') + + @unittest.skipIf(sys.maxunicode == 65535, 'test requires wide build') + def test_title(self): + string_tests.MixinStrUnicodeUserStringTest.test_title(self) + self.assertEqual('\U0001044F'.title(), '\U00010427') + self.assertEqual('\U0001044F\U0001044F'.title(), + '\U00010427\U0001044F') + self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(), + '\U00010427\U0001044F \U00010427\U0001044F') + self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(), + '\U00010427\U0001044F \U00010427\U0001044F') + self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(), + '\U00010427\U0001044F \U00010427\U0001044F') + self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(), + 'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F') + + @unittest.skipIf(sys.maxunicode == 65535, 'test requires wide build') + def test_swapcase(self): + string_tests.CommonTest.test_swapcase(self) + self.assertEqual('\U0001044F'.swapcase(), '\U00010427') + self.assertEqual('\U00010427'.swapcase(), '\U0001044F') + self.assertEqual('\U0001044F\U0001044F'.swapcase(), + '\U00010427\U00010427') + self.assertEqual('\U00010427\U0001044F'.swapcase(), + '\U0001044F\U00010427') + self.assertEqual('\U0001044F\U00010427'.swapcase(), + '\U00010427\U0001044F') + self.assertEqual('X\U00010427x\U0001044F'.swapcase(), + 'x\U0001044FX\U00010427') + def test_contains(self): # Testing Unicode contains method self.assertIn('a', 'abdb') diff --git a/Misc/NEWS b/Misc/NEWS index da0e67f666c..6673d20477c 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,9 @@ What's New in Python 3.2.3? Core and Builtins ----------------- +- Issue #9200: The str.is* methods now work with strings that contain non-BMP + characters even in narrow Unicode builds. + - Issue #12791: Break reference cycles early when a generator exits with an exception. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 77f8dd5a810..8567a9f2db1 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6442,6 +6442,37 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s, start = 0; \ } +/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed + * by 'ptr', possibly combining surrogate pairs on narrow builds. + * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character + * that should be returned and 'end' pointing to the end of the buffer. + * ('end' is used on narrow builds to detect a lone surrogate at the + * end of the buffer that should be returned unchanged.) + * The ptr and end arguments should be side-effect free and ptr must an lvalue. + * The type of the returned char is always Py_UCS4. + * + * Note: the macro advances ptr to next char, so it might have side-effects + * (especially if used with other macros). + */ + +/* helper macros used by _Py_UNICODE_NEXT */ +#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF) +#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF) +/* Join two surrogate characters and return a single Py_UCS4 value. */ +#define _Py_UNICODE_JOIN_SURROGATES(high, low) \ + (((((Py_UCS4)(high) & 0x03FF) << 10) | \ + ((Py_UCS4)(low) & 0x03FF)) + 0x10000) + +#ifdef Py_UNICODE_WIDE +#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++ +#else +#define _Py_UNICODE_NEXT(ptr, end) \ + (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \ + _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \ + ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \ + (Py_UCS4)*(ptr)++) +#endif + Py_ssize_t PyUnicode_Count(PyObject *str, PyObject *substr, Py_ssize_t start, @@ -7705,8 +7736,8 @@ unicode_islower(PyUnicodeObject *self) e = p + PyUnicode_GET_SIZE(self); cased = 0; - for (; p < e; p++) { - register const Py_UNICODE ch = *p; + while (p < e) { + const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) return PyBool_FromLong(0); @@ -7739,8 +7770,8 @@ unicode_isupper(PyUnicodeObject *self) e = p + PyUnicode_GET_SIZE(self); cased = 0; - for (; p < e; p++) { - register const Py_UNICODE ch = *p; + while (p < e) { + const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch)) return PyBool_FromLong(0); @@ -7777,8 +7808,8 @@ unicode_istitle(PyUnicodeObject *self) e = p + PyUnicode_GET_SIZE(self); cased = 0; previous_is_cased = 0; - for (; p < e; p++) { - register const Py_UNICODE ch = *p; + while (p < e) { + const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) { if (previous_is_cased) @@ -7820,8 +7851,9 @@ unicode_isspace(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISSPACE(*p)) + while (p < e) { + const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); + if (!Py_UNICODE_ISSPACE(ch)) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7849,8 +7881,8 @@ unicode_isalpha(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISALPHA(*p)) + while (p < e) { + if (!Py_UNICODE_ISALPHA(_Py_UNICODE_NEXT(p, e))) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7878,8 +7910,9 @@ unicode_isalnum(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISALNUM(*p)) + while (p < e) { + const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e); + if (!Py_UNICODE_ISALNUM(ch)) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7907,8 +7940,8 @@ unicode_isdecimal(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISDECIMAL(*p)) + while (p < e) { + if (!Py_UNICODE_ISDECIMAL(_Py_UNICODE_NEXT(p, e))) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7936,8 +7969,8 @@ unicode_isdigit(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISDIGIT(*p)) + while (p < e) { + if (!Py_UNICODE_ISDIGIT(_Py_UNICODE_NEXT(p, e))) return PyBool_FromLong(0); } return PyBool_FromLong(1); @@ -7965,37 +7998,22 @@ unicode_isnumeric(PyUnicodeObject *self) return PyBool_FromLong(0); e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISNUMERIC(*p)) + while (p < e) { + if (!Py_UNICODE_ISNUMERIC(_Py_UNICODE_NEXT(p, e))) return PyBool_FromLong(0); } return PyBool_FromLong(1); } -static Py_UCS4 -decode_ucs4(const Py_UNICODE *s, Py_ssize_t *i, Py_ssize_t size) -{ - Py_UCS4 ch; - assert(*i < size); - ch = s[(*i)++]; -#ifndef Py_UNICODE_WIDE - if ((ch & 0xfffffc00) == 0xd800 && - *i < size - && (s[*i] & 0xFFFFFC00) == 0xDC00) - ch = ((Py_UCS4)ch << 10UL) + (Py_UCS4)(s[(*i)++]) - 0x35fdc00; -#endif - return ch; -} - int PyUnicode_IsIdentifier(PyObject *self) { - Py_ssize_t i = 0, size = PyUnicode_GET_SIZE(self); - Py_UCS4 first; const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self); + const Py_UNICODE *e; + Py_UCS4 first; /* Special case for empty strings */ - if (!size) + if (PyUnicode_GET_SIZE(self) == 0) return 0; /* PEP 3131 says that the first character must be in @@ -8006,12 +8024,13 @@ PyUnicode_IsIdentifier(PyObject *self) definition of XID_Start and XID_Continue, it is sufficient to check just for these, except that _ must be allowed as starting an identifier. */ - first = decode_ucs4(p, &i, size); + e = p + PyUnicode_GET_SIZE(self); + first = _Py_UNICODE_NEXT(p, e); if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */) return 0; - while (i < size) - if (!_PyUnicode_IsXidContinue(decode_ucs4(p, &i, size))) + while (p < e) + if (!_PyUnicode_IsXidContinue(_Py_UNICODE_NEXT(p, e))) return 0; return 1; } @@ -8046,8 +8065,8 @@ unicode_isprintable(PyObject *self) } e = p + PyUnicode_GET_SIZE(self); - for (; p < e; p++) { - if (!Py_UNICODE_ISPRINTABLE(*p)) { + while (p < e) { + if (!Py_UNICODE_ISPRINTABLE(_Py_UNICODE_NEXT(p, e))) { Py_RETURN_FALSE; } }