mirror of https://github.com/python/cpython
#9200: merge with 3.2.
This commit is contained in:
commit
6f2a683a0c
|
@ -345,26 +345,69 @@ class UnicodeTest(string_tests.CommonTest,
|
|||
def test_islower(self):
|
||||
string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
|
||||
self.checkequalnofix(False, '\u1FFc', 'islower')
|
||||
# non-BMP, uppercase
|
||||
self.assertFalse('\U00010401'.islower())
|
||||
self.assertFalse('\U00010427'.islower())
|
||||
# non-BMP, lowercase
|
||||
self.assertTrue('\U00010429'.islower())
|
||||
self.assertTrue('\U0001044E'.islower())
|
||||
# non-BMP, non-cased
|
||||
self.assertFalse('\U0001F40D'.islower())
|
||||
self.assertFalse('\U0001F46F'.islower())
|
||||
|
||||
def test_isupper(self):
|
||||
string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
|
||||
if not sys.platform.startswith('java'):
|
||||
self.checkequalnofix(False, '\u1FFc', 'isupper')
|
||||
# non-BMP, uppercase
|
||||
self.assertTrue('\U00010401'.isupper())
|
||||
self.assertTrue('\U00010427'.isupper())
|
||||
# non-BMP, lowercase
|
||||
self.assertFalse('\U00010429'.isupper())
|
||||
self.assertFalse('\U0001044E'.isupper())
|
||||
# non-BMP, non-cased
|
||||
self.assertFalse('\U0001F40D'.isupper())
|
||||
self.assertFalse('\U0001F46F'.isupper())
|
||||
|
||||
def test_istitle(self):
|
||||
string_tests.MixinStrUnicodeUserStringTest.test_title(self)
|
||||
string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
|
||||
self.checkequalnofix(True, '\u1FFc', 'istitle')
|
||||
self.checkequalnofix(True, 'Greek \u1FFcitlecases ...', 'istitle')
|
||||
|
||||
# non-BMP, uppercase + lowercase
|
||||
self.assertTrue('\U00010401\U00010429'.istitle())
|
||||
self.assertTrue('\U00010427\U0001044E'.istitle())
|
||||
# apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
|
||||
for ch in ['\U00010429', '\U0001044E', '\U0001F40D', '\U0001F46F']:
|
||||
self.assertFalse(ch.istitle(), '{!a} is not title'.format(ch))
|
||||
|
||||
def test_isspace(self):
|
||||
string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
|
||||
self.checkequalnofix(True, '\u2000', 'isspace')
|
||||
self.checkequalnofix(True, '\u200a', 'isspace')
|
||||
self.checkequalnofix(False, '\u2014', 'isspace')
|
||||
# apparently there are no non-BMP spaces chars in Unicode 6
|
||||
for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
|
||||
'\U0001F40D', '\U0001F46F']:
|
||||
self.assertFalse(ch.isspace(), '{!a} is not space.'.format(ch))
|
||||
|
||||
def test_isalnum(self):
|
||||
string_tests.MixinStrUnicodeUserStringTest.test_isalnum(self)
|
||||
for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
|
||||
'\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
|
||||
self.assertTrue(ch.isalnum(), '{!a} is alnum.'.format(ch))
|
||||
|
||||
def test_isalpha(self):
|
||||
string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
|
||||
self.checkequalnofix(True, '\u1FFc', 'isalpha')
|
||||
# non-BMP, cased
|
||||
self.assertTrue('\U00010401'.isalpha())
|
||||
self.assertTrue('\U00010427'.isalpha())
|
||||
self.assertTrue('\U00010429'.isalpha())
|
||||
self.assertTrue('\U0001044E'.isalpha())
|
||||
# non-BMP, non-cased
|
||||
self.assertFalse('\U0001F40D'.isalpha())
|
||||
self.assertFalse('\U0001F46F'.isalpha())
|
||||
|
||||
def test_isdecimal(self):
|
||||
self.checkequalnofix(False, '', 'isdecimal')
|
||||
|
@ -378,12 +421,24 @@ class UnicodeTest(string_tests.CommonTest,
|
|||
|
||||
self.checkraises(TypeError, 'abc', 'isdecimal', 42)
|
||||
|
||||
for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
|
||||
'\U0001F40D', '\U0001F46F', '\U00011065', '\U0001F107']:
|
||||
self.assertFalse(ch.isdecimal(), '{!a} is not decimal.'.format(ch))
|
||||
for ch in ['\U0001D7F6', '\U00011066', '\U000104A0']:
|
||||
self.assertTrue(ch.isdecimal(), '{!a} is decimal.'.format(ch))
|
||||
|
||||
def test_isdigit(self):
|
||||
string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
|
||||
self.checkequalnofix(True, '\u2460', 'isdigit')
|
||||
self.checkequalnofix(False, '\xbc', 'isdigit')
|
||||
self.checkequalnofix(True, '\u0660', 'isdigit')
|
||||
|
||||
for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
|
||||
'\U0001F40D', '\U0001F46F', '\U00011065']:
|
||||
self.assertFalse(ch.isdigit(), '{!a} is not a digit.'.format(ch))
|
||||
for ch in ['\U0001D7F6', '\U00011066', '\U000104A0', '\U0001F107']:
|
||||
self.assertTrue(ch.isdigit(), '{!a} is a digit.'.format(ch))
|
||||
|
||||
def test_isnumeric(self):
|
||||
self.checkequalnofix(False, '', 'isnumeric')
|
||||
self.checkequalnofix(False, 'a', 'isnumeric')
|
||||
|
@ -396,6 +451,13 @@ class UnicodeTest(string_tests.CommonTest,
|
|||
|
||||
self.assertRaises(TypeError, "abc".isnumeric, 42)
|
||||
|
||||
for ch in ['\U00010401', '\U00010427', '\U00010429', '\U0001044E',
|
||||
'\U0001F40D', '\U0001F46F']:
|
||||
self.assertFalse(ch.isnumeric(), '{!a} is not numeric.'.format(ch))
|
||||
for ch in ['\U00011065', '\U0001D7F6', '\U00011066',
|
||||
'\U000104A0', '\U0001F107']:
|
||||
self.assertTrue(ch.isnumeric(), '{!a} is numeric.'.format(ch))
|
||||
|
||||
def test_isidentifier(self):
|
||||
self.assertTrue("a".isidentifier())
|
||||
self.assertTrue("Z".isidentifier())
|
||||
|
@ -423,6 +485,100 @@ class UnicodeTest(string_tests.CommonTest,
|
|||
# single surrogate character
|
||||
self.assertFalse("\ud800".isprintable())
|
||||
|
||||
self.assertTrue('\U0001F46F'.isprintable())
|
||||
self.assertFalse('\U000E0020'.isprintable())
|
||||
|
||||
def test_surrogates(self):
|
||||
for s in ('a\uD800b\uDFFF', 'a\uDFFFb\uD800',
|
||||
'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
|
||||
self.assertTrue(s.islower())
|
||||
self.assertFalse(s.isupper())
|
||||
self.assertFalse(s.istitle())
|
||||
for s in ('A\uD800B\uDFFF', 'A\uDFFFB\uD800',
|
||||
'A\uD800B\uDFFFA', 'A\uDFFFB\uD800A'):
|
||||
self.assertFalse(s.islower())
|
||||
self.assertTrue(s.isupper())
|
||||
self.assertTrue(s.istitle())
|
||||
|
||||
for meth_name in ('islower', 'isupper', 'istitle'):
|
||||
meth = getattr(str, meth_name)
|
||||
for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF'):
|
||||
self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
|
||||
|
||||
for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
|
||||
'isdecimal', 'isnumeric',
|
||||
'isidentifier', 'isprintable'):
|
||||
meth = getattr(str, meth_name)
|
||||
for s in ('\uD800', '\uDFFF', '\uD800\uD800', '\uDFFF\uDFFF',
|
||||
'a\uD800b\uDFFF', 'a\uDFFFb\uD800',
|
||||
'a\uD800b\uDFFFa', 'a\uDFFFb\uD800a'):
|
||||
self.assertFalse(meth(s), '%a.%s() is False' % (s, meth_name))
|
||||
|
||||
|
||||
@unittest.skipIf(sys.maxunicode == 65535, 'test requires wide build')
|
||||
def test_lower(self):
|
||||
string_tests.CommonTest.test_lower(self)
|
||||
self.assertEqual('\U00010427'.lower(), '\U0001044F')
|
||||
self.assertEqual('\U00010427\U00010427'.lower(),
|
||||
'\U0001044F\U0001044F')
|
||||
self.assertEqual('\U00010427\U0001044F'.lower(),
|
||||
'\U0001044F\U0001044F')
|
||||
self.assertEqual('X\U00010427x\U0001044F'.lower(),
|
||||
'x\U0001044Fx\U0001044F')
|
||||
|
||||
@unittest.skipIf(sys.maxunicode == 65535, 'test requires wide build')
|
||||
def test_upper(self):
|
||||
string_tests.CommonTest.test_upper(self)
|
||||
self.assertEqual('\U0001044F'.upper(), '\U00010427')
|
||||
self.assertEqual('\U0001044F\U0001044F'.upper(),
|
||||
'\U00010427\U00010427')
|
||||
self.assertEqual('\U00010427\U0001044F'.upper(),
|
||||
'\U00010427\U00010427')
|
||||
self.assertEqual('X\U00010427x\U0001044F'.upper(),
|
||||
'X\U00010427X\U00010427')
|
||||
|
||||
@unittest.skipIf(sys.maxunicode == 65535, 'test requires wide build')
|
||||
def test_capitalize(self):
|
||||
string_tests.CommonTest.test_capitalize(self)
|
||||
self.assertEqual('\U0001044F'.capitalize(), '\U00010427')
|
||||
self.assertEqual('\U0001044F\U0001044F'.capitalize(),
|
||||
'\U00010427\U0001044F')
|
||||
self.assertEqual('\U00010427\U0001044F'.capitalize(),
|
||||
'\U00010427\U0001044F')
|
||||
self.assertEqual('\U0001044F\U00010427'.capitalize(),
|
||||
'\U00010427\U0001044F')
|
||||
self.assertEqual('X\U00010427x\U0001044F'.capitalize(),
|
||||
'X\U0001044Fx\U0001044F')
|
||||
|
||||
@unittest.skipIf(sys.maxunicode == 65535, 'test requires wide build')
|
||||
def test_title(self):
|
||||
string_tests.MixinStrUnicodeUserStringTest.test_title(self)
|
||||
self.assertEqual('\U0001044F'.title(), '\U00010427')
|
||||
self.assertEqual('\U0001044F\U0001044F'.title(),
|
||||
'\U00010427\U0001044F')
|
||||
self.assertEqual('\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
|
||||
'\U00010427\U0001044F \U00010427\U0001044F')
|
||||
self.assertEqual('\U00010427\U0001044F \U00010427\U0001044F'.title(),
|
||||
'\U00010427\U0001044F \U00010427\U0001044F')
|
||||
self.assertEqual('\U0001044F\U00010427 \U0001044F\U00010427'.title(),
|
||||
'\U00010427\U0001044F \U00010427\U0001044F')
|
||||
self.assertEqual('X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
|
||||
'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
|
||||
|
||||
@unittest.skipIf(sys.maxunicode == 65535, 'test requires wide build')
|
||||
def test_swapcase(self):
|
||||
string_tests.CommonTest.test_swapcase(self)
|
||||
self.assertEqual('\U0001044F'.swapcase(), '\U00010427')
|
||||
self.assertEqual('\U00010427'.swapcase(), '\U0001044F')
|
||||
self.assertEqual('\U0001044F\U0001044F'.swapcase(),
|
||||
'\U00010427\U00010427')
|
||||
self.assertEqual('\U00010427\U0001044F'.swapcase(),
|
||||
'\U0001044F\U00010427')
|
||||
self.assertEqual('\U0001044F\U00010427'.swapcase(),
|
||||
'\U00010427\U0001044F')
|
||||
self.assertEqual('X\U00010427x\U0001044F'.swapcase(),
|
||||
'x\U0001044FX\U00010427')
|
||||
|
||||
def test_contains(self):
|
||||
# Testing Unicode contains method
|
||||
self.assertIn('a', 'abdb')
|
||||
|
|
|
@ -10,6 +10,9 @@ What's New in Python 3.3 Alpha 1?
|
|||
Core and Builtins
|
||||
-----------------
|
||||
|
||||
- Issue #9200: The str.is* methods now work with strings that contain non-BMP
|
||||
characters even in narrow Unicode builds.
|
||||
|
||||
- Issue #12791: Break reference cycles early when a generator exits with
|
||||
an exception.
|
||||
|
||||
|
|
|
@ -6514,6 +6514,28 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
|
|||
start = 0; \
|
||||
}
|
||||
|
||||
/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
|
||||
* by 'ptr', possibly combining surrogate pairs on narrow builds.
|
||||
* 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
|
||||
* that should be returned and 'end' pointing to the end of the buffer.
|
||||
* ('end' is used on narrow builds to detect a lone surrogate at the
|
||||
* end of the buffer that should be returned unchanged.)
|
||||
* The ptr and end arguments should be side-effect free and ptr must an lvalue.
|
||||
* The type of the returned char is always Py_UCS4.
|
||||
*
|
||||
* Note: the macro advances ptr to next char, so it might have side-effects
|
||||
* (especially if used with other macros).
|
||||
*/
|
||||
#ifdef Py_UNICODE_WIDE
|
||||
#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
|
||||
#else
|
||||
#define _Py_UNICODE_NEXT(ptr, end) \
|
||||
(((Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
|
||||
Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
|
||||
((ptr) += 2,Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
|
||||
(Py_UCS4)*(ptr)++)
|
||||
#endif
|
||||
|
||||
Py_ssize_t
|
||||
PyUnicode_Count(PyObject *str,
|
||||
PyObject *substr,
|
||||
|
@ -7777,8 +7799,8 @@ unicode_islower(PyUnicodeObject *self)
|
|||
|
||||
e = p + PyUnicode_GET_SIZE(self);
|
||||
cased = 0;
|
||||
for (; p < e; p++) {
|
||||
register const Py_UNICODE ch = *p;
|
||||
while (p < e) {
|
||||
const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
|
||||
|
||||
if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
|
||||
return PyBool_FromLong(0);
|
||||
|
@ -7811,8 +7833,8 @@ unicode_isupper(PyUnicodeObject *self)
|
|||
|
||||
e = p + PyUnicode_GET_SIZE(self);
|
||||
cased = 0;
|
||||
for (; p < e; p++) {
|
||||
register const Py_UNICODE ch = *p;
|
||||
while (p < e) {
|
||||
const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
|
||||
|
||||
if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
|
||||
return PyBool_FromLong(0);
|
||||
|
@ -7849,8 +7871,8 @@ unicode_istitle(PyUnicodeObject *self)
|
|||
e = p + PyUnicode_GET_SIZE(self);
|
||||
cased = 0;
|
||||
previous_is_cased = 0;
|
||||
for (; p < e; p++) {
|
||||
register const Py_UNICODE ch = *p;
|
||||
while (p < e) {
|
||||
const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
|
||||
|
||||
if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
|
||||
if (previous_is_cased)
|
||||
|
@ -7892,8 +7914,9 @@ unicode_isspace(PyUnicodeObject *self)
|
|||
return PyBool_FromLong(0);
|
||||
|
||||
e = p + PyUnicode_GET_SIZE(self);
|
||||
for (; p < e; p++) {
|
||||
if (!Py_UNICODE_ISSPACE(*p))
|
||||
while (p < e) {
|
||||
const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
|
||||
if (!Py_UNICODE_ISSPACE(ch))
|
||||
return PyBool_FromLong(0);
|
||||
}
|
||||
return PyBool_FromLong(1);
|
||||
|
@ -7921,8 +7944,8 @@ unicode_isalpha(PyUnicodeObject *self)
|
|||
return PyBool_FromLong(0);
|
||||
|
||||
e = p + PyUnicode_GET_SIZE(self);
|
||||
for (; p < e; p++) {
|
||||
if (!Py_UNICODE_ISALPHA(*p))
|
||||
while (p < e) {
|
||||
if (!Py_UNICODE_ISALPHA(_Py_UNICODE_NEXT(p, e)))
|
||||
return PyBool_FromLong(0);
|
||||
}
|
||||
return PyBool_FromLong(1);
|
||||
|
@ -7950,8 +7973,9 @@ unicode_isalnum(PyUnicodeObject *self)
|
|||
return PyBool_FromLong(0);
|
||||
|
||||
e = p + PyUnicode_GET_SIZE(self);
|
||||
for (; p < e; p++) {
|
||||
if (!Py_UNICODE_ISALNUM(*p))
|
||||
while (p < e) {
|
||||
const Py_UCS4 ch = _Py_UNICODE_NEXT(p, e);
|
||||
if (!Py_UNICODE_ISALNUM(ch))
|
||||
return PyBool_FromLong(0);
|
||||
}
|
||||
return PyBool_FromLong(1);
|
||||
|
@ -7979,8 +8003,8 @@ unicode_isdecimal(PyUnicodeObject *self)
|
|||
return PyBool_FromLong(0);
|
||||
|
||||
e = p + PyUnicode_GET_SIZE(self);
|
||||
for (; p < e; p++) {
|
||||
if (!Py_UNICODE_ISDECIMAL(*p))
|
||||
while (p < e) {
|
||||
if (!Py_UNICODE_ISDECIMAL(_Py_UNICODE_NEXT(p, e)))
|
||||
return PyBool_FromLong(0);
|
||||
}
|
||||
return PyBool_FromLong(1);
|
||||
|
@ -8008,8 +8032,8 @@ unicode_isdigit(PyUnicodeObject *self)
|
|||
return PyBool_FromLong(0);
|
||||
|
||||
e = p + PyUnicode_GET_SIZE(self);
|
||||
for (; p < e; p++) {
|
||||
if (!Py_UNICODE_ISDIGIT(*p))
|
||||
while (p < e) {
|
||||
if (!Py_UNICODE_ISDIGIT(_Py_UNICODE_NEXT(p, e)))
|
||||
return PyBool_FromLong(0);
|
||||
}
|
||||
return PyBool_FromLong(1);
|
||||
|
@ -8037,37 +8061,22 @@ unicode_isnumeric(PyUnicodeObject *self)
|
|||
return PyBool_FromLong(0);
|
||||
|
||||
e = p + PyUnicode_GET_SIZE(self);
|
||||
for (; p < e; p++) {
|
||||
if (!Py_UNICODE_ISNUMERIC(*p))
|
||||
while (p < e) {
|
||||
if (!Py_UNICODE_ISNUMERIC(_Py_UNICODE_NEXT(p, e)))
|
||||
return PyBool_FromLong(0);
|
||||
}
|
||||
return PyBool_FromLong(1);
|
||||
}
|
||||
|
||||
static Py_UCS4
|
||||
decode_ucs4(const Py_UNICODE *s, Py_ssize_t *i, Py_ssize_t size)
|
||||
{
|
||||
Py_UCS4 ch;
|
||||
assert(*i < size);
|
||||
ch = s[(*i)++];
|
||||
#ifndef Py_UNICODE_WIDE
|
||||
if ((ch & 0xfffffc00) == 0xd800 &&
|
||||
*i < size
|
||||
&& (s[*i] & 0xFFFFFC00) == 0xDC00)
|
||||
ch = ((Py_UCS4)ch << 10UL) + (Py_UCS4)(s[(*i)++]) - 0x35fdc00;
|
||||
#endif
|
||||
return ch;
|
||||
}
|
||||
|
||||
int
|
||||
PyUnicode_IsIdentifier(PyObject *self)
|
||||
{
|
||||
Py_ssize_t i = 0, size = PyUnicode_GET_SIZE(self);
|
||||
Py_UCS4 first;
|
||||
const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
|
||||
const Py_UNICODE *e;
|
||||
Py_UCS4 first;
|
||||
|
||||
/* Special case for empty strings */
|
||||
if (!size)
|
||||
if (PyUnicode_GET_SIZE(self) == 0)
|
||||
return 0;
|
||||
|
||||
/* PEP 3131 says that the first character must be in
|
||||
|
@ -8078,12 +8087,13 @@ PyUnicode_IsIdentifier(PyObject *self)
|
|||
definition of XID_Start and XID_Continue, it is sufficient
|
||||
to check just for these, except that _ must be allowed
|
||||
as starting an identifier. */
|
||||
first = decode_ucs4(p, &i, size);
|
||||
e = p + PyUnicode_GET_SIZE(self);
|
||||
first = _Py_UNICODE_NEXT(p, e);
|
||||
if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
|
||||
return 0;
|
||||
|
||||
while (i < size)
|
||||
if (!_PyUnicode_IsXidContinue(decode_ucs4(p, &i, size)))
|
||||
while (p < e)
|
||||
if (!_PyUnicode_IsXidContinue(_Py_UNICODE_NEXT(p, e)))
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
@ -8118,8 +8128,8 @@ unicode_isprintable(PyObject *self)
|
|||
}
|
||||
|
||||
e = p + PyUnicode_GET_SIZE(self);
|
||||
for (; p < e; p++) {
|
||||
if (!Py_UNICODE_ISPRINTABLE(*p)) {
|
||||
while (p < e) {
|
||||
if (!Py_UNICODE_ISPRINTABLE(_Py_UNICODE_NEXT(p, e))) {
|
||||
Py_RETURN_FALSE;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue