bpo-40596: Fix str.isidentifier() for non-canonicalized strings containing non-BMP characters on Windows. (GH-20053)

This commit is contained in:
Serhiy Storchaka 2020-05-12 16:18:00 +03:00 committed by GitHub
parent 7c6e970775
commit 5650e76f63
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 31 additions and 4 deletions

View File

@ -720,6 +720,13 @@ class UnicodeTest(string_tests.CommonTest,
self.assertFalse("©".isidentifier())
self.assertFalse("0".isidentifier())
@support.cpython_only
def test_isidentifier_legacy(self):
import _testcapi
u = '𝖀𝖓𝖎𝖈𝖔𝖉𝖊'
self.assertTrue(u.isidentifier())
self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier())
def test_isprintable(self):
self.assertTrue("".isprintable())
self.assertTrue(" ".isprintable())

View File

@ -0,0 +1,2 @@
Fixed :meth:`str.isidentifier` for non-canonicalized strings containing
non-BMP characters on Windows.

View File

@ -12356,20 +12356,38 @@ PyUnicode_IsIdentifier(PyObject *self)
return len && i == len;
}
else {
Py_ssize_t i, len = PyUnicode_GET_SIZE(self);
Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
if (len == 0) {
/* an empty string is not a valid identifier */
return 0;
}
const wchar_t *wstr = _PyUnicode_WSTR(self);
Py_UCS4 ch = wstr[0];
Py_UCS4 ch = wstr[i++];
#if SIZEOF_WCHAR_T == 2
if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
&& i < len
&& Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
{
ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
i++;
}
#endif
if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
return 0;
}
for (i = 1; i < len; i++) {
ch = wstr[i];
while (i < len) {
ch = wstr[i++];
#if SIZEOF_WCHAR_T == 2
if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
&& i < len
&& Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
{
ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
i++;
}
#endif
if (!_PyUnicode_IsXidContinue(ch)) {
return 0;
}