bpo-40596: Fix str.isidentifier() for non-canonicalized strings containing non-BMP characters on Windows. (GH-20053)
This commit is contained in:
parent
7c6e970775
commit
5650e76f63
|
@ -720,6 +720,13 @@ class UnicodeTest(string_tests.CommonTest,
|
||||||
self.assertFalse("©".isidentifier())
|
self.assertFalse("©".isidentifier())
|
||||||
self.assertFalse("0".isidentifier())
|
self.assertFalse("0".isidentifier())
|
||||||
|
|
||||||
|
@support.cpython_only
|
||||||
|
def test_isidentifier_legacy(self):
|
||||||
|
import _testcapi
|
||||||
|
u = '𝖀𝖓𝖎𝖈𝖔𝖉𝖊'
|
||||||
|
self.assertTrue(u.isidentifier())
|
||||||
|
self.assertTrue(_testcapi.unicode_legacy_string(u).isidentifier())
|
||||||
|
|
||||||
def test_isprintable(self):
|
def test_isprintable(self):
|
||||||
self.assertTrue("".isprintable())
|
self.assertTrue("".isprintable())
|
||||||
self.assertTrue(" ".isprintable())
|
self.assertTrue(" ".isprintable())
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
Fixed :meth:`str.isidentifier` for non-canonicalized strings containing
|
||||||
|
non-BMP characters on Windows.
|
|
@ -12356,20 +12356,38 @@ PyUnicode_IsIdentifier(PyObject *self)
|
||||||
return len && i == len;
|
return len && i == len;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
Py_ssize_t i, len = PyUnicode_GET_SIZE(self);
|
Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
|
||||||
if (len == 0) {
|
if (len == 0) {
|
||||||
/* an empty string is not a valid identifier */
|
/* an empty string is not a valid identifier */
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
const wchar_t *wstr = _PyUnicode_WSTR(self);
|
const wchar_t *wstr = _PyUnicode_WSTR(self);
|
||||||
Py_UCS4 ch = wstr[0];
|
Py_UCS4 ch = wstr[i++];
|
||||||
|
#if SIZEOF_WCHAR_T == 2
|
||||||
|
if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
|
||||||
|
&& i < len
|
||||||
|
&& Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
|
||||||
|
{
|
||||||
|
ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
|
if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (i = 1; i < len; i++) {
|
while (i < len) {
|
||||||
ch = wstr[i];
|
ch = wstr[i++];
|
||||||
|
#if SIZEOF_WCHAR_T == 2
|
||||||
|
if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
|
||||||
|
&& i < len
|
||||||
|
&& Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
|
||||||
|
{
|
||||||
|
ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
if (!_PyUnicode_IsXidContinue(ch)) {
|
if (!_PyUnicode_IsXidContinue(ch)) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue