mirror of https://github.com/python/cpython
bpo-29456: Fix bugs in unicodedata.normalize: u1176, u11a7 and u11c3 (GH-1958)
Hangul composition check boundaries are wrong for the second character ([0x1161, 0x1176) instead of [0x1161, 0x1176]) and third character ((0x11A7, 0x11C3) instead of [0x11A7, 0x11C3]).
This commit is contained in:
parent
ceeef10cdb
commit
d134809cd3
|
@ -208,6 +208,19 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
|
||||||
b = 'C\u0338' * 20 + '\xC7'
|
b = 'C\u0338' * 20 + '\xC7'
|
||||||
self.assertEqual(self.db.normalize('NFC', a), b)
|
self.assertEqual(self.db.normalize('NFC', a), b)
|
||||||
|
|
||||||
|
def test_issue29456(self):
|
||||||
|
# Fix #29456
|
||||||
|
u1176_str_a = '\u1100\u1176\u11a8'
|
||||||
|
u1176_str_b = '\u1100\u1176\u11a8'
|
||||||
|
u11a7_str_a = '\u1100\u1175\u11a7'
|
||||||
|
u11a7_str_b = '\uae30\u11a7'
|
||||||
|
u11c3_str_a = '\u1100\u1175\u11c3'
|
||||||
|
u11c3_str_b = '\uae30\u11c3'
|
||||||
|
self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
|
||||||
|
self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
|
||||||
|
self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
|
||||||
|
|
||||||
|
|
||||||
def test_east_asian_width(self):
|
def test_east_asian_width(self):
|
||||||
eaw = self.db.east_asian_width
|
eaw = self.db.east_asian_width
|
||||||
self.assertRaises(TypeError, eaw, b'a')
|
self.assertRaises(TypeError, eaw, b'a')
|
||||||
|
|
|
@ -1800,6 +1800,7 @@ Jason Yeo
|
||||||
EungJun Yi
|
EungJun Yi
|
||||||
Bob Yodlowski
|
Bob Yodlowski
|
||||||
Danny Yoo
|
Danny Yoo
|
||||||
|
Wonsup Yoon
|
||||||
Rory Yorke
|
Rory Yorke
|
||||||
George Yoshida
|
George Yoshida
|
||||||
Kazuhiro Yoshida
|
Kazuhiro Yoshida
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
Fix bugs in hangul normalization: u1176, u11a7 and u11c3
|
|
@ -681,15 +681,19 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
|
||||||
if (LBase <= code && code < (LBase+LCount) &&
|
if (LBase <= code && code < (LBase+LCount) &&
|
||||||
i + 1 < len &&
|
i + 1 < len &&
|
||||||
VBase <= PyUnicode_READ(kind, data, i+1) &&
|
VBase <= PyUnicode_READ(kind, data, i+1) &&
|
||||||
PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
|
PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
|
||||||
|
/* check L character is a modern leading consonant (0x1100 ~ 0x1112)
|
||||||
|
and V character is a modern vowel (0x1161 ~ 0x1175). */
|
||||||
int LIndex, VIndex;
|
int LIndex, VIndex;
|
||||||
LIndex = code - LBase;
|
LIndex = code - LBase;
|
||||||
VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
|
VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
|
||||||
code = SBase + (LIndex*VCount+VIndex)*TCount;
|
code = SBase + (LIndex*VCount+VIndex)*TCount;
|
||||||
i+=2;
|
i+=2;
|
||||||
if (i < len &&
|
if (i < len &&
|
||||||
TBase <= PyUnicode_READ(kind, data, i) &&
|
TBase < PyUnicode_READ(kind, data, i) &&
|
||||||
PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
|
PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
|
||||||
|
/* check T character is a modern trailing consonant
|
||||||
|
(0x11A8 ~ 0x11C2). */
|
||||||
code += PyUnicode_READ(kind, data, i)-TBase;
|
code += PyUnicode_READ(kind, data, i)-TBase;
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue