mirror of https://github.com/python/cpython
GH-96172 fix unicodedata.east_asian_width being wrong on unassigned code points (#96207)
This commit is contained in:
parent
c1581a928c
commit
9c197bc8bf
|
@ -71,7 +71,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
|
||||||
|
|
||||||
# Update this if the database changes. Make sure to do a full rebuild
|
# Update this if the database changes. Make sure to do a full rebuild
|
||||||
# (e.g. 'make distclean && make') to get the correct checksum.
|
# (e.g. 'make distclean && make') to get the correct checksum.
|
||||||
expectedchecksum = '98d602e1f69d5c5bb8a5910c40bbbad4e18e8370'
|
expectedchecksum = '4975f3ec0acd4a62465d18c9bf8519b1964181f6'
|
||||||
|
|
||||||
@requires_resource('cpu')
|
@requires_resource('cpu')
|
||||||
def test_function_checksum(self):
|
def test_function_checksum(self):
|
||||||
|
@ -90,6 +90,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
|
||||||
self.db.decomposition(char),
|
self.db.decomposition(char),
|
||||||
str(self.db.mirrored(char)),
|
str(self.db.mirrored(char)),
|
||||||
str(self.db.combining(char)),
|
str(self.db.combining(char)),
|
||||||
|
unicodedata.east_asian_width(char),
|
||||||
]
|
]
|
||||||
h.update(''.join(data).encode("ascii"))
|
h.update(''.join(data).encode("ascii"))
|
||||||
result = h.hexdigest()
|
result = h.hexdigest()
|
||||||
|
@ -220,6 +221,23 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
|
||||||
self.assertEqual(eaw('\u2010'), 'A')
|
self.assertEqual(eaw('\u2010'), 'A')
|
||||||
self.assertEqual(eaw('\U00020000'), 'W')
|
self.assertEqual(eaw('\U00020000'), 'W')
|
||||||
|
|
||||||
|
def test_east_asian_width_unassigned(self):
|
||||||
|
eaw = self.db.east_asian_width
|
||||||
|
# unassigned
|
||||||
|
for char in '\u0530\u0ece\u10c6\u20fc\uaaca\U000107bd\U000115f2':
|
||||||
|
self.assertEqual(eaw(char), 'N')
|
||||||
|
self.assertIs(self.db.name(char, None), None)
|
||||||
|
|
||||||
|
# unassigned but reserved for CJK
|
||||||
|
for char in '\uFA6E\uFADA\U0002A6E0\U0002FA20\U0003134B\U0003FFFD':
|
||||||
|
self.assertEqual(eaw(char), 'W')
|
||||||
|
self.assertIs(self.db.name(char, None), None)
|
||||||
|
|
||||||
|
# private use areas
|
||||||
|
for char in '\uE000\uF800\U000F0000\U000FFFEE\U00100000\U0010FFF0':
|
||||||
|
self.assertEqual(eaw(char), 'A')
|
||||||
|
self.assertIs(self.db.name(char, None), None)
|
||||||
|
|
||||||
def test_east_asian_width_9_0_changes(self):
|
def test_east_asian_width_9_0_changes(self):
|
||||||
self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
|
self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
|
||||||
self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
|
self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
Fix a bug in ``unicodedata``: ``east_asian_width`` used to return the wrong
|
||||||
|
value for unassigned characters; and for yet unassigned, but reserved
|
||||||
|
characters.
|
File diff suppressed because it is too large
Load Diff
|
@ -77,7 +77,8 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
|
||||||
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
|
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
|
||||||
"ON", "LRI", "RLI", "FSI", "PDI" ]
|
"ON", "LRI", "RLI", "FSI", "PDI" ]
|
||||||
|
|
||||||
EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]
|
# "N" needs to be the first entry, see the comment in makeunicodedata
|
||||||
|
EASTASIANWIDTH_NAMES = [ "N", "H", "W", "Na", "A", "F" ]
|
||||||
|
|
||||||
MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
|
MANDATORY_LINE_BREAKS = [ "BK", "CR", "LF", "NL" ]
|
||||||
|
|
||||||
|
@ -135,6 +136,14 @@ def maketables(trace=0):
|
||||||
|
|
||||||
def makeunicodedata(unicode, trace):
|
def makeunicodedata(unicode, trace):
|
||||||
|
|
||||||
|
# the default value of east_asian_width is "N", for unassigned code points
|
||||||
|
# not mentioned in EastAsianWidth.txt
|
||||||
|
# in addition there are some reserved but unassigned code points in CJK
|
||||||
|
# ranges that are classified as "W". code points in private use areas
|
||||||
|
# have a width of "A". both of these have entries in
|
||||||
|
# EastAsianWidth.txt
|
||||||
|
# see https://unicode.org/reports/tr11/#Unassigned
|
||||||
|
assert EASTASIANWIDTH_NAMES[0] == "N"
|
||||||
dummy = (0, 0, 0, 0, 0, 0)
|
dummy = (0, 0, 0, 0, 0, 0)
|
||||||
table = [dummy]
|
table = [dummy]
|
||||||
cache = {0: dummy}
|
cache = {0: dummy}
|
||||||
|
@ -160,12 +169,20 @@ def makeunicodedata(unicode, trace):
|
||||||
category, combining, bidirectional, mirrored, eastasianwidth,
|
category, combining, bidirectional, mirrored, eastasianwidth,
|
||||||
normalizationquickcheck
|
normalizationquickcheck
|
||||||
)
|
)
|
||||||
# add entry to index and item tables
|
elif unicode.widths[char] is not None:
|
||||||
i = cache.get(item)
|
# an unassigned but reserved character, with a known
|
||||||
if i is None:
|
# east_asian_width
|
||||||
cache[item] = i = len(table)
|
eastasianwidth = EASTASIANWIDTH_NAMES.index(unicode.widths[char])
|
||||||
table.append(item)
|
item = (0, 0, 0, 0, eastasianwidth, 0)
|
||||||
index[char] = i
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# add entry to index and item tables
|
||||||
|
i = cache.get(item)
|
||||||
|
if i is None:
|
||||||
|
cache[item] = i = len(table)
|
||||||
|
table.append(item)
|
||||||
|
index[char] = i
|
||||||
|
|
||||||
# 2) decomposition data
|
# 2) decomposition data
|
||||||
|
|
||||||
|
@ -1085,6 +1102,7 @@ class UnicodeData:
|
||||||
for i in range(0, 0x110000):
|
for i in range(0, 0x110000):
|
||||||
if table[i] is not None:
|
if table[i] is not None:
|
||||||
table[i].east_asian_width = widths[i]
|
table[i].east_asian_width = widths[i]
|
||||||
|
self.widths = widths
|
||||||
|
|
||||||
for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
|
for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
|
||||||
if table[char]:
|
if table[char]:
|
||||||
|
|
Loading…
Reference in New Issue