mirror of https://github.com/python/cpython
bpo-37760: Convert from length-18 lists to a dataclass, in makeunicodedata. (GH-15265)
Now the fields have names! Much easier to keep straight as a reader than the elements of an 18-tuple. Runs about 10-15% slower: from 10.8s to 12.3s, on my laptop. Fortunately that's perfectly fine for this maintenance script.
This commit is contained in:
parent
5e9caeec76
commit
a65678c5c9
|
@ -0,0 +1,6 @@
|
|||
The :file:`Tools/unicode/makeunicodedata.py` script, which is used for
|
||||
converting information from the Unicode Character Database into generated
|
||||
code and data used by the methods of :class:`str` and by the
|
||||
:mod:`unicodedata` module, now handles each character's data as a
|
||||
``dataclass`` with named attributes, rather than a length-18 list of
|
||||
different fields.
|
|
@ -26,13 +26,14 @@
|
|||
# written by Fredrik Lundh (fredrik@pythonware.com)
|
||||
#
|
||||
|
||||
import dataclasses
|
||||
import os
|
||||
import sys
|
||||
import zipfile
|
||||
|
||||
from functools import partial
|
||||
from textwrap import dedent
|
||||
from typing import Iterator, List, Tuple
|
||||
from typing import Iterator, List, Optional, Set, Tuple
|
||||
|
||||
SCRIPT = sys.argv[0]
|
||||
VERSION = "3.3"
|
||||
|
@ -148,12 +149,12 @@ def makeunicodedata(unicode, trace):
|
|||
record = unicode.table[char]
|
||||
if record:
|
||||
# extract database properties
|
||||
category = CATEGORY_NAMES.index(record[2])
|
||||
combining = int(record[3])
|
||||
bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
|
||||
mirrored = record[9] == "Y"
|
||||
eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
|
||||
normalizationquickcheck = record[17]
|
||||
category = CATEGORY_NAMES.index(record.general_category)
|
||||
combining = int(record.canonical_combining_class)
|
||||
bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
|
||||
mirrored = record.bidi_mirrored == "Y"
|
||||
eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
|
||||
normalizationquickcheck = record.quick_check
|
||||
item = (
|
||||
category, combining, bidirectional, mirrored, eastasianwidth,
|
||||
normalizationquickcheck
|
||||
|
@ -179,8 +180,8 @@ def makeunicodedata(unicode, trace):
|
|||
for char in unicode.chars:
|
||||
record = unicode.table[char]
|
||||
if record:
|
||||
if record[5]:
|
||||
decomp = record[5].split()
|
||||
if record.decomposition_type:
|
||||
decomp = record.decomposition_type.split()
|
||||
if len(decomp) > 19:
|
||||
raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
|
||||
# prefix
|
||||
|
@ -200,7 +201,7 @@ def makeunicodedata(unicode, trace):
|
|||
# Collect NFC pairs
|
||||
if not prefix and len(decomp) == 3 and \
|
||||
char not in unicode.exclusions and \
|
||||
unicode.table[decomp[1]][3] == "0":
|
||||
unicode.table[decomp[1]].canonical_combining_class == "0":
|
||||
p, l, r = decomp
|
||||
comp_first[l] = 1
|
||||
comp_last[r] = 1
|
||||
|
@ -404,9 +405,9 @@ def makeunicodetype(unicode, trace):
|
|||
record = unicode.table[char]
|
||||
if record:
|
||||
# extract database properties
|
||||
category = record[2]
|
||||
bidirectional = record[4]
|
||||
properties = record[16]
|
||||
category = record.general_category
|
||||
bidirectional = record.bidi_class
|
||||
properties = record.binary_properties
|
||||
flags = 0
|
||||
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
|
||||
flags |= ALPHA_MASK
|
||||
|
@ -434,16 +435,16 @@ def makeunicodetype(unicode, trace):
|
|||
flags |= CASE_IGNORABLE_MASK
|
||||
sc = unicode.special_casing.get(char)
|
||||
cf = unicode.case_folding.get(char, [char])
|
||||
if record[12]:
|
||||
upper = int(record[12], 16)
|
||||
if record.simple_uppercase_mapping:
|
||||
upper = int(record.simple_uppercase_mapping, 16)
|
||||
else:
|
||||
upper = char
|
||||
if record[13]:
|
||||
lower = int(record[13], 16)
|
||||
if record.simple_lowercase_mapping:
|
||||
lower = int(record.simple_lowercase_mapping, 16)
|
||||
else:
|
||||
lower = char
|
||||
if record[14]:
|
||||
title = int(record[14], 16)
|
||||
if record.simple_titlecase_mapping:
|
||||
title = int(record.simple_titlecase_mapping, 16)
|
||||
else:
|
||||
title = upper
|
||||
if sc is None and cf != [lower]:
|
||||
|
@ -480,16 +481,16 @@ def makeunicodetype(unicode, trace):
|
|||
extra_casing.extend(sc[1])
|
||||
# decimal digit, integer digit
|
||||
decimal = 0
|
||||
if record[6]:
|
||||
if record.decomposition_mapping:
|
||||
flags |= DECIMAL_MASK
|
||||
decimal = int(record[6])
|
||||
decimal = int(record.decomposition_mapping)
|
||||
digit = 0
|
||||
if record[7]:
|
||||
if record.numeric_type:
|
||||
flags |= DIGIT_MASK
|
||||
digit = int(record[7])
|
||||
if record[8]:
|
||||
digit = int(record.numeric_type)
|
||||
if record.numeric_value:
|
||||
flags |= NUMERIC_MASK
|
||||
numeric.setdefault(record[8], []).append(char)
|
||||
numeric.setdefault(record.numeric_value, []).append(char)
|
||||
item = (
|
||||
upper, lower, title, decimal, digit, flags
|
||||
)
|
||||
|
@ -609,7 +610,7 @@ def makeunicodename(unicode, trace):
|
|||
for char in unicode.chars:
|
||||
record = unicode.table[char]
|
||||
if record:
|
||||
name = record[1].strip()
|
||||
name = record.name.strip()
|
||||
if name and name[0] != "<":
|
||||
names[char] = name + chr(0)
|
||||
|
||||
|
@ -719,7 +720,7 @@ def makeunicodename(unicode, trace):
|
|||
for char in unicode.chars:
|
||||
record = unicode.table[char]
|
||||
if record:
|
||||
name = record[1].strip()
|
||||
name = record.name.strip()
|
||||
if name and name[0] != "<":
|
||||
data.append((name, char))
|
||||
|
||||
|
@ -819,31 +820,27 @@ def merge_old_version(version, new, old):
|
|||
continue
|
||||
# check characters that differ
|
||||
if old.table[i] != new.table[i]:
|
||||
for k in range(len(old.table[i])):
|
||||
if old.table[i][k] != new.table[i][k]:
|
||||
value = old.table[i][k]
|
||||
for k, field in enumerate(dataclasses.fields(UcdRecord)):
|
||||
value = getattr(old.table[i], field.name)
|
||||
new_value = getattr(new.table[i], field.name)
|
||||
if value != new_value:
|
||||
if k == 1 and i in PUA_15:
|
||||
# the name is not set in the old.table, but in the
|
||||
# new.table we are using it for aliases and named seq
|
||||
assert value == ''
|
||||
elif k == 2:
|
||||
#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
|
||||
category_changes[i] = CATEGORY_NAMES.index(value)
|
||||
elif k == 4:
|
||||
#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
|
||||
bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
|
||||
elif k == 5:
|
||||
#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
|
||||
# We assume that all normalization changes are in 1:1 mappings
|
||||
assert " " not in value
|
||||
normalization_changes.append((i, value))
|
||||
elif k == 6:
|
||||
#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
|
||||
# we only support changes where the old value is a single digit
|
||||
assert value in "0123456789"
|
||||
decimal_changes[i] = int(value)
|
||||
elif k == 8:
|
||||
# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
|
||||
# Since 0 encodes "no change", the old value is better not 0
|
||||
if not value:
|
||||
numeric_changes[i] = -1
|
||||
|
@ -952,6 +949,45 @@ class UcdFile:
|
|||
yield char, rest
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class UcdRecord:
|
||||
# 15 fields from UnicodeData.txt . See:
|
||||
# https://www.unicode.org/reports/tr44/#UnicodeData.txt
|
||||
codepoint: str
|
||||
name: str
|
||||
general_category: str
|
||||
canonical_combining_class: str
|
||||
bidi_class: str
|
||||
decomposition_type: str
|
||||
decomposition_mapping: str
|
||||
numeric_type: str
|
||||
numeric_value: str
|
||||
bidi_mirrored: str
|
||||
unicode_1_name: str # obsolete
|
||||
iso_comment: str # obsolete
|
||||
simple_uppercase_mapping: str
|
||||
simple_lowercase_mapping: str
|
||||
simple_titlecase_mapping: str
|
||||
|
||||
# https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
|
||||
east_asian_width: Optional[str]
|
||||
|
||||
# Binary properties, as a set of those that are true.
|
||||
# Taken from multiple files:
|
||||
# https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
|
||||
# https://www.unicode.org/reports/tr44/#LineBreak.txt
|
||||
binary_properties: Set[str]
|
||||
|
||||
# The Quick_Check properties related to normalization:
|
||||
# https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
|
||||
# We store them as a bitmask.
|
||||
quick_check: int
|
||||
|
||||
|
||||
def from_row(row: List[str]) -> UcdRecord:
|
||||
return UcdRecord(*row, None, set(), 0)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# the following support code is taken from the unidb utilities
|
||||
# Copyright (c) 1999-2000 by Secret Labs AB
|
||||
|
@ -959,18 +995,14 @@ class UcdFile:
|
|||
# load a unicode-data file from disk
|
||||
|
||||
class UnicodeData:
|
||||
# Record structure:
|
||||
# [ID, name, category, combining, bidi, decomp, (6)
|
||||
# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
|
||||
# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
|
||||
# derived-props] (17)
|
||||
# table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
|
||||
|
||||
def __init__(self, version, cjk_check=True):
|
||||
self.changed = []
|
||||
table = [None] * 0x110000
|
||||
for s in UcdFile(UNICODE_DATA, version):
|
||||
char = int(s[0], 16)
|
||||
table[char] = s
|
||||
table[char] = from_row(s)
|
||||
|
||||
cjk_ranges_found = []
|
||||
|
||||
|
@ -982,19 +1014,17 @@ class UnicodeData:
|
|||
# https://www.unicode.org/reports/tr44/#Code_Point_Ranges
|
||||
s = table[i]
|
||||
if s:
|
||||
if s[1][-6:] == "First>":
|
||||
s[1] = ""
|
||||
field = s
|
||||
elif s[1][-5:] == "Last>":
|
||||
if s[1].startswith("<CJK Ideograph"):
|
||||
if s.name[-6:] == "First>":
|
||||
s.name = ""
|
||||
field = dataclasses.astuple(s)[:15]
|
||||
elif s.name[-5:] == "Last>":
|
||||
if s.name.startswith("<CJK Ideograph"):
|
||||
cjk_ranges_found.append((field[0],
|
||||
s[0]))
|
||||
s[1] = ""
|
||||
s.codepoint))
|
||||
s.name = ""
|
||||
field = None
|
||||
elif field:
|
||||
f2 = field[:]
|
||||
f2[0] = "%X" % i
|
||||
table[i] = f2
|
||||
table[i] = from_row(('%X' % i,) + field[1:])
|
||||
if cjk_check and cjk_ranges != cjk_ranges_found:
|
||||
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
|
||||
|
||||
|
@ -1015,7 +1045,7 @@ class UnicodeData:
|
|||
char = int(char, 16)
|
||||
self.aliases.append((name, char))
|
||||
# also store the name in the PUA 1
|
||||
self.table[pua_index][1] = name
|
||||
self.table[pua_index].name = name
|
||||
pua_index += 1
|
||||
assert pua_index - NAME_ALIASES_START == len(self.aliases)
|
||||
|
||||
|
@ -1034,7 +1064,7 @@ class UnicodeData:
|
|||
"the NamedSequence struct and in unicodedata_lookup")
|
||||
self.named_sequences.append((name, chars))
|
||||
# also store these in the PUA 1
|
||||
self.table[pua_index][1] = name
|
||||
self.table[pua_index].name = name
|
||||
pua_index += 1
|
||||
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
|
||||
|
||||
|
@ -1049,23 +1079,19 @@ class UnicodeData:
|
|||
|
||||
for i in range(0, 0x110000):
|
||||
if table[i] is not None:
|
||||
table[i].append(widths[i])
|
||||
|
||||
for i in range(0, 0x110000):
|
||||
if table[i] is not None:
|
||||
table[i].append(set())
|
||||
table[i].east_asian_width = widths[i]
|
||||
|
||||
for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
|
||||
if table[char]:
|
||||
# Some properties (e.g. Default_Ignorable_Code_Point)
|
||||
# apply to unassigned code points; ignore them
|
||||
table[char][-1].add(p)
|
||||
table[char].binary_properties.add(p)
|
||||
|
||||
for char_range, value in UcdFile(LINE_BREAK, version):
|
||||
if value not in MANDATORY_LINE_BREAKS:
|
||||
continue
|
||||
for char in expand_range(char_range):
|
||||
table[char][-1].add('Line_Break')
|
||||
table[char].binary_properties.add('Line_Break')
|
||||
|
||||
# We only want the quickcheck properties
|
||||
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
|
||||
|
@ -1087,7 +1113,7 @@ class UnicodeData:
|
|||
quickchecks[char] |= quickcheck
|
||||
for i in range(0, 0x110000):
|
||||
if table[i] is not None:
|
||||
table[i].append(quickchecks[i])
|
||||
table[i].quick_check = quickchecks[i]
|
||||
|
||||
with open_data(UNIHAN, version) as file:
|
||||
zip = zipfile.ZipFile(file)
|
||||
|
@ -1106,7 +1132,7 @@ class UnicodeData:
|
|||
i = int(code[2:], 16)
|
||||
# Patch the numeric field
|
||||
if table[i] is not None:
|
||||
table[i][8] = value
|
||||
table[i].numeric_value = value
|
||||
|
||||
sc = self.special_casing = {}
|
||||
for data in UcdFile(SPECIAL_CASING, version):
|
||||
|
|
Loading…
Reference in New Issue