bpo-37760: Factor out the basic UCD parsing logic of makeunicodedata. (GH-15130)

There were 10 copies of this, and almost as many distinct versions of
exactly how it was written.  They're all implementing the same
standard.  Pull them out to the top, so the more interesting logic
that remains becomes easier to read.
This commit is contained in:
Greg Price 2019-08-12 22:20:56 -07:00 committed by Benjamin Peterson
parent 66a34d35e4
commit ef2af1ad44
1 changed files with 108 additions and 132 deletions

View File

@ -30,8 +30,9 @@ import os
import sys
import zipfile
from textwrap import dedent
from functools import partial
from textwrap import dedent
from typing import *
SCRIPT = sys.argv[0]
VERSION = "3.3"
@ -903,6 +904,32 @@ def open_data(template, version):
return open(local, 'rb')
class UcdFile:
'''
A file in the standard format of the UCD.
See: https://www.unicode.org/reports/tr44/#Format_Conventions
Note that, as described there, the Unihan data files have their
own separate format.
'''
def __init__(self, template: str, version: str) -> None:
self.template = template
self.version = version
def records(self) -> Iterator[List[str]]:
with open_data(self.template, self.version) as file:
for line in file:
line = line.split('#', 1)[0].strip()
if not line:
continue
yield [field.strip() for field in line.split(';')]
def __iter__(self) -> Iterator[List[str]]:
return self.records()
# --------------------------------------------------------------------
# the following support code is taken from the unidb utilities
# Copyright (c) 1999-2000 by Secret Labs AB
@ -922,12 +949,7 @@ class UnicodeData:
cjk_check=True):
self.changed = []
table = [None] * 0x110000
with open_data(UNICODE_DATA, version) as file:
while 1:
s = file.readline()
if not s:
break
s = s.strip().split(";")
for s in UcdFile(UNICODE_DATA, version):
char = int(s[0], 16)
table[char] = s
@ -968,12 +990,7 @@ class UnicodeData:
# in order to take advantage of the compression and lookup
# algorithms used for the other characters
pua_index = NAME_ALIASES_START
with open_data(NAME_ALIASES, version) as file:
for s in file:
s = s.strip()
if not s or s.startswith('#'):
continue
char, name, abbrev = s.split(';')
for char, name, abbrev in UcdFile(NAME_ALIASES, version):
char = int(char, 16)
self.aliases.append((name, char))
# also store the name in the PUA 1
@ -988,12 +1005,7 @@ class UnicodeData:
assert pua_index < NAMED_SEQUENCES_START
pua_index = NAMED_SEQUENCES_START
with open_data(NAMED_SEQUENCES, version) as file:
for s in file:
s = s.strip()
if not s or s.startswith('#'):
continue
name, chars = s.split(';')
for name, chars in UcdFile(NAMED_SEQUENCES, version):
chars = tuple(int(char, 16) for char in chars.split())
# check that the structure defined in makeunicodename is OK
assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
@ -1006,25 +1018,12 @@ class UnicodeData:
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
self.exclusions = {}
with open_data(COMPOSITION_EXCLUSIONS, version) as file:
for s in file:
s = s.strip()
if not s:
continue
if s[0] == '#':
continue
char = int(s.split()[0],16)
for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
char = int(char, 16)
self.exclusions[char] = 1
widths = [None] * 0x110000
with open_data(EASTASIAN_WIDTH, version) as file:
for s in file:
s = s.strip()
if not s:
continue
if s[0] == '#':
continue
s = s.split()[0].split(';')
for s in UcdFile(EASTASIAN_WIDTH, version):
if '..' in s[0]:
first, last = [int(c, 16) for c in s[0].split('..')]
chars = list(range(first, last+1))
@ -1041,15 +1040,7 @@ class UnicodeData:
if table[i] is not None:
table[i].append(set())
with open_data(DERIVED_CORE_PROPERTIES, version) as file:
for s in file:
s = s.split('#', 1)[0].strip()
if not s:
continue
r, p = s.split(";")
r = r.strip()
p = p.strip()
for r, p in UcdFile(DERIVED_CORE_PROPERTIES, version):
if ".." in r:
first, last = [int(c, 16) for c in r.split('..')]
chars = list(range(first, last+1))
@ -1061,10 +1052,7 @@ class UnicodeData:
# apply to unassigned code points; ignore them
table[char][-1].add(p)
with open_data(LINE_BREAK, version) as file:
for s in file:
s = s.partition('#')[0]
s = [i.strip() for i in s.split(';')]
for s in UcdFile(LINE_BREAK, version):
if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
continue
if '..' not in s[0]:
@ -1083,11 +1071,7 @@ class UnicodeData:
# for older versions, and no delta records will be created.
quickchecks = [0] * 0x110000
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
for s in file:
if '#' in s:
s = s[:s.index('#')]
s = [i.strip() for i in s.split(';')]
for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
if len(s) < 2 or s[1] not in qc_order:
continue
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
@ -1122,13 +1106,9 @@ class UnicodeData:
# Patch the numeric field
if table[i] is not None:
table[i][8] = value
sc = self.special_casing = {}
with open_data(SPECIAL_CASING, version) as file:
for s in file:
s = s[:-1].split('#', 1)[0]
if not s:
continue
data = s.split("; ")
for data in UcdFile(SPECIAL_CASING, version):
if data[4]:
# We ignore all conditionals (since they depend on
# languages) except for one, which is hardcoded. See
@ -1139,14 +1119,10 @@ class UnicodeData:
title = [int(char, 16) for char in data[2].split()]
upper = [int(char, 16) for char in data[3].split()]
sc[c] = (lower, title, upper)
cf = self.case_folding = {}
if version != '3.2.0':
with open_data(CASE_FOLDING, version) as file:
for s in file:
s = s[:-1].split('#', 1)[0]
if not s:
continue
data = s.split("; ")
for data in UcdFile(CASE_FOLDING, version):
if data[1] in "CF":
c = int(data[0], 16)
cf[c] = [int(char, 16) for char in data[2].split()]