Clean up and reduce visual clutter in the makeunicode.py script. (GH-7558)
This commit is contained in:
parent
56624a99a9
commit
faa2948654
|
@ -31,6 +31,7 @@ import sys
|
|||
import zipfile
|
||||
|
||||
from textwrap import dedent
|
||||
from functools import partial
|
||||
|
||||
SCRIPT = sys.argv[0]
|
||||
VERSION = "3.3"
|
||||
|
@ -106,11 +107,11 @@ cjk_ranges = [
|
|||
('2CEB0', '2EBE0'),
|
||||
]
|
||||
|
||||
|
||||
def maketables(trace=0):
|
||||
|
||||
print("--- Reading", UNICODE_DATA % "", "...")
|
||||
|
||||
version = ""
|
||||
unicode = UnicodeData(UNIDATA_VERSION)
|
||||
|
||||
print(len(list(filter(None, unicode.table))), "characters")
|
||||
|
@ -125,6 +126,7 @@ def maketables(trace=0):
|
|||
makeunicodedata(unicode, trace)
|
||||
makeunicodetype(unicode, trace)
|
||||
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# unicode character properties
|
||||
|
||||
|
@ -258,124 +260,125 @@ def makeunicodedata(unicode, trace):
|
|||
|
||||
print("--- Writing", FILE, "...")
|
||||
|
||||
fp = open(FILE, "w")
|
||||
print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
|
||||
print(file=fp)
|
||||
print('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION, file=fp)
|
||||
print("/* a list of unique database records */", file=fp)
|
||||
print("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {", file=fp)
|
||||
for item in table:
|
||||
print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
|
||||
print("};", file=fp)
|
||||
print(file=fp)
|
||||
with open(FILE, "w") as fp:
|
||||
fprint = partial(print, file=fp)
|
||||
|
||||
print("/* Reindexing of NFC first characters. */", file=fp)
|
||||
print("#define TOTAL_FIRST",total_first, file=fp)
|
||||
print("#define TOTAL_LAST",total_last, file=fp)
|
||||
print("struct reindex{int start;short count,index;};", file=fp)
|
||||
print("static struct reindex nfc_first[] = {", file=fp)
|
||||
for start,end in comp_first_ranges:
|
||||
print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
|
||||
print(" {0,0,0}", file=fp)
|
||||
print("};\n", file=fp)
|
||||
print("static struct reindex nfc_last[] = {", file=fp)
|
||||
for start,end in comp_last_ranges:
|
||||
print(" { %d, %d, %d}," % (start,end-start,comp_last[start]), file=fp)
|
||||
print(" {0,0,0}", file=fp)
|
||||
print("};\n", file=fp)
|
||||
fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
|
||||
fprint()
|
||||
fprint('#define UNIDATA_VERSION "%s"' % UNIDATA_VERSION)
|
||||
fprint("/* a list of unique database records */")
|
||||
fprint("const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {")
|
||||
for item in table:
|
||||
fprint(" {%d, %d, %d, %d, %d, %d}," % item)
|
||||
fprint("};")
|
||||
fprint()
|
||||
|
||||
# FIXME: <fl> the following tables could be made static, and
|
||||
# the support code moved into unicodedatabase.c
|
||||
fprint("/* Reindexing of NFC first characters. */")
|
||||
fprint("#define TOTAL_FIRST",total_first)
|
||||
fprint("#define TOTAL_LAST",total_last)
|
||||
fprint("struct reindex{int start;short count,index;};")
|
||||
fprint("static struct reindex nfc_first[] = {")
|
||||
for start,end in comp_first_ranges:
|
||||
fprint(" { %d, %d, %d}," % (start,end-start,comp_first[start]))
|
||||
fprint(" {0,0,0}")
|
||||
fprint("};\n")
|
||||
fprint("static struct reindex nfc_last[] = {")
|
||||
for start,end in comp_last_ranges:
|
||||
fprint(" { %d, %d, %d}," % (start,end-start,comp_last[start]))
|
||||
fprint(" {0,0,0}")
|
||||
fprint("};\n")
|
||||
|
||||
print("/* string literals */", file=fp)
|
||||
print("const char *_PyUnicode_CategoryNames[] = {", file=fp)
|
||||
for name in CATEGORY_NAMES:
|
||||
print(" \"%s\"," % name, file=fp)
|
||||
print(" NULL", file=fp)
|
||||
print("};", file=fp)
|
||||
# FIXME: <fl> the following tables could be made static, and
|
||||
# the support code moved into unicodedatabase.c
|
||||
|
||||
print("const char *_PyUnicode_BidirectionalNames[] = {", file=fp)
|
||||
for name in BIDIRECTIONAL_NAMES:
|
||||
print(" \"%s\"," % name, file=fp)
|
||||
print(" NULL", file=fp)
|
||||
print("};", file=fp)
|
||||
fprint("/* string literals */")
|
||||
fprint("const char *_PyUnicode_CategoryNames[] = {")
|
||||
for name in CATEGORY_NAMES:
|
||||
fprint(" \"%s\"," % name)
|
||||
fprint(" NULL")
|
||||
fprint("};")
|
||||
|
||||
print("const char *_PyUnicode_EastAsianWidthNames[] = {", file=fp)
|
||||
for name in EASTASIANWIDTH_NAMES:
|
||||
print(" \"%s\"," % name, file=fp)
|
||||
print(" NULL", file=fp)
|
||||
print("};", file=fp)
|
||||
fprint("const char *_PyUnicode_BidirectionalNames[] = {")
|
||||
for name in BIDIRECTIONAL_NAMES:
|
||||
fprint(" \"%s\"," % name)
|
||||
fprint(" NULL")
|
||||
fprint("};")
|
||||
|
||||
print("static const char *decomp_prefix[] = {", file=fp)
|
||||
for name in decomp_prefix:
|
||||
print(" \"%s\"," % name, file=fp)
|
||||
print(" NULL", file=fp)
|
||||
print("};", file=fp)
|
||||
fprint("const char *_PyUnicode_EastAsianWidthNames[] = {")
|
||||
for name in EASTASIANWIDTH_NAMES:
|
||||
fprint(" \"%s\"," % name)
|
||||
fprint(" NULL")
|
||||
fprint("};")
|
||||
|
||||
# split record index table
|
||||
index1, index2, shift = splitbins(index, trace)
|
||||
fprint("static const char *decomp_prefix[] = {")
|
||||
for name in decomp_prefix:
|
||||
fprint(" \"%s\"," % name)
|
||||
fprint(" NULL")
|
||||
fprint("};")
|
||||
|
||||
print("/* index tables for the database records */", file=fp)
|
||||
print("#define SHIFT", shift, file=fp)
|
||||
Array("index1", index1).dump(fp, trace)
|
||||
Array("index2", index2).dump(fp, trace)
|
||||
|
||||
# split decomposition index table
|
||||
index1, index2, shift = splitbins(decomp_index, trace)
|
||||
|
||||
print("/* decomposition data */", file=fp)
|
||||
Array("decomp_data", decomp_data).dump(fp, trace)
|
||||
|
||||
print("/* index tables for the decomposition data */", file=fp)
|
||||
print("#define DECOMP_SHIFT", shift, file=fp)
|
||||
Array("decomp_index1", index1).dump(fp, trace)
|
||||
Array("decomp_index2", index2).dump(fp, trace)
|
||||
|
||||
index, index2, shift = splitbins(comp_data, trace)
|
||||
print("/* NFC pairs */", file=fp)
|
||||
print("#define COMP_SHIFT", shift, file=fp)
|
||||
Array("comp_index", index).dump(fp, trace)
|
||||
Array("comp_data", index2).dump(fp, trace)
|
||||
|
||||
# Generate delta tables for old versions
|
||||
for version, table, normalization in unicode.changed:
|
||||
cversion = version.replace(".","_")
|
||||
records = [table[0]]
|
||||
cache = {table[0]:0}
|
||||
index = [0] * len(table)
|
||||
for i, record in enumerate(table):
|
||||
try:
|
||||
index[i] = cache[record]
|
||||
except KeyError:
|
||||
index[i] = cache[record] = len(records)
|
||||
records.append(record)
|
||||
# split record index table
|
||||
index1, index2, shift = splitbins(index, trace)
|
||||
print("static const change_record change_records_%s[] = {" % cversion, file=fp)
|
||||
for record in records:
|
||||
print(" { %s }," % ", ".join(map(str,record)), file=fp)
|
||||
print("};", file=fp)
|
||||
Array("changes_%s_index" % cversion, index1).dump(fp, trace)
|
||||
Array("changes_%s_data" % cversion, index2).dump(fp, trace)
|
||||
print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
|
||||
print("{", file=fp)
|
||||
print(" int index;", file=fp)
|
||||
print(" if (n >= 0x110000) index = 0;", file=fp)
|
||||
print(" else {", file=fp)
|
||||
print(" index = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
|
||||
print(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \
|
||||
(cversion, shift, ((1<<shift)-1)), file=fp)
|
||||
print(" }", file=fp)
|
||||
print(" return change_records_%s+index;" % cversion, file=fp)
|
||||
print("}\n", file=fp)
|
||||
print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
|
||||
print("{", file=fp)
|
||||
print(" switch(n) {", file=fp)
|
||||
for k, v in normalization:
|
||||
print(" case %s: return 0x%s;" % (hex(k), v), file=fp)
|
||||
print(" default: return 0;", file=fp)
|
||||
print(" }\n}\n", file=fp)
|
||||
|
||||
fp.close()
|
||||
fprint("/* index tables for the database records */")
|
||||
fprint("#define SHIFT", shift)
|
||||
Array("index1", index1).dump(fp, trace)
|
||||
Array("index2", index2).dump(fp, trace)
|
||||
|
||||
# split decomposition index table
|
||||
index1, index2, shift = splitbins(decomp_index, trace)
|
||||
|
||||
fprint("/* decomposition data */")
|
||||
Array("decomp_data", decomp_data).dump(fp, trace)
|
||||
|
||||
fprint("/* index tables for the decomposition data */")
|
||||
fprint("#define DECOMP_SHIFT", shift)
|
||||
Array("decomp_index1", index1).dump(fp, trace)
|
||||
Array("decomp_index2", index2).dump(fp, trace)
|
||||
|
||||
index, index2, shift = splitbins(comp_data, trace)
|
||||
fprint("/* NFC pairs */")
|
||||
fprint("#define COMP_SHIFT", shift)
|
||||
Array("comp_index", index).dump(fp, trace)
|
||||
Array("comp_data", index2).dump(fp, trace)
|
||||
|
||||
# Generate delta tables for old versions
|
||||
for version, table, normalization in unicode.changed:
|
||||
cversion = version.replace(".","_")
|
||||
records = [table[0]]
|
||||
cache = {table[0]:0}
|
||||
index = [0] * len(table)
|
||||
for i, record in enumerate(table):
|
||||
try:
|
||||
index[i] = cache[record]
|
||||
except KeyError:
|
||||
index[i] = cache[record] = len(records)
|
||||
records.append(record)
|
||||
index1, index2, shift = splitbins(index, trace)
|
||||
fprint("static const change_record change_records_%s[] = {" % cversion)
|
||||
for record in records:
|
||||
fprint(" { %s }," % ", ".join(map(str,record)))
|
||||
fprint("};")
|
||||
Array("changes_%s_index" % cversion, index1).dump(fp, trace)
|
||||
Array("changes_%s_data" % cversion, index2).dump(fp, trace)
|
||||
fprint("static const change_record* get_change_%s(Py_UCS4 n)" % cversion)
|
||||
fprint("{")
|
||||
fprint(" int index;")
|
||||
fprint(" if (n >= 0x110000) index = 0;")
|
||||
fprint(" else {")
|
||||
fprint(" index = changes_%s_index[n>>%d];" % (cversion, shift))
|
||||
fprint(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \
|
||||
(cversion, shift, ((1<<shift)-1)))
|
||||
fprint(" }")
|
||||
fprint(" return change_records_%s+index;" % cversion)
|
||||
fprint("}\n")
|
||||
fprint("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion)
|
||||
fprint("{")
|
||||
fprint(" switch(n) {")
|
||||
for k, v in normalization:
|
||||
fprint(" case %s: return 0x%s;" % (hex(k), v))
|
||||
fprint(" default: return 0;")
|
||||
fprint(" }\n}\n")
|
||||
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# unicode character type tables
|
||||
|
@ -404,7 +407,6 @@ def makeunicodetype(unicode, trace):
|
|||
bidirectional = record[4]
|
||||
properties = record[16]
|
||||
flags = 0
|
||||
delta = True
|
||||
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
|
||||
flags |= ALPHA_MASK
|
||||
if "Lowercase" in properties:
|
||||
|
@ -505,90 +507,91 @@ def makeunicodetype(unicode, trace):
|
|||
|
||||
print("--- Writing", FILE, "...")
|
||||
|
||||
fp = open(FILE, "w")
|
||||
print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
|
||||
print(file=fp)
|
||||
print("/* a list of unique character type descriptors */", file=fp)
|
||||
print("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {", file=fp)
|
||||
for item in table:
|
||||
print(" {%d, %d, %d, %d, %d, %d}," % item, file=fp)
|
||||
print("};", file=fp)
|
||||
print(file=fp)
|
||||
with open(FILE, "w") as fp:
|
||||
fprint = partial(print, file=fp)
|
||||
|
||||
print("/* extended case mappings */", file=fp)
|
||||
print(file=fp)
|
||||
print("const Py_UCS4 _PyUnicode_ExtendedCase[] = {", file=fp)
|
||||
for c in extra_casing:
|
||||
print(" %d," % c, file=fp)
|
||||
print("};", file=fp)
|
||||
print(file=fp)
|
||||
fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
|
||||
fprint()
|
||||
fprint("/* a list of unique character type descriptors */")
|
||||
fprint("const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {")
|
||||
for item in table:
|
||||
fprint(" {%d, %d, %d, %d, %d, %d}," % item)
|
||||
fprint("};")
|
||||
fprint()
|
||||
|
||||
# split decomposition index table
|
||||
index1, index2, shift = splitbins(index, trace)
|
||||
fprint("/* extended case mappings */")
|
||||
fprint()
|
||||
fprint("const Py_UCS4 _PyUnicode_ExtendedCase[] = {")
|
||||
for c in extra_casing:
|
||||
fprint(" %d," % c)
|
||||
fprint("};")
|
||||
fprint()
|
||||
|
||||
print("/* type indexes */", file=fp)
|
||||
print("#define SHIFT", shift, file=fp)
|
||||
Array("index1", index1).dump(fp, trace)
|
||||
Array("index2", index2).dump(fp, trace)
|
||||
# split decomposition index table
|
||||
index1, index2, shift = splitbins(index, trace)
|
||||
|
||||
# Generate code for _PyUnicode_ToNumeric()
|
||||
numeric_items = sorted(numeric.items())
|
||||
print('/* Returns the numeric value as double for Unicode characters', file=fp)
|
||||
print(' * having this property, -1.0 otherwise.', file=fp)
|
||||
print(' */', file=fp)
|
||||
print('double _PyUnicode_ToNumeric(Py_UCS4 ch)', file=fp)
|
||||
print('{', file=fp)
|
||||
print(' switch (ch) {', file=fp)
|
||||
for value, codepoints in numeric_items:
|
||||
# Turn text into float literals
|
||||
parts = value.split('/')
|
||||
parts = [repr(float(part)) for part in parts]
|
||||
value = '/'.join(parts)
|
||||
fprint("/* type indexes */")
|
||||
fprint("#define SHIFT", shift)
|
||||
Array("index1", index1).dump(fp, trace)
|
||||
Array("index2", index2).dump(fp, trace)
|
||||
|
||||
codepoints.sort()
|
||||
for codepoint in codepoints:
|
||||
print(' case 0x%04X:' % (codepoint,), file=fp)
|
||||
print(' return (double) %s;' % (value,), file=fp)
|
||||
print(' }', file=fp)
|
||||
print(' return -1.0;', file=fp)
|
||||
print('}', file=fp)
|
||||
print(file=fp)
|
||||
# Generate code for _PyUnicode_ToNumeric()
|
||||
numeric_items = sorted(numeric.items())
|
||||
fprint('/* Returns the numeric value as double for Unicode characters')
|
||||
fprint(' * having this property, -1.0 otherwise.')
|
||||
fprint(' */')
|
||||
fprint('double _PyUnicode_ToNumeric(Py_UCS4 ch)')
|
||||
fprint('{')
|
||||
fprint(' switch (ch) {')
|
||||
for value, codepoints in numeric_items:
|
||||
# Turn text into float literals
|
||||
parts = value.split('/')
|
||||
parts = [repr(float(part)) for part in parts]
|
||||
value = '/'.join(parts)
|
||||
|
||||
# Generate code for _PyUnicode_IsWhitespace()
|
||||
print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
|
||||
print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
|
||||
print(" */", file=fp)
|
||||
print('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)', file=fp)
|
||||
print('{', file=fp)
|
||||
print(' switch (ch) {', file=fp)
|
||||
codepoints.sort()
|
||||
for codepoint in codepoints:
|
||||
fprint(' case 0x%04X:' % (codepoint,))
|
||||
fprint(' return (double) %s;' % (value,))
|
||||
fprint(' }')
|
||||
fprint(' return -1.0;')
|
||||
fprint('}')
|
||||
fprint()
|
||||
|
||||
for codepoint in sorted(spaces):
|
||||
print(' case 0x%04X:' % (codepoint,), file=fp)
|
||||
print(' return 1;', file=fp)
|
||||
# Generate code for _PyUnicode_IsWhitespace()
|
||||
fprint("/* Returns 1 for Unicode characters having the bidirectional")
|
||||
fprint(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.")
|
||||
fprint(" */")
|
||||
fprint('int _PyUnicode_IsWhitespace(const Py_UCS4 ch)')
|
||||
fprint('{')
|
||||
fprint(' switch (ch) {')
|
||||
|
||||
print(' }', file=fp)
|
||||
print(' return 0;', file=fp)
|
||||
print('}', file=fp)
|
||||
print(file=fp)
|
||||
for codepoint in sorted(spaces):
|
||||
fprint(' case 0x%04X:' % (codepoint,))
|
||||
fprint(' return 1;')
|
||||
|
||||
# Generate code for _PyUnicode_IsLinebreak()
|
||||
print("/* Returns 1 for Unicode characters having the line break", file=fp)
|
||||
print(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional", file=fp)
|
||||
print(" * type 'B', 0 otherwise.", file=fp)
|
||||
print(" */", file=fp)
|
||||
print('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)', file=fp)
|
||||
print('{', file=fp)
|
||||
print(' switch (ch) {', file=fp)
|
||||
for codepoint in sorted(linebreaks):
|
||||
print(' case 0x%04X:' % (codepoint,), file=fp)
|
||||
print(' return 1;', file=fp)
|
||||
fprint(' }')
|
||||
fprint(' return 0;')
|
||||
fprint('}')
|
||||
fprint()
|
||||
|
||||
print(' }', file=fp)
|
||||
print(' return 0;', file=fp)
|
||||
print('}', file=fp)
|
||||
print(file=fp)
|
||||
# Generate code for _PyUnicode_IsLinebreak()
|
||||
fprint("/* Returns 1 for Unicode characters having the line break")
|
||||
fprint(" * property 'BK', 'CR', 'LF' or 'NL' or having bidirectional")
|
||||
fprint(" * type 'B', 0 otherwise.")
|
||||
fprint(" */")
|
||||
fprint('int _PyUnicode_IsLinebreak(const Py_UCS4 ch)')
|
||||
fprint('{')
|
||||
fprint(' switch (ch) {')
|
||||
for codepoint in sorted(linebreaks):
|
||||
fprint(' case 0x%04X:' % (codepoint,))
|
||||
fprint(' return 1;')
|
||||
|
||||
fprint(' }')
|
||||
fprint(' return 0;')
|
||||
fprint('}')
|
||||
fprint()
|
||||
|
||||
fp.close()
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# unicode name database
|
||||
|
@ -727,63 +730,63 @@ def makeunicodename(unicode, trace):
|
|||
|
||||
print("--- Writing", FILE, "...")
|
||||
|
||||
fp = open(FILE, "w")
|
||||
print("/* this file was generated by %s %s */" % (SCRIPT, VERSION), file=fp)
|
||||
print(file=fp)
|
||||
print("#define NAME_MAXLEN", 256, file=fp)
|
||||
print(file=fp)
|
||||
print("/* lexicon */", file=fp)
|
||||
Array("lexicon", lexicon).dump(fp, trace)
|
||||
Array("lexicon_offset", lexicon_offset).dump(fp, trace)
|
||||
with open(FILE, "w") as fp:
|
||||
fprint = partial(print, file=fp)
|
||||
|
||||
# split decomposition index table
|
||||
offset1, offset2, shift = splitbins(phrasebook_offset, trace)
|
||||
fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
|
||||
fprint()
|
||||
fprint("#define NAME_MAXLEN", 256)
|
||||
fprint()
|
||||
fprint("/* lexicon */")
|
||||
Array("lexicon", lexicon).dump(fp, trace)
|
||||
Array("lexicon_offset", lexicon_offset).dump(fp, trace)
|
||||
|
||||
print("/* code->name phrasebook */", file=fp)
|
||||
print("#define phrasebook_shift", shift, file=fp)
|
||||
print("#define phrasebook_short", short, file=fp)
|
||||
# split decomposition index table
|
||||
offset1, offset2, shift = splitbins(phrasebook_offset, trace)
|
||||
|
||||
Array("phrasebook", phrasebook).dump(fp, trace)
|
||||
Array("phrasebook_offset1", offset1).dump(fp, trace)
|
||||
Array("phrasebook_offset2", offset2).dump(fp, trace)
|
||||
fprint("/* code->name phrasebook */")
|
||||
fprint("#define phrasebook_shift", shift)
|
||||
fprint("#define phrasebook_short", short)
|
||||
|
||||
print("/* name->code dictionary */", file=fp)
|
||||
codehash.dump(fp, trace)
|
||||
Array("phrasebook", phrasebook).dump(fp, trace)
|
||||
Array("phrasebook_offset1", offset1).dump(fp, trace)
|
||||
Array("phrasebook_offset2", offset2).dump(fp, trace)
|
||||
|
||||
print(file=fp)
|
||||
print('static const unsigned int aliases_start = %#x;' %
|
||||
NAME_ALIASES_START, file=fp)
|
||||
print('static const unsigned int aliases_end = %#x;' %
|
||||
(NAME_ALIASES_START + len(unicode.aliases)), file=fp)
|
||||
fprint("/* name->code dictionary */")
|
||||
codehash.dump(fp, trace)
|
||||
|
||||
print('static const unsigned int name_aliases[] = {', file=fp)
|
||||
for name, codepoint in unicode.aliases:
|
||||
print(' 0x%04X,' % codepoint, file=fp)
|
||||
print('};', file=fp)
|
||||
fprint()
|
||||
fprint('static const unsigned int aliases_start = %#x;' %
|
||||
NAME_ALIASES_START)
|
||||
fprint('static const unsigned int aliases_end = %#x;' %
|
||||
(NAME_ALIASES_START + len(unicode.aliases)))
|
||||
|
||||
# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
|
||||
# so we are using Py_UCS2 seq[4]. This needs to be updated if longer
|
||||
# sequences or sequences with non-BMP chars are added.
|
||||
# unicodedata_lookup should be adapted too.
|
||||
print(dedent("""
|
||||
typedef struct NamedSequence {
|
||||
int seqlen;
|
||||
Py_UCS2 seq[4];
|
||||
} named_sequence;
|
||||
"""), file=fp)
|
||||
fprint('static const unsigned int name_aliases[] = {')
|
||||
for name, codepoint in unicode.aliases:
|
||||
fprint(' 0x%04X,' % codepoint)
|
||||
fprint('};')
|
||||
|
||||
print('static const unsigned int named_sequences_start = %#x;' %
|
||||
NAMED_SEQUENCES_START, file=fp)
|
||||
print('static const unsigned int named_sequences_end = %#x;' %
|
||||
(NAMED_SEQUENCES_START + len(unicode.named_sequences)), file=fp)
|
||||
# In Unicode 6.0.0, the sequences contain at most 4 BMP chars,
|
||||
# so we are using Py_UCS2 seq[4]. This needs to be updated if longer
|
||||
# sequences or sequences with non-BMP chars are added.
|
||||
# unicodedata_lookup should be adapted too.
|
||||
fprint(dedent("""
|
||||
typedef struct NamedSequence {
|
||||
int seqlen;
|
||||
Py_UCS2 seq[4];
|
||||
} named_sequence;
|
||||
"""))
|
||||
|
||||
print('static const named_sequence named_sequences[] = {', file=fp)
|
||||
for name, sequence in unicode.named_sequences:
|
||||
seq_str = ', '.join('0x%04X' % cp for cp in sequence)
|
||||
print(' {%d, {%s}},' % (len(sequence), seq_str), file=fp)
|
||||
print('};', file=fp)
|
||||
fprint('static const unsigned int named_sequences_start = %#x;' %
|
||||
NAMED_SEQUENCES_START)
|
||||
fprint('static const unsigned int named_sequences_end = %#x;' %
|
||||
(NAMED_SEQUENCES_START + len(unicode.named_sequences)))
|
||||
|
||||
fp.close()
|
||||
fprint('static const named_sequence named_sequences[] = {')
|
||||
for name, sequence in unicode.named_sequences:
|
||||
seq_str = ', '.join('0x%04X' % cp for cp in sequence)
|
||||
fprint(' {%d, {%s}},' % (len(sequence), seq_str))
|
||||
fprint('};')
|
||||
|
||||
|
||||
def merge_old_version(version, new, old):
|
||||
|
@ -882,6 +885,7 @@ def merge_old_version(version, new, old):
|
|||
numeric_changes)),
|
||||
normalization_changes))
|
||||
|
||||
|
||||
def open_data(template, version):
|
||||
local = template % ('-'+version,)
|
||||
if not os.path.exists(local):
|
||||
|
@ -898,6 +902,7 @@ def open_data(template, version):
|
|||
# Unihan.zip
|
||||
return open(local, 'rb')
|
||||
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# the following support code is taken from the unidb utilities
|
||||
# Copyright (c) 1999-2000 by Secret Labs AB
|
||||
|
@ -1150,6 +1155,7 @@ class UnicodeData:
|
|||
# restrict character range to ISO Latin 1
|
||||
self.chars = list(range(256))
|
||||
|
||||
|
||||
# hash table tools
|
||||
|
||||
# this is a straight-forward reimplementation of Python's built-in
|
||||
|
@ -1165,6 +1171,7 @@ def myhash(s, magic):
|
|||
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
|
||||
return h
|
||||
|
||||
|
||||
SIZES = [
|
||||
(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
|
||||
(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
|
||||
|
@ -1172,6 +1179,7 @@ SIZES = [
|
|||
(2097152,5), (4194304,3), (8388608,33), (16777216,27)
|
||||
]
|
||||
|
||||
|
||||
class Hash:
|
||||
def __init__(self, name, data, magic):
|
||||
# turn a (key, value) list into a static hash table structure
|
||||
|
@ -1202,7 +1210,7 @@ class Hash:
|
|||
if v is None:
|
||||
table[i] = value
|
||||
continue
|
||||
incr = (h ^ (h >> 3)) & mask;
|
||||
incr = (h ^ (h >> 3)) & mask
|
||||
if not incr:
|
||||
incr = mask
|
||||
while 1:
|
||||
|
@ -1236,6 +1244,7 @@ class Hash:
|
|||
file.write("#define %s_size %d\n" % (self.name, self.size))
|
||||
file.write("#define %s_poly %d\n" % (self.name, self.poly))
|
||||
|
||||
|
||||
# stuff to deal with arrays of unsigned integers
|
||||
|
||||
class Array:
|
||||
|
@ -1270,6 +1279,7 @@ class Array:
|
|||
file.write(s.rstrip() + "\n")
|
||||
file.write("};\n\n")
|
||||
|
||||
|
||||
def getsize(data):
|
||||
# return smallest possible integer size for the given array
|
||||
maxdata = max(data)
|
||||
|
@ -1280,6 +1290,7 @@ def getsize(data):
|
|||
else:
|
||||
return 4
|
||||
|
||||
|
||||
def splitbins(t, trace=0):
|
||||
"""t, trace=0 -> (t1, t2, shift). Split a table to save space.
|
||||
|
||||
|
@ -1299,8 +1310,8 @@ def splitbins(t, trace=0):
|
|||
def dump(t1, t2, shift, bytes):
|
||||
print("%d+%d bins at shift %d; %d bytes" % (
|
||||
len(t1), len(t2), shift, bytes), file=sys.stderr)
|
||||
print("Size of original table:", len(t)*getsize(t), \
|
||||
"bytes", file=sys.stderr)
|
||||
print("Size of original table:", len(t)*getsize(t), "bytes",
|
||||
file=sys.stderr)
|
||||
n = len(t)-1 # last valid index
|
||||
maxshift = 0 # the most we can shift n and still have something left
|
||||
if n > 0:
|
||||
|
@ -1341,5 +1352,6 @@ def splitbins(t, trace=0):
|
|||
assert t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
|
||||
return best
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
maketables(1)
|
||||
|
|
Loading…
Reference in New Issue