From 733e15f1707ddec502a69c8c324c77e02ca11fa9 Mon Sep 17 00:00:00 2001 From: Davide Rizzo Date: Sat, 11 Jun 2022 16:19:41 +0200 Subject: [PATCH] gh-84508: tool to generate cjk traditional chinese mappings (gh-93272) --- Modules/cjkcodecs/mappings_hk.h | 1 + Modules/cjkcodecs/mappings_tw.h | 2 + Tools/unicode/genmap_tchinese.py | 239 +++++++++++++++++++++++++++++++ 3 files changed, 242 insertions(+) create mode 100644 Tools/unicode/genmap_tchinese.py diff --git a/Modules/cjkcodecs/mappings_hk.h b/Modules/cjkcodecs/mappings_hk.h index 1b1d70e7c17..9012ae350c4 100644 --- a/Modules/cjkcodecs/mappings_hk.h +++ b/Modules/cjkcodecs/mappings_hk.h @@ -1,3 +1,4 @@ +// AUTO-GENERATED FILE FROM genmap_tchinese.py: DO NOT EDIT static const ucs2_t __big5hkscs_decmap[6219] = { 17392,19506,17923,17830,17784,29287,19831,17843,31921,19682,31941,15253,18230, 18244,19527,19520,17087,13847,29522,28299,28882,19543,41809,18255,17882,19589, diff --git a/Modules/cjkcodecs/mappings_tw.h b/Modules/cjkcodecs/mappings_tw.h index ec3f9f7468e..ceb4bc56a21 100644 --- a/Modules/cjkcodecs/mappings_tw.h +++ b/Modules/cjkcodecs/mappings_tw.h @@ -1,3 +1,4 @@ +// AUTO-GENERATED FILE FROM genmap_tchinese.py: DO NOT EDIT static const ucs2_t __big5_decmap[16702] = { 12288,65292,12289,12290,65294,8226,65307,65306,65311,65281,65072,8230,8229, 65104,65380,65106,183,65108,65109,65110,65111,65372,8211,65073,8212,65075, @@ -2631,3 +2632,4 @@ static const struct unim_index cp950ext_encmap[256] = { 0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{__cp950ext_encmap+342,81,104},{ __cp950ext_encmap+366,15,229}, }; + diff --git a/Tools/unicode/genmap_tchinese.py b/Tools/unicode/genmap_tchinese.py new file mode 100644 index 00000000000..a416cf3d1fa --- /dev/null +++ b/Tools/unicode/genmap_tchinese.py @@ -0,0 +1,239 @@ +# +# genmap_tchinese.py: Traditional Chinese Codecs Map Generator +# +# Original Author: Hye-Shik Chang +# +import os + +from genmap_support import * + + +# ranges for (lead byte, follower byte) +BIG5_C1 = (0xa1, 0xfe) +BIG5_C2 = (0x40, 0xfe) +BIG5HKSCS_C1 = (0x87, 0xfe) +BIG5HKSCS_C2 = (0x40, 0xfe) + +MAPPINGS_BIG5 = 'https://unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/OTHER/BIG5.TXT' +MAPPINGS_CP950 = 'https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP950.TXT' + +HKSCS_VERSION = '2004' +# The files for HKSCS mappings are available under a restrictive license. +# Users of the script need to download the files from the HKSARG CCLI website: +MAPPINGS_HKSCS = f'https://www.ccli.gov.hk/en/archive/terms_hkscs-{HKSCS_VERSION}-big5-iso.html' + + +def bh2s(code): + return ((code >> 8) - 0x87) * (0xfe - 0x40 + 1) + ((code & 0xff) - 0x40) + + +def split_bytes(code): + """Split 0xABCD into 0xAB, 0xCD""" + return code >> 8, code & 0xff + + +def parse_hkscs_map(fo): + fo.seek(0, 0) + table = [] + for line in fo: + line = line.split('#', 1)[0].strip() + # We expect 4 columns in supported HKSCS files: + # [1999]: unsupported + # [2001]: unsupported + # [2004]: Big-5; iso10646-1:1993; iso10646-1:2000; iso10646:2003+amd1 + # [2008]: Big-5; iso10646-1:1993; iso10646-1:2000; iso10646:2003+amd6 + # [2016]: not supported here--uses a json file instead + # + # In both supported cases, we only need the first and last column: + # * Big-5 is a hex string (always 4 digits) + # * iso10646:2003 is either a hex string (4 or 5 digits) or a sequence + # of hex strings like: `` + try: + hkscs_col, _, _, uni_col = line.split() + hkscs = int(hkscs_col, 16) + seq = tuple(int(cp, 16) for cp in uni_col.strip('<>').split(',')) + except ValueError: + continue + table.append((hkscs, seq)) + return table + + +def make_hkscs_map(table): + decode_map = {} + encode_map_bmp, encode_map_notbmp = {}, {} + is_bmp_map = {} + sequences = [] + beginnings = {} + single_cp_table = [] + # Determine multi-codepoint sequences, and sequence beginnings that encode + # multiple multibyte (i.e. Big-5) codes. + for mbcode, cp_seq in table: + cp, *_ = cp_seq + if len(cp_seq) == 1: + single_cp_table.append((mbcode, cp)) + else: + sequences.append((mbcode, cp_seq)) + beginnings.setdefault(cp, []).append(mbcode) + # Decode table only cares about single code points (no sequences) currently + for mbcode, cp in single_cp_table: + b1, b2 = split_bytes(mbcode) + decode_map.setdefault(b1, {}) + decode_map[b1][b2] = cp & 0xffff + # Encode table needs to mark code points beginning a sequence as tuples. + for cp, mbcodes in beginnings.items(): + plane = cp >> 16 + if plane == 0: + encode_map = encode_map_bmp + elif plane == 2: + encode_map = encode_map_notbmp + is_bmp_map[bh2s(mbcodes[0])] = 1 + else: + assert False, 'only plane 0 (BMP) and plane 2 (SIP) allowed' + if len(mbcodes) == 1: + encode_value = mbcodes[0] + else: + encode_value = tuple(mbcodes) + uni_b1, uni_b2 = split_bytes(cp & 0xffff) + encode_map.setdefault(uni_b1, {}) + encode_map[uni_b1][uni_b2] = encode_value + + return decode_map, encode_map_bmp, encode_map_notbmp, is_bmp_map + + +def load_big5_map(): + mapfile = open_mapping_file('python-mappings/BIG5.txt', MAPPINGS_BIG5) + with mapfile: + big5decmap = loadmap(mapfile) + # big5 mapping fix: use the cp950 mapping for these characters as the file + # provided by unicode.org doesn't define a mapping. See notes in BIG5.txt. + # Since U+5341, U+5345, U+FF0F, U+FF3C already have a big5 mapping, no + # roundtrip compatibility is guaranteed for those. + for m in """\ + 0xA15A 0x2574 + 0xA1C3 0xFFE3 + 0xA1C5 0x02CD + 0xA1FE 0xFF0F + 0xA240 0xFF3C + 0xA2CC 0x5341 + 0xA2CE 0x5345""".splitlines(): + bcode, ucode = list(map(eval, m.split())) + big5decmap[bcode >> 8][bcode & 0xff] = ucode + # encoding map + big5encmap = {} + for c1, m in list(big5decmap.items()): + for c2, code in list(m.items()): + big5encmap.setdefault(code >> 8, {}) + if code & 0xff not in big5encmap[code >> 8]: + big5encmap[code >> 8][code & 0xff] = c1 << 8 | c2 + # fix unicode->big5 priority for the above-mentioned duplicate characters + big5encmap[0xFF][0x0F] = 0xA241 + big5encmap[0xFF][0x3C] = 0xA242 + big5encmap[0x53][0x41] = 0xA451 + big5encmap[0x53][0x45] = 0xA4CA + + return big5decmap, big5encmap + + +def load_cp950_map(): + mapfile = open_mapping_file('python-mappings/CP950.TXT', MAPPINGS_CP950) + with mapfile: + cp950decmap = loadmap(mapfile) + cp950encmap = {} + for c1, m in list(cp950decmap.items()): + for c2, code in list(m.items()): + cp950encmap.setdefault(code >> 8, {}) + if code & 0xff not in cp950encmap[code >> 8]: + cp950encmap[code >> 8][code & 0xff] = c1 << 8 | c2 + # fix unicode->big5 duplicated mapping priority + cp950encmap[0x53][0x41] = 0xA451 + cp950encmap[0x53][0x45] = 0xA4CA + return cp950decmap, cp950encmap + + +def main_tw(): + big5decmap, big5encmap = load_big5_map() + cp950decmap, cp950encmap = load_cp950_map() + + # CP950 extends Big5, and the codec can use the Big5 lookup tables + # for most entries. So the CP950 tables should only include entries + # that are not in Big5: + for c1, m in list(cp950encmap.items()): + for c2, code in list(m.items()): + if (c1 in big5encmap and c2 in big5encmap[c1] + and big5encmap[c1][c2] == code): + del cp950encmap[c1][c2] + for c1, m in list(cp950decmap.items()): + for c2, code in list(m.items()): + if (c1 in big5decmap and c2 in big5decmap[c1] + and big5decmap[c1][c2] == code): + del cp950decmap[c1][c2] + + with open('mappings_tw.h', 'w') as fp: + print_autogen(fp, os.path.basename(__file__)) + write_big5_maps(fp, 'BIG5', 'big5', big5decmap, big5encmap) + write_big5_maps(fp, 'CP950', 'cp950ext', cp950decmap, cp950encmap) + + +def write_big5_maps(fp, display_name, table_name, decode_map, encode_map): + print(f'Generating {display_name} decode map...') + writer = DecodeMapWriter(fp, table_name, decode_map) + writer.update_decode_map(BIG5_C1, BIG5_C2) + writer.generate() + print(f'Generating {display_name} encode map...') + writer = EncodeMapWriter(fp, table_name, encode_map) + writer.generate() + + +class HintsWriter: + filler_class = BufferedFiller + + def __init__(self, fp, prefix, isbmpmap): + self.fp = fp + self.prefix = prefix + self.isbmpmap = isbmpmap + self.filler = self.filler_class() + + def fillhints(self, hintfrom, hintto): + name = f'{self.prefix}_phint_{hintfrom}' + self.fp.write(f'static const unsigned char {name}[] = {{\n') + for msbcode in range(hintfrom, hintto+1, 8): + v = 0 + for c in range(msbcode, msbcode+8): + v |= self.isbmpmap.get(c, 0) << (c - msbcode) + self.filler.write('%d,' % v) + self.filler.printout(self.fp) + self.fp.write('};\n\n') + + +def main_hkscs(): + filename = f'python-mappings/hkscs-{HKSCS_VERSION}-big5-iso.txt' + with open_mapping_file(filename, MAPPINGS_HKSCS) as f: + table = parse_hkscs_map(f) + hkscsdecmap, hkscsencmap_bmp, hkscsencmap_nonbmp, isbmpmap = ( + make_hkscs_map(table) + ) + with open('mappings_hk.h', 'w') as fp: + print('Generating BIG5HKSCS decode map...') + print_autogen(fp, os.path.basename(__file__)) + writer = DecodeMapWriter(fp, 'big5hkscs', hkscsdecmap) + writer.update_decode_map(BIG5HKSCS_C1, BIG5HKSCS_C2) + writer.generate() + + print('Generating BIG5HKSCS decode map Unicode plane hints...') + writer = HintsWriter(fp, 'big5hkscs', isbmpmap) + writer.fillhints(bh2s(0x8740), bh2s(0xa0fe)) + writer.fillhints(bh2s(0xc6a1), bh2s(0xc8fe)) + writer.fillhints(bh2s(0xf9d6), bh2s(0xfefe)) + + print('Generating BIG5HKSCS encode map (BMP)...') + writer = EncodeMapWriter(fp, 'big5hkscs_bmp', hkscsencmap_bmp) + writer.generate() + + print('Generating BIG5HKSCS encode map (non-BMP)...') + writer = EncodeMapWriter(fp, 'big5hkscs_nonbmp', hkscsencmap_nonbmp) + writer.generate() + + +if __name__ == '__main__': + main_tw() + main_hkscs()