cpython/Tools/unicode/genmap_schinese.py

#
# genmap_schinese.py: Simplified Chinese Codecs Map Generator
#
# Original Author:  Hye-Shik Chang <perky@FreeBSD.org>
# Modified Author:  Dong-hee Na <donghee.na92@gmail.com>
#
import os
import re

from genmap_support import *


GB2312_C1   = (0x21, 0x7e)
GB2312_C2   = (0x21, 0x7e)
GBKL1_C1    = (0x81, 0xa8)
GBKL1_C2    = (0x40, 0xfe)
GBKL2_C1    = (0xa9, 0xfe)
GBKL2_C2    = (0x40, 0xa0)
GB18030EXTP1_C1 = (0xa1, 0xa9)
GB18030EXTP1_C2 = (0x40, 0xfe)
GB18030EXTP2_C1 = (0xaa, 0xaf)
GB18030EXTP2_C2 = (0xa1, 0xfe)
GB18030EXTP3_C1 = (0xd7, 0xd7)
GB18030EXTP3_C2 = (0xfa, 0xfe)
GB18030EXTP4_C1 = (0xf8, 0xfd)
GB18030EXTP4_C2 = (0xa1, 0xfe)
GB18030EXTP5_C1 = (0xfe, 0xfe)
GB18030EXTP5_C2 = (0x50, 0xfe)

MAPPINGS_GB2312 = 'http://people.freebsd.org/~perky/i18n/GB2312.TXT'
MAPPINGS_CP936 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT'
MAPPINGS_GB18030 = 'http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/gb-18030-2000.xml'

re_gb18030ass = re.compile('<a u="([A-F0-9]{4})" b="([0-9A-F ]+)"/>')


def parse_gb18030map(fo):
    m, gbuni = {}, {}
    for i in range(65536):
        if i < 0xd800 or i > 0xdfff: # exclude unicode surrogate area
            gbuni[i] = None
    for uni, native in re_gb18030ass.findall(fo.read()):
        uni = eval('0x'+uni)
        native = [eval('0x'+u) for u in native.split()]
        if len(native) <= 2:
            del gbuni[uni]
        if len(native) == 2: # we can decode algorithmically for 1 or 4 bytes
            m.setdefault(native[0], {})
            m[native[0]][native[1]] = uni
    gbuni = [k for k in gbuni.keys()]
    gbuni.sort()
    return m, gbuni

def main():
    print("Loading Mapping File...")
    gb2312map = open_mapping_file('python-mappings/GB2312.TXT', MAPPINGS_GB2312)
    cp936map = open_mapping_file('python-mappings/CP936.TXT', MAPPINGS_CP936)
    gb18030map = open_mapping_file('python-mappings/gb-18030-2000.xml', MAPPINGS_GB18030)

    gb18030decmap, gb18030unilinear = parse_gb18030map(gb18030map)
    gbkdecmap = loadmap(cp936map)
    gb2312decmap = loadmap(gb2312map)
    difmap = {}
    for c1, m in gbkdecmap.items():
        for c2, code in m.items():
            del gb18030decmap[c1][c2]
            if not gb18030decmap[c1]:
                del gb18030decmap[c1]
    for c1, m in gb2312decmap.items():
        for c2, code in m.items():
            gbkc1, gbkc2 = c1 | 0x80, c2 | 0x80
            if gbkdecmap[gbkc1][gbkc2] == code:
                del gbkdecmap[gbkc1][gbkc2]
                if not gbkdecmap[gbkc1]:
                    del gbkdecmap[gbkc1]

    gb2312_gbkencmap, gb18030encmap = {}, {}
    for c1, m in gbkdecmap.items():
        for c2, code in m.items():
            gb2312_gbkencmap.setdefault(code >> 8, {})
            gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB set
    for c1, m in gb2312decmap.items():
        for c2, code in m.items():
            gb2312_gbkencmap.setdefault(code >> 8, {})
            gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 | c2 # MSB unset
    for c1, m in gb18030decmap.items():
        for c2, code in m.items():
            gb18030encmap.setdefault(code >> 8, {})
            gb18030encmap[code >> 8][code & 0xff] = c1 << 8 | c2

    with open('mappings_cn.h', 'w') as fp:
        print_autogen(fp, os.path.basename(__file__))

        print("Generating GB2312 decode map...")
        writer = DecodeMapWriter(fp, "gb2312", gb2312decmap)
        writer.update_decode_map(GB2312_C1, GB2312_C2)
        writer.generate()

        print("Generating GBK decode map...")
        writer = DecodeMapWriter(fp, "gbkext", gbkdecmap)
        writer.update_decode_map(GBKL1_C1, GBKL1_C2)
        writer.update_decode_map(GBKL2_C1, GBKL2_C2)
        writer.generate()

        print("Generating GB2312 && GBK encode map...")
        writer = EncodeMapWriter(fp, "gbcommon", gb2312_gbkencmap)
        writer.generate()

        print("Generating GB18030 extension decode map...")
        writer = DecodeMapWriter(fp, "gb18030ext", gb18030decmap)
        for i in range(1, 6):
            writer.update_decode_map(eval("GB18030EXTP%d_C1" % i), eval("GB18030EXTP%d_C2" % i))

        writer.generate()

        print("Generating GB18030 extension encode map...")
        writer = EncodeMapWriter(fp, "gb18030ext", gb18030encmap)
        writer.generate()

        print("Generating GB18030 Unicode BMP Mapping Ranges...")
        ranges = [[-1, -1, -1]]
        gblinnum = 0
        fp.write("""
static const struct _gb18030_to_unibmp_ranges {
    Py_UCS4   first, last;
    DBCHAR       base;
} gb18030_to_unibmp_ranges[] = {
""")

        for uni in gb18030unilinear:
            if uni == ranges[-1][1] + 1:
                ranges[-1][1] = uni
            else:
                ranges.append([uni, uni, gblinnum])
            gblinnum += 1

        filler = BufferedFiller()
        for first, last, base in ranges[1:]:
            filler.write('{', str(first), ',', str(last), ',', str(base), '},')

        filler.write('{', '0,', '0,', str(
            ranges[-1][2] + ranges[-1][1] - ranges[-1][0] + 1), '}', '};')
        filler.printout(fp)

    print("Done!")


if __name__ == '__main__':
    main()
bpo-40328: Add tool for generating cjk mapping headers (GH-19602) 2020-04-29 14:34:24 -03:00			`#`
			`# genmap_schinese.py: Simplified Chinese Codecs Map Generator`
			`#`
			`# Original Author: Hye-Shik Chang <perky@FreeBSD.org>`
			`# Modified Author: Dong-hee Na <donghee.na92@gmail.com>`
			`#`
			`import os`
			`import re`

			`from genmap_support import *`


			`GB2312_C1 = (0x21, 0x7e)`
			`GB2312_C2 = (0x21, 0x7e)`
			`GBKL1_C1 = (0x81, 0xa8)`
			`GBKL1_C2 = (0x40, 0xfe)`
			`GBKL2_C1 = (0xa9, 0xfe)`
			`GBKL2_C2 = (0x40, 0xa0)`
			`GB18030EXTP1_C1 = (0xa1, 0xa9)`
			`GB18030EXTP1_C2 = (0x40, 0xfe)`
			`GB18030EXTP2_C1 = (0xaa, 0xaf)`
			`GB18030EXTP2_C2 = (0xa1, 0xfe)`
			`GB18030EXTP3_C1 = (0xd7, 0xd7)`
			`GB18030EXTP3_C2 = (0xfa, 0xfe)`
			`GB18030EXTP4_C1 = (0xf8, 0xfd)`
			`GB18030EXTP4_C2 = (0xa1, 0xfe)`
			`GB18030EXTP5_C1 = (0xfe, 0xfe)`
			`GB18030EXTP5_C2 = (0x50, 0xfe)`

			`MAPPINGS_GB2312 = 'http://people.freebsd.org/~perky/i18n/GB2312.TXT'`
			`MAPPINGS_CP936 = 'http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP936.TXT'`
			`MAPPINGS_GB18030 = 'http://oss.software.ibm.com/cvs/icu/~checkout~/charset/data/xml/gb-18030-2000.xml'`

			`re_gb18030ass = re.compile('<a u="([A-F0-9]{4})" b="([0-9A-F ]+)"/>')`


			`def parse_gb18030map(fo):`
			`m, gbuni = {}, {}`
			`for i in range(65536):`
			`if i < 0xd800 or i > 0xdfff: # exclude unicode surrogate area`
			`gbuni[i] = None`
			`for uni, native in re_gb18030ass.findall(fo.read()):`
			`uni = eval('0x'+uni)`
			`native = [eval('0x'+u) for u in native.split()]`
			`if len(native) <= 2:`
			`del gbuni[uni]`
			`if len(native) == 2: # we can decode algorithmically for 1 or 4 bytes`
			`m.setdefault(native[0], {})`
			`m[native[0]][native[1]] = uni`
			`gbuni = [k for k in gbuni.keys()]`
			`gbuni.sort()`
			`return m, gbuni`

			`def main():`
			`print("Loading Mapping File...")`
			`gb2312map = open_mapping_file('python-mappings/GB2312.TXT', MAPPINGS_GB2312)`
			`cp936map = open_mapping_file('python-mappings/CP936.TXT', MAPPINGS_CP936)`
			`gb18030map = open_mapping_file('python-mappings/gb-18030-2000.xml', MAPPINGS_GB18030)`

			`gb18030decmap, gb18030unilinear = parse_gb18030map(gb18030map)`
			`gbkdecmap = loadmap(cp936map)`
			`gb2312decmap = loadmap(gb2312map)`
			`difmap = {}`
			`for c1, m in gbkdecmap.items():`
			`for c2, code in m.items():`
			`del gb18030decmap[c1][c2]`
			`if not gb18030decmap[c1]:`
			`del gb18030decmap[c1]`
			`for c1, m in gb2312decmap.items():`
			`for c2, code in m.items():`
			`gbkc1, gbkc2 = c1 \| 0x80, c2 \| 0x80`
			`if gbkdecmap[gbkc1][gbkc2] == code:`
			`del gbkdecmap[gbkc1][gbkc2]`
			`if not gbkdecmap[gbkc1]:`
			`del gbkdecmap[gbkc1]`

			`gb2312_gbkencmap, gb18030encmap = {}, {}`
			`for c1, m in gbkdecmap.items():`
			`for c2, code in m.items():`
			`gb2312_gbkencmap.setdefault(code >> 8, {})`
			`gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 \| c2 # MSB set`
			`for c1, m in gb2312decmap.items():`
			`for c2, code in m.items():`
			`gb2312_gbkencmap.setdefault(code >> 8, {})`
			`gb2312_gbkencmap[code >> 8][code & 0xff] = c1 << 8 \| c2 # MSB unset`
			`for c1, m in gb18030decmap.items():`
			`for c2, code in m.items():`
			`gb18030encmap.setdefault(code >> 8, {})`
			`gb18030encmap[code >> 8][code & 0xff] = c1 << 8 \| c2`

			`with open('mappings_cn.h', 'w') as fp:`
			`print_autogen(fp, os.path.basename(__file__))`

			`print("Generating GB2312 decode map...")`
			`writer = DecodeMapWriter(fp, "gb2312", gb2312decmap)`
			`writer.update_decode_map(GB2312_C1, GB2312_C2)`
			`writer.generate()`

			`print("Generating GBK decode map...")`
			`writer = DecodeMapWriter(fp, "gbkext", gbkdecmap)`
			`writer.update_decode_map(GBKL1_C1, GBKL1_C2)`
			`writer.update_decode_map(GBKL2_C1, GBKL2_C2)`
			`writer.generate()`

			`print("Generating GB2312 && GBK encode map...")`
			`writer = EncodeMapWriter(fp, "gbcommon", gb2312_gbkencmap)`
			`writer.generate()`

			`print("Generating GB18030 extension decode map...")`
			`writer = DecodeMapWriter(fp, "gb18030ext", gb18030decmap)`
			`for i in range(1, 6):`
			`writer.update_decode_map(eval("GB18030EXTP%d_C1" % i), eval("GB18030EXTP%d_C2" % i))`

			`writer.generate()`

			`print("Generating GB18030 extension encode map...")`
			`writer = EncodeMapWriter(fp, "gb18030ext", gb18030encmap)`
			`writer.generate()`

			`print("Generating GB18030 Unicode BMP Mapping Ranges...")`
			`ranges = [[-1, -1, -1]]`
			`gblinnum = 0`
			`fp.write("""`
			`static const struct _gb18030_to_unibmp_ranges {`
			`Py_UCS4 first, last;`
			`DBCHAR base;`
			`} gb18030_to_unibmp_ranges[] = {`
			`""")`

			`for uni in gb18030unilinear:`
			`if uni == ranges[-1][1] + 1:`
			`ranges[-1][1] = uni`
			`else:`
			`ranges.append([uni, uni, gblinnum])`
			`gblinnum += 1`

			`filler = BufferedFiller()`
			`for first, last, base in ranges[1:]:`
			`filler.write('{', str(first), ',', str(last), ',', str(base), '},')`

			`filler.write('{', '0,', '0,', str(`
			`ranges[-1][2] + ranges[-1][1] - ranges[-1][0] + 1), '}', '};')`
			`filler.printout(fp)`

			`print("Done!")`


			`if __name__ == '__main__':`
			`main()`