Upgrade to Unicode 6.0.0.

makeunicodedata.py: download all data files from unicode.org, switch to extracting Unihan data from zip file. Read linebreakprops and derivednormalizationprops even for old versions, even though they are not used in delta records. test:unicode.py: U+11000 is now assigned, use U+14000 instead.
2010-10-11 22:42:28 +00:00 · 2010-10-11 22:42:28 +00:00 · baecd7243a
parent e8930228c7
commit baecd7243a
6 changed files with 21711 additions and 19847 deletions
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -1349,7 +1349,7 @@ class UnicodeTest(string_tests.CommonTest,
    def test_printable_repr(self):
        self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
-        self.assertEqual(repr('\U00011000'), "'\\U00011000'")     # nonprintable
+        self.assertEqual(repr('\U00014000'), "'\\U00014000'")     # nonprintable
    def test_expandtabs_overflows_gracefully(self):
        # This test only affects 32-bit platforms because expandtabs can only take
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@ -21,7 +21,7 @@ errors = 'surrogatepass'
 class UnicodeMethodsTest(unittest.TestCase):
    # update this, if the database changes
-    expectedchecksum = '4504dffd035baea02c5b9de82bebc3d65e0e0baf'
+    expectedchecksum = '21b90f1aed00081b81ca7942b22196af090015a0'
    def test_method_checksum(self):
        h = hashlib.sha1()
@ -80,7 +80,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
 class UnicodeFunctionsTest(UnicodeDatabaseTest):
    # update this, if the database changes
-    expectedchecksum = 'e89a6380093a00a7685ac7b92e7367d737fcb79b'
+    expectedchecksum = 'c23dfc0b5eaf3ca2aad32d733de96bb182ccda50'
    def test_function_checksum(self):
        data = []
        h = hashlib.sha1()
--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
--- a/Modules/unicodename_db.h
+++ b/Modules/unicodename_db.h
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -25,17 +25,17 @@
 # written by Fredrik Lundh (fredrik@pythonware.com)
 #
-import sys
+import sys, os, zipfile
 SCRIPT = sys.argv[0]
 VERSION = "3.2"
 # The Unicode Database
-UNIDATA_VERSION = "5.2.0"
+UNIDATA_VERSION = "6.0.0"
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
-UNIHAN = "Unihan%s.txt"
+UNIHAN = "Unihan%s.zip"
 DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
 DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
 LINE_BREAK = "LineBreak%s.txt"
@ -75,23 +75,13 @@ def maketables(trace=0):
    print("--- Reading", UNICODE_DATA % "", "...")
    version = ""
-    unicode = UnicodeData(UNICODE_DATA % version,
+    unicode = UnicodeData(UNIDATA_VERSION)
                          COMPOSITION_EXCLUSIONS % version,
                          EASTASIAN_WIDTH % version,
                          UNIHAN % version,
                          DERIVED_CORE_PROPERTIES % version,
                          DERIVEDNORMALIZATION_PROPS % version,
                          LINE_BREAK % version)
    print(len(list(filter(None, unicode.table))), "characters")
    for version in old_versions:
        print("--- Reading", UNICODE_DATA % ("-"+version), "...")
-        old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
+        old_unicode = UnicodeData(version)
                                  COMPOSITION_EXCLUSIONS % ("-"+version),
                                  EASTASIAN_WIDTH % ("-"+version),
                                  UNIHAN % ("-"+version),
                                  DERIVED_CORE_PROPERTIES % ("-"+version))
        print(len(list(filter(None, old_unicode.table))), "characters")
        merge_old_version(version, unicode, old_unicode)
@ -771,6 +761,10 @@ def merge_old_version(version, new, old):
                    elif k == 16:
                        # derived property changes; not yet
                        pass
                    elif k == 17:
                        # normalization quickchecks are not performed
                        # for older versions
                        pass
                    else:
                        class Difference(Exception):pass
                        raise Difference(hex(i), k, old.table[i], new.table[i])
@ -779,6 +773,21 @@ def merge_old_version(version, new, old):
                                     numeric_changes)),
                        normalization_changes))
 def open_data(template, version):
    local = template % ('-'+version,)
    if not os.path.exists(local):
        import urllib.request
        if version == '3.2.0':
            # irregular url structure
            url = 'http://www.unicode.org/Public/3.2-Update/' + local
        else:
            url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
        urllib.request.urlretrieve(url, filename=local)
    if local.endswith('.txt'):
        return open(local, encoding='utf-8')
    else:
        # Unihan.zip
        return open(local, 'rb')
 # --------------------------------------------------------------------
 # the following support code is taken from the unidb utilities
@ -793,11 +802,11 @@ class UnicodeData:
    #  ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
    #  derived-props] (17)
-    def __init__(self, filename, exclusions, eastasianwidth, unihan,
+    def __init__(self, version,
-                 derivedprops, derivednormalizationprops=None, linebreakprops=None,
+                 linebreakprops=False,
                 expand=1):
        self.changed = []
-        file = open(filename)
+        file = open_data(UNICODE_DATA, version)
        table = [None] * 0x110000
        while 1:
            s = file.readline()
@ -825,11 +834,11 @@ class UnicodeData:
                    table[i] = f2
        # public attributes
-        self.filename = filename
+        self.filename = UNICODE_DATA % ''
        self.table = table
        self.chars = list(range(0x110000)) # unicode 3.2
-        file = open(exclusions)
+        file = open_data(COMPOSITION_EXCLUSIONS, version)
        self.exclusions = {}
        for s in file:
            s = s.strip()
@ -841,7 +850,7 @@ class UnicodeData:
            self.exclusions[char] = 1
        widths = [None] * 0x110000
-        for s in open(eastasianwidth):
+        for s in open_data(EASTASIAN_WIDTH, version):
            s = s.strip()
            if not s:
                continue
@ -862,7 +871,7 @@ class UnicodeData:
        for i in range(0, 0x110000):
            if table[i] is not None:
                table[i].append(set())
-        for s in open(derivedprops):
+        for s in open_data(DERIVED_CORE_PROPERTIES, version):
            s = s.split('#', 1)[0].strip()
            if not s:
                continue
@ -881,43 +890,53 @@ class UnicodeData:
                    # apply to unassigned code points; ignore them
                    table[char][-1].add(p)
-        if linebreakprops:
+        for s in open_data(LINE_BREAK, version):
-            for s in open(linebreakprops):
+            s = s.partition('#')[0]
-                s = s.partition('#')[0]
+            s = [i.strip() for i in s.split(';')]
-                s = [i.strip() for i in s.split(';')]
+            if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
-                if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
+                continue
-                    continue
+            if '..' not in s[0]:
-                if '..' not in s[0]:
+                first = last = int(s[0], 16)
-                    first = last = int(s[0], 16)
+            else:
-                else:
+                first, last = [int(c, 16) for c in s[0].split('..')]
-                    first, last = [int(c, 16) for c in s[0].split('..')]
+            for char in range(first, last+1):
-                for char in range(first, last+1):
+                table[char][-1].add('Line_Break')
                    table[char][-1].add('Line_Break')
-        if derivednormalizationprops:
+        # We only want the quickcheck properties
-            quickchecks = [0] * 0x110000 # default is Yes
+        # Format: NF?_QC; Y(es)/N(o)/M(aybe)
-            qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
+        # Yes is the default, hence only N and M occur
-            for s in open(derivednormalizationprops):
+        # In 3.2.0, the format was different (NF?_NO)
-                if '#' in s:
+        # The parsing will incorrectly determine these as
-                    s = s[:s.index('#')]
+        # "yes", however, unicodedata.c will not perform quickchecks
-                s = [i.strip() for i in s.split(';')]
+        # for older versions, and no delta records will be created.
-                if len(s) < 2 or s[1] not in qc_order:
+        quickchecks = [0] * 0x110000
-                    continue
+        qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
-                quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
+        for s in open_data(DERIVEDNORMALIZATION_PROPS, version):
-                quickcheck_shift = qc_order.index(s[1])*2
+            if '#' in s:
-                quickcheck <<= quickcheck_shift
+                s = s[:s.index('#')]
-                if '..' not in s[0]:
+            s = [i.strip() for i in s.split(';')]
-                    first = last = int(s[0], 16)
+            if len(s) < 2 or s[1] not in qc_order:
-                else:
+                continue
-                    first, last = [int(c, 16) for c in s[0].split('..')]
+            quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
-                for char in range(first, last+1):
+            quickcheck_shift = qc_order.index(s[1])*2
-                    assert not (quickchecks[char]>>quickcheck_shift)&3
+            quickcheck <<= quickcheck_shift
-                    quickchecks[char] |= quickcheck
+            if '..' not in s[0]:
-            for i in range(0, 0x110000):
+                first = last = int(s[0], 16)
-                if table[i] is not None:
+            else:
-                    table[i].append(quickchecks[i])
+                first, last = [int(c, 16) for c in s[0].split('..')]
            for char in range(first, last+1):
                assert not (quickchecks[char]>>quickcheck_shift)&3
                quickchecks[char] |= quickcheck
        for i in range(0, 0x110000):
            if table[i] is not None:
                table[i].append(quickchecks[i])
-        for line in open(unihan, encoding='utf-8'):
+        zip = zipfile.ZipFile(open_data(UNIHAN, version))
        if version == '3.2.0':
            data = zip.open('Unihan-3.2.0.txt').read()
        else:
            data = zip.open('Unihan_NumericValues.txt').read()
        for line in data.decode("utf-8").splitlines():
            if not line.startswith('U+'):
                continue
            code, tag, value = line.split(None, 3)[:3]