update to Unicode 6.1

2012-02-20 22:24:29 -05:00 · 2012-02-20 22:24:29 -05:00 · 71f660e00f
parent 16fa2a1097
commit 71f660e00f
7 changed files with 24819 additions and 23329 deletions
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@ -21,7 +21,7 @@ errors = 'surrogatepass'
 class UnicodeMethodsTest(unittest.TestCase):

    # update this, if the database changes
-    expectedchecksum = 'df0b3ca6785a070b21f837b227dbdbdff3c2e921'
+    expectedchecksum = 'bf7a78f1a532421b5033600102e23a92044dbba9'

    def test_method_checksum(self):
        h = hashlib.sha1()
@ -80,7 +80,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
 class UnicodeFunctionsTest(UnicodeDatabaseTest):

    # update this, if the database changes
-    expectedchecksum = 'c23dfc0b5eaf3ca2aad32d733de96bb182ccda50'
+    expectedchecksum = '17fe2f12b788e4fff5479b469c4404bb6ecf841f'
    def test_function_checksum(self):
        data = []
        h = hashlib.sha1()
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
 Core and Builtins
 -----------------

+- Upgrade Unicode data to Unicode 6.1.
+
 - Issue #14040: Remove rarely used file name suffixes for C extensions
  (under POSIX mainly).

--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@ -921,7 +921,7 @@ is_unified_ideograph(Py_UCS4 code)
 {
    return
        (0x3400 <= code && code <= 0x4DB5)   || /* CJK Ideograph Extension A */
-        (0x4E00 <= code && code <= 0x9FCB)   || /* CJK Ideograph */
+        (0x4E00 <= code && code <= 0x9FCC)   || /* CJK Ideograph */
        (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
        (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
        (0x2B740 <= code && code <= 0x2B81D);   /* CJK Ideograph Extension D */
--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
--- a/Modules/unicodename_db.h
+++ b/Modules/unicodename_db.h
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -38,7 +38,7 @@ SCRIPT = sys.argv[0]
 VERSION = "3.2"

 # The Unicode Database
-UNIDATA_VERSION = "6.0.0"
+UNIDATA_VERSION = "6.1.0"
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
@ -58,7 +58,7 @@ PUA_16 = range(0x100000, 0x10FFFE)

 # we use this ranges of PUA_15 to store name aliases and named sequences
 NAME_ALIASES_START = 0xF0000
-NAMED_SEQUENCES_START = 0xF0100
+NAMED_SEQUENCES_START = 0xF0200

 old_versions = ["3.2.0"]

@ -95,7 +95,7 @@ EXTENDED_CASE_MASK = 0x4000
 # these ranges need to match unicodedata.c:is_unified_ideograph
 cjk_ranges = [
    ('3400', '4DB5'),
-    ('4E00', '9FCB'),
+    ('4E00', '9FCC'),
    ('20000', '2A6D6'),
    ('2A700', '2B734'),
    ('2B740', '2B81D')
@ -958,7 +958,7 @@ class UnicodeData:
                    s = s.strip()
                    if not s or s.startswith('#'):
                        continue
-                    char, name = s.split(';')
+                    char, name, abbrev = s.split(';')
                    char = int(char, 16)
                    self.aliases.append((name, char))
                    # also store the name in the PUA 1
@ -971,6 +971,7 @@ class UnicodeData:
            # in order to take advantage of the compression and lookup
            # algorithms used for the other characters.

+            assert pua_index < NAMED_SEQUENCES_START
            pua_index = NAMED_SEQUENCES_START
            with open_data(NAMED_SEQUENCES, version) as file:
                for s in file: