Merged revisions 66362 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk ........ r66362 | martin.v.loewis | 2008-09-10 15:38:12 +0200 (Mi, 10 Sep 2008) | 3 lines Issue #3811: The Unicode database was updated to 5.1. Reviewed by Fredrik Lundh and Marc-Andre Lemburg. ........
2008-09-10 14:08:48 +00:00 · 2008-09-10 14:08:48 +00:00 · 93cbca33f2
parent 1009d39187
commit 93cbca33f2
8 changed files with 19135 additions and 15886 deletions
--- a/Doc/library/unicodedata.rst
+++ b/Doc/library/unicodedata.rst
@ -16,11 +16,11 @@

 This module provides access to the Unicode Character Database which defines
 character properties for all Unicode characters. The data in this database is
-based on the :file:`UnicodeData.txt` file version 4.1.0 which is publicly
+based on the :file:`UnicodeData.txt` file version 5.1.0 which is publicly
 available from ftp://ftp.unicode.org/.

 The module uses the same names and symbols as defined by the UnicodeData File
-Format 4.1.0 (see http://www.unicode.org/Public/4.1.0/ucd/UCD.html).  It defines
+Format 5.1.0 (see http://www.unicode.org/Public/5.1.0/ucd/UCD.html).  It defines
 the following functions:


--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@ -16,7 +16,7 @@ encoding = 'utf-8'
 class UnicodeMethodsTest(unittest.TestCase):

    # update this, if the database changes
-    expectedchecksum = 'c198ed264497f108434b3f576d4107237221cc8a'
+    expectedchecksum = 'aef99984a58c8e1e5363a3175f2ff9608599a93e'

    def test_method_checksum(self):
        h = hashlib.sha1()
@ -75,7 +75,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
 class UnicodeFunctionsTest(UnicodeDatabaseTest):

    # update this, if the database changes
-    expectedchecksum = '4e389f97e9f88b8b7ab743121fd643089116f9f2'
+    expectedchecksum = '3136d5afd787dc2bcb1bdcac95e385349fbebbca'

    def test_function_checksum(self):
        data = []
@ -226,6 +226,16 @@ class UnicodeMiscTest(UnicodeDatabaseTest):
    def test_bug_1704793(self):
        self.assertEquals(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346')

+    def test_ucd_510(self):
+        import unicodedata
+        # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
+        self.assert_(unicodedata.mirrored("\u0f3a"))
+        self.assert_(not unicodedata.ucd_3_2_0.mirrored("\u0f3a"))
+        # Also, we now have two ways of representing
+        # the upper-case mapping: as delta, or as absolute value
+        self.assert_("a".upper()=='A')
+        self.assert_("\u1d79".upper()=='\ua77d')
+
 def test_main():
    test.support.run_unittest(
        UnicodeMiscTest,
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@ -1,8 +1,8 @@
 /* ------------------------------------------------------------------------

-   unicodedata -- Provides access to the Unicode 4.1 data base.
+   unicodedata -- Provides access to the Unicode 5.1 data base.

-   Data was extracted from the Unicode 4.1 UnicodeData.txt file.
+   Data was extracted from the Unicode 5.1 UnicodeData.txt file.

   Written by Marc-Andre Lemburg (mal@lemburg.com).
   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
@ -34,6 +34,7 @@ typedef struct change_record {
    const unsigned char bidir_changed;
    const unsigned char category_changed;
    const unsigned char decimal_changed;
+    const unsigned char mirrored_changed;
    const int numeric_changed;
 } change_record;

@ -355,6 +356,8 @@ unicodedata_mirrored(PyObject *self, PyObject *args)
        const change_record *old = get_old_record(self, c);
        if (old->category_changed == 0)
            index = 0; /* unassigned */
+        else if (old->mirrored_changed != 0xFF)
+            index = old->mirrored_changed;
    }
    return PyLong_FromLong(index);
 }
@ -1179,11 +1182,11 @@ PyDoc_STRVAR(unicodedata_docstring,
 "This module provides access to the Unicode Character Database which\n\
 defines character properties for all Unicode characters. The data in\n\
 this database is based on the UnicodeData.txt file version\n\
-4.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
+5.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
 \n\
 The module uses the same names and symbols as defined by the\n\
-UnicodeData File Format 4.1.0 (see\n\
-http://www.unicode.org/Public/4.1.0/ucd/UCD.html).");
+UnicodeData File Format 5.1.0 (see\n\
+http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");


 static struct PyModuleDef unicodedatamodule = {
--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
--- a/Modules/unicodename_db.h
+++ b/Modules/unicodename_db.h
--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@ -22,6 +22,7 @@
 #define XID_START_MASK 0x100
 #define XID_CONTINUE_MASK 0x200
 #define PRINTABLE_MASK 0x400
+#define NODELTA_MASK 0x800

 typedef struct {
    const Py_UNICODE upper;
@ -85,6 +86,9 @@ Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
    else
 	delta = ctype->upper;

+    if (ctype->flags & NODELTA_MASK)
+	return delta;
+
    if (delta >= 32768)
 	    delta -= 65536;

@ -767,6 +771,8 @@ Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
    int delta = ctype->upper;
+    if (ctype->flags & NODELTA_MASK)
+	return delta;
    if (delta >= 32768)
 	    delta -= 65536;
    return ch + delta;
@ -779,6 +785,8 @@ Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
 {
    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
    int delta = ctype->lower;
+    if (ctype->flags & NODELTA_MASK)
+	return delta;
    if (delta >= 32768)
 	    delta -= 65536;
    return ch + delta;
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -28,10 +28,10 @@
 import sys

 SCRIPT = sys.argv[0]
-VERSION = "2.5"
+VERSION = "2.6"

 # The Unicode Database
-UNIDATA_VERSION = "4.1.0"
+UNIDATA_VERSION = "5.1.0"
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
@ -62,6 +62,7 @@ UPPER_MASK = 0x80
 XID_START_MASK = 0x100
 XID_CONTINUE_MASK = 0x200
 PRINTABLE_MASK = 0x400
+NODELTA_MASK = 0x800

 def maketables(trace=0):

@ -361,6 +362,7 @@ def makeunicodetype(unicode, trace):
            bidirectional = record[4]
            properties = record[16]
            flags = 0
+            delta = True
            if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
                flags |= ALPHA_MASK
            if category == "Ll":
@ -379,25 +381,36 @@ def makeunicodetype(unicode, trace):
                flags |= XID_START_MASK
            if "XID_Continue" in properties:
                flags |= XID_CONTINUE_MASK
-            # use delta predictor for upper/lower/title
+            # use delta predictor for upper/lower/title if it fits
            if record[12]:
                upper = int(record[12], 16) - char
-                assert -32768 <= upper <= 32767
+                if  -32768 <= upper <= 32767 and delta:
                    upper = upper & 0xffff
+                else:
+                    upper += char
+                    delta = False
            else:
                upper = 0
            if record[13]:
                lower = int(record[13], 16) - char
-                assert -32768 <= lower <= 32767
+                if -32768 <= lower <= 32767 and delta:
                    lower = lower & 0xffff
+                else:
+                    lower += char
+                    delta = False
            else:
                lower = 0
            if record[14]:
                title = int(record[14], 16) - char
-                assert -32768 <= lower <= 32767
+                if -32768 <= lower <= 32767 and delta:
                    title = title & 0xffff
+                else:
+                    title += char
+                    delta = False
            else:
                title = 0
+            if not delta:
+                flags |= NODELTA_MASK
            # decimal digit, integer digit
            decimal = 0
            if record[6]:
@ -626,6 +639,7 @@ def merge_old_version(version, new, old):
    bidir_changes = [0xFF]*0x110000
    category_changes = [0xFF]*0x110000
    decimal_changes = [0xFF]*0x110000
+    mirrored_changes = [0xFF]*0x110000
    # In numeric data, 0 means "no change",
    # -1 means "did not have a numeric value
    numeric_changes = [0] * 0x110000
@ -672,6 +686,11 @@ def merge_old_version(version, new, old):
                        else:
                            assert re.match("^[0-9]+$", value)
                            numeric_changes[i] = int(value)
+                    elif k == 9:
+                        if value == 'Y':
+                            mirrored_changes[i] = '1'
+                        else:
+                            mirrored_changes[i] = '0'
                    elif k == 11:
                        # change to ISO comment, ignore
                        pass
@ -691,7 +710,8 @@ def merge_old_version(version, new, old):
                        class Difference(Exception):pass
                        raise Difference(hex(i), k, old.table[i], new.table[i])
    new.changed.append((version, list(zip(bidir_changes, category_changes,
-                                     decimal_changes, numeric_changes)),
+                                     decimal_changes, mirrored_changes,
+                                     numeric_changes)),
                        normalization_changes))