Update Unicode database to Unicode 4.1.

2006-03-09 23:38:20 +00:00 · 2006-03-09 23:38:20 +00:00 · 480f1bb67b
parent e2b4677253
commit 480f1bb67b
12 changed files with 17302 additions and 13365 deletions
--- a/Doc/lib/libunicodedata.tex
+++ b/Doc/lib/libunicodedata.tex
@ -14,11 +14,11 @@
 This module provides access to the Unicode Character Database which
 defines character properties for all Unicode characters. The data in
 this database is based on the \file{UnicodeData.txt} file version
-3.2.0 which is publically available from \url{ftp://ftp.unicode.org/}.
+4.1.0 which is publically available from \url{ftp://ftp.unicode.org/}.

 The module uses the same names and symbols as defined by the
-UnicodeData File Format 3.2.0 (see
-\url{http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html}).  It
+UnicodeData File Format 4.1.0 (see
+\url{http://www.unicode.org/Public/4.1-Update/UnicodeData-4.1.0.html}).  It
 defines the following functions:

 \begin{funcdesc}{lookup}{name}
@ -130,3 +130,12 @@ The version of the Unicode database used in this module.

 \versionadded{2.3}
 \end{datadesc}
+
+\begin{datadesc}{db_3_2_0}
+This is an object that has the same methods as the entire
+module, but uses the Unicode database version 3.2 instead,
+for applications that require this specific version of
+the Unicode database (such as IDNA).
+
+\versionadded{2.5}
+\end{datadesc}
--- a/Include/ucnhash.h
+++ b/Include/ucnhash.h
@ -14,12 +14,14 @@ typedef struct {
    int size;

    /* Get name for a given character code.  Returns non-zero if
-       success, zero if not.  Does not set Python exceptions. */
-    int (*getname)(Py_UCS4 code, char* buffer, int buflen);
+       success, zero if not.  Does not set Python exceptions. 
+       If self is NULL, data come from the default version of the database.
+       If it is not NULL, it should be a unicodedata.db_X_Y_Z object */
+    int (*getname)(PyObject *self, Py_UCS4 code, char* buffer, int buflen);

    /* Get character code for a given name.  Same error handling
       as for getname. */
-    int (*getcode)(const char* name, int namelen, Py_UCS4* code);
+    int (*getcode)(PyObject *self, const char* name, int namelen, Py_UCS4* code);

 } _PyUnicode_Name_CAPI;

--- a/Lib/encodings/idna.py
+++ b/Lib/encodings/idna.py
@ -1,6 +1,7 @@
 # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)

-import stringprep, unicodedata, re, codecs
+import stringprep, re, codecs
+from unicodedata import db_3_2_0 as unicodedata

 # IDNA section 3.1
 dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
--- a/Lib/stringprep.py
+++ b/Lib/stringprep.py
@ -5,7 +5,7 @@ There are two kinds of tables: sets, for which a member test is provided,
 and mappings, for which a mapping function is provided.
 """

-import unicodedata
+from unicodedata import db_3_2_0 as unicodedata

 assert unicodedata.unidata_version == '3.2.0'

--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@ -16,7 +16,7 @@ encoding = 'utf-8'
 class UnicodeMethodsTest(unittest.TestCase):

    # update this, if the database changes
-    expectedchecksum = 'a37276dc2c158bef6dfd908ad34525c97180fad9'
+    expectedchecksum = 'a6555cd209d960dcfa17bfdce0c96d91cfa9a9ba'

    def test_method_checksum(self):
        h = sha.sha()
@ -75,7 +75,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
 class UnicodeFunctionsTest(UnicodeDatabaseTest):

    # update this, if the database changes
-    expectedchecksum = 'cfe20a967a450ebc82ca68c3e4eed344164e11af'
+    expectedchecksum = 'b45b79f3203ee1a896d9b5655484adaff5d4964b'

    def test_function_checksum(self):
        data = []
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -279,6 +279,10 @@ Core and builtins
 Extension Modules
 -----------------

+- The unicodedata module was updated to the 4.1 version of the Unicode
+  database. The 3.2 version is still available as unicodedata.db_3_2_0
+  for applications that require this specific version (such as IDNA).
+
 - The timing module is no longer built by default.  It was deprecated
  in PEP 4 in Python 2.0 or earlier.

--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@ -14,6 +14,7 @@

 #include "Python.h"
 #include "ucnhash.h"
+#include "structmember.h"

 /* character properties */

@ -28,6 +29,14 @@ typedef struct {
 						   _PyUnicode_EastAsianWidth */
 } _PyUnicode_DatabaseRecord;

+typedef struct change_record {
+    /* sequence of fields should be the same as in merge_old_version */
+    const unsigned char bidir_changed;
+    const unsigned char category_changed;
+    const unsigned char decimal_changed;
+    const int numeric_changed;
+} change_record;
+
 /* data file generated by Tools/unicode/makeunicodedata.py */
 #include "unicodedata_db.h"

@ -51,6 +60,85 @@ _getrecord(PyUnicodeObject* v)
    return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
 }

+/* ------------- Previous-version API ------------------------------------- */
+typedef struct previous_version {
+    PyObject_HEAD
+    const char *name;
+    const change_record* (*getrecord)(Py_UCS4);
+    Py_UCS4 (*normalization)(Py_UCS4);
+} PreviousDBVersion;
+
+#define get_old_record(self, v)    ((((PreviousDBVersion*)self)->getrecord)(v))
+
+/* Forward declaration */
+static PyMethodDef unicodedata_functions[];
+
+static PyMemberDef DB_members[] = {
+	{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
+        {NULL}
+};
+
+static PyTypeObject Xxo_Type = {
+	/* The ob_type field must be initialized in the module init function
+	 * to be portable to Windows without using C++. */
+	PyObject_HEAD_INIT(NULL)
+	0,			/*ob_size*/
+	"unicodedata.DB",		/*tp_name*/
+	sizeof(PreviousDBVersion),	/*tp_basicsize*/
+	0,			/*tp_itemsize*/
+	/* methods */
+	(destructor)PyObject_Del, /*tp_dealloc*/
+	0,			/*tp_print*/
+	0,                      /*tp_getattr*/
+	0,			/*tp_setattr*/
+	0,			/*tp_compare*/
+	0,			/*tp_repr*/
+	0,			/*tp_as_number*/
+	0,			/*tp_as_sequence*/
+	0,			/*tp_as_mapping*/
+	0,			/*tp_hash*/
+        0,                      /*tp_call*/
+        0,                      /*tp_str*/
+        PyObject_GenericGetAttr,/*tp_getattro*/
+        0,                      /*tp_setattro*/
+        0,                      /*tp_as_buffer*/
+        Py_TPFLAGS_DEFAULT,     /*tp_flags*/
+        0,                      /*tp_doc*/
+        0,                      /*tp_traverse*/
+        0,                      /*tp_clear*/
+        0,                      /*tp_richcompare*/
+        0,                      /*tp_weaklistoffset*/
+        0,                      /*tp_iter*/
+        0,                      /*tp_iternext*/
+        unicodedata_functions,  /*tp_methods*/
+        DB_members,             /*tp_members*/
+        0,                      /*tp_getset*/
+        0,                      /*tp_base*/
+        0,                      /*tp_dict*/
+        0,                      /*tp_descr_get*/
+        0,                      /*tp_descr_set*/
+        0,                      /*tp_dictoffset*/
+        0,                      /*tp_init*/
+        0,                      /*tp_alloc*/
+        0,                      /*tp_new*/
+        0,                      /*tp_free*/
+        0,                      /*tp_is_gc*/
+};
+
+static PyObject*
+new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
+                     Py_UCS4 (*normalization)(Py_UCS4))
+{
+	PreviousDBVersion *self;
+	self = PyObject_New(PreviousDBVersion, &Xxo_Type);
+	if (self == NULL)
+		return NULL;
+	self->name = name;
+	self->getrecord = getrecord;
+        self->normalization = normalization;
+	return (PyObject*)self;
+}
+
 /* --- Module API --------------------------------------------------------- */

 PyDoc_STRVAR(unicodedata_decimal__doc__,
@ -65,6 +153,7 @@ unicodedata_decimal(PyObject *self, PyObject *args)
 {
    PyUnicodeObject *v;
    PyObject *defobj = NULL;
+    int have_old = 0;
    long rc;

    if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
@ -74,7 +163,22 @@ unicodedata_decimal(PyObject *self, PyObject *args)
 			"need a single Unicode character as parameter");
        return NULL;
    }
-    rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
+
+    if (self) {
+        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+        if (old->category_changed == 0) {
+            /* unassigned */
+            have_old = 1;
+            rc = -1;
+        } 
+        else if (old->decimal_changed != 0xFF) {
+            have_old = 1;
+            rc = old->decimal_changed;
+        }
+    }
+
+    if (!have_old)
+        rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
    if (rc < 0) {
 	if (defobj == NULL) {
 	    PyErr_SetString(PyExc_ValueError,
@ -136,6 +240,7 @@ unicodedata_numeric(PyObject *self, PyObject *args)
 {
    PyUnicodeObject *v;
    PyObject *defobj = NULL;
+    int have_old = 0;
    double rc;

    if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
@ -145,7 +250,22 @@ unicodedata_numeric(PyObject *self, PyObject *args)
 			"need a single Unicode character as parameter");
 	return NULL;
    }
-    rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
+
+    if (self) {
+        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+        if (old->category_changed == 0) {
+            /* unassigned */
+            have_old = 1;
+            rc = -1;
+        } 
+        else if (old->decimal_changed != 0xFF) {
+            have_old = 1;
+            rc = old->decimal_changed;
+        }
+    }
+
+    if (!have_old)
+        rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
    if (rc < 0) {
 	if (defobj == NULL) {
 	    PyErr_SetString(PyExc_ValueError, "not a numeric character");
@ -180,6 +300,11 @@ unicodedata_category(PyObject *self, PyObject *args)
 	return NULL;
    }
    index = (int) _getrecord(v)->category;
+    if (self) {
+        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+        if (old->category_changed != 0xFF)
+            index = old->category_changed;
+    }
    return PyString_FromString(_PyUnicode_CategoryNames[index]);
 }

@ -205,6 +330,13 @@ unicodedata_bidirectional(PyObject *self, PyObject *args)
 	return NULL;
    }
    index = (int) _getrecord(v)->bidirectional;
+    if (self) {
+        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+        if (old->category_changed == 0)
+            index = 0; /* unassigned */
+        else if (old->bidir_changed != 0xFF)
+            index = old->bidir_changed;
+    }
    return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
 }

@ -219,6 +351,7 @@ static PyObject *
 unicodedata_combining(PyObject *self, PyObject *args)
 {
    PyUnicodeObject *v;
+    int index;

    if (!PyArg_ParseTuple(args, "O!:combining",
 			  &PyUnicode_Type, &v))
@ -228,7 +361,13 @@ unicodedata_combining(PyObject *self, PyObject *args)
 			"need a single Unicode character as parameter");
 	return NULL;
    }
-    return PyInt_FromLong((int) _getrecord(v)->combining);
+    index = (int) _getrecord(v)->combining;
+    if (self) {
+        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+        if (old->category_changed == 0)
+            index = 0; /* unassigned */
+    }
+    return PyInt_FromLong(index);
 }

 PyDoc_STRVAR(unicodedata_mirrored__doc__,
@ -242,6 +381,7 @@ static PyObject *
 unicodedata_mirrored(PyObject *self, PyObject *args)
 {
    PyUnicodeObject *v;
+    int index;

    if (!PyArg_ParseTuple(args, "O!:mirrored",
 			  &PyUnicode_Type, &v))
@ -251,7 +391,13 @@ unicodedata_mirrored(PyObject *self, PyObject *args)
 			"need a single Unicode character as parameter");
 	return NULL;
    }
-    return PyInt_FromLong((int) _getrecord(v)->mirrored);
+    index = (int) _getrecord(v)->mirrored;
+    if (self) {
+        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+        if (old->category_changed == 0)
+            index = 0; /* unassigned */
+    }
+    return PyInt_FromLong(index);
 }

 PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
@ -275,6 +421,11 @@ unicodedata_east_asian_width(PyObject *self, PyObject *args)
 	return NULL;
    }
    index = (int) _getrecord(v)->east_asian_width;
+    if (self) {
+        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+        if (old->category_changed == 0)
+            index = 0; /* unassigned */
+    }
    return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
 }

@ -303,6 +454,12 @@ unicodedata_decomposition(PyObject *self, PyObject *args)

    code = (int) *PyUnicode_AS_UNICODE(v);

+    if (self) {
+        const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
+        if (old->category_changed == 0)
+            return PyString_FromString(""); /* unassigned */
+    }
+
    if (code < 0 || code >= 0x110000)
        index = 0;
    else {
@ -337,11 +494,14 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
 }

 void
-get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
+get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
 {
    if (code >= 0x110000) {
        *index = 0;
-    } 
+    } else if (self && get_old_record(self, code)->category_changed==0) {
+        /* unassigned in old version */
+        *index = 0;
+    }
    else {
        *index = decomp_index1[(code>>DECOMP_SHIFT)];
        *index = decomp_index2[(*index<<DECOMP_SHIFT)+
@ -367,7 +527,7 @@ get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
 #define SCount  (LCount*NCount)

 static PyObject*
-nfd_nfkd(PyObject *input, int k)
+nfd_nfkd(PyObject *self, PyObject *input, int k)
 {
    PyObject *result;
    Py_UNICODE *i, *end, *o;
@ -416,8 +576,17 @@ nfd_nfkd(PyObject *input, int k)
                }
                continue;
            }
-            /* Other decompoistions. */
-            get_decomp_record(code, &index, &prefix, &count);
+            /* normalization changes */
+            if (self) {
+                Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
+                if (value != 0) {
+                    stack[stackptr++] = value;
+                    continue;
+                }
+            }
+
+            /* Other decompositions. */
+            get_decomp_record(self, code, &index, &prefix, &count);

            /* Copy character if it is not decomposable, or has a
               compatibility decomposition, but we do NFD. */
@ -467,7 +636,7 @@ nfd_nfkd(PyObject *input, int k)
 }

 static int
-find_nfc_index(struct reindex* nfc, Py_UNICODE code)
+find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
 {
    int index;
    for (index = 0; nfc[index].start; index++) {
@ -483,7 +652,7 @@ find_nfc_index(struct reindex* nfc, Py_UNICODE code)
 }

 static PyObject*
-nfc_nfkc(PyObject *input, int k)
+nfc_nfkc(PyObject *self, PyObject *input, int k)
 {
    PyObject *result;
    Py_UNICODE *i, *i1, *o, *end;
@ -492,7 +661,7 @@ nfc_nfkc(PyObject *input, int k)
    Py_UNICODE *skipped[20];
    int cskipped = 0;

-    result = nfd_nfkd(input, k);
+    result = nfd_nfkd(self, input, k);
    if (!result)
        return NULL;

@ -536,7 +705,7 @@ nfc_nfkc(PyObject *input, int k)
          continue;
      }

-      f = find_nfc_index(nfc_first, *i);
+      f = find_nfc_index(self, nfc_first, *i);
      if (f == -1) {
          *o++ = *i++;
          continue;
@ -551,7 +720,7 @@ nfc_nfkc(PyObject *input, int k)
              i1++;
              continue;
          }
-          l = find_nfc_index(nfc_last, *i1);
+          l = find_nfc_index(self, nfc_last, *i1);
          /* *i1 cannot be combined with *i. If *i1
             is a starter, we don't need to look further.
             Otherwise, record the combining class. */
@ -575,7 +744,7 @@ nfc_nfkc(PyObject *input, int k)
          /* Mark the second character unused. */
          skipped[cskipped++] = i1;
          i1++;
-          f = find_nfc_index(nfc_first, *i);
+          f = find_nfc_index(self, nfc_first, *i);
          if (f == -1)
              break;
      }
@ -610,13 +779,13 @@ unicodedata_normalize(PyObject *self, PyObject *args)
    }

    if (strcmp(form, "NFC") == 0)
-        return nfc_nfkc(input, 0);
+        return nfc_nfkc(self, input, 0);
    if (strcmp(form, "NFKC") == 0)
-        return nfc_nfkc(input, 1);
+        return nfc_nfkc(self, input, 1);
    if (strcmp(form, "NFD") == 0)
-        return nfd_nfkd(input, 0);
+        return nfd_nfkd(self, input, 0);
    if (strcmp(form, "NFKD") == 0)
-        return nfd_nfkd(input, 1);
+        return nfd_nfkd(self, input, 1);
    PyErr_SetString(PyExc_ValueError, "invalid normalization form");
    return NULL;
 }
@ -686,7 +855,7 @@ is_unified_ideograph(Py_UCS4 code)
 }

 static int
-_getucname(Py_UCS4 code, char* buffer, int buflen)
+_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
 {
    int offset;
    int i;
@ -726,6 +895,15 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
    if (code >= 0x110000)
        return 0;

+    if (self) {
+        const change_record *old = get_old_record(self, code);
+        if (old->category_changed == 0) {
+            /* unassigned */
+            return 0;
+        } 
+    }
+
+
    /* get offset into phrasebook */
    offset = phrasebook_offset1[(code>>phrasebook_shift)];
    offset = phrasebook_offset2[(offset<<phrasebook_shift) +
@ -768,12 +946,12 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
 }

 static int
-_cmpname(int code, const char* name, int namelen)
+_cmpname(PyObject *self, int code, const char* name, int namelen)
 {
    /* check if code corresponds to the given name */
    int i;
    char buffer[NAME_MAXLEN];
-    if (!_getucname(code, buffer, sizeof(buffer)))
+    if (!_getucname(self, code, buffer, sizeof(buffer)))
        return 0;
    for (i = 0; i < namelen; i++) {
        if (toupper(name[i]) != buffer[i])
@ -803,7 +981,7 @@ find_syllable(const char *str, int *len, int *pos, int count, int column)
 }

 static int
-_getcode(const char* name, int namelen, Py_UCS4* code)
+_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
 {
    unsigned int h, v;
    unsigned int mask = code_size-1;
@ -860,7 +1038,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
    v = code_hash[i];
    if (!v)
        return 0;
-    if (_cmpname(v, name, namelen)) {
+    if (_cmpname(self, v, name, namelen)) {
        *code = v;
        return 1;
    }
@ -872,7 +1050,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
        v = code_hash[i];
        if (!v)
            return 0;
-        if (_cmpname(v, name, namelen)) {
+        if (_cmpname(self, v, name, namelen)) {
            *code = v;
            return 1;
        }
@ -914,8 +1092,8 @@ unicodedata_name(PyObject* self, PyObject* args)
 	return NULL;
    }

-    if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
-                             name, sizeof(name))) {
+    if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
+                    name, sizeof(name))) {
 	if (defobj == NULL) {
 	    PyErr_SetString(PyExc_ValueError, "no such name");
            return NULL;
@ -947,7 +1125,7 @@ unicodedata_lookup(PyObject* self, PyObject* args)
    if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
        return NULL;

-    if (!_getcode(name, namelen, &code)) {
+    if (!_getcode(self, name, namelen, &code)) {
        char fmt[] = "undefined character name '%s'";
        char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
        sprintf(buf, fmt, name);
@ -985,6 +1163,8 @@ static PyMethodDef unicodedata_functions[] = {
    {NULL, NULL}		/* sentinel */
 };

+
+
 PyDoc_STRVAR(unicodedata_docstring,
 "This module provides access to the Unicode Character Database which\n\
 defines character properties for all Unicode characters. The data in\n\
@ -1007,6 +1187,11 @@ initunicodedata(void)

    PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);

+    /* Previous versions */
+    v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
+    if (v != NULL)
+        PyModule_AddObject(m, "db_3_2_0", v);
+
    /* Export C API */
    v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
    if (v != NULL)
--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
--- a/Modules/unicodename_db.h
+++ b/Modules/unicodename_db.h
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -1898,7 +1898,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
                    /* found a name.  look it up in the unicode database */
                    message = "unknown Unicode character name";
                    s++;
-                    if (ucnhash_CAPI->getcode(start, (int)(s-start-1), &chr))
+                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
                        goto store;
                }
            }
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -26,13 +26,15 @@
 import sys

 SCRIPT = sys.argv[0]
-VERSION = "2.3"
+VERSION = "2.5"

 # The Unicode Database
-UNIDATA_VERSION = "3.2.0"
-UNICODE_DATA = "UnicodeData.txt"
-COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
-EASTASIAN_WIDTH = "EastAsianWidth.txt"
+UNIDATA_VERSION = "4.1.0"
+UNICODE_DATA = "UnicodeData%s.txt"
+COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
+EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
+
+old_versions = ["3.2.0"]

 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
@ -57,13 +59,23 @@ UPPER_MASK = 0x80

 def maketables(trace=0):

-    print "--- Reading", UNICODE_DATA, "..."
+    print "--- Reading", UNICODE_DATA % "", "..."

-    unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS,
-                          EASTASIAN_WIDTH)
+    version = ""
+    unicode = UnicodeData(UNICODE_DATA % version,
+                          COMPOSITION_EXCLUSIONS % version,
+                          EASTASIAN_WIDTH % version)

    print len(filter(None, unicode.table)), "characters"

+    for version in old_versions:
+        print "--- Reading", UNICODE_DATA % ("-"+version), "..."
+        old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
+                                  COMPOSITION_EXCLUSIONS % ("-"+version),
+                                  EASTASIAN_WIDTH % ("-"+version))
+        print len(filter(None, old_unicode.table)), "characters"
+        merge_old_version(version, unicode, old_unicode)
+
    makeunicodename(unicode, trace)
    makeunicodedata(unicode, trace)
    makeunicodetype(unicode, trace)
@ -119,6 +131,8 @@ def makeunicodedata(unicode, trace):
        if record:
            if record[5]:
                decomp = record[5].split()
+                if len(decomp) > 19:
+                    raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
                # prefix
                if decomp[0][0] == "<":
                    prefix = decomp.pop(0)
@ -278,6 +292,44 @@ def makeunicodedata(unicode, trace):
    Array("comp_index", index).dump(fp, trace)
    Array("comp_data", index2).dump(fp, trace)

+    # Generate delta tables for old versions
+    for version, table, normalization in unicode.changed:
+        cversion = version.replace(".","_")
+        records = [table[0]]
+        cache = {table[0]:0}
+        index = [0] * len(table)
+        for i, record in enumerate(table):
+            try:
+                index[i] = cache[record]
+            except KeyError:
+                index[i] = cache[record] = len(records)
+                records.append(record)
+        index1, index2, shift = splitbins(index, trace)
+        print >>fp, "static const change_record change_records_%s[] = {" % cversion
+        for record in records:
+            print >>fp, "\t{ %s }," % ", ".join(map(str,record))
+        print >>fp, "};"
+        Array("changes_%s_index" % cversion, index1).dump(fp, trace)
+        Array("changes_%s_data" % cversion, index2).dump(fp, trace)
+        print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
+        print >>fp, "{"
+        print >>fp, "\tint index;"
+        print >>fp, "\tif (n >= 0x110000) index = 0;"
+        print >>fp, "\telse {"
+        print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
+        print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
+              (cversion, shift, ((1<<shift)-1))
+        print >>fp, "\t}"
+        print >>fp, "\treturn change_records_%s+index;" % cversion
+        print >>fp, "}\n"
+        print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
+        print >>fp, "{"
+        print >>fp, "\tswitch(n) {"
+        for k, v in normalization:
+            print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
+        print >>fp, "\tdefault: return 0;"
+        print >>fp, "\t}\n}\n"
+
    fp.close()

 # --------------------------------------------------------------------
@ -540,6 +592,82 @@ def makeunicodename(unicode, trace):

    fp.close()

+
+def merge_old_version(version, new, old):
+    # Changes to exclusion file not implemented yet
+    if old.exclusions != new.exclusions:
+        raise NotImplementedError, "exclusions differ"
+
+    # In these change records, 0xFF means "no change"
+    bidir_changes = [0xFF]*0x110000
+    category_changes = [0xFF]*0x110000
+    decimal_changes = [0xFF]*0x110000
+    # In numeric data, 0 means "no change",
+    # -1 means "did not have a numeric value
+    numeric_changes = [0] * 0x110000
+    # normalization_changes is a list of key-value pairs
+    normalization_changes = []
+    for i in range(0x110000):
+        if new.table[i] is None:
+            # Characters unassigned in the new version ought to
+            # be unassigned in the old one
+            assert old.table[i] is None
+            continue
+        # check characters unassigned in the old version
+        if old.table[i] is None:
+            # category 0 is "unassigned"
+            category_changes[i] = 0
+            continue
+        # check characters that differ
+        if old.table[i] != new.table[i]:
+            for k in range(len(old.table[i])):
+                if old.table[i][k] != new.table[i][k]:
+                    value = old.table[i][k]
+                    if k == 2:
+                        #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
+                        category_changes[i] = CATEGORY_NAMES.index(value)
+                    elif k == 4:
+                        #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
+                        bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
+                    elif k == 5:
+                        #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
+                        # We assume that all normalization changes are in 1:1 mappings
+                        assert " " not in value
+                        normalization_changes.append((i, value))
+                    elif k == 6:
+                        #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
+                        # we only support changes where the old value is a single digit
+                        assert value in "0123456789"
+                        decimal_changes[i] = int(value)
+                    elif k == 8:
+                        # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
+                        # Since 0 encodes "no change", the old value is better not 0
+                        assert value != "0" and value != "-1"
+                        if not value:
+                            numeric_changes[i] = -1
+                        else:
+                            assert re.match("^[0-9]+$", value)
+                            numeric_changes[i] = int(value)
+                    elif k == 11:
+                        # change to ISO comment, ignore
+                        pass
+                    elif k == 12:
+                        # change to simple uppercase mapping; ignore
+                        pass
+                    elif k == 13:
+                        # change to simple lowercase mapping; ignore
+                        pass
+                    elif k == 14:
+                        # change to simple titlecase mapping; ignore
+                        pass
+                    else:
+                        class Difference(Exception):pass
+                        raise Difference, (hex(i), k, old.table[i], new.table[i])
+    new.changed.append((version, zip(bidir_changes, category_changes,
+                                     decimal_changes, numeric_changes),
+                        normalization_changes))
+    
+
 # --------------------------------------------------------------------
 # the following support code is taken from the unidb utilities
 # Copyright (c) 1999-2000 by Secret Labs AB
@ -551,6 +679,7 @@ import sys
 class UnicodeData:

    def __init__(self, filename, exclusions, eastasianwidth, expand=1):
+        self.changed = []
        file = open(filename)
        table = [None] * 0x110000
        while 1:
@ -569,13 +698,14 @@ class UnicodeData:
                if s:
                    if s[1][-6:] == "First>":
                        s[1] = ""
-                        field = s[:]
+                        field = s
                    elif s[1][-5:] == "Last>":
                        s[1] = ""
                        field = None
                elif field:
-                    field[0] = hex(i)
-                    table[i] = field
+                    f2 = field[:]
+                    f2[0] = "%X" % i
+                    table[i] = f2

        # public attributes
        self.filename = filename