Update Unicode database to Unicode 4.1.

This commit is contained in:
Martin v. Löwis 2006-03-09 23:38:20 +00:00
parent e2b4677253
commit 480f1bb67b
12 changed files with 17302 additions and 13365 deletions

View File

@ -14,11 +14,11 @@
This module provides access to the Unicode Character Database which This module provides access to the Unicode Character Database which
defines character properties for all Unicode characters. The data in defines character properties for all Unicode characters. The data in
this database is based on the \file{UnicodeData.txt} file version this database is based on the \file{UnicodeData.txt} file version
3.2.0 which is publically available from \url{ftp://ftp.unicode.org/}. 4.1.0 which is publically available from \url{ftp://ftp.unicode.org/}.
The module uses the same names and symbols as defined by the The module uses the same names and symbols as defined by the
UnicodeData File Format 3.2.0 (see UnicodeData File Format 4.1.0 (see
\url{http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html}). It \url{http://www.unicode.org/Public/4.1-Update/UnicodeData-4.1.0.html}). It
defines the following functions: defines the following functions:
\begin{funcdesc}{lookup}{name} \begin{funcdesc}{lookup}{name}
@ -130,3 +130,12 @@ The version of the Unicode database used in this module.
\versionadded{2.3} \versionadded{2.3}
\end{datadesc} \end{datadesc}
\begin{datadesc}{db_3_2_0}
This is an object that has the same methods as the entire
module, but uses the Unicode database version 3.2 instead,
for applications that require this specific version of
the Unicode database (such as IDNA).
\versionadded{2.5}
\end{datadesc}

View File

@ -14,12 +14,14 @@ typedef struct {
int size; int size;
/* Get name for a given character code. Returns non-zero if /* Get name for a given character code. Returns non-zero if
success, zero if not. Does not set Python exceptions. */ success, zero if not. Does not set Python exceptions.
int (*getname)(Py_UCS4 code, char* buffer, int buflen); If self is NULL, data come from the default version of the database.
If it is not NULL, it should be a unicodedata.db_X_Y_Z object */
int (*getname)(PyObject *self, Py_UCS4 code, char* buffer, int buflen);
/* Get character code for a given name. Same error handling /* Get character code for a given name. Same error handling
as for getname. */ as for getname. */
int (*getcode)(const char* name, int namelen, Py_UCS4* code); int (*getcode)(PyObject *self, const char* name, int namelen, Py_UCS4* code);
} _PyUnicode_Name_CAPI; } _PyUnicode_Name_CAPI;

View File

@ -1,6 +1,7 @@
# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
import stringprep, unicodedata, re, codecs import stringprep, re, codecs
from unicodedata import db_3_2_0 as unicodedata
# IDNA section 3.1 # IDNA section 3.1
dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")

View File

@ -5,7 +5,7 @@ There are two kinds of tables: sets, for which a member test is provided,
and mappings, for which a mapping function is provided. and mappings, for which a mapping function is provided.
""" """
import unicodedata from unicodedata import db_3_2_0 as unicodedata
assert unicodedata.unidata_version == '3.2.0' assert unicodedata.unidata_version == '3.2.0'

View File

@ -16,7 +16,7 @@ encoding = 'utf-8'
class UnicodeMethodsTest(unittest.TestCase): class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes # update this, if the database changes
expectedchecksum = 'a37276dc2c158bef6dfd908ad34525c97180fad9' expectedchecksum = 'a6555cd209d960dcfa17bfdce0c96d91cfa9a9ba'
def test_method_checksum(self): def test_method_checksum(self):
h = sha.sha() h = sha.sha()
@ -75,7 +75,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
class UnicodeFunctionsTest(UnicodeDatabaseTest): class UnicodeFunctionsTest(UnicodeDatabaseTest):
# update this, if the database changes # update this, if the database changes
expectedchecksum = 'cfe20a967a450ebc82ca68c3e4eed344164e11af' expectedchecksum = 'b45b79f3203ee1a896d9b5655484adaff5d4964b'
def test_function_checksum(self): def test_function_checksum(self):
data = [] data = []

View File

@ -279,6 +279,10 @@ Core and builtins
Extension Modules Extension Modules
----------------- -----------------
- The unicodedata module was updated to the 4.1 version of the Unicode
database. The 3.2 version is still available as unicodedata.db_3_2_0
for applications that require this specific version (such as IDNA).
- The timing module is no longer built by default. It was deprecated - The timing module is no longer built by default. It was deprecated
in PEP 4 in Python 2.0 or earlier. in PEP 4 in Python 2.0 or earlier.

View File

@ -14,6 +14,7 @@
#include "Python.h" #include "Python.h"
#include "ucnhash.h" #include "ucnhash.h"
#include "structmember.h"
/* character properties */ /* character properties */
@ -28,6 +29,14 @@ typedef struct {
_PyUnicode_EastAsianWidth */ _PyUnicode_EastAsianWidth */
} _PyUnicode_DatabaseRecord; } _PyUnicode_DatabaseRecord;
typedef struct change_record {
/* sequence of fields should be the same as in merge_old_version */
const unsigned char bidir_changed;
const unsigned char category_changed;
const unsigned char decimal_changed;
const int numeric_changed;
} change_record;
/* data file generated by Tools/unicode/makeunicodedata.py */ /* data file generated by Tools/unicode/makeunicodedata.py */
#include "unicodedata_db.h" #include "unicodedata_db.h"
@ -51,6 +60,85 @@ _getrecord(PyUnicodeObject* v)
return _getrecord_ex(*PyUnicode_AS_UNICODE(v)); return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
} }
/* ------------- Previous-version API ------------------------------------- */
typedef struct previous_version {
PyObject_HEAD
const char *name;
const change_record* (*getrecord)(Py_UCS4);
Py_UCS4 (*normalization)(Py_UCS4);
} PreviousDBVersion;
#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
/* Forward declaration */
static PyMethodDef unicodedata_functions[];
static PyMemberDef DB_members[] = {
{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
{NULL}
};
static PyTypeObject Xxo_Type = {
/* The ob_type field must be initialized in the module init function
* to be portable to Windows without using C++. */
PyObject_HEAD_INIT(NULL)
0, /*ob_size*/
"unicodedata.DB", /*tp_name*/
sizeof(PreviousDBVersion), /*tp_basicsize*/
0, /*tp_itemsize*/
/* methods */
(destructor)PyObject_Del, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash*/
0, /*tp_call*/
0, /*tp_str*/
PyObject_GenericGetAttr,/*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT, /*tp_flags*/
0, /*tp_doc*/
0, /*tp_traverse*/
0, /*tp_clear*/
0, /*tp_richcompare*/
0, /*tp_weaklistoffset*/
0, /*tp_iter*/
0, /*tp_iternext*/
unicodedata_functions, /*tp_methods*/
DB_members, /*tp_members*/
0, /*tp_getset*/
0, /*tp_base*/
0, /*tp_dict*/
0, /*tp_descr_get*/
0, /*tp_descr_set*/
0, /*tp_dictoffset*/
0, /*tp_init*/
0, /*tp_alloc*/
0, /*tp_new*/
0, /*tp_free*/
0, /*tp_is_gc*/
};
static PyObject*
new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
Py_UCS4 (*normalization)(Py_UCS4))
{
PreviousDBVersion *self;
self = PyObject_New(PreviousDBVersion, &Xxo_Type);
if (self == NULL)
return NULL;
self->name = name;
self->getrecord = getrecord;
self->normalization = normalization;
return (PyObject*)self;
}
/* --- Module API --------------------------------------------------------- */ /* --- Module API --------------------------------------------------------- */
PyDoc_STRVAR(unicodedata_decimal__doc__, PyDoc_STRVAR(unicodedata_decimal__doc__,
@ -65,6 +153,7 @@ unicodedata_decimal(PyObject *self, PyObject *args)
{ {
PyUnicodeObject *v; PyUnicodeObject *v;
PyObject *defobj = NULL; PyObject *defobj = NULL;
int have_old = 0;
long rc; long rc;
if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj)) if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
@ -74,7 +163,22 @@ unicodedata_decimal(PyObject *self, PyObject *args)
"need a single Unicode character as parameter"); "need a single Unicode character as parameter");
return NULL; return NULL;
} }
rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
if (self) {
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
if (old->category_changed == 0) {
/* unassigned */
have_old = 1;
rc = -1;
}
else if (old->decimal_changed != 0xFF) {
have_old = 1;
rc = old->decimal_changed;
}
}
if (!have_old)
rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
if (rc < 0) { if (rc < 0) {
if (defobj == NULL) { if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
@ -136,6 +240,7 @@ unicodedata_numeric(PyObject *self, PyObject *args)
{ {
PyUnicodeObject *v; PyUnicodeObject *v;
PyObject *defobj = NULL; PyObject *defobj = NULL;
int have_old = 0;
double rc; double rc;
if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj)) if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
@ -145,7 +250,22 @@ unicodedata_numeric(PyObject *self, PyObject *args)
"need a single Unicode character as parameter"); "need a single Unicode character as parameter");
return NULL; return NULL;
} }
rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
if (self) {
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
if (old->category_changed == 0) {
/* unassigned */
have_old = 1;
rc = -1;
}
else if (old->decimal_changed != 0xFF) {
have_old = 1;
rc = old->decimal_changed;
}
}
if (!have_old)
rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
if (rc < 0) { if (rc < 0) {
if (defobj == NULL) { if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError, "not a numeric character"); PyErr_SetString(PyExc_ValueError, "not a numeric character");
@ -180,6 +300,11 @@ unicodedata_category(PyObject *self, PyObject *args)
return NULL; return NULL;
} }
index = (int) _getrecord(v)->category; index = (int) _getrecord(v)->category;
if (self) {
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
if (old->category_changed != 0xFF)
index = old->category_changed;
}
return PyString_FromString(_PyUnicode_CategoryNames[index]); return PyString_FromString(_PyUnicode_CategoryNames[index]);
} }
@ -205,6 +330,13 @@ unicodedata_bidirectional(PyObject *self, PyObject *args)
return NULL; return NULL;
} }
index = (int) _getrecord(v)->bidirectional; index = (int) _getrecord(v)->bidirectional;
if (self) {
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
if (old->category_changed == 0)
index = 0; /* unassigned */
else if (old->bidir_changed != 0xFF)
index = old->bidir_changed;
}
return PyString_FromString(_PyUnicode_BidirectionalNames[index]); return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
} }
@ -219,6 +351,7 @@ static PyObject *
unicodedata_combining(PyObject *self, PyObject *args) unicodedata_combining(PyObject *self, PyObject *args)
{ {
PyUnicodeObject *v; PyUnicodeObject *v;
int index;
if (!PyArg_ParseTuple(args, "O!:combining", if (!PyArg_ParseTuple(args, "O!:combining",
&PyUnicode_Type, &v)) &PyUnicode_Type, &v))
@ -228,7 +361,13 @@ unicodedata_combining(PyObject *self, PyObject *args)
"need a single Unicode character as parameter"); "need a single Unicode character as parameter");
return NULL; return NULL;
} }
return PyInt_FromLong((int) _getrecord(v)->combining); index = (int) _getrecord(v)->combining;
if (self) {
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
if (old->category_changed == 0)
index = 0; /* unassigned */
}
return PyInt_FromLong(index);
} }
PyDoc_STRVAR(unicodedata_mirrored__doc__, PyDoc_STRVAR(unicodedata_mirrored__doc__,
@ -242,6 +381,7 @@ static PyObject *
unicodedata_mirrored(PyObject *self, PyObject *args) unicodedata_mirrored(PyObject *self, PyObject *args)
{ {
PyUnicodeObject *v; PyUnicodeObject *v;
int index;
if (!PyArg_ParseTuple(args, "O!:mirrored", if (!PyArg_ParseTuple(args, "O!:mirrored",
&PyUnicode_Type, &v)) &PyUnicode_Type, &v))
@ -251,7 +391,13 @@ unicodedata_mirrored(PyObject *self, PyObject *args)
"need a single Unicode character as parameter"); "need a single Unicode character as parameter");
return NULL; return NULL;
} }
return PyInt_FromLong((int) _getrecord(v)->mirrored); index = (int) _getrecord(v)->mirrored;
if (self) {
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
if (old->category_changed == 0)
index = 0; /* unassigned */
}
return PyInt_FromLong(index);
} }
PyDoc_STRVAR(unicodedata_east_asian_width__doc__, PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
@ -275,6 +421,11 @@ unicodedata_east_asian_width(PyObject *self, PyObject *args)
return NULL; return NULL;
} }
index = (int) _getrecord(v)->east_asian_width; index = (int) _getrecord(v)->east_asian_width;
if (self) {
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
if (old->category_changed == 0)
index = 0; /* unassigned */
}
return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]); return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
} }
@ -303,6 +454,12 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
code = (int) *PyUnicode_AS_UNICODE(v); code = (int) *PyUnicode_AS_UNICODE(v);
if (self) {
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
if (old->category_changed == 0)
return PyString_FromString(""); /* unassigned */
}
if (code < 0 || code >= 0x110000) if (code < 0 || code >= 0x110000)
index = 0; index = 0;
else { else {
@ -337,11 +494,14 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
} }
void void
get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count) get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
{ {
if (code >= 0x110000) { if (code >= 0x110000) {
*index = 0; *index = 0;
} } else if (self && get_old_record(self, code)->category_changed==0) {
/* unassigned in old version */
*index = 0;
}
else { else {
*index = decomp_index1[(code>>DECOMP_SHIFT)]; *index = decomp_index1[(code>>DECOMP_SHIFT)];
*index = decomp_index2[(*index<<DECOMP_SHIFT)+ *index = decomp_index2[(*index<<DECOMP_SHIFT)+
@ -367,7 +527,7 @@ get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
#define SCount (LCount*NCount) #define SCount (LCount*NCount)
static PyObject* static PyObject*
nfd_nfkd(PyObject *input, int k) nfd_nfkd(PyObject *self, PyObject *input, int k)
{ {
PyObject *result; PyObject *result;
Py_UNICODE *i, *end, *o; Py_UNICODE *i, *end, *o;
@ -416,8 +576,17 @@ nfd_nfkd(PyObject *input, int k)
} }
continue; continue;
} }
/* Other decompoistions. */ /* normalization changes */
get_decomp_record(code, &index, &prefix, &count); if (self) {
Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
if (value != 0) {
stack[stackptr++] = value;
continue;
}
}
/* Other decompositions. */
get_decomp_record(self, code, &index, &prefix, &count);
/* Copy character if it is not decomposable, or has a /* Copy character if it is not decomposable, or has a
compatibility decomposition, but we do NFD. */ compatibility decomposition, but we do NFD. */
@ -467,7 +636,7 @@ nfd_nfkd(PyObject *input, int k)
} }
static int static int
find_nfc_index(struct reindex* nfc, Py_UNICODE code) find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
{ {
int index; int index;
for (index = 0; nfc[index].start; index++) { for (index = 0; nfc[index].start; index++) {
@ -483,7 +652,7 @@ find_nfc_index(struct reindex* nfc, Py_UNICODE code)
} }
static PyObject* static PyObject*
nfc_nfkc(PyObject *input, int k) nfc_nfkc(PyObject *self, PyObject *input, int k)
{ {
PyObject *result; PyObject *result;
Py_UNICODE *i, *i1, *o, *end; Py_UNICODE *i, *i1, *o, *end;
@ -492,7 +661,7 @@ nfc_nfkc(PyObject *input, int k)
Py_UNICODE *skipped[20]; Py_UNICODE *skipped[20];
int cskipped = 0; int cskipped = 0;
result = nfd_nfkd(input, k); result = nfd_nfkd(self, input, k);
if (!result) if (!result)
return NULL; return NULL;
@ -536,7 +705,7 @@ nfc_nfkc(PyObject *input, int k)
continue; continue;
} }
f = find_nfc_index(nfc_first, *i); f = find_nfc_index(self, nfc_first, *i);
if (f == -1) { if (f == -1) {
*o++ = *i++; *o++ = *i++;
continue; continue;
@ -551,7 +720,7 @@ nfc_nfkc(PyObject *input, int k)
i1++; i1++;
continue; continue;
} }
l = find_nfc_index(nfc_last, *i1); l = find_nfc_index(self, nfc_last, *i1);
/* *i1 cannot be combined with *i. If *i1 /* *i1 cannot be combined with *i. If *i1
is a starter, we don't need to look further. is a starter, we don't need to look further.
Otherwise, record the combining class. */ Otherwise, record the combining class. */
@ -575,7 +744,7 @@ nfc_nfkc(PyObject *input, int k)
/* Mark the second character unused. */ /* Mark the second character unused. */
skipped[cskipped++] = i1; skipped[cskipped++] = i1;
i1++; i1++;
f = find_nfc_index(nfc_first, *i); f = find_nfc_index(self, nfc_first, *i);
if (f == -1) if (f == -1)
break; break;
} }
@ -610,13 +779,13 @@ unicodedata_normalize(PyObject *self, PyObject *args)
} }
if (strcmp(form, "NFC") == 0) if (strcmp(form, "NFC") == 0)
return nfc_nfkc(input, 0); return nfc_nfkc(self, input, 0);
if (strcmp(form, "NFKC") == 0) if (strcmp(form, "NFKC") == 0)
return nfc_nfkc(input, 1); return nfc_nfkc(self, input, 1);
if (strcmp(form, "NFD") == 0) if (strcmp(form, "NFD") == 0)
return nfd_nfkd(input, 0); return nfd_nfkd(self, input, 0);
if (strcmp(form, "NFKD") == 0) if (strcmp(form, "NFKD") == 0)
return nfd_nfkd(input, 1); return nfd_nfkd(self, input, 1);
PyErr_SetString(PyExc_ValueError, "invalid normalization form"); PyErr_SetString(PyExc_ValueError, "invalid normalization form");
return NULL; return NULL;
} }
@ -686,7 +855,7 @@ is_unified_ideograph(Py_UCS4 code)
} }
static int static int
_getucname(Py_UCS4 code, char* buffer, int buflen) _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
{ {
int offset; int offset;
int i; int i;
@ -726,6 +895,15 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
if (code >= 0x110000) if (code >= 0x110000)
return 0; return 0;
if (self) {
const change_record *old = get_old_record(self, code);
if (old->category_changed == 0) {
/* unassigned */
return 0;
}
}
/* get offset into phrasebook */ /* get offset into phrasebook */
offset = phrasebook_offset1[(code>>phrasebook_shift)]; offset = phrasebook_offset1[(code>>phrasebook_shift)];
offset = phrasebook_offset2[(offset<<phrasebook_shift) + offset = phrasebook_offset2[(offset<<phrasebook_shift) +
@ -768,12 +946,12 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
} }
static int static int
_cmpname(int code, const char* name, int namelen) _cmpname(PyObject *self, int code, const char* name, int namelen)
{ {
/* check if code corresponds to the given name */ /* check if code corresponds to the given name */
int i; int i;
char buffer[NAME_MAXLEN]; char buffer[NAME_MAXLEN];
if (!_getucname(code, buffer, sizeof(buffer))) if (!_getucname(self, code, buffer, sizeof(buffer)))
return 0; return 0;
for (i = 0; i < namelen; i++) { for (i = 0; i < namelen; i++) {
if (toupper(name[i]) != buffer[i]) if (toupper(name[i]) != buffer[i])
@ -803,7 +981,7 @@ find_syllable(const char *str, int *len, int *pos, int count, int column)
} }
static int static int
_getcode(const char* name, int namelen, Py_UCS4* code) _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
{ {
unsigned int h, v; unsigned int h, v;
unsigned int mask = code_size-1; unsigned int mask = code_size-1;
@ -860,7 +1038,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
v = code_hash[i]; v = code_hash[i];
if (!v) if (!v)
return 0; return 0;
if (_cmpname(v, name, namelen)) { if (_cmpname(self, v, name, namelen)) {
*code = v; *code = v;
return 1; return 1;
} }
@ -872,7 +1050,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
v = code_hash[i]; v = code_hash[i];
if (!v) if (!v)
return 0; return 0;
if (_cmpname(v, name, namelen)) { if (_cmpname(self, v, name, namelen)) {
*code = v; *code = v;
return 1; return 1;
} }
@ -914,8 +1092,8 @@ unicodedata_name(PyObject* self, PyObject* args)
return NULL; return NULL;
} }
if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v), if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
name, sizeof(name))) { name, sizeof(name))) {
if (defobj == NULL) { if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError, "no such name"); PyErr_SetString(PyExc_ValueError, "no such name");
return NULL; return NULL;
@ -947,7 +1125,7 @@ unicodedata_lookup(PyObject* self, PyObject* args)
if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen)) if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
return NULL; return NULL;
if (!_getcode(name, namelen, &code)) { if (!_getcode(self, name, namelen, &code)) {
char fmt[] = "undefined character name '%s'"; char fmt[] = "undefined character name '%s'";
char *buf = PyMem_MALLOC(sizeof(fmt) + namelen); char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
sprintf(buf, fmt, name); sprintf(buf, fmt, name);
@ -985,6 +1163,8 @@ static PyMethodDef unicodedata_functions[] = {
{NULL, NULL} /* sentinel */ {NULL, NULL} /* sentinel */
}; };
PyDoc_STRVAR(unicodedata_docstring, PyDoc_STRVAR(unicodedata_docstring,
"This module provides access to the Unicode Character Database which\n\ "This module provides access to the Unicode Character Database which\n\
defines character properties for all Unicode characters. The data in\n\ defines character properties for all Unicode characters. The data in\n\
@ -1007,6 +1187,11 @@ initunicodedata(void)
PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION); PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
/* Previous versions */
v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
if (v != NULL)
PyModule_AddObject(m, "db_3_2_0", v);
/* Export C API */ /* Export C API */
v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL); v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
if (v != NULL) if (v != NULL)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1898,7 +1898,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
/* found a name. look it up in the unicode database */ /* found a name. look it up in the unicode database */
message = "unknown Unicode character name"; message = "unknown Unicode character name";
s++; s++;
if (ucnhash_CAPI->getcode(start, (int)(s-start-1), &chr)) if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
goto store; goto store;
} }
} }

File diff suppressed because it is too large Load Diff

View File

@ -26,13 +26,15 @@
import sys import sys
SCRIPT = sys.argv[0] SCRIPT = sys.argv[0]
VERSION = "2.3" VERSION = "2.5"
# The Unicode Database # The Unicode Database
UNIDATA_VERSION = "3.2.0" UNIDATA_VERSION = "4.1.0"
UNICODE_DATA = "UnicodeData.txt" UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth.txt" EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
old_versions = ["3.2.0"]
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
@ -57,13 +59,23 @@ UPPER_MASK = 0x80
def maketables(trace=0): def maketables(trace=0):
print "--- Reading", UNICODE_DATA, "..." print "--- Reading", UNICODE_DATA % "", "..."
unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS, version = ""
EASTASIAN_WIDTH) unicode = UnicodeData(UNICODE_DATA % version,
COMPOSITION_EXCLUSIONS % version,
EASTASIAN_WIDTH % version)
print len(filter(None, unicode.table)), "characters" print len(filter(None, unicode.table)), "characters"
for version in old_versions:
print "--- Reading", UNICODE_DATA % ("-"+version), "..."
old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
COMPOSITION_EXCLUSIONS % ("-"+version),
EASTASIAN_WIDTH % ("-"+version))
print len(filter(None, old_unicode.table)), "characters"
merge_old_version(version, unicode, old_unicode)
makeunicodename(unicode, trace) makeunicodename(unicode, trace)
makeunicodedata(unicode, trace) makeunicodedata(unicode, trace)
makeunicodetype(unicode, trace) makeunicodetype(unicode, trace)
@ -119,6 +131,8 @@ def makeunicodedata(unicode, trace):
if record: if record:
if record[5]: if record[5]:
decomp = record[5].split() decomp = record[5].split()
if len(decomp) > 19:
raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
# prefix # prefix
if decomp[0][0] == "<": if decomp[0][0] == "<":
prefix = decomp.pop(0) prefix = decomp.pop(0)
@ -278,6 +292,44 @@ def makeunicodedata(unicode, trace):
Array("comp_index", index).dump(fp, trace) Array("comp_index", index).dump(fp, trace)
Array("comp_data", index2).dump(fp, trace) Array("comp_data", index2).dump(fp, trace)
# Generate delta tables for old versions
for version, table, normalization in unicode.changed:
cversion = version.replace(".","_")
records = [table[0]]
cache = {table[0]:0}
index = [0] * len(table)
for i, record in enumerate(table):
try:
index[i] = cache[record]
except KeyError:
index[i] = cache[record] = len(records)
records.append(record)
index1, index2, shift = splitbins(index, trace)
print >>fp, "static const change_record change_records_%s[] = {" % cversion
for record in records:
print >>fp, "\t{ %s }," % ", ".join(map(str,record))
print >>fp, "};"
Array("changes_%s_index" % cversion, index1).dump(fp, trace)
Array("changes_%s_data" % cversion, index2).dump(fp, trace)
print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
print >>fp, "{"
print >>fp, "\tint index;"
print >>fp, "\tif (n >= 0x110000) index = 0;"
print >>fp, "\telse {"
print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
(cversion, shift, ((1<<shift)-1))
print >>fp, "\t}"
print >>fp, "\treturn change_records_%s+index;" % cversion
print >>fp, "}\n"
print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
print >>fp, "{"
print >>fp, "\tswitch(n) {"
for k, v in normalization:
print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
print >>fp, "\tdefault: return 0;"
print >>fp, "\t}\n}\n"
fp.close() fp.close()
# -------------------------------------------------------------------- # --------------------------------------------------------------------
@ -540,6 +592,82 @@ def makeunicodename(unicode, trace):
fp.close() fp.close()
def merge_old_version(version, new, old):
# Changes to exclusion file not implemented yet
if old.exclusions != new.exclusions:
raise NotImplementedError, "exclusions differ"
# In these change records, 0xFF means "no change"
bidir_changes = [0xFF]*0x110000
category_changes = [0xFF]*0x110000
decimal_changes = [0xFF]*0x110000
# In numeric data, 0 means "no change",
# -1 means "did not have a numeric value
numeric_changes = [0] * 0x110000
# normalization_changes is a list of key-value pairs
normalization_changes = []
for i in range(0x110000):
if new.table[i] is None:
# Characters unassigned in the new version ought to
# be unassigned in the old one
assert old.table[i] is None
continue
# check characters unassigned in the old version
if old.table[i] is None:
# category 0 is "unassigned"
category_changes[i] = 0
continue
# check characters that differ
if old.table[i] != new.table[i]:
for k in range(len(old.table[i])):
if old.table[i][k] != new.table[i][k]:
value = old.table[i][k]
if k == 2:
#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
category_changes[i] = CATEGORY_NAMES.index(value)
elif k == 4:
#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
elif k == 5:
#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
# We assume that all normalization changes are in 1:1 mappings
assert " " not in value
normalization_changes.append((i, value))
elif k == 6:
#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
# we only support changes where the old value is a single digit
assert value in "0123456789"
decimal_changes[i] = int(value)
elif k == 8:
# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
# Since 0 encodes "no change", the old value is better not 0
assert value != "0" and value != "-1"
if not value:
numeric_changes[i] = -1
else:
assert re.match("^[0-9]+$", value)
numeric_changes[i] = int(value)
elif k == 11:
# change to ISO comment, ignore
pass
elif k == 12:
# change to simple uppercase mapping; ignore
pass
elif k == 13:
# change to simple lowercase mapping; ignore
pass
elif k == 14:
# change to simple titlecase mapping; ignore
pass
else:
class Difference(Exception):pass
raise Difference, (hex(i), k, old.table[i], new.table[i])
new.changed.append((version, zip(bidir_changes, category_changes,
decimal_changes, numeric_changes),
normalization_changes))
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# the following support code is taken from the unidb utilities # the following support code is taken from the unidb utilities
# Copyright (c) 1999-2000 by Secret Labs AB # Copyright (c) 1999-2000 by Secret Labs AB
@ -551,6 +679,7 @@ import sys
class UnicodeData: class UnicodeData:
def __init__(self, filename, exclusions, eastasianwidth, expand=1): def __init__(self, filename, exclusions, eastasianwidth, expand=1):
self.changed = []
file = open(filename) file = open(filename)
table = [None] * 0x110000 table = [None] * 0x110000
while 1: while 1:
@ -569,13 +698,14 @@ class UnicodeData:
if s: if s:
if s[1][-6:] == "First>": if s[1][-6:] == "First>":
s[1] = "" s[1] = ""
field = s[:] field = s
elif s[1][-5:] == "Last>": elif s[1][-5:] == "Last>":
s[1] = "" s[1] = ""
field = None field = None
elif field: elif field:
field[0] = hex(i) f2 = field[:]
table[i] = field f2[0] = "%X" % i
table[i] = f2
# public attributes # public attributes
self.filename = filename self.filename = filename