Update Unicode database to Unicode 4.1.
This commit is contained in:
parent
e2b4677253
commit
480f1bb67b
|
@ -14,11 +14,11 @@
|
||||||
This module provides access to the Unicode Character Database which
|
This module provides access to the Unicode Character Database which
|
||||||
defines character properties for all Unicode characters. The data in
|
defines character properties for all Unicode characters. The data in
|
||||||
this database is based on the \file{UnicodeData.txt} file version
|
this database is based on the \file{UnicodeData.txt} file version
|
||||||
3.2.0 which is publically available from \url{ftp://ftp.unicode.org/}.
|
4.1.0 which is publically available from \url{ftp://ftp.unicode.org/}.
|
||||||
|
|
||||||
The module uses the same names and symbols as defined by the
|
The module uses the same names and symbols as defined by the
|
||||||
UnicodeData File Format 3.2.0 (see
|
UnicodeData File Format 4.1.0 (see
|
||||||
\url{http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html}). It
|
\url{http://www.unicode.org/Public/4.1-Update/UnicodeData-4.1.0.html}). It
|
||||||
defines the following functions:
|
defines the following functions:
|
||||||
|
|
||||||
\begin{funcdesc}{lookup}{name}
|
\begin{funcdesc}{lookup}{name}
|
||||||
|
@ -130,3 +130,12 @@ The version of the Unicode database used in this module.
|
||||||
|
|
||||||
\versionadded{2.3}
|
\versionadded{2.3}
|
||||||
\end{datadesc}
|
\end{datadesc}
|
||||||
|
|
||||||
|
\begin{datadesc}{db_3_2_0}
|
||||||
|
This is an object that has the same methods as the entire
|
||||||
|
module, but uses the Unicode database version 3.2 instead,
|
||||||
|
for applications that require this specific version of
|
||||||
|
the Unicode database (such as IDNA).
|
||||||
|
|
||||||
|
\versionadded{2.5}
|
||||||
|
\end{datadesc}
|
||||||
|
|
|
@ -14,12 +14,14 @@ typedef struct {
|
||||||
int size;
|
int size;
|
||||||
|
|
||||||
/* Get name for a given character code. Returns non-zero if
|
/* Get name for a given character code. Returns non-zero if
|
||||||
success, zero if not. Does not set Python exceptions. */
|
success, zero if not. Does not set Python exceptions.
|
||||||
int (*getname)(Py_UCS4 code, char* buffer, int buflen);
|
If self is NULL, data come from the default version of the database.
|
||||||
|
If it is not NULL, it should be a unicodedata.db_X_Y_Z object */
|
||||||
|
int (*getname)(PyObject *self, Py_UCS4 code, char* buffer, int buflen);
|
||||||
|
|
||||||
/* Get character code for a given name. Same error handling
|
/* Get character code for a given name. Same error handling
|
||||||
as for getname. */
|
as for getname. */
|
||||||
int (*getcode)(const char* name, int namelen, Py_UCS4* code);
|
int (*getcode)(PyObject *self, const char* name, int namelen, Py_UCS4* code);
|
||||||
|
|
||||||
} _PyUnicode_Name_CAPI;
|
} _PyUnicode_Name_CAPI;
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
|
# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
|
||||||
|
|
||||||
import stringprep, unicodedata, re, codecs
|
import stringprep, re, codecs
|
||||||
|
from unicodedata import db_3_2_0 as unicodedata
|
||||||
|
|
||||||
# IDNA section 3.1
|
# IDNA section 3.1
|
||||||
dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
|
dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
|
||||||
|
|
|
@ -5,7 +5,7 @@ There are two kinds of tables: sets, for which a member test is provided,
|
||||||
and mappings, for which a mapping function is provided.
|
and mappings, for which a mapping function is provided.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import unicodedata
|
from unicodedata import db_3_2_0 as unicodedata
|
||||||
|
|
||||||
assert unicodedata.unidata_version == '3.2.0'
|
assert unicodedata.unidata_version == '3.2.0'
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@ encoding = 'utf-8'
|
||||||
class UnicodeMethodsTest(unittest.TestCase):
|
class UnicodeMethodsTest(unittest.TestCase):
|
||||||
|
|
||||||
# update this, if the database changes
|
# update this, if the database changes
|
||||||
expectedchecksum = 'a37276dc2c158bef6dfd908ad34525c97180fad9'
|
expectedchecksum = 'a6555cd209d960dcfa17bfdce0c96d91cfa9a9ba'
|
||||||
|
|
||||||
def test_method_checksum(self):
|
def test_method_checksum(self):
|
||||||
h = sha.sha()
|
h = sha.sha()
|
||||||
|
@ -75,7 +75,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
|
||||||
class UnicodeFunctionsTest(UnicodeDatabaseTest):
|
class UnicodeFunctionsTest(UnicodeDatabaseTest):
|
||||||
|
|
||||||
# update this, if the database changes
|
# update this, if the database changes
|
||||||
expectedchecksum = 'cfe20a967a450ebc82ca68c3e4eed344164e11af'
|
expectedchecksum = 'b45b79f3203ee1a896d9b5655484adaff5d4964b'
|
||||||
|
|
||||||
def test_function_checksum(self):
|
def test_function_checksum(self):
|
||||||
data = []
|
data = []
|
||||||
|
|
|
@ -279,6 +279,10 @@ Core and builtins
|
||||||
Extension Modules
|
Extension Modules
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- The unicodedata module was updated to the 4.1 version of the Unicode
|
||||||
|
database. The 3.2 version is still available as unicodedata.db_3_2_0
|
||||||
|
for applications that require this specific version (such as IDNA).
|
||||||
|
|
||||||
- The timing module is no longer built by default. It was deprecated
|
- The timing module is no longer built by default. It was deprecated
|
||||||
in PEP 4 in Python 2.0 or earlier.
|
in PEP 4 in Python 2.0 or earlier.
|
||||||
|
|
||||||
|
|
|
@ -14,6 +14,7 @@
|
||||||
|
|
||||||
#include "Python.h"
|
#include "Python.h"
|
||||||
#include "ucnhash.h"
|
#include "ucnhash.h"
|
||||||
|
#include "structmember.h"
|
||||||
|
|
||||||
/* character properties */
|
/* character properties */
|
||||||
|
|
||||||
|
@ -28,6 +29,14 @@ typedef struct {
|
||||||
_PyUnicode_EastAsianWidth */
|
_PyUnicode_EastAsianWidth */
|
||||||
} _PyUnicode_DatabaseRecord;
|
} _PyUnicode_DatabaseRecord;
|
||||||
|
|
||||||
|
typedef struct change_record {
|
||||||
|
/* sequence of fields should be the same as in merge_old_version */
|
||||||
|
const unsigned char bidir_changed;
|
||||||
|
const unsigned char category_changed;
|
||||||
|
const unsigned char decimal_changed;
|
||||||
|
const int numeric_changed;
|
||||||
|
} change_record;
|
||||||
|
|
||||||
/* data file generated by Tools/unicode/makeunicodedata.py */
|
/* data file generated by Tools/unicode/makeunicodedata.py */
|
||||||
#include "unicodedata_db.h"
|
#include "unicodedata_db.h"
|
||||||
|
|
||||||
|
@ -51,6 +60,85 @@ _getrecord(PyUnicodeObject* v)
|
||||||
return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
|
return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ------------- Previous-version API ------------------------------------- */
|
||||||
|
typedef struct previous_version {
|
||||||
|
PyObject_HEAD
|
||||||
|
const char *name;
|
||||||
|
const change_record* (*getrecord)(Py_UCS4);
|
||||||
|
Py_UCS4 (*normalization)(Py_UCS4);
|
||||||
|
} PreviousDBVersion;
|
||||||
|
|
||||||
|
#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
|
||||||
|
|
||||||
|
/* Forward declaration */
|
||||||
|
static PyMethodDef unicodedata_functions[];
|
||||||
|
|
||||||
|
static PyMemberDef DB_members[] = {
|
||||||
|
{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
|
||||||
|
{NULL}
|
||||||
|
};
|
||||||
|
|
||||||
|
static PyTypeObject Xxo_Type = {
|
||||||
|
/* The ob_type field must be initialized in the module init function
|
||||||
|
* to be portable to Windows without using C++. */
|
||||||
|
PyObject_HEAD_INIT(NULL)
|
||||||
|
0, /*ob_size*/
|
||||||
|
"unicodedata.DB", /*tp_name*/
|
||||||
|
sizeof(PreviousDBVersion), /*tp_basicsize*/
|
||||||
|
0, /*tp_itemsize*/
|
||||||
|
/* methods */
|
||||||
|
(destructor)PyObject_Del, /*tp_dealloc*/
|
||||||
|
0, /*tp_print*/
|
||||||
|
0, /*tp_getattr*/
|
||||||
|
0, /*tp_setattr*/
|
||||||
|
0, /*tp_compare*/
|
||||||
|
0, /*tp_repr*/
|
||||||
|
0, /*tp_as_number*/
|
||||||
|
0, /*tp_as_sequence*/
|
||||||
|
0, /*tp_as_mapping*/
|
||||||
|
0, /*tp_hash*/
|
||||||
|
0, /*tp_call*/
|
||||||
|
0, /*tp_str*/
|
||||||
|
PyObject_GenericGetAttr,/*tp_getattro*/
|
||||||
|
0, /*tp_setattro*/
|
||||||
|
0, /*tp_as_buffer*/
|
||||||
|
Py_TPFLAGS_DEFAULT, /*tp_flags*/
|
||||||
|
0, /*tp_doc*/
|
||||||
|
0, /*tp_traverse*/
|
||||||
|
0, /*tp_clear*/
|
||||||
|
0, /*tp_richcompare*/
|
||||||
|
0, /*tp_weaklistoffset*/
|
||||||
|
0, /*tp_iter*/
|
||||||
|
0, /*tp_iternext*/
|
||||||
|
unicodedata_functions, /*tp_methods*/
|
||||||
|
DB_members, /*tp_members*/
|
||||||
|
0, /*tp_getset*/
|
||||||
|
0, /*tp_base*/
|
||||||
|
0, /*tp_dict*/
|
||||||
|
0, /*tp_descr_get*/
|
||||||
|
0, /*tp_descr_set*/
|
||||||
|
0, /*tp_dictoffset*/
|
||||||
|
0, /*tp_init*/
|
||||||
|
0, /*tp_alloc*/
|
||||||
|
0, /*tp_new*/
|
||||||
|
0, /*tp_free*/
|
||||||
|
0, /*tp_is_gc*/
|
||||||
|
};
|
||||||
|
|
||||||
|
static PyObject*
|
||||||
|
new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
|
||||||
|
Py_UCS4 (*normalization)(Py_UCS4))
|
||||||
|
{
|
||||||
|
PreviousDBVersion *self;
|
||||||
|
self = PyObject_New(PreviousDBVersion, &Xxo_Type);
|
||||||
|
if (self == NULL)
|
||||||
|
return NULL;
|
||||||
|
self->name = name;
|
||||||
|
self->getrecord = getrecord;
|
||||||
|
self->normalization = normalization;
|
||||||
|
return (PyObject*)self;
|
||||||
|
}
|
||||||
|
|
||||||
/* --- Module API --------------------------------------------------------- */
|
/* --- Module API --------------------------------------------------------- */
|
||||||
|
|
||||||
PyDoc_STRVAR(unicodedata_decimal__doc__,
|
PyDoc_STRVAR(unicodedata_decimal__doc__,
|
||||||
|
@ -65,6 +153,7 @@ unicodedata_decimal(PyObject *self, PyObject *args)
|
||||||
{
|
{
|
||||||
PyUnicodeObject *v;
|
PyUnicodeObject *v;
|
||||||
PyObject *defobj = NULL;
|
PyObject *defobj = NULL;
|
||||||
|
int have_old = 0;
|
||||||
long rc;
|
long rc;
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
|
if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
|
||||||
|
@ -74,7 +163,22 @@ unicodedata_decimal(PyObject *self, PyObject *args)
|
||||||
"need a single Unicode character as parameter");
|
"need a single Unicode character as parameter");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
|
|
||||||
|
if (self) {
|
||||||
|
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
|
||||||
|
if (old->category_changed == 0) {
|
||||||
|
/* unassigned */
|
||||||
|
have_old = 1;
|
||||||
|
rc = -1;
|
||||||
|
}
|
||||||
|
else if (old->decimal_changed != 0xFF) {
|
||||||
|
have_old = 1;
|
||||||
|
rc = old->decimal_changed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!have_old)
|
||||||
|
rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
|
||||||
if (rc < 0) {
|
if (rc < 0) {
|
||||||
if (defobj == NULL) {
|
if (defobj == NULL) {
|
||||||
PyErr_SetString(PyExc_ValueError,
|
PyErr_SetString(PyExc_ValueError,
|
||||||
|
@ -136,6 +240,7 @@ unicodedata_numeric(PyObject *self, PyObject *args)
|
||||||
{
|
{
|
||||||
PyUnicodeObject *v;
|
PyUnicodeObject *v;
|
||||||
PyObject *defobj = NULL;
|
PyObject *defobj = NULL;
|
||||||
|
int have_old = 0;
|
||||||
double rc;
|
double rc;
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
|
if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
|
||||||
|
@ -145,7 +250,22 @@ unicodedata_numeric(PyObject *self, PyObject *args)
|
||||||
"need a single Unicode character as parameter");
|
"need a single Unicode character as parameter");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
|
|
||||||
|
if (self) {
|
||||||
|
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
|
||||||
|
if (old->category_changed == 0) {
|
||||||
|
/* unassigned */
|
||||||
|
have_old = 1;
|
||||||
|
rc = -1;
|
||||||
|
}
|
||||||
|
else if (old->decimal_changed != 0xFF) {
|
||||||
|
have_old = 1;
|
||||||
|
rc = old->decimal_changed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!have_old)
|
||||||
|
rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
|
||||||
if (rc < 0) {
|
if (rc < 0) {
|
||||||
if (defobj == NULL) {
|
if (defobj == NULL) {
|
||||||
PyErr_SetString(PyExc_ValueError, "not a numeric character");
|
PyErr_SetString(PyExc_ValueError, "not a numeric character");
|
||||||
|
@ -180,6 +300,11 @@ unicodedata_category(PyObject *self, PyObject *args)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
index = (int) _getrecord(v)->category;
|
index = (int) _getrecord(v)->category;
|
||||||
|
if (self) {
|
||||||
|
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
|
||||||
|
if (old->category_changed != 0xFF)
|
||||||
|
index = old->category_changed;
|
||||||
|
}
|
||||||
return PyString_FromString(_PyUnicode_CategoryNames[index]);
|
return PyString_FromString(_PyUnicode_CategoryNames[index]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -205,6 +330,13 @@ unicodedata_bidirectional(PyObject *self, PyObject *args)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
index = (int) _getrecord(v)->bidirectional;
|
index = (int) _getrecord(v)->bidirectional;
|
||||||
|
if (self) {
|
||||||
|
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
|
||||||
|
if (old->category_changed == 0)
|
||||||
|
index = 0; /* unassigned */
|
||||||
|
else if (old->bidir_changed != 0xFF)
|
||||||
|
index = old->bidir_changed;
|
||||||
|
}
|
||||||
return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
|
return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -219,6 +351,7 @@ static PyObject *
|
||||||
unicodedata_combining(PyObject *self, PyObject *args)
|
unicodedata_combining(PyObject *self, PyObject *args)
|
||||||
{
|
{
|
||||||
PyUnicodeObject *v;
|
PyUnicodeObject *v;
|
||||||
|
int index;
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "O!:combining",
|
if (!PyArg_ParseTuple(args, "O!:combining",
|
||||||
&PyUnicode_Type, &v))
|
&PyUnicode_Type, &v))
|
||||||
|
@ -228,7 +361,13 @@ unicodedata_combining(PyObject *self, PyObject *args)
|
||||||
"need a single Unicode character as parameter");
|
"need a single Unicode character as parameter");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
return PyInt_FromLong((int) _getrecord(v)->combining);
|
index = (int) _getrecord(v)->combining;
|
||||||
|
if (self) {
|
||||||
|
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
|
||||||
|
if (old->category_changed == 0)
|
||||||
|
index = 0; /* unassigned */
|
||||||
|
}
|
||||||
|
return PyInt_FromLong(index);
|
||||||
}
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(unicodedata_mirrored__doc__,
|
PyDoc_STRVAR(unicodedata_mirrored__doc__,
|
||||||
|
@ -242,6 +381,7 @@ static PyObject *
|
||||||
unicodedata_mirrored(PyObject *self, PyObject *args)
|
unicodedata_mirrored(PyObject *self, PyObject *args)
|
||||||
{
|
{
|
||||||
PyUnicodeObject *v;
|
PyUnicodeObject *v;
|
||||||
|
int index;
|
||||||
|
|
||||||
if (!PyArg_ParseTuple(args, "O!:mirrored",
|
if (!PyArg_ParseTuple(args, "O!:mirrored",
|
||||||
&PyUnicode_Type, &v))
|
&PyUnicode_Type, &v))
|
||||||
|
@ -251,7 +391,13 @@ unicodedata_mirrored(PyObject *self, PyObject *args)
|
||||||
"need a single Unicode character as parameter");
|
"need a single Unicode character as parameter");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
return PyInt_FromLong((int) _getrecord(v)->mirrored);
|
index = (int) _getrecord(v)->mirrored;
|
||||||
|
if (self) {
|
||||||
|
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
|
||||||
|
if (old->category_changed == 0)
|
||||||
|
index = 0; /* unassigned */
|
||||||
|
}
|
||||||
|
return PyInt_FromLong(index);
|
||||||
}
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
|
PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
|
||||||
|
@ -275,6 +421,11 @@ unicodedata_east_asian_width(PyObject *self, PyObject *args)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
index = (int) _getrecord(v)->east_asian_width;
|
index = (int) _getrecord(v)->east_asian_width;
|
||||||
|
if (self) {
|
||||||
|
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
|
||||||
|
if (old->category_changed == 0)
|
||||||
|
index = 0; /* unassigned */
|
||||||
|
}
|
||||||
return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
|
return PyString_FromString(_PyUnicode_EastAsianWidthNames[index]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -303,6 +454,12 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
|
||||||
|
|
||||||
code = (int) *PyUnicode_AS_UNICODE(v);
|
code = (int) *PyUnicode_AS_UNICODE(v);
|
||||||
|
|
||||||
|
if (self) {
|
||||||
|
const change_record *old = get_old_record(self, *PyUnicode_AS_UNICODE(v));
|
||||||
|
if (old->category_changed == 0)
|
||||||
|
return PyString_FromString(""); /* unassigned */
|
||||||
|
}
|
||||||
|
|
||||||
if (code < 0 || code >= 0x110000)
|
if (code < 0 || code >= 0x110000)
|
||||||
index = 0;
|
index = 0;
|
||||||
else {
|
else {
|
||||||
|
@ -337,11 +494,14 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
|
get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
|
||||||
{
|
{
|
||||||
if (code >= 0x110000) {
|
if (code >= 0x110000) {
|
||||||
*index = 0;
|
*index = 0;
|
||||||
}
|
} else if (self && get_old_record(self, code)->category_changed==0) {
|
||||||
|
/* unassigned in old version */
|
||||||
|
*index = 0;
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
*index = decomp_index1[(code>>DECOMP_SHIFT)];
|
*index = decomp_index1[(code>>DECOMP_SHIFT)];
|
||||||
*index = decomp_index2[(*index<<DECOMP_SHIFT)+
|
*index = decomp_index2[(*index<<DECOMP_SHIFT)+
|
||||||
|
@ -367,7 +527,7 @@ get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
|
||||||
#define SCount (LCount*NCount)
|
#define SCount (LCount*NCount)
|
||||||
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
nfd_nfkd(PyObject *input, int k)
|
nfd_nfkd(PyObject *self, PyObject *input, int k)
|
||||||
{
|
{
|
||||||
PyObject *result;
|
PyObject *result;
|
||||||
Py_UNICODE *i, *end, *o;
|
Py_UNICODE *i, *end, *o;
|
||||||
|
@ -416,8 +576,17 @@ nfd_nfkd(PyObject *input, int k)
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* Other decompoistions. */
|
/* normalization changes */
|
||||||
get_decomp_record(code, &index, &prefix, &count);
|
if (self) {
|
||||||
|
Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
|
||||||
|
if (value != 0) {
|
||||||
|
stack[stackptr++] = value;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Other decompositions. */
|
||||||
|
get_decomp_record(self, code, &index, &prefix, &count);
|
||||||
|
|
||||||
/* Copy character if it is not decomposable, or has a
|
/* Copy character if it is not decomposable, or has a
|
||||||
compatibility decomposition, but we do NFD. */
|
compatibility decomposition, but we do NFD. */
|
||||||
|
@ -467,7 +636,7 @@ nfd_nfkd(PyObject *input, int k)
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
find_nfc_index(struct reindex* nfc, Py_UNICODE code)
|
find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
|
||||||
{
|
{
|
||||||
int index;
|
int index;
|
||||||
for (index = 0; nfc[index].start; index++) {
|
for (index = 0; nfc[index].start; index++) {
|
||||||
|
@ -483,7 +652,7 @@ find_nfc_index(struct reindex* nfc, Py_UNICODE code)
|
||||||
}
|
}
|
||||||
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
nfc_nfkc(PyObject *input, int k)
|
nfc_nfkc(PyObject *self, PyObject *input, int k)
|
||||||
{
|
{
|
||||||
PyObject *result;
|
PyObject *result;
|
||||||
Py_UNICODE *i, *i1, *o, *end;
|
Py_UNICODE *i, *i1, *o, *end;
|
||||||
|
@ -492,7 +661,7 @@ nfc_nfkc(PyObject *input, int k)
|
||||||
Py_UNICODE *skipped[20];
|
Py_UNICODE *skipped[20];
|
||||||
int cskipped = 0;
|
int cskipped = 0;
|
||||||
|
|
||||||
result = nfd_nfkd(input, k);
|
result = nfd_nfkd(self, input, k);
|
||||||
if (!result)
|
if (!result)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
@ -536,7 +705,7 @@ nfc_nfkc(PyObject *input, int k)
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
f = find_nfc_index(nfc_first, *i);
|
f = find_nfc_index(self, nfc_first, *i);
|
||||||
if (f == -1) {
|
if (f == -1) {
|
||||||
*o++ = *i++;
|
*o++ = *i++;
|
||||||
continue;
|
continue;
|
||||||
|
@ -551,7 +720,7 @@ nfc_nfkc(PyObject *input, int k)
|
||||||
i1++;
|
i1++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
l = find_nfc_index(nfc_last, *i1);
|
l = find_nfc_index(self, nfc_last, *i1);
|
||||||
/* *i1 cannot be combined with *i. If *i1
|
/* *i1 cannot be combined with *i. If *i1
|
||||||
is a starter, we don't need to look further.
|
is a starter, we don't need to look further.
|
||||||
Otherwise, record the combining class. */
|
Otherwise, record the combining class. */
|
||||||
|
@ -575,7 +744,7 @@ nfc_nfkc(PyObject *input, int k)
|
||||||
/* Mark the second character unused. */
|
/* Mark the second character unused. */
|
||||||
skipped[cskipped++] = i1;
|
skipped[cskipped++] = i1;
|
||||||
i1++;
|
i1++;
|
||||||
f = find_nfc_index(nfc_first, *i);
|
f = find_nfc_index(self, nfc_first, *i);
|
||||||
if (f == -1)
|
if (f == -1)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -610,13 +779,13 @@ unicodedata_normalize(PyObject *self, PyObject *args)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (strcmp(form, "NFC") == 0)
|
if (strcmp(form, "NFC") == 0)
|
||||||
return nfc_nfkc(input, 0);
|
return nfc_nfkc(self, input, 0);
|
||||||
if (strcmp(form, "NFKC") == 0)
|
if (strcmp(form, "NFKC") == 0)
|
||||||
return nfc_nfkc(input, 1);
|
return nfc_nfkc(self, input, 1);
|
||||||
if (strcmp(form, "NFD") == 0)
|
if (strcmp(form, "NFD") == 0)
|
||||||
return nfd_nfkd(input, 0);
|
return nfd_nfkd(self, input, 0);
|
||||||
if (strcmp(form, "NFKD") == 0)
|
if (strcmp(form, "NFKD") == 0)
|
||||||
return nfd_nfkd(input, 1);
|
return nfd_nfkd(self, input, 1);
|
||||||
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
|
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
@ -686,7 +855,7 @@ is_unified_ideograph(Py_UCS4 code)
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
_getucname(Py_UCS4 code, char* buffer, int buflen)
|
_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen)
|
||||||
{
|
{
|
||||||
int offset;
|
int offset;
|
||||||
int i;
|
int i;
|
||||||
|
@ -726,6 +895,15 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
|
||||||
if (code >= 0x110000)
|
if (code >= 0x110000)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
if (self) {
|
||||||
|
const change_record *old = get_old_record(self, code);
|
||||||
|
if (old->category_changed == 0) {
|
||||||
|
/* unassigned */
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/* get offset into phrasebook */
|
/* get offset into phrasebook */
|
||||||
offset = phrasebook_offset1[(code>>phrasebook_shift)];
|
offset = phrasebook_offset1[(code>>phrasebook_shift)];
|
||||||
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
|
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
|
||||||
|
@ -768,12 +946,12 @@ _getucname(Py_UCS4 code, char* buffer, int buflen)
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
_cmpname(int code, const char* name, int namelen)
|
_cmpname(PyObject *self, int code, const char* name, int namelen)
|
||||||
{
|
{
|
||||||
/* check if code corresponds to the given name */
|
/* check if code corresponds to the given name */
|
||||||
int i;
|
int i;
|
||||||
char buffer[NAME_MAXLEN];
|
char buffer[NAME_MAXLEN];
|
||||||
if (!_getucname(code, buffer, sizeof(buffer)))
|
if (!_getucname(self, code, buffer, sizeof(buffer)))
|
||||||
return 0;
|
return 0;
|
||||||
for (i = 0; i < namelen; i++) {
|
for (i = 0; i < namelen; i++) {
|
||||||
if (toupper(name[i]) != buffer[i])
|
if (toupper(name[i]) != buffer[i])
|
||||||
|
@ -803,7 +981,7 @@ find_syllable(const char *str, int *len, int *pos, int count, int column)
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
_getcode(const char* name, int namelen, Py_UCS4* code)
|
_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code)
|
||||||
{
|
{
|
||||||
unsigned int h, v;
|
unsigned int h, v;
|
||||||
unsigned int mask = code_size-1;
|
unsigned int mask = code_size-1;
|
||||||
|
@ -860,7 +1038,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
|
||||||
v = code_hash[i];
|
v = code_hash[i];
|
||||||
if (!v)
|
if (!v)
|
||||||
return 0;
|
return 0;
|
||||||
if (_cmpname(v, name, namelen)) {
|
if (_cmpname(self, v, name, namelen)) {
|
||||||
*code = v;
|
*code = v;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -872,7 +1050,7 @@ _getcode(const char* name, int namelen, Py_UCS4* code)
|
||||||
v = code_hash[i];
|
v = code_hash[i];
|
||||||
if (!v)
|
if (!v)
|
||||||
return 0;
|
return 0;
|
||||||
if (_cmpname(v, name, namelen)) {
|
if (_cmpname(self, v, name, namelen)) {
|
||||||
*code = v;
|
*code = v;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -914,8 +1092,8 @@ unicodedata_name(PyObject* self, PyObject* args)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
|
if (!_getucname(self, (Py_UCS4) *PyUnicode_AS_UNICODE(v),
|
||||||
name, sizeof(name))) {
|
name, sizeof(name))) {
|
||||||
if (defobj == NULL) {
|
if (defobj == NULL) {
|
||||||
PyErr_SetString(PyExc_ValueError, "no such name");
|
PyErr_SetString(PyExc_ValueError, "no such name");
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -947,7 +1125,7 @@ unicodedata_lookup(PyObject* self, PyObject* args)
|
||||||
if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
|
if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
if (!_getcode(name, namelen, &code)) {
|
if (!_getcode(self, name, namelen, &code)) {
|
||||||
char fmt[] = "undefined character name '%s'";
|
char fmt[] = "undefined character name '%s'";
|
||||||
char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
|
char *buf = PyMem_MALLOC(sizeof(fmt) + namelen);
|
||||||
sprintf(buf, fmt, name);
|
sprintf(buf, fmt, name);
|
||||||
|
@ -985,6 +1163,8 @@ static PyMethodDef unicodedata_functions[] = {
|
||||||
{NULL, NULL} /* sentinel */
|
{NULL, NULL} /* sentinel */
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
PyDoc_STRVAR(unicodedata_docstring,
|
PyDoc_STRVAR(unicodedata_docstring,
|
||||||
"This module provides access to the Unicode Character Database which\n\
|
"This module provides access to the Unicode Character Database which\n\
|
||||||
defines character properties for all Unicode characters. The data in\n\
|
defines character properties for all Unicode characters. The data in\n\
|
||||||
|
@ -1007,6 +1187,11 @@ initunicodedata(void)
|
||||||
|
|
||||||
PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
|
PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
|
||||||
|
|
||||||
|
/* Previous versions */
|
||||||
|
v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
|
||||||
|
if (v != NULL)
|
||||||
|
PyModule_AddObject(m, "db_3_2_0", v);
|
||||||
|
|
||||||
/* Export C API */
|
/* Export C API */
|
||||||
v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
|
v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
|
||||||
if (v != NULL)
|
if (v != NULL)
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
22596
Modules/unicodename_db.h
22596
Modules/unicodename_db.h
File diff suppressed because it is too large
Load Diff
|
@ -1898,7 +1898,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
|
||||||
/* found a name. look it up in the unicode database */
|
/* found a name. look it up in the unicode database */
|
||||||
message = "unknown Unicode character name";
|
message = "unknown Unicode character name";
|
||||||
s++;
|
s++;
|
||||||
if (ucnhash_CAPI->getcode(start, (int)(s-start-1), &chr))
|
if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
|
||||||
goto store;
|
goto store;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -26,13 +26,15 @@
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
SCRIPT = sys.argv[0]
|
SCRIPT = sys.argv[0]
|
||||||
VERSION = "2.3"
|
VERSION = "2.5"
|
||||||
|
|
||||||
# The Unicode Database
|
# The Unicode Database
|
||||||
UNIDATA_VERSION = "3.2.0"
|
UNIDATA_VERSION = "4.1.0"
|
||||||
UNICODE_DATA = "UnicodeData.txt"
|
UNICODE_DATA = "UnicodeData%s.txt"
|
||||||
COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
|
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
|
||||||
EASTASIAN_WIDTH = "EastAsianWidth.txt"
|
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
|
||||||
|
|
||||||
|
old_versions = ["3.2.0"]
|
||||||
|
|
||||||
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
|
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
|
||||||
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
|
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
|
||||||
|
@ -57,13 +59,23 @@ UPPER_MASK = 0x80
|
||||||
|
|
||||||
def maketables(trace=0):
|
def maketables(trace=0):
|
||||||
|
|
||||||
print "--- Reading", UNICODE_DATA, "..."
|
print "--- Reading", UNICODE_DATA % "", "..."
|
||||||
|
|
||||||
unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS,
|
version = ""
|
||||||
EASTASIAN_WIDTH)
|
unicode = UnicodeData(UNICODE_DATA % version,
|
||||||
|
COMPOSITION_EXCLUSIONS % version,
|
||||||
|
EASTASIAN_WIDTH % version)
|
||||||
|
|
||||||
print len(filter(None, unicode.table)), "characters"
|
print len(filter(None, unicode.table)), "characters"
|
||||||
|
|
||||||
|
for version in old_versions:
|
||||||
|
print "--- Reading", UNICODE_DATA % ("-"+version), "..."
|
||||||
|
old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
|
||||||
|
COMPOSITION_EXCLUSIONS % ("-"+version),
|
||||||
|
EASTASIAN_WIDTH % ("-"+version))
|
||||||
|
print len(filter(None, old_unicode.table)), "characters"
|
||||||
|
merge_old_version(version, unicode, old_unicode)
|
||||||
|
|
||||||
makeunicodename(unicode, trace)
|
makeunicodename(unicode, trace)
|
||||||
makeunicodedata(unicode, trace)
|
makeunicodedata(unicode, trace)
|
||||||
makeunicodetype(unicode, trace)
|
makeunicodetype(unicode, trace)
|
||||||
|
@ -119,6 +131,8 @@ def makeunicodedata(unicode, trace):
|
||||||
if record:
|
if record:
|
||||||
if record[5]:
|
if record[5]:
|
||||||
decomp = record[5].split()
|
decomp = record[5].split()
|
||||||
|
if len(decomp) > 19:
|
||||||
|
raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
|
||||||
# prefix
|
# prefix
|
||||||
if decomp[0][0] == "<":
|
if decomp[0][0] == "<":
|
||||||
prefix = decomp.pop(0)
|
prefix = decomp.pop(0)
|
||||||
|
@ -278,6 +292,44 @@ def makeunicodedata(unicode, trace):
|
||||||
Array("comp_index", index).dump(fp, trace)
|
Array("comp_index", index).dump(fp, trace)
|
||||||
Array("comp_data", index2).dump(fp, trace)
|
Array("comp_data", index2).dump(fp, trace)
|
||||||
|
|
||||||
|
# Generate delta tables for old versions
|
||||||
|
for version, table, normalization in unicode.changed:
|
||||||
|
cversion = version.replace(".","_")
|
||||||
|
records = [table[0]]
|
||||||
|
cache = {table[0]:0}
|
||||||
|
index = [0] * len(table)
|
||||||
|
for i, record in enumerate(table):
|
||||||
|
try:
|
||||||
|
index[i] = cache[record]
|
||||||
|
except KeyError:
|
||||||
|
index[i] = cache[record] = len(records)
|
||||||
|
records.append(record)
|
||||||
|
index1, index2, shift = splitbins(index, trace)
|
||||||
|
print >>fp, "static const change_record change_records_%s[] = {" % cversion
|
||||||
|
for record in records:
|
||||||
|
print >>fp, "\t{ %s }," % ", ".join(map(str,record))
|
||||||
|
print >>fp, "};"
|
||||||
|
Array("changes_%s_index" % cversion, index1).dump(fp, trace)
|
||||||
|
Array("changes_%s_data" % cversion, index2).dump(fp, trace)
|
||||||
|
print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
|
||||||
|
print >>fp, "{"
|
||||||
|
print >>fp, "\tint index;"
|
||||||
|
print >>fp, "\tif (n >= 0x110000) index = 0;"
|
||||||
|
print >>fp, "\telse {"
|
||||||
|
print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
|
||||||
|
print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
|
||||||
|
(cversion, shift, ((1<<shift)-1))
|
||||||
|
print >>fp, "\t}"
|
||||||
|
print >>fp, "\treturn change_records_%s+index;" % cversion
|
||||||
|
print >>fp, "}\n"
|
||||||
|
print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
|
||||||
|
print >>fp, "{"
|
||||||
|
print >>fp, "\tswitch(n) {"
|
||||||
|
for k, v in normalization:
|
||||||
|
print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
|
||||||
|
print >>fp, "\tdefault: return 0;"
|
||||||
|
print >>fp, "\t}\n}\n"
|
||||||
|
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
# --------------------------------------------------------------------
|
# --------------------------------------------------------------------
|
||||||
|
@ -540,6 +592,82 @@ def makeunicodename(unicode, trace):
|
||||||
|
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
|
|
||||||
|
def merge_old_version(version, new, old):
|
||||||
|
# Changes to exclusion file not implemented yet
|
||||||
|
if old.exclusions != new.exclusions:
|
||||||
|
raise NotImplementedError, "exclusions differ"
|
||||||
|
|
||||||
|
# In these change records, 0xFF means "no change"
|
||||||
|
bidir_changes = [0xFF]*0x110000
|
||||||
|
category_changes = [0xFF]*0x110000
|
||||||
|
decimal_changes = [0xFF]*0x110000
|
||||||
|
# In numeric data, 0 means "no change",
|
||||||
|
# -1 means "did not have a numeric value
|
||||||
|
numeric_changes = [0] * 0x110000
|
||||||
|
# normalization_changes is a list of key-value pairs
|
||||||
|
normalization_changes = []
|
||||||
|
for i in range(0x110000):
|
||||||
|
if new.table[i] is None:
|
||||||
|
# Characters unassigned in the new version ought to
|
||||||
|
# be unassigned in the old one
|
||||||
|
assert old.table[i] is None
|
||||||
|
continue
|
||||||
|
# check characters unassigned in the old version
|
||||||
|
if old.table[i] is None:
|
||||||
|
# category 0 is "unassigned"
|
||||||
|
category_changes[i] = 0
|
||||||
|
continue
|
||||||
|
# check characters that differ
|
||||||
|
if old.table[i] != new.table[i]:
|
||||||
|
for k in range(len(old.table[i])):
|
||||||
|
if old.table[i][k] != new.table[i][k]:
|
||||||
|
value = old.table[i][k]
|
||||||
|
if k == 2:
|
||||||
|
#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
|
||||||
|
category_changes[i] = CATEGORY_NAMES.index(value)
|
||||||
|
elif k == 4:
|
||||||
|
#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
|
||||||
|
bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
|
||||||
|
elif k == 5:
|
||||||
|
#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
|
||||||
|
# We assume that all normalization changes are in 1:1 mappings
|
||||||
|
assert " " not in value
|
||||||
|
normalization_changes.append((i, value))
|
||||||
|
elif k == 6:
|
||||||
|
#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
|
||||||
|
# we only support changes where the old value is a single digit
|
||||||
|
assert value in "0123456789"
|
||||||
|
decimal_changes[i] = int(value)
|
||||||
|
elif k == 8:
|
||||||
|
# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
|
||||||
|
# Since 0 encodes "no change", the old value is better not 0
|
||||||
|
assert value != "0" and value != "-1"
|
||||||
|
if not value:
|
||||||
|
numeric_changes[i] = -1
|
||||||
|
else:
|
||||||
|
assert re.match("^[0-9]+$", value)
|
||||||
|
numeric_changes[i] = int(value)
|
||||||
|
elif k == 11:
|
||||||
|
# change to ISO comment, ignore
|
||||||
|
pass
|
||||||
|
elif k == 12:
|
||||||
|
# change to simple uppercase mapping; ignore
|
||||||
|
pass
|
||||||
|
elif k == 13:
|
||||||
|
# change to simple lowercase mapping; ignore
|
||||||
|
pass
|
||||||
|
elif k == 14:
|
||||||
|
# change to simple titlecase mapping; ignore
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
class Difference(Exception):pass
|
||||||
|
raise Difference, (hex(i), k, old.table[i], new.table[i])
|
||||||
|
new.changed.append((version, zip(bidir_changes, category_changes,
|
||||||
|
decimal_changes, numeric_changes),
|
||||||
|
normalization_changes))
|
||||||
|
|
||||||
|
|
||||||
# --------------------------------------------------------------------
|
# --------------------------------------------------------------------
|
||||||
# the following support code is taken from the unidb utilities
|
# the following support code is taken from the unidb utilities
|
||||||
# Copyright (c) 1999-2000 by Secret Labs AB
|
# Copyright (c) 1999-2000 by Secret Labs AB
|
||||||
|
@ -551,6 +679,7 @@ import sys
|
||||||
class UnicodeData:
|
class UnicodeData:
|
||||||
|
|
||||||
def __init__(self, filename, exclusions, eastasianwidth, expand=1):
|
def __init__(self, filename, exclusions, eastasianwidth, expand=1):
|
||||||
|
self.changed = []
|
||||||
file = open(filename)
|
file = open(filename)
|
||||||
table = [None] * 0x110000
|
table = [None] * 0x110000
|
||||||
while 1:
|
while 1:
|
||||||
|
@ -569,13 +698,14 @@ class UnicodeData:
|
||||||
if s:
|
if s:
|
||||||
if s[1][-6:] == "First>":
|
if s[1][-6:] == "First>":
|
||||||
s[1] = ""
|
s[1] = ""
|
||||||
field = s[:]
|
field = s
|
||||||
elif s[1][-5:] == "Last>":
|
elif s[1][-5:] == "Last>":
|
||||||
s[1] = ""
|
s[1] = ""
|
||||||
field = None
|
field = None
|
||||||
elif field:
|
elif field:
|
||||||
field[0] = hex(i)
|
f2 = field[:]
|
||||||
table[i] = field
|
f2[0] = "%X" % i
|
||||||
|
table[i] = f2
|
||||||
|
|
||||||
# public attributes
|
# public attributes
|
||||||
self.filename = filename
|
self.filename = filename
|
||||||
|
|
Loading…
Reference in New Issue