Issue #3811: The Unicode database was updated to 5.1.

Reviewed by Fredrik Lundh and Marc-Andre Lemburg.
This commit is contained in:
Martin v. Löwis 2008-09-10 13:38:12 +00:00
parent 9ba7a309be
commit 24329ba176
9 changed files with 19001 additions and 15733 deletions

View File

@ -16,11 +16,11 @@
This module provides access to the Unicode Character Database which defines This module provides access to the Unicode Character Database which defines
character properties for all Unicode characters. The data in this database is character properties for all Unicode characters. The data in this database is
based on the :file:`UnicodeData.txt` file version 4.1.0 which is publicly based on the :file:`UnicodeData.txt` file version 5.1.0 which is publicly
available from ftp://ftp.unicode.org/. available from ftp://ftp.unicode.org/.
The module uses the same names and symbols as defined by the UnicodeData File The module uses the same names and symbols as defined by the UnicodeData File
Format 4.1.0 (see http://www.unicode.org/Public/4.1.0/ucd/UCD.html). It defines Format 5.1.0 (see http://www.unicode.org/Public/5.1.0/ucd/UCD.html). It defines
the following functions: the following functions:

View File

@ -16,7 +16,7 @@ encoding = 'utf-8'
class UnicodeMethodsTest(unittest.TestCase): class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes # update this, if the database changes
expectedchecksum = 'c198ed264497f108434b3f576d4107237221cc8a' expectedchecksum = 'aef99984a58c8e1e5363a3175f2ff9608599a93e'
def test_method_checksum(self): def test_method_checksum(self):
h = hashlib.sha1() h = hashlib.sha1()
@ -75,7 +75,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
class UnicodeFunctionsTest(UnicodeDatabaseTest): class UnicodeFunctionsTest(UnicodeDatabaseTest):
# update this, if the database changes # update this, if the database changes
expectedchecksum = '4e389f97e9f88b8b7ab743121fd643089116f9f2' expectedchecksum = '3136d5afd787dc2bcb1bdcac95e385349fbebbca'
def test_function_checksum(self): def test_function_checksum(self):
data = [] data = []
@ -225,6 +225,16 @@ class UnicodeMiscTest(UnicodeDatabaseTest):
def test_bug_1704793(self): def test_bug_1704793(self):
self.assertEquals(self.db.lookup("GOTHIC LETTER FAIHU"), u'\U00010346') self.assertEquals(self.db.lookup("GOTHIC LETTER FAIHU"), u'\U00010346')
def test_ucd_510(self):
import unicodedata
# In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
self.assert_(unicodedata.mirrored(u"\u0f3a"))
self.assert_(not unicodedata.ucd_3_2_0.mirrored(u"\u0f3a"))
# Also, we now have two ways of representing
# the upper-case mapping: as delta, or as absolute value
self.assert_(u"a".upper()==u'A')
self.assert_(u"\u1d79".upper()==u'\ua77d')
def test_main(): def test_main():
test.test_support.run_unittest( test.test_support.run_unittest(
UnicodeMiscTest, UnicodeMiscTest,

View File

@ -68,6 +68,8 @@ C-API
Library Library
------- -------
- Issue #3811: The Unicode database was updated to 5.1.
- Issue #3809: Fixed spurious 'test.blah' file left behind by test_logging. - Issue #3809: Fixed spurious 'test.blah' file left behind by test_logging.
- Issue 3781: Clean up the API for warnings.catch_warnings() by having it - Issue 3781: Clean up the API for warnings.catch_warnings() by having it

View File

@ -1,8 +1,8 @@
/* ------------------------------------------------------------------------ /* ------------------------------------------------------------------------
unicodedata -- Provides access to the Unicode 4.1 data base. unicodedata -- Provides access to the Unicode 5.1 data base.
Data was extracted from the Unicode 4.1 UnicodeData.txt file. Data was extracted from the Unicode 5.1 UnicodeData.txt file.
Written by Marc-Andre Lemburg (mal@lemburg.com). Written by Marc-Andre Lemburg (mal@lemburg.com).
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com) Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
@ -34,6 +34,7 @@ typedef struct change_record {
const unsigned char bidir_changed; const unsigned char bidir_changed;
const unsigned char category_changed; const unsigned char category_changed;
const unsigned char decimal_changed; const unsigned char decimal_changed;
const unsigned char mirrored_changed;
const int numeric_changed; const int numeric_changed;
} change_record; } change_record;
@ -354,6 +355,8 @@ unicodedata_mirrored(PyObject *self, PyObject *args)
const change_record *old = get_old_record(self, c); const change_record *old = get_old_record(self, c);
if (old->category_changed == 0) if (old->category_changed == 0)
index = 0; /* unassigned */ index = 0; /* unassigned */
else if (old->mirrored_changed != 0xFF)
index = old->mirrored_changed;
} }
return PyInt_FromLong(index); return PyInt_FromLong(index);
} }
@ -1177,11 +1180,11 @@ PyDoc_STRVAR(unicodedata_docstring,
"This module provides access to the Unicode Character Database which\n\ "This module provides access to the Unicode Character Database which\n\
defines character properties for all Unicode characters. The data in\n\ defines character properties for all Unicode characters. The data in\n\
this database is based on the UnicodeData.txt file version\n\ this database is based on the UnicodeData.txt file version\n\
4.1.0 which is publically available from ftp://ftp.unicode.org/.\n\ 5.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
\n\ \n\
The module uses the same names and symbols as defined by the\n\ The module uses the same names and symbols as defined by the\n\
UnicodeData File Format 4.1.0 (see\n\ UnicodeData File Format 5.1.0 (see\n\
http://www.unicode.org/Public/4.1.0/ucd/UCD.html)."); http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
PyMODINIT_FUNC PyMODINIT_FUNC
initunicodedata(void) initunicodedata(void)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -19,6 +19,7 @@
#define SPACE_MASK 0x20 #define SPACE_MASK 0x20
#define TITLE_MASK 0x40 #define TITLE_MASK 0x40
#define UPPER_MASK 0x80 #define UPPER_MASK 0x80
#define NODELTA_MASK 0x100
typedef struct { typedef struct {
const Py_UNICODE upper; const Py_UNICODE upper;
@ -82,6 +83,9 @@ Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
else else
delta = ctype->upper; delta = ctype->upper;
if (ctype->flags & NODELTA_MASK)
return delta;
if (delta >= 32768) if (delta >= 32768)
delta -= 65536; delta -= 65536;
@ -724,6 +728,8 @@ Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
{ {
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
int delta = ctype->upper; int delta = ctype->upper;
if (ctype->flags & NODELTA_MASK)
return delta;
if (delta >= 32768) if (delta >= 32768)
delta -= 65536; delta -= 65536;
return ch + delta; return ch + delta;
@ -736,6 +742,8 @@ Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
{ {
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch); const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
int delta = ctype->lower; int delta = ctype->lower;
if (ctype->flags & NODELTA_MASK)
return delta;
if (delta >= 32768) if (delta >= 32768)
delta -= 65536; delta -= 65536;
return ch + delta; return ch + delta;

File diff suppressed because it is too large Load Diff

View File

@ -27,10 +27,10 @@
import sys import sys
SCRIPT = sys.argv[0] SCRIPT = sys.argv[0]
VERSION = "2.5" VERSION = "2.6"
# The Unicode Database # The Unicode Database
UNIDATA_VERSION = "4.1.0" UNIDATA_VERSION = "5.1.0"
UNICODE_DATA = "UnicodeData%s.txt" UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt" EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
@ -57,6 +57,7 @@ LINEBREAK_MASK = 0x10
SPACE_MASK = 0x20 SPACE_MASK = 0x20
TITLE_MASK = 0x40 TITLE_MASK = 0x40
UPPER_MASK = 0x80 UPPER_MASK = 0x80
NODELTA_MASK = 0x100
def maketables(trace=0): def maketables(trace=0):
@ -355,6 +356,7 @@ def makeunicodetype(unicode, trace):
category = record[2] category = record[2]
bidirectional = record[4] bidirectional = record[4]
flags = 0 flags = 0
delta = True
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]: if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
flags |= ALPHA_MASK flags |= ALPHA_MASK
if category == "Ll": if category == "Ll":
@ -367,25 +369,36 @@ def makeunicodetype(unicode, trace):
flags |= TITLE_MASK flags |= TITLE_MASK
if category == "Lu": if category == "Lu":
flags |= UPPER_MASK flags |= UPPER_MASK
# use delta predictor for upper/lower/title # use delta predictor for upper/lower/title if it fits
if record[12]: if record[12]:
upper = int(record[12], 16) - char upper = int(record[12], 16) - char
assert -32768 <= upper <= 32767 if -32768 <= upper <= 32767 and delta:
upper = upper & 0xffff upper = upper & 0xffff
else:
upper += char
delta = False
else: else:
upper = 0 upper = 0
if record[13]: if record[13]:
lower = int(record[13], 16) - char lower = int(record[13], 16) - char
assert -32768 <= lower <= 32767 if -32768 <= lower <= 32767 and delta:
lower = lower & 0xffff lower = lower & 0xffff
else:
lower += char
delta = False
else: else:
lower = 0 lower = 0
if record[14]: if record[14]:
title = int(record[14], 16) - char title = int(record[14], 16) - char
assert -32768 <= lower <= 32767 if -32768 <= lower <= 32767 and delta:
title = title & 0xffff title = title & 0xffff
else:
title += char
delta = False
else: else:
title = 0 title = 0
if not delta:
flags |= NODELTA_MASK
# decimal digit, integer digit # decimal digit, integer digit
decimal = 0 decimal = 0
if record[6]: if record[6]:
@ -603,6 +616,7 @@ def merge_old_version(version, new, old):
bidir_changes = [0xFF]*0x110000 bidir_changes = [0xFF]*0x110000
category_changes = [0xFF]*0x110000 category_changes = [0xFF]*0x110000
decimal_changes = [0xFF]*0x110000 decimal_changes = [0xFF]*0x110000
mirrored_changes = [0xFF]*0x110000
# In numeric data, 0 means "no change", # In numeric data, 0 means "no change",
# -1 means "did not have a numeric value # -1 means "did not have a numeric value
numeric_changes = [0] * 0x110000 numeric_changes = [0] * 0x110000
@ -649,6 +663,11 @@ def merge_old_version(version, new, old):
else: else:
assert re.match("^[0-9]+$", value) assert re.match("^[0-9]+$", value)
numeric_changes[i] = int(value) numeric_changes[i] = int(value)
elif k == 9:
if value == 'Y':
mirrored_changes[i] = '1'
else:
mirrored_changes[i] = '0'
elif k == 11: elif k == 11:
# change to ISO comment, ignore # change to ISO comment, ignore
pass pass
@ -665,7 +684,8 @@ def merge_old_version(version, new, old):
class Difference(Exception):pass class Difference(Exception):pass
raise Difference, (hex(i), k, old.table[i], new.table[i]) raise Difference, (hex(i), k, old.table[i], new.table[i])
new.changed.append((version, zip(bidir_changes, category_changes, new.changed.append((version, zip(bidir_changes, category_changes,
decimal_changes, numeric_changes), decimal_changes, mirrored_changes,
numeric_changes),
normalization_changes)) normalization_changes))