Merged revisions 66362 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk ........ r66362 | martin.v.loewis | 2008-09-10 15:38:12 +0200 (Mi, 10 Sep 2008) | 3 lines Issue #3811: The Unicode database was updated to 5.1. Reviewed by Fredrik Lundh and Marc-Andre Lemburg. ........
This commit is contained in:
parent
1009d39187
commit
93cbca33f2
|
@ -16,11 +16,11 @@
|
|||
|
||||
This module provides access to the Unicode Character Database which defines
|
||||
character properties for all Unicode characters. The data in this database is
|
||||
based on the :file:`UnicodeData.txt` file version 4.1.0 which is publicly
|
||||
based on the :file:`UnicodeData.txt` file version 5.1.0 which is publicly
|
||||
available from ftp://ftp.unicode.org/.
|
||||
|
||||
The module uses the same names and symbols as defined by the UnicodeData File
|
||||
Format 4.1.0 (see http://www.unicode.org/Public/4.1.0/ucd/UCD.html). It defines
|
||||
Format 5.1.0 (see http://www.unicode.org/Public/5.1.0/ucd/UCD.html). It defines
|
||||
the following functions:
|
||||
|
||||
|
||||
|
|
|
@ -16,7 +16,7 @@ encoding = 'utf-8'
|
|||
class UnicodeMethodsTest(unittest.TestCase):
|
||||
|
||||
# update this, if the database changes
|
||||
expectedchecksum = 'c198ed264497f108434b3f576d4107237221cc8a'
|
||||
expectedchecksum = 'aef99984a58c8e1e5363a3175f2ff9608599a93e'
|
||||
|
||||
def test_method_checksum(self):
|
||||
h = hashlib.sha1()
|
||||
|
@ -75,7 +75,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
|
|||
class UnicodeFunctionsTest(UnicodeDatabaseTest):
|
||||
|
||||
# update this, if the database changes
|
||||
expectedchecksum = '4e389f97e9f88b8b7ab743121fd643089116f9f2'
|
||||
expectedchecksum = '3136d5afd787dc2bcb1bdcac95e385349fbebbca'
|
||||
|
||||
def test_function_checksum(self):
|
||||
data = []
|
||||
|
@ -226,6 +226,16 @@ class UnicodeMiscTest(UnicodeDatabaseTest):
|
|||
def test_bug_1704793(self):
|
||||
self.assertEquals(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346')
|
||||
|
||||
def test_ucd_510(self):
|
||||
import unicodedata
|
||||
# In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
|
||||
self.assert_(unicodedata.mirrored("\u0f3a"))
|
||||
self.assert_(not unicodedata.ucd_3_2_0.mirrored("\u0f3a"))
|
||||
# Also, we now have two ways of representing
|
||||
# the upper-case mapping: as delta, or as absolute value
|
||||
self.assert_("a".upper()=='A')
|
||||
self.assert_("\u1d79".upper()=='\ua77d')
|
||||
|
||||
def test_main():
|
||||
test.support.run_unittest(
|
||||
UnicodeMiscTest,
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
/* ------------------------------------------------------------------------
|
||||
|
||||
unicodedata -- Provides access to the Unicode 4.1 data base.
|
||||
unicodedata -- Provides access to the Unicode 5.1 data base.
|
||||
|
||||
Data was extracted from the Unicode 4.1 UnicodeData.txt file.
|
||||
Data was extracted from the Unicode 5.1 UnicodeData.txt file.
|
||||
|
||||
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
||||
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
|
||||
|
@ -34,6 +34,7 @@ typedef struct change_record {
|
|||
const unsigned char bidir_changed;
|
||||
const unsigned char category_changed;
|
||||
const unsigned char decimal_changed;
|
||||
const unsigned char mirrored_changed;
|
||||
const int numeric_changed;
|
||||
} change_record;
|
||||
|
||||
|
@ -355,6 +356,8 @@ unicodedata_mirrored(PyObject *self, PyObject *args)
|
|||
const change_record *old = get_old_record(self, c);
|
||||
if (old->category_changed == 0)
|
||||
index = 0; /* unassigned */
|
||||
else if (old->mirrored_changed != 0xFF)
|
||||
index = old->mirrored_changed;
|
||||
}
|
||||
return PyLong_FromLong(index);
|
||||
}
|
||||
|
@ -1179,11 +1182,11 @@ PyDoc_STRVAR(unicodedata_docstring,
|
|||
"This module provides access to the Unicode Character Database which\n\
|
||||
defines character properties for all Unicode characters. The data in\n\
|
||||
this database is based on the UnicodeData.txt file version\n\
|
||||
4.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
|
||||
5.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
|
||||
\n\
|
||||
The module uses the same names and symbols as defined by the\n\
|
||||
UnicodeData File Format 4.1.0 (see\n\
|
||||
http://www.unicode.org/Public/4.1.0/ucd/UCD.html).");
|
||||
UnicodeData File Format 5.1.0 (see\n\
|
||||
http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
|
||||
|
||||
|
||||
static struct PyModuleDef unicodedatamodule = {
|
||||
|
|
File diff suppressed because it is too large
Load Diff
26759
Modules/unicodename_db.h
26759
Modules/unicodename_db.h
File diff suppressed because it is too large
Load Diff
|
@ -22,6 +22,7 @@
|
|||
#define XID_START_MASK 0x100
|
||||
#define XID_CONTINUE_MASK 0x200
|
||||
#define PRINTABLE_MASK 0x400
|
||||
#define NODELTA_MASK 0x800
|
||||
|
||||
typedef struct {
|
||||
const Py_UNICODE upper;
|
||||
|
@ -85,6 +86,9 @@ Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
|
|||
else
|
||||
delta = ctype->upper;
|
||||
|
||||
if (ctype->flags & NODELTA_MASK)
|
||||
return delta;
|
||||
|
||||
if (delta >= 32768)
|
||||
delta -= 65536;
|
||||
|
||||
|
@ -767,6 +771,8 @@ Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
|
|||
{
|
||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||
int delta = ctype->upper;
|
||||
if (ctype->flags & NODELTA_MASK)
|
||||
return delta;
|
||||
if (delta >= 32768)
|
||||
delta -= 65536;
|
||||
return ch + delta;
|
||||
|
@ -779,6 +785,8 @@ Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
|
|||
{
|
||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||
int delta = ctype->lower;
|
||||
if (ctype->flags & NODELTA_MASK)
|
||||
return delta;
|
||||
if (delta >= 32768)
|
||||
delta -= 65536;
|
||||
return ch + delta;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -28,10 +28,10 @@
|
|||
import sys
|
||||
|
||||
SCRIPT = sys.argv[0]
|
||||
VERSION = "2.5"
|
||||
VERSION = "2.6"
|
||||
|
||||
# The Unicode Database
|
||||
UNIDATA_VERSION = "4.1.0"
|
||||
UNIDATA_VERSION = "5.1.0"
|
||||
UNICODE_DATA = "UnicodeData%s.txt"
|
||||
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
|
||||
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
|
||||
|
@ -62,6 +62,7 @@ UPPER_MASK = 0x80
|
|||
XID_START_MASK = 0x100
|
||||
XID_CONTINUE_MASK = 0x200
|
||||
PRINTABLE_MASK = 0x400
|
||||
NODELTA_MASK = 0x800
|
||||
|
||||
def maketables(trace=0):
|
||||
|
||||
|
@ -361,6 +362,7 @@ def makeunicodetype(unicode, trace):
|
|||
bidirectional = record[4]
|
||||
properties = record[16]
|
||||
flags = 0
|
||||
delta = True
|
||||
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
|
||||
flags |= ALPHA_MASK
|
||||
if category == "Ll":
|
||||
|
@ -379,25 +381,36 @@ def makeunicodetype(unicode, trace):
|
|||
flags |= XID_START_MASK
|
||||
if "XID_Continue" in properties:
|
||||
flags |= XID_CONTINUE_MASK
|
||||
# use delta predictor for upper/lower/title
|
||||
# use delta predictor for upper/lower/title if it fits
|
||||
if record[12]:
|
||||
upper = int(record[12], 16) - char
|
||||
assert -32768 <= upper <= 32767
|
||||
if -32768 <= upper <= 32767 and delta:
|
||||
upper = upper & 0xffff
|
||||
else:
|
||||
upper += char
|
||||
delta = False
|
||||
else:
|
||||
upper = 0
|
||||
if record[13]:
|
||||
lower = int(record[13], 16) - char
|
||||
assert -32768 <= lower <= 32767
|
||||
if -32768 <= lower <= 32767 and delta:
|
||||
lower = lower & 0xffff
|
||||
else:
|
||||
lower += char
|
||||
delta = False
|
||||
else:
|
||||
lower = 0
|
||||
if record[14]:
|
||||
title = int(record[14], 16) - char
|
||||
assert -32768 <= lower <= 32767
|
||||
if -32768 <= lower <= 32767 and delta:
|
||||
title = title & 0xffff
|
||||
else:
|
||||
title += char
|
||||
delta = False
|
||||
else:
|
||||
title = 0
|
||||
if not delta:
|
||||
flags |= NODELTA_MASK
|
||||
# decimal digit, integer digit
|
||||
decimal = 0
|
||||
if record[6]:
|
||||
|
@ -626,6 +639,7 @@ def merge_old_version(version, new, old):
|
|||
bidir_changes = [0xFF]*0x110000
|
||||
category_changes = [0xFF]*0x110000
|
||||
decimal_changes = [0xFF]*0x110000
|
||||
mirrored_changes = [0xFF]*0x110000
|
||||
# In numeric data, 0 means "no change",
|
||||
# -1 means "did not have a numeric value
|
||||
numeric_changes = [0] * 0x110000
|
||||
|
@ -672,6 +686,11 @@ def merge_old_version(version, new, old):
|
|||
else:
|
||||
assert re.match("^[0-9]+$", value)
|
||||
numeric_changes[i] = int(value)
|
||||
elif k == 9:
|
||||
if value == 'Y':
|
||||
mirrored_changes[i] = '1'
|
||||
else:
|
||||
mirrored_changes[i] = '0'
|
||||
elif k == 11:
|
||||
# change to ISO comment, ignore
|
||||
pass
|
||||
|
@ -691,7 +710,8 @@ def merge_old_version(version, new, old):
|
|||
class Difference(Exception):pass
|
||||
raise Difference(hex(i), k, old.table[i], new.table[i])
|
||||
new.changed.append((version, list(zip(bidir_changes, category_changes,
|
||||
decimal_changes, numeric_changes)),
|
||||
decimal_changes, mirrored_changes,
|
||||
numeric_changes)),
|
||||
normalization_changes))
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue