Merged revisions 66362 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk

........
  r66362 | martin.v.loewis | 2008-09-10 15:38:12 +0200 (Mi, 10 Sep 2008) | 3 lines

  Issue #3811: The Unicode database was updated to 5.1.
  Reviewed by Fredrik Lundh and Marc-Andre Lemburg.
........
This commit is contained in:
Martin v. Löwis 2008-09-10 14:08:48 +00:00
parent 1009d39187
commit 93cbca33f2
8 changed files with 19135 additions and 15886 deletions

View File

@ -16,11 +16,11 @@
This module provides access to the Unicode Character Database which defines
character properties for all Unicode characters. The data in this database is
based on the :file:`UnicodeData.txt` file version 4.1.0 which is publicly
based on the :file:`UnicodeData.txt` file version 5.1.0 which is publicly
available from ftp://ftp.unicode.org/.
The module uses the same names and symbols as defined by the UnicodeData File
Format 4.1.0 (see http://www.unicode.org/Public/4.1.0/ucd/UCD.html). It defines
Format 5.1.0 (see http://www.unicode.org/Public/5.1.0/ucd/UCD.html). It defines
the following functions:

View File

@ -16,7 +16,7 @@ encoding = 'utf-8'
class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes
expectedchecksum = 'c198ed264497f108434b3f576d4107237221cc8a'
expectedchecksum = 'aef99984a58c8e1e5363a3175f2ff9608599a93e'
def test_method_checksum(self):
h = hashlib.sha1()
@ -75,7 +75,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
class UnicodeFunctionsTest(UnicodeDatabaseTest):
# update this, if the database changes
expectedchecksum = '4e389f97e9f88b8b7ab743121fd643089116f9f2'
expectedchecksum = '3136d5afd787dc2bcb1bdcac95e385349fbebbca'
def test_function_checksum(self):
data = []
@ -226,6 +226,16 @@ class UnicodeMiscTest(UnicodeDatabaseTest):
def test_bug_1704793(self):
self.assertEquals(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346')
def test_ucd_510(self):
import unicodedata
# In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
self.assert_(unicodedata.mirrored("\u0f3a"))
self.assert_(not unicodedata.ucd_3_2_0.mirrored("\u0f3a"))
# Also, we now have two ways of representing
# the upper-case mapping: as delta, or as absolute value
self.assert_("a".upper()=='A')
self.assert_("\u1d79".upper()=='\ua77d')
def test_main():
test.support.run_unittest(
UnicodeMiscTest,

View File

@ -1,8 +1,8 @@
/* ------------------------------------------------------------------------
unicodedata -- Provides access to the Unicode 4.1 data base.
unicodedata -- Provides access to the Unicode 5.1 data base.
Data was extracted from the Unicode 4.1 UnicodeData.txt file.
Data was extracted from the Unicode 5.1 UnicodeData.txt file.
Written by Marc-Andre Lemburg (mal@lemburg.com).
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
@ -34,6 +34,7 @@ typedef struct change_record {
const unsigned char bidir_changed;
const unsigned char category_changed;
const unsigned char decimal_changed;
const unsigned char mirrored_changed;
const int numeric_changed;
} change_record;
@ -355,6 +356,8 @@ unicodedata_mirrored(PyObject *self, PyObject *args)
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0)
index = 0; /* unassigned */
else if (old->mirrored_changed != 0xFF)
index = old->mirrored_changed;
}
return PyLong_FromLong(index);
}
@ -1179,11 +1182,11 @@ PyDoc_STRVAR(unicodedata_docstring,
"This module provides access to the Unicode Character Database which\n\
defines character properties for all Unicode characters. The data in\n\
this database is based on the UnicodeData.txt file version\n\
4.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
5.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
\n\
The module uses the same names and symbols as defined by the\n\
UnicodeData File Format 4.1.0 (see\n\
http://www.unicode.org/Public/4.1.0/ucd/UCD.html).");
UnicodeData File Format 5.1.0 (see\n\
http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
static struct PyModuleDef unicodedatamodule = {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -22,6 +22,7 @@
#define XID_START_MASK 0x100
#define XID_CONTINUE_MASK 0x200
#define PRINTABLE_MASK 0x400
#define NODELTA_MASK 0x800
typedef struct {
const Py_UNICODE upper;
@ -85,6 +86,9 @@ Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
else
delta = ctype->upper;
if (ctype->flags & NODELTA_MASK)
return delta;
if (delta >= 32768)
delta -= 65536;
@ -767,6 +771,8 @@ Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
int delta = ctype->upper;
if (ctype->flags & NODELTA_MASK)
return delta;
if (delta >= 32768)
delta -= 65536;
return ch + delta;
@ -779,6 +785,8 @@ Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
int delta = ctype->lower;
if (ctype->flags & NODELTA_MASK)
return delta;
if (delta >= 32768)
delta -= 65536;
return ch + delta;

File diff suppressed because it is too large Load Diff

View File

@ -28,10 +28,10 @@
import sys
SCRIPT = sys.argv[0]
VERSION = "2.5"
VERSION = "2.6"
# The Unicode Database
UNIDATA_VERSION = "4.1.0"
UNIDATA_VERSION = "5.1.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
@ -62,6 +62,7 @@ UPPER_MASK = 0x80
XID_START_MASK = 0x100
XID_CONTINUE_MASK = 0x200
PRINTABLE_MASK = 0x400
NODELTA_MASK = 0x800
def maketables(trace=0):
@ -361,6 +362,7 @@ def makeunicodetype(unicode, trace):
bidirectional = record[4]
properties = record[16]
flags = 0
delta = True
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
flags |= ALPHA_MASK
if category == "Ll":
@ -379,25 +381,36 @@ def makeunicodetype(unicode, trace):
flags |= XID_START_MASK
if "XID_Continue" in properties:
flags |= XID_CONTINUE_MASK
# use delta predictor for upper/lower/title
# use delta predictor for upper/lower/title if it fits
if record[12]:
upper = int(record[12], 16) - char
assert -32768 <= upper <= 32767
if -32768 <= upper <= 32767 and delta:
upper = upper & 0xffff
else:
upper += char
delta = False
else:
upper = 0
if record[13]:
lower = int(record[13], 16) - char
assert -32768 <= lower <= 32767
if -32768 <= lower <= 32767 and delta:
lower = lower & 0xffff
else:
lower += char
delta = False
else:
lower = 0
if record[14]:
title = int(record[14], 16) - char
assert -32768 <= lower <= 32767
if -32768 <= lower <= 32767 and delta:
title = title & 0xffff
else:
title += char
delta = False
else:
title = 0
if not delta:
flags |= NODELTA_MASK
# decimal digit, integer digit
decimal = 0
if record[6]:
@ -626,6 +639,7 @@ def merge_old_version(version, new, old):
bidir_changes = [0xFF]*0x110000
category_changes = [0xFF]*0x110000
decimal_changes = [0xFF]*0x110000
mirrored_changes = [0xFF]*0x110000
# In numeric data, 0 means "no change",
# -1 means "did not have a numeric value
numeric_changes = [0] * 0x110000
@ -672,6 +686,11 @@ def merge_old_version(version, new, old):
else:
assert re.match("^[0-9]+$", value)
numeric_changes[i] = int(value)
elif k == 9:
if value == 'Y':
mirrored_changes[i] = '1'
else:
mirrored_changes[i] = '0'
elif k == 11:
# change to ISO comment, ignore
pass
@ -691,7 +710,8 @@ def merge_old_version(version, new, old):
class Difference(Exception):pass
raise Difference(hex(i), k, old.table[i], new.table[i])
new.changed.append((version, list(zip(bidir_changes, category_changes,
decimal_changes, numeric_changes)),
decimal_changes, mirrored_changes,
numeric_changes)),
normalization_changes))