Issue #3811: The Unicode database was updated to 5.1.
Reviewed by Fredrik Lundh and Marc-Andre Lemburg.
This commit is contained in:
parent
9ba7a309be
commit
24329ba176
|
@ -16,11 +16,11 @@
|
||||||
|
|
||||||
This module provides access to the Unicode Character Database which defines
|
This module provides access to the Unicode Character Database which defines
|
||||||
character properties for all Unicode characters. The data in this database is
|
character properties for all Unicode characters. The data in this database is
|
||||||
based on the :file:`UnicodeData.txt` file version 4.1.0 which is publicly
|
based on the :file:`UnicodeData.txt` file version 5.1.0 which is publicly
|
||||||
available from ftp://ftp.unicode.org/.
|
available from ftp://ftp.unicode.org/.
|
||||||
|
|
||||||
The module uses the same names and symbols as defined by the UnicodeData File
|
The module uses the same names and symbols as defined by the UnicodeData File
|
||||||
Format 4.1.0 (see http://www.unicode.org/Public/4.1.0/ucd/UCD.html). It defines
|
Format 5.1.0 (see http://www.unicode.org/Public/5.1.0/ucd/UCD.html). It defines
|
||||||
the following functions:
|
the following functions:
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@ encoding = 'utf-8'
|
||||||
class UnicodeMethodsTest(unittest.TestCase):
|
class UnicodeMethodsTest(unittest.TestCase):
|
||||||
|
|
||||||
# update this, if the database changes
|
# update this, if the database changes
|
||||||
expectedchecksum = 'c198ed264497f108434b3f576d4107237221cc8a'
|
expectedchecksum = 'aef99984a58c8e1e5363a3175f2ff9608599a93e'
|
||||||
|
|
||||||
def test_method_checksum(self):
|
def test_method_checksum(self):
|
||||||
h = hashlib.sha1()
|
h = hashlib.sha1()
|
||||||
|
@ -75,7 +75,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
|
||||||
class UnicodeFunctionsTest(UnicodeDatabaseTest):
|
class UnicodeFunctionsTest(UnicodeDatabaseTest):
|
||||||
|
|
||||||
# update this, if the database changes
|
# update this, if the database changes
|
||||||
expectedchecksum = '4e389f97e9f88b8b7ab743121fd643089116f9f2'
|
expectedchecksum = '3136d5afd787dc2bcb1bdcac95e385349fbebbca'
|
||||||
|
|
||||||
def test_function_checksum(self):
|
def test_function_checksum(self):
|
||||||
data = []
|
data = []
|
||||||
|
@ -225,6 +225,16 @@ class UnicodeMiscTest(UnicodeDatabaseTest):
|
||||||
def test_bug_1704793(self):
|
def test_bug_1704793(self):
|
||||||
self.assertEquals(self.db.lookup("GOTHIC LETTER FAIHU"), u'\U00010346')
|
self.assertEquals(self.db.lookup("GOTHIC LETTER FAIHU"), u'\U00010346')
|
||||||
|
|
||||||
|
def test_ucd_510(self):
|
||||||
|
import unicodedata
|
||||||
|
# In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
|
||||||
|
self.assert_(unicodedata.mirrored(u"\u0f3a"))
|
||||||
|
self.assert_(not unicodedata.ucd_3_2_0.mirrored(u"\u0f3a"))
|
||||||
|
# Also, we now have two ways of representing
|
||||||
|
# the upper-case mapping: as delta, or as absolute value
|
||||||
|
self.assert_(u"a".upper()==u'A')
|
||||||
|
self.assert_(u"\u1d79".upper()==u'\ua77d')
|
||||||
|
|
||||||
def test_main():
|
def test_main():
|
||||||
test.test_support.run_unittest(
|
test.test_support.run_unittest(
|
||||||
UnicodeMiscTest,
|
UnicodeMiscTest,
|
||||||
|
|
|
@ -68,6 +68,8 @@ C-API
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #3811: The Unicode database was updated to 5.1.
|
||||||
|
|
||||||
- Issue #3809: Fixed spurious 'test.blah' file left behind by test_logging.
|
- Issue #3809: Fixed spurious 'test.blah' file left behind by test_logging.
|
||||||
|
|
||||||
- Issue 3781: Clean up the API for warnings.catch_warnings() by having it
|
- Issue 3781: Clean up the API for warnings.catch_warnings() by having it
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
/* ------------------------------------------------------------------------
|
/* ------------------------------------------------------------------------
|
||||||
|
|
||||||
unicodedata -- Provides access to the Unicode 4.1 data base.
|
unicodedata -- Provides access to the Unicode 5.1 data base.
|
||||||
|
|
||||||
Data was extracted from the Unicode 4.1 UnicodeData.txt file.
|
Data was extracted from the Unicode 5.1 UnicodeData.txt file.
|
||||||
|
|
||||||
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
||||||
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
|
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
|
||||||
|
@ -34,6 +34,7 @@ typedef struct change_record {
|
||||||
const unsigned char bidir_changed;
|
const unsigned char bidir_changed;
|
||||||
const unsigned char category_changed;
|
const unsigned char category_changed;
|
||||||
const unsigned char decimal_changed;
|
const unsigned char decimal_changed;
|
||||||
|
const unsigned char mirrored_changed;
|
||||||
const int numeric_changed;
|
const int numeric_changed;
|
||||||
} change_record;
|
} change_record;
|
||||||
|
|
||||||
|
@ -354,6 +355,8 @@ unicodedata_mirrored(PyObject *self, PyObject *args)
|
||||||
const change_record *old = get_old_record(self, c);
|
const change_record *old = get_old_record(self, c);
|
||||||
if (old->category_changed == 0)
|
if (old->category_changed == 0)
|
||||||
index = 0; /* unassigned */
|
index = 0; /* unassigned */
|
||||||
|
else if (old->mirrored_changed != 0xFF)
|
||||||
|
index = old->mirrored_changed;
|
||||||
}
|
}
|
||||||
return PyInt_FromLong(index);
|
return PyInt_FromLong(index);
|
||||||
}
|
}
|
||||||
|
@ -1177,11 +1180,11 @@ PyDoc_STRVAR(unicodedata_docstring,
|
||||||
"This module provides access to the Unicode Character Database which\n\
|
"This module provides access to the Unicode Character Database which\n\
|
||||||
defines character properties for all Unicode characters. The data in\n\
|
defines character properties for all Unicode characters. The data in\n\
|
||||||
this database is based on the UnicodeData.txt file version\n\
|
this database is based on the UnicodeData.txt file version\n\
|
||||||
4.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
|
5.1.0 which is publically available from ftp://ftp.unicode.org/.\n\
|
||||||
\n\
|
\n\
|
||||||
The module uses the same names and symbols as defined by the\n\
|
The module uses the same names and symbols as defined by the\n\
|
||||||
UnicodeData File Format 4.1.0 (see\n\
|
UnicodeData File Format 5.1.0 (see\n\
|
||||||
http://www.unicode.org/Public/4.1.0/ucd/UCD.html).");
|
http://www.unicode.org/Public/5.1.0/ucd/UCD.html).");
|
||||||
|
|
||||||
PyMODINIT_FUNC
|
PyMODINIT_FUNC
|
||||||
initunicodedata(void)
|
initunicodedata(void)
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
26480
Modules/unicodename_db.h
26480
Modules/unicodename_db.h
File diff suppressed because it is too large
Load Diff
|
@ -19,6 +19,7 @@
|
||||||
#define SPACE_MASK 0x20
|
#define SPACE_MASK 0x20
|
||||||
#define TITLE_MASK 0x40
|
#define TITLE_MASK 0x40
|
||||||
#define UPPER_MASK 0x80
|
#define UPPER_MASK 0x80
|
||||||
|
#define NODELTA_MASK 0x100
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const Py_UNICODE upper;
|
const Py_UNICODE upper;
|
||||||
|
@ -82,6 +83,9 @@ Py_UNICODE _PyUnicode_ToTitlecase(register Py_UNICODE ch)
|
||||||
else
|
else
|
||||||
delta = ctype->upper;
|
delta = ctype->upper;
|
||||||
|
|
||||||
|
if (ctype->flags & NODELTA_MASK)
|
||||||
|
return delta;
|
||||||
|
|
||||||
if (delta >= 32768)
|
if (delta >= 32768)
|
||||||
delta -= 65536;
|
delta -= 65536;
|
||||||
|
|
||||||
|
@ -724,6 +728,8 @@ Py_UNICODE _PyUnicode_ToUppercase(Py_UNICODE ch)
|
||||||
{
|
{
|
||||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
int delta = ctype->upper;
|
int delta = ctype->upper;
|
||||||
|
if (ctype->flags & NODELTA_MASK)
|
||||||
|
return delta;
|
||||||
if (delta >= 32768)
|
if (delta >= 32768)
|
||||||
delta -= 65536;
|
delta -= 65536;
|
||||||
return ch + delta;
|
return ch + delta;
|
||||||
|
@ -736,6 +742,8 @@ Py_UNICODE _PyUnicode_ToLowercase(Py_UNICODE ch)
|
||||||
{
|
{
|
||||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||||
int delta = ctype->lower;
|
int delta = ctype->lower;
|
||||||
|
if (ctype->flags & NODELTA_MASK)
|
||||||
|
return delta;
|
||||||
if (delta >= 32768)
|
if (delta >= 32768)
|
||||||
delta -= 65536;
|
delta -= 65536;
|
||||||
return ch + delta;
|
return ch + delta;
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -27,10 +27,10 @@
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
SCRIPT = sys.argv[0]
|
SCRIPT = sys.argv[0]
|
||||||
VERSION = "2.5"
|
VERSION = "2.6"
|
||||||
|
|
||||||
# The Unicode Database
|
# The Unicode Database
|
||||||
UNIDATA_VERSION = "4.1.0"
|
UNIDATA_VERSION = "5.1.0"
|
||||||
UNICODE_DATA = "UnicodeData%s.txt"
|
UNICODE_DATA = "UnicodeData%s.txt"
|
||||||
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
|
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
|
||||||
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
|
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
|
||||||
|
@ -57,6 +57,7 @@ LINEBREAK_MASK = 0x10
|
||||||
SPACE_MASK = 0x20
|
SPACE_MASK = 0x20
|
||||||
TITLE_MASK = 0x40
|
TITLE_MASK = 0x40
|
||||||
UPPER_MASK = 0x80
|
UPPER_MASK = 0x80
|
||||||
|
NODELTA_MASK = 0x100
|
||||||
|
|
||||||
def maketables(trace=0):
|
def maketables(trace=0):
|
||||||
|
|
||||||
|
@ -355,6 +356,7 @@ def makeunicodetype(unicode, trace):
|
||||||
category = record[2]
|
category = record[2]
|
||||||
bidirectional = record[4]
|
bidirectional = record[4]
|
||||||
flags = 0
|
flags = 0
|
||||||
|
delta = True
|
||||||
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
|
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
|
||||||
flags |= ALPHA_MASK
|
flags |= ALPHA_MASK
|
||||||
if category == "Ll":
|
if category == "Ll":
|
||||||
|
@ -367,25 +369,36 @@ def makeunicodetype(unicode, trace):
|
||||||
flags |= TITLE_MASK
|
flags |= TITLE_MASK
|
||||||
if category == "Lu":
|
if category == "Lu":
|
||||||
flags |= UPPER_MASK
|
flags |= UPPER_MASK
|
||||||
# use delta predictor for upper/lower/title
|
# use delta predictor for upper/lower/title if it fits
|
||||||
if record[12]:
|
if record[12]:
|
||||||
upper = int(record[12], 16) - char
|
upper = int(record[12], 16) - char
|
||||||
assert -32768 <= upper <= 32767
|
if -32768 <= upper <= 32767 and delta:
|
||||||
upper = upper & 0xffff
|
upper = upper & 0xffff
|
||||||
|
else:
|
||||||
|
upper += char
|
||||||
|
delta = False
|
||||||
else:
|
else:
|
||||||
upper = 0
|
upper = 0
|
||||||
if record[13]:
|
if record[13]:
|
||||||
lower = int(record[13], 16) - char
|
lower = int(record[13], 16) - char
|
||||||
assert -32768 <= lower <= 32767
|
if -32768 <= lower <= 32767 and delta:
|
||||||
lower = lower & 0xffff
|
lower = lower & 0xffff
|
||||||
|
else:
|
||||||
|
lower += char
|
||||||
|
delta = False
|
||||||
else:
|
else:
|
||||||
lower = 0
|
lower = 0
|
||||||
if record[14]:
|
if record[14]:
|
||||||
title = int(record[14], 16) - char
|
title = int(record[14], 16) - char
|
||||||
assert -32768 <= lower <= 32767
|
if -32768 <= lower <= 32767 and delta:
|
||||||
title = title & 0xffff
|
title = title & 0xffff
|
||||||
|
else:
|
||||||
|
title += char
|
||||||
|
delta = False
|
||||||
else:
|
else:
|
||||||
title = 0
|
title = 0
|
||||||
|
if not delta:
|
||||||
|
flags |= NODELTA_MASK
|
||||||
# decimal digit, integer digit
|
# decimal digit, integer digit
|
||||||
decimal = 0
|
decimal = 0
|
||||||
if record[6]:
|
if record[6]:
|
||||||
|
@ -603,6 +616,7 @@ def merge_old_version(version, new, old):
|
||||||
bidir_changes = [0xFF]*0x110000
|
bidir_changes = [0xFF]*0x110000
|
||||||
category_changes = [0xFF]*0x110000
|
category_changes = [0xFF]*0x110000
|
||||||
decimal_changes = [0xFF]*0x110000
|
decimal_changes = [0xFF]*0x110000
|
||||||
|
mirrored_changes = [0xFF]*0x110000
|
||||||
# In numeric data, 0 means "no change",
|
# In numeric data, 0 means "no change",
|
||||||
# -1 means "did not have a numeric value
|
# -1 means "did not have a numeric value
|
||||||
numeric_changes = [0] * 0x110000
|
numeric_changes = [0] * 0x110000
|
||||||
|
@ -649,6 +663,11 @@ def merge_old_version(version, new, old):
|
||||||
else:
|
else:
|
||||||
assert re.match("^[0-9]+$", value)
|
assert re.match("^[0-9]+$", value)
|
||||||
numeric_changes[i] = int(value)
|
numeric_changes[i] = int(value)
|
||||||
|
elif k == 9:
|
||||||
|
if value == 'Y':
|
||||||
|
mirrored_changes[i] = '1'
|
||||||
|
else:
|
||||||
|
mirrored_changes[i] = '0'
|
||||||
elif k == 11:
|
elif k == 11:
|
||||||
# change to ISO comment, ignore
|
# change to ISO comment, ignore
|
||||||
pass
|
pass
|
||||||
|
@ -665,7 +684,8 @@ def merge_old_version(version, new, old):
|
||||||
class Difference(Exception):pass
|
class Difference(Exception):pass
|
||||||
raise Difference, (hex(i), k, old.table[i], new.table[i])
|
raise Difference, (hex(i), k, old.table[i], new.table[i])
|
||||||
new.changed.append((version, zip(bidir_changes, category_changes,
|
new.changed.append((version, zip(bidir_changes, category_changes,
|
||||||
decimal_changes, numeric_changes),
|
decimal_changes, mirrored_changes,
|
||||||
|
numeric_changes),
|
||||||
normalization_changes))
|
normalization_changes))
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue