Unicode 9.0.0

Not completely mechanical since support for East Asian Width changes—emoji
codepoints became Wide—had to be added to unicodedata.
This commit is contained in:
Benjamin Peterson 2016-09-14 23:53:47 -07:00
parent 7ec64562b2
commit 6775231597
10 changed files with 26910 additions and 25252 deletions

View File

@ -17,8 +17,8 @@
This module provides access to the Unicode Character Database (UCD) which
defines character properties for all Unicode characters. The data contained in
this database is compiled from the `UCD version 8.0.0
<http://www.unicode.org/Public/8.0.0/ucd>`_.
this database is compiled from the `UCD version 9.0.0
<http://www.unicode.org/Public/9.0.0/ucd>`_.
The module uses the same names and symbols as defined by Unicode
Standard Annex #44, `"Unicode Character Database"
@ -168,6 +168,6 @@ Examples:
.. rubric:: Footnotes
.. [#] http://www.unicode.org/Public/8.0.0/ucd/NameAliases.txt
.. [#] http://www.unicode.org/Public/9.0.0/ucd/NameAliases.txt
.. [#] http://www.unicode.org/Public/8.0.0/ucd/NamedSequences.txt
.. [#] http://www.unicode.org/Public/9.0.0/ucd/NamedSequences.txt

View File

@ -966,6 +966,13 @@ representing :class:`contextlib.AbstractContextManager`.
(Contributed by Brett Cannon in :issue:`25609`.)
unicodedata
-----------
The internal database has been upgraded to use Unicode 9.0.0. (Contributed by
Benjamin Peterson.)
unittest.mock
-------------

View File

@ -20,7 +20,7 @@ errors = 'surrogatepass'
class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes
expectedchecksum = '5971760872b2f98bb9c701e6c0db3273d756b3ec'
expectedchecksum = 'c1fa98674a683aa8a8d8dee0c84494f8d36346e6'
def test_method_checksum(self):
h = hashlib.sha1()
@ -80,7 +80,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
# Update this if the database changes. Make sure to do a full rebuild
# (e.g. 'make distclean && make') to get the correct checksum.
expectedchecksum = '5e74827cd07f9e546a30f34b7bcf6cc2eac38c8c'
expectedchecksum = 'f891b1e6430c712531b9bc935a38e22d78ba1bf3'
def test_function_checksum(self):
data = []
h = hashlib.sha1()
@ -222,6 +222,10 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
self.assertEqual(eaw('\u2010'), 'A')
self.assertEqual(eaw('\U00020000'), 'W')
def test_east_asian_width_9_0_changes(self):
self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
class UnicodeMiscTest(UnicodeDatabaseTest):
def test_failed_import_during_compiling(self):

View File

@ -10,6 +10,8 @@ What's New in Python 3.6.0 beta 2
Core and Builtins
-----------------
- Upgrade internal unicode databases to Unicode version 9.0.0.
- Issue #28131: Fix a regression in zipimport's compile_source(). zipimport
should use the same optimization level as the interpreter.

View File

@ -45,6 +45,7 @@ typedef struct change_record {
const unsigned char category_changed;
const unsigned char decimal_changed;
const unsigned char mirrored_changed;
const unsigned char east_asian_width_changed;
const double numeric_changed;
} change_record;
@ -375,6 +376,8 @@ unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0)
index = 0; /* unassigned */
else if (old->east_asian_width_changed != 0xFF)
index = old->east_asian_width_changed;
}
return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -42,7 +42,7 @@ VERSION = "3.2"
# * Doc/library/stdtypes.rst, and
# * Doc/library/unicodedata.rst
# * Doc/reference/lexical_analysis.rst (two occurrences)
UNIDATA_VERSION = "8.0.0"
UNIDATA_VERSION = "9.0.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
@ -796,6 +796,7 @@ def merge_old_version(version, new, old):
category_changes = [0xFF]*0x110000
decimal_changes = [0xFF]*0x110000
mirrored_changes = [0xFF]*0x110000
east_asian_width_changes = [0xFF]*0x110000
# In numeric data, 0 means "no change",
# -1 means "did not have a numeric value
numeric_changes = [0] * 0x110000
@ -862,6 +863,9 @@ def merge_old_version(version, new, old):
elif k == 14:
# change to simple titlecase mapping; ignore
pass
elif k == 15:
# change to east asian width
east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
elif k == 16:
# derived property changes; not yet
pass
@ -873,8 +877,9 @@ def merge_old_version(version, new, old):
class Difference(Exception):pass
raise Difference(hex(i), k, old.table[i], new.table[i])
new.changed.append((version, list(zip(bidir_changes, category_changes,
decimal_changes, mirrored_changes,
numeric_changes)),
decimal_changes, mirrored_changes,
east_asian_width_changes,
numeric_changes)),
normalization_changes))
def open_data(template, version):

View File

@ -652,7 +652,8 @@ class PyBuildExt(build_ext):
# profiler (_lsprof is for cProfile.py)
exts.append( Extension('_lsprof', ['_lsprof.c', 'rotatingtree.c']) )
# static Unicode character database
exts.append( Extension('unicodedata', ['unicodedata.c']) )
exts.append( Extension('unicodedata', ['unicodedata.c'],
depends=['unicodedata_db.h', 'unicodename_db.h']) )
# _opcode module
exts.append( Extension('_opcode', ['_opcode.c']) )