update to Unicode 6.1

This commit is contained in:
Benjamin Peterson 2012-02-20 22:24:29 -05:00
parent 16fa2a1097
commit 71f660e00f
7 changed files with 24819 additions and 23329 deletions

View File

@ -21,7 +21,7 @@ errors = 'surrogatepass'
class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes
expectedchecksum = 'df0b3ca6785a070b21f837b227dbdbdff3c2e921'
expectedchecksum = 'bf7a78f1a532421b5033600102e23a92044dbba9'
def test_method_checksum(self):
h = hashlib.sha1()
@ -80,7 +80,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
class UnicodeFunctionsTest(UnicodeDatabaseTest):
# update this, if the database changes
expectedchecksum = 'c23dfc0b5eaf3ca2aad32d733de96bb182ccda50'
expectedchecksum = '17fe2f12b788e4fff5479b469c4404bb6ecf841f'
def test_function_checksum(self):
data = []
h = hashlib.sha1()

View File

@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
Core and Builtins
-----------------
- Upgrade Unicode data to Unicode 6.1.
- Issue #14040: Remove rarely used file name suffixes for C extensions
(under POSIX mainly).

View File

@ -921,7 +921,7 @@ is_unified_ideograph(Py_UCS4 code)
{
return
(0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
(0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph */
(0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
(0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
(0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
(0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,7 @@ SCRIPT = sys.argv[0]
VERSION = "3.2"
# The Unicode Database
UNIDATA_VERSION = "6.0.0"
UNIDATA_VERSION = "6.1.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
@ -58,7 +58,7 @@ PUA_16 = range(0x100000, 0x10FFFE)
# we use this ranges of PUA_15 to store name aliases and named sequences
NAME_ALIASES_START = 0xF0000
NAMED_SEQUENCES_START = 0xF0100
NAMED_SEQUENCES_START = 0xF0200
old_versions = ["3.2.0"]
@ -95,7 +95,7 @@ EXTENDED_CASE_MASK = 0x4000
# these ranges need to match unicodedata.c:is_unified_ideograph
cjk_ranges = [
('3400', '4DB5'),
('4E00', '9FCB'),
('4E00', '9FCC'),
('20000', '2A6D6'),
('2A700', '2B734'),
('2B740', '2B81D')
@ -958,7 +958,7 @@ class UnicodeData:
s = s.strip()
if not s or s.startswith('#'):
continue
char, name = s.split(';')
char, name, abbrev = s.split(';')
char = int(char, 16)
self.aliases.append((name, char))
# also store the name in the PUA 1
@ -971,6 +971,7 @@ class UnicodeData:
# in order to take advantage of the compression and lookup
# algorithms used for the other characters.
assert pua_index < NAMED_SEQUENCES_START
pua_index = NAMED_SEQUENCES_START
with open_data(NAMED_SEQUENCES, version) as file:
for s in file: