Upgrade to Unicode 6.0.0.
makeunicodedata.py: download all data files from unicode.org, switch to extracting Unihan data from zip file. Read linebreakprops and derivednormalizationprops even for old versions, even though they are not used in delta records. test:unicode.py: U+11000 is now assigned, use U+14000 instead.
This commit is contained in:
parent
e8930228c7
commit
baecd7243a
|
@ -1349,7 +1349,7 @@ class UnicodeTest(string_tests.CommonTest,
|
||||||
|
|
||||||
def test_printable_repr(self):
|
def test_printable_repr(self):
|
||||||
self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
|
self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
|
||||||
self.assertEqual(repr('\U00011000'), "'\\U00011000'") # nonprintable
|
self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
|
||||||
|
|
||||||
def test_expandtabs_overflows_gracefully(self):
|
def test_expandtabs_overflows_gracefully(self):
|
||||||
# This test only affects 32-bit platforms because expandtabs can only take
|
# This test only affects 32-bit platforms because expandtabs can only take
|
||||||
|
|
|
@ -21,7 +21,7 @@ errors = 'surrogatepass'
|
||||||
class UnicodeMethodsTest(unittest.TestCase):
|
class UnicodeMethodsTest(unittest.TestCase):
|
||||||
|
|
||||||
# update this, if the database changes
|
# update this, if the database changes
|
||||||
expectedchecksum = '4504dffd035baea02c5b9de82bebc3d65e0e0baf'
|
expectedchecksum = '21b90f1aed00081b81ca7942b22196af090015a0'
|
||||||
|
|
||||||
def test_method_checksum(self):
|
def test_method_checksum(self):
|
||||||
h = hashlib.sha1()
|
h = hashlib.sha1()
|
||||||
|
@ -80,7 +80,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
|
||||||
class UnicodeFunctionsTest(UnicodeDatabaseTest):
|
class UnicodeFunctionsTest(UnicodeDatabaseTest):
|
||||||
|
|
||||||
# update this, if the database changes
|
# update this, if the database changes
|
||||||
expectedchecksum = 'e89a6380093a00a7685ac7b92e7367d737fcb79b'
|
expectedchecksum = 'c23dfc0b5eaf3ca2aad32d733de96bb182ccda50'
|
||||||
def test_function_checksum(self):
|
def test_function_checksum(self):
|
||||||
data = []
|
data = []
|
||||||
h = hashlib.sha1()
|
h = hashlib.sha1()
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
34172
Modules/unicodename_db.h
34172
Modules/unicodename_db.h
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -25,17 +25,17 @@
|
||||||
# written by Fredrik Lundh (fredrik@pythonware.com)
|
# written by Fredrik Lundh (fredrik@pythonware.com)
|
||||||
#
|
#
|
||||||
|
|
||||||
import sys
|
import sys, os, zipfile
|
||||||
|
|
||||||
SCRIPT = sys.argv[0]
|
SCRIPT = sys.argv[0]
|
||||||
VERSION = "3.2"
|
VERSION = "3.2"
|
||||||
|
|
||||||
# The Unicode Database
|
# The Unicode Database
|
||||||
UNIDATA_VERSION = "5.2.0"
|
UNIDATA_VERSION = "6.0.0"
|
||||||
UNICODE_DATA = "UnicodeData%s.txt"
|
UNICODE_DATA = "UnicodeData%s.txt"
|
||||||
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
|
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
|
||||||
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
|
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
|
||||||
UNIHAN = "Unihan%s.txt"
|
UNIHAN = "Unihan%s.zip"
|
||||||
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
|
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
|
||||||
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
|
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
|
||||||
LINE_BREAK = "LineBreak%s.txt"
|
LINE_BREAK = "LineBreak%s.txt"
|
||||||
|
@ -75,23 +75,13 @@ def maketables(trace=0):
|
||||||
print("--- Reading", UNICODE_DATA % "", "...")
|
print("--- Reading", UNICODE_DATA % "", "...")
|
||||||
|
|
||||||
version = ""
|
version = ""
|
||||||
unicode = UnicodeData(UNICODE_DATA % version,
|
unicode = UnicodeData(UNIDATA_VERSION)
|
||||||
COMPOSITION_EXCLUSIONS % version,
|
|
||||||
EASTASIAN_WIDTH % version,
|
|
||||||
UNIHAN % version,
|
|
||||||
DERIVED_CORE_PROPERTIES % version,
|
|
||||||
DERIVEDNORMALIZATION_PROPS % version,
|
|
||||||
LINE_BREAK % version)
|
|
||||||
|
|
||||||
print(len(list(filter(None, unicode.table))), "characters")
|
print(len(list(filter(None, unicode.table))), "characters")
|
||||||
|
|
||||||
for version in old_versions:
|
for version in old_versions:
|
||||||
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
|
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
|
||||||
old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
|
old_unicode = UnicodeData(version)
|
||||||
COMPOSITION_EXCLUSIONS % ("-"+version),
|
|
||||||
EASTASIAN_WIDTH % ("-"+version),
|
|
||||||
UNIHAN % ("-"+version),
|
|
||||||
DERIVED_CORE_PROPERTIES % ("-"+version))
|
|
||||||
print(len(list(filter(None, old_unicode.table))), "characters")
|
print(len(list(filter(None, old_unicode.table))), "characters")
|
||||||
merge_old_version(version, unicode, old_unicode)
|
merge_old_version(version, unicode, old_unicode)
|
||||||
|
|
||||||
|
@ -771,6 +761,10 @@ def merge_old_version(version, new, old):
|
||||||
elif k == 16:
|
elif k == 16:
|
||||||
# derived property changes; not yet
|
# derived property changes; not yet
|
||||||
pass
|
pass
|
||||||
|
elif k == 17:
|
||||||
|
# normalization quickchecks are not performed
|
||||||
|
# for older versions
|
||||||
|
pass
|
||||||
else:
|
else:
|
||||||
class Difference(Exception):pass
|
class Difference(Exception):pass
|
||||||
raise Difference(hex(i), k, old.table[i], new.table[i])
|
raise Difference(hex(i), k, old.table[i], new.table[i])
|
||||||
|
@ -779,6 +773,21 @@ def merge_old_version(version, new, old):
|
||||||
numeric_changes)),
|
numeric_changes)),
|
||||||
normalization_changes))
|
normalization_changes))
|
||||||
|
|
||||||
|
def open_data(template, version):
|
||||||
|
local = template % ('-'+version,)
|
||||||
|
if not os.path.exists(local):
|
||||||
|
import urllib.request
|
||||||
|
if version == '3.2.0':
|
||||||
|
# irregular url structure
|
||||||
|
url = 'http://www.unicode.org/Public/3.2-Update/' + local
|
||||||
|
else:
|
||||||
|
url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
|
||||||
|
urllib.request.urlretrieve(url, filename=local)
|
||||||
|
if local.endswith('.txt'):
|
||||||
|
return open(local, encoding='utf-8')
|
||||||
|
else:
|
||||||
|
# Unihan.zip
|
||||||
|
return open(local, 'rb')
|
||||||
|
|
||||||
# --------------------------------------------------------------------
|
# --------------------------------------------------------------------
|
||||||
# the following support code is taken from the unidb utilities
|
# the following support code is taken from the unidb utilities
|
||||||
|
@ -793,11 +802,11 @@ class UnicodeData:
|
||||||
# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
|
# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
|
||||||
# derived-props] (17)
|
# derived-props] (17)
|
||||||
|
|
||||||
def __init__(self, filename, exclusions, eastasianwidth, unihan,
|
def __init__(self, version,
|
||||||
derivedprops, derivednormalizationprops=None, linebreakprops=None,
|
linebreakprops=False,
|
||||||
expand=1):
|
expand=1):
|
||||||
self.changed = []
|
self.changed = []
|
||||||
file = open(filename)
|
file = open_data(UNICODE_DATA, version)
|
||||||
table = [None] * 0x110000
|
table = [None] * 0x110000
|
||||||
while 1:
|
while 1:
|
||||||
s = file.readline()
|
s = file.readline()
|
||||||
|
@ -825,11 +834,11 @@ class UnicodeData:
|
||||||
table[i] = f2
|
table[i] = f2
|
||||||
|
|
||||||
# public attributes
|
# public attributes
|
||||||
self.filename = filename
|
self.filename = UNICODE_DATA % ''
|
||||||
self.table = table
|
self.table = table
|
||||||
self.chars = list(range(0x110000)) # unicode 3.2
|
self.chars = list(range(0x110000)) # unicode 3.2
|
||||||
|
|
||||||
file = open(exclusions)
|
file = open_data(COMPOSITION_EXCLUSIONS, version)
|
||||||
self.exclusions = {}
|
self.exclusions = {}
|
||||||
for s in file:
|
for s in file:
|
||||||
s = s.strip()
|
s = s.strip()
|
||||||
|
@ -841,7 +850,7 @@ class UnicodeData:
|
||||||
self.exclusions[char] = 1
|
self.exclusions[char] = 1
|
||||||
|
|
||||||
widths = [None] * 0x110000
|
widths = [None] * 0x110000
|
||||||
for s in open(eastasianwidth):
|
for s in open_data(EASTASIAN_WIDTH, version):
|
||||||
s = s.strip()
|
s = s.strip()
|
||||||
if not s:
|
if not s:
|
||||||
continue
|
continue
|
||||||
|
@ -862,7 +871,7 @@ class UnicodeData:
|
||||||
for i in range(0, 0x110000):
|
for i in range(0, 0x110000):
|
||||||
if table[i] is not None:
|
if table[i] is not None:
|
||||||
table[i].append(set())
|
table[i].append(set())
|
||||||
for s in open(derivedprops):
|
for s in open_data(DERIVED_CORE_PROPERTIES, version):
|
||||||
s = s.split('#', 1)[0].strip()
|
s = s.split('#', 1)[0].strip()
|
||||||
if not s:
|
if not s:
|
||||||
continue
|
continue
|
||||||
|
@ -881,8 +890,7 @@ class UnicodeData:
|
||||||
# apply to unassigned code points; ignore them
|
# apply to unassigned code points; ignore them
|
||||||
table[char][-1].add(p)
|
table[char][-1].add(p)
|
||||||
|
|
||||||
if linebreakprops:
|
for s in open_data(LINE_BREAK, version):
|
||||||
for s in open(linebreakprops):
|
|
||||||
s = s.partition('#')[0]
|
s = s.partition('#')[0]
|
||||||
s = [i.strip() for i in s.split(';')]
|
s = [i.strip() for i in s.split(';')]
|
||||||
if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
|
if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
|
||||||
|
@ -894,10 +902,16 @@ class UnicodeData:
|
||||||
for char in range(first, last+1):
|
for char in range(first, last+1):
|
||||||
table[char][-1].add('Line_Break')
|
table[char][-1].add('Line_Break')
|
||||||
|
|
||||||
if derivednormalizationprops:
|
# We only want the quickcheck properties
|
||||||
quickchecks = [0] * 0x110000 # default is Yes
|
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
|
||||||
|
# Yes is the default, hence only N and M occur
|
||||||
|
# In 3.2.0, the format was different (NF?_NO)
|
||||||
|
# The parsing will incorrectly determine these as
|
||||||
|
# "yes", however, unicodedata.c will not perform quickchecks
|
||||||
|
# for older versions, and no delta records will be created.
|
||||||
|
quickchecks = [0] * 0x110000
|
||||||
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
|
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
|
||||||
for s in open(derivednormalizationprops):
|
for s in open_data(DERIVEDNORMALIZATION_PROPS, version):
|
||||||
if '#' in s:
|
if '#' in s:
|
||||||
s = s[:s.index('#')]
|
s = s[:s.index('#')]
|
||||||
s = [i.strip() for i in s.split(';')]
|
s = [i.strip() for i in s.split(';')]
|
||||||
|
@ -917,7 +931,12 @@ class UnicodeData:
|
||||||
if table[i] is not None:
|
if table[i] is not None:
|
||||||
table[i].append(quickchecks[i])
|
table[i].append(quickchecks[i])
|
||||||
|
|
||||||
for line in open(unihan, encoding='utf-8'):
|
zip = zipfile.ZipFile(open_data(UNIHAN, version))
|
||||||
|
if version == '3.2.0':
|
||||||
|
data = zip.open('Unihan-3.2.0.txt').read()
|
||||||
|
else:
|
||||||
|
data = zip.open('Unihan_NumericValues.txt').read()
|
||||||
|
for line in data.decode("utf-8").splitlines():
|
||||||
if not line.startswith('U+'):
|
if not line.startswith('U+'):
|
||||||
continue
|
continue
|
||||||
code, tag, value = line.split(None, 3)[:3]
|
code, tag, value = line.split(None, 3)[:3]
|
||||||
|
|
Loading…
Reference in New Issue