Upgrade to Unicode 6.0.0.

makeunicodedata.py: download all data files from unicode.org,
  switch to extracting Unihan data from zip file.
  Read linebreakprops and derivednormalizationprops even for
  old versions, even though they are not used in delta records.
test:unicode.py: U+11000 is now assigned, use U+14000 instead.
This commit is contained in:
Martin v. Löwis 2010-10-11 22:42:28 +00:00
parent e8930228c7
commit baecd7243a
6 changed files with 21711 additions and 19847 deletions

View File

@ -1349,7 +1349,7 @@ class UnicodeTest(string_tests.CommonTest,
def test_printable_repr(self):
self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
self.assertEqual(repr('\U00011000'), "'\\U00011000'") # nonprintable
self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
def test_expandtabs_overflows_gracefully(self):
# This test only affects 32-bit platforms because expandtabs can only take

View File

@ -21,7 +21,7 @@ errors = 'surrogatepass'
class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes
expectedchecksum = '4504dffd035baea02c5b9de82bebc3d65e0e0baf'
expectedchecksum = '21b90f1aed00081b81ca7942b22196af090015a0'
def test_method_checksum(self):
h = hashlib.sha1()
@ -80,7 +80,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
class UnicodeFunctionsTest(UnicodeDatabaseTest):
# update this, if the database changes
expectedchecksum = 'e89a6380093a00a7685ac7b92e7367d737fcb79b'
expectedchecksum = 'c23dfc0b5eaf3ca2aad32d733de96bb182ccda50'
def test_function_checksum(self):
data = []
h = hashlib.sha1()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -25,17 +25,17 @@
# written by Fredrik Lundh (fredrik@pythonware.com)
#
import sys
import sys, os, zipfile
SCRIPT = sys.argv[0]
VERSION = "3.2"
# The Unicode Database
UNIDATA_VERSION = "5.2.0"
UNIDATA_VERSION = "6.0.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
UNIHAN = "Unihan%s.txt"
UNIHAN = "Unihan%s.zip"
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
LINE_BREAK = "LineBreak%s.txt"
@ -75,23 +75,13 @@ def maketables(trace=0):
print("--- Reading", UNICODE_DATA % "", "...")
version = ""
unicode = UnicodeData(UNICODE_DATA % version,
COMPOSITION_EXCLUSIONS % version,
EASTASIAN_WIDTH % version,
UNIHAN % version,
DERIVED_CORE_PROPERTIES % version,
DERIVEDNORMALIZATION_PROPS % version,
LINE_BREAK % version)
unicode = UnicodeData(UNIDATA_VERSION)
print(len(list(filter(None, unicode.table))), "characters")
for version in old_versions:
print("--- Reading", UNICODE_DATA % ("-"+version), "...")
old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
COMPOSITION_EXCLUSIONS % ("-"+version),
EASTASIAN_WIDTH % ("-"+version),
UNIHAN % ("-"+version),
DERIVED_CORE_PROPERTIES % ("-"+version))
old_unicode = UnicodeData(version)
print(len(list(filter(None, old_unicode.table))), "characters")
merge_old_version(version, unicode, old_unicode)
@ -771,6 +761,10 @@ def merge_old_version(version, new, old):
elif k == 16:
# derived property changes; not yet
pass
elif k == 17:
# normalization quickchecks are not performed
# for older versions
pass
else:
class Difference(Exception):pass
raise Difference(hex(i), k, old.table[i], new.table[i])
@ -779,6 +773,21 @@ def merge_old_version(version, new, old):
numeric_changes)),
normalization_changes))
def open_data(template, version):
local = template % ('-'+version,)
if not os.path.exists(local):
import urllib.request
if version == '3.2.0':
# irregular url structure
url = 'http://www.unicode.org/Public/3.2-Update/' + local
else:
url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
urllib.request.urlretrieve(url, filename=local)
if local.endswith('.txt'):
return open(local, encoding='utf-8')
else:
# Unihan.zip
return open(local, 'rb')
# --------------------------------------------------------------------
# the following support code is taken from the unidb utilities
@ -793,11 +802,11 @@ class UnicodeData:
# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
# derived-props] (17)
def __init__(self, filename, exclusions, eastasianwidth, unihan,
derivedprops, derivednormalizationprops=None, linebreakprops=None,
def __init__(self, version,
linebreakprops=False,
expand=1):
self.changed = []
file = open(filename)
file = open_data(UNICODE_DATA, version)
table = [None] * 0x110000
while 1:
s = file.readline()
@ -825,11 +834,11 @@ class UnicodeData:
table[i] = f2
# public attributes
self.filename = filename
self.filename = UNICODE_DATA % ''
self.table = table
self.chars = list(range(0x110000)) # unicode 3.2
file = open(exclusions)
file = open_data(COMPOSITION_EXCLUSIONS, version)
self.exclusions = {}
for s in file:
s = s.strip()
@ -841,7 +850,7 @@ class UnicodeData:
self.exclusions[char] = 1
widths = [None] * 0x110000
for s in open(eastasianwidth):
for s in open_data(EASTASIAN_WIDTH, version):
s = s.strip()
if not s:
continue
@ -862,7 +871,7 @@ class UnicodeData:
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(set())
for s in open(derivedprops):
for s in open_data(DERIVED_CORE_PROPERTIES, version):
s = s.split('#', 1)[0].strip()
if not s:
continue
@ -881,43 +890,53 @@ class UnicodeData:
# apply to unassigned code points; ignore them
table[char][-1].add(p)
if linebreakprops:
for s in open(linebreakprops):
s = s.partition('#')[0]
s = [i.strip() for i in s.split(';')]
if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
continue
if '..' not in s[0]:
first = last = int(s[0], 16)
else:
first, last = [int(c, 16) for c in s[0].split('..')]
for char in range(first, last+1):
table[char][-1].add('Line_Break')
for s in open_data(LINE_BREAK, version):
s = s.partition('#')[0]
s = [i.strip() for i in s.split(';')]
if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
continue
if '..' not in s[0]:
first = last = int(s[0], 16)
else:
first, last = [int(c, 16) for c in s[0].split('..')]
for char in range(first, last+1):
table[char][-1].add('Line_Break')
if derivednormalizationprops:
quickchecks = [0] * 0x110000 # default is Yes
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
for s in open(derivednormalizationprops):
if '#' in s:
s = s[:s.index('#')]
s = [i.strip() for i in s.split(';')]
if len(s) < 2 or s[1] not in qc_order:
continue
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
quickcheck_shift = qc_order.index(s[1])*2
quickcheck <<= quickcheck_shift
if '..' not in s[0]:
first = last = int(s[0], 16)
else:
first, last = [int(c, 16) for c in s[0].split('..')]
for char in range(first, last+1):
assert not (quickchecks[char]>>quickcheck_shift)&3
quickchecks[char] |= quickcheck
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(quickchecks[i])
# We only want the quickcheck properties
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
# Yes is the default, hence only N and M occur
# In 3.2.0, the format was different (NF?_NO)
# The parsing will incorrectly determine these as
# "yes", however, unicodedata.c will not perform quickchecks
# for older versions, and no delta records will be created.
quickchecks = [0] * 0x110000
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
for s in open_data(DERIVEDNORMALIZATION_PROPS, version):
if '#' in s:
s = s[:s.index('#')]
s = [i.strip() for i in s.split(';')]
if len(s) < 2 or s[1] not in qc_order:
continue
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
quickcheck_shift = qc_order.index(s[1])*2
quickcheck <<= quickcheck_shift
if '..' not in s[0]:
first = last = int(s[0], 16)
else:
first, last = [int(c, 16) for c in s[0].split('..')]
for char in range(first, last+1):
assert not (quickchecks[char]>>quickcheck_shift)&3
quickchecks[char] |= quickcheck
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(quickchecks[i])
for line in open(unihan, encoding='utf-8'):
zip = zipfile.ZipFile(open_data(UNIHAN, version))
if version == '3.2.0':
data = zip.open('Unihan-3.2.0.txt').read()
else:
data = zip.open('Unihan_NumericValues.txt').read()
for line in data.decode("utf-8").splitlines():
if not line.startswith('U+'):
continue
code, tag, value = line.split(None, 3)[:3]