unicode database compression, step 3:

- use unidb compression for the unicodectype module.  smaller,
  faster, and slightly more portable...

- also mention the unicode directory in Tools/README
This commit is contained in:
Fredrik Lundh 2000-09-25 17:59:57 +00:00
parent e53793bf4c
commit e9133f7e2e
2 changed files with 100 additions and 9 deletions

View File

@ -21,6 +21,9 @@ scripts A number of useful single-file programs, e.g. tabnanny.py
(by Tim Peters), which checks for inconsistent mixing
of tabs and spaces.
unicode Tools used to generate unicode database files for
Python 2.0 (by Fredrik Lundh).
versioncheck A tool to automate checking whether you have the latest
version of a package (by Jack Jansen).

View File

@ -1,9 +1,13 @@
#
# generate a compact version of the unicode property database
# (re)generate unicode property and type databases
#
# this script converts a unicode 3.0 database file to
# Modules/unicodedata_db.h and Objects/unicodetype_db.h
#
# history:
# 2000-09-24 fl created (based on bits and pieces from unidb)
# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
# 2000-09-25 fl added character type table
#
# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
#
@ -13,7 +17,7 @@ import sys
SCRIPT = sys.argv[0]
VERSION = "1.1"
UNICODE_DATA = "../UnicodeData-Latest.txt"
UNICODE_DATA = "UnicodeData-Latest.txt"
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
@ -24,7 +28,16 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
"ON" ]
def maketable():
ALPHA_MASK = 0x01
DECIMAL_MASK = 0x02
DIGIT_MASK = 0x04
LOWER_MASK = 0x08
NUMERIC_MASK = 0x10
SPACE_MASK = 0x20
TITLE_MASK = 0x40
UPPER_MASK = 0x80
def maketables():
unicode = UnicodeData(UNICODE_DATA)
@ -74,7 +87,7 @@ def maketable():
i = 0
decomp_index[char] = i
FILE = "unicodedata_db.h"
FILE = "Modules/unicodedata_db.h"
sys.stdout = open(FILE, "w")
@ -87,6 +100,9 @@ def maketable():
print "};"
print
# FIXME: the following tables should be made static, and
# the support code moved into unicodedatabase.c
print "/* string literals */"
print "const char *_PyUnicode_CategoryNames[] = {"
for name in CATEGORY_NAMES:
@ -106,24 +122,96 @@ def maketable():
print " NULL"
print "};"
# split index table
# split record index table
index1, index2, shift = splitbins(index)
print "/* index tables used to find the right database record */"
print "/* index tables for the database records */"
print "#define SHIFT", shift
Array("index1", index1).dump(sys.stdout)
Array("index2", index2).dump(sys.stdout)
# split index table
# split decomposition index table
index1, index2, shift = splitbins(decomp_index)
print "/* same, for the decomposition data */"
print "/* index tables for the decomposition data */"
print "#define DECOMP_SHIFT", shift
Array("decomp_index1", index1).dump(sys.stdout)
Array("decomp_index2", index2).dump(sys.stdout)
sys.stdout = sys.__stdout__
#
# 3) unicode type data
# extract unicode types
dummy = (0, 0, 0, 0)
table = [dummy]
cache = {0: dummy}
index = [0] * len(unicode.chars)
for char in unicode.chars:
record = unicode.table[char]
if record:
# extract database properties
category = record[2]
bidirectional = record[4]
flags = 0
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
flags |= ALPHA_MASK
if category == "Ll":
flags |= LOWER_MASK
if category == "Zs" or bidirectional in ("WS", "B", "S"):
flags |= SPACE_MASK
if category in ["Lt", "Lu"]:
flags |= TITLE_MASK
if category == "Lu":
flags |= UPPER_MASK
# use delta predictor for upper/lower/title
if record[12]:
upper = (int(record[12], 16) - char) & 0xffff
else:
upper = 0
if record[13]:
lower = (int(record[13], 16) - char) & 0xffff
else:
lower = 0
if record[14]:
title = (int(record[14], 16) - char) & 0xffff
else:
title = 0
item = (
flags, upper, lower, title
)
# add entry to index and item tables
i = cache.get(item)
if i is None:
cache[item] = i = len(table)
table.append(item)
index[char] = i
FILE = "Objects/unicodetype_db.h"
sys.stdout = open(FILE, "w")
print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
print
print "/* a list of unique character type descriptors */"
print "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
for item in table:
print " {%d, %d, %d, %d}," % item
print "};"
print
# split decomposition index table
index1, index2, shift = splitbins(index)
print "/* type indexes */"
print "#define SHIFT", shift
Array("index1", index1).dump(sys.stdout)
Array("index2", index2).dump(sys.stdout)
sys.stdout = sys.__stdout__
# --------------------------------------------------------------------
# the following support code is taken from the unidb utilities
# Copyright (c) 1999-2000 by Secret Labs AB
@ -259,4 +347,4 @@ def splitbins(t, trace=0):
return best
if __name__ == "__main__":
maketable()
maketables()