110 lines
3.1 KiB
Python
110 lines
3.1 KiB
Python
#! /usr/bin/env python
|
|
import sys
|
|
import string
|
|
import perfect_hash
|
|
|
|
# This is a user of perfect_hash.py
|
|
# that takes as input the UnicodeData.txt file available from:
|
|
# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
|
|
|
# It generates a hash table from Unicode Character Name ->
|
|
# unicode code space value.
|
|
|
|
# These variables determine which hash function is tried first.
|
|
# Yields a multiple of 1.7875 for UnicodeData.txt on 2000/06/24/
|
|
f1Seed = 1694245428
|
|
f2Seed = -1917331657
|
|
|
|
# Maximum allowed multipler, if this isn't None then instead of continually
|
|
# increasing C, it resets it back to initC to keep searching for
|
|
# a solution.
|
|
minC = 1.7875
|
|
# Initial multiplier for trying to find a perfect hash function.
|
|
initC = 1.7875
|
|
|
|
moduleName = "ucnhash"
|
|
dataArrayName = "aucn"
|
|
dataArrayType = "_Py_UnicodeCharacterName"
|
|
headerFileName = "ucnhash.h"
|
|
cFileName = "ucnhash.c"
|
|
structName = "_Py_UCNHashAPI"
|
|
|
|
keys = []
|
|
hashData = {}
|
|
|
|
def generateOutputFiles(perfHash, hashData):
|
|
header = perfHash.generate_header(structName)
|
|
header = header + """
|
|
typedef struct
|
|
{
|
|
const char *pszUCN;
|
|
unsigned int uiValue;
|
|
} _Py_UnicodeCharacterName;
|
|
|
|
"""
|
|
|
|
code = perfHash.generate_code(moduleName,
|
|
dataArrayName,
|
|
dataArrayType,
|
|
structName)
|
|
out = open(headerFileName, "w")
|
|
out.write(header)
|
|
out = open(cFileName, "w")
|
|
out.write("#include <%s>\n" % headerFileName)
|
|
out.write(code)
|
|
perfHash.generate_graph(out)
|
|
out.write("""
|
|
|
|
static const _Py_UnicodeCharacterName aucn[] =
|
|
{
|
|
""")
|
|
for i in xrange(len(keys)):
|
|
v = hashData[keys[i][0]]
|
|
out.write(' { "' + keys[i][0] + '", ' + hex(v) + " }," + "\n")
|
|
out.write("};\n\n")
|
|
sys.stderr.write('\nGenerated output files: \n')
|
|
sys.stderr.write('%s\n%s\n' % (headerFileName, cFileName))
|
|
|
|
def main():
|
|
# Suck in UnicodeData.txt and spit out the generated files.
|
|
input = open(sys.argv[1], 'r')
|
|
i = 0
|
|
while 1:
|
|
line = input.readline()
|
|
if line == "": break
|
|
fields = string.split(line, ';')
|
|
if len(fields) < 2:
|
|
sys.stderr.write('Ill-formated line!\n')
|
|
sys.stderr.write('line #: %d\n' % (i + 1))
|
|
sys.exit()
|
|
data, key = fields[:2]
|
|
key = string.strip( key )
|
|
# Any name starting with '<' is a control, or start/end character,
|
|
# so skip it...
|
|
if key[0] == "<":
|
|
continue
|
|
hashcode = i
|
|
i = i + 1
|
|
# force the name to uppercase
|
|
keys.append( (string.upper(key),hashcode) )
|
|
data = string.atoi(data, 16)
|
|
hashData[key] = data
|
|
|
|
input.close()
|
|
sys.stderr.write('%i key/hash pairs read\n' % len(keys) )
|
|
perfHash = perfect_hash.generate_hash(keys, 1,
|
|
minC, initC,
|
|
f1Seed, f2Seed,
|
|
# increment, tries
|
|
0.0025, 50)
|
|
generateOutputFiles(perfHash, hashData)
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) == 1:
|
|
sys.stdout = sys.stderr
|
|
print 'Usage: %s <input filename>' % sys.argv[0]
|
|
print ' The input file needs to be UnicodeData.txt'
|
|
sys.exit()
|
|
main()
|
|
|