#! /usr/bin/env python import sys import string import perfect_hash # This is a user of perfect_hash.py # that takes as input the UnicodeData.txt file available from: # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt # It generates a hash table from Unicode Character Name -> # unicode code space value. # These variables determine which hash function is tried first. # Yields a multiple of 1.7875 for UnicodeData.txt on 2000/06/24/ f1Seed = 1694245428 f2Seed = -1917331657 # Maximum allowed multipler, if this isn't None then instead of continually # increasing C, it resets it back to initC to keep searching for # a solution. minC = 1.7875 # Initial multiplier for trying to find a perfect hash function. initC = 1.7875 moduleName = "ucnhash" dataArrayName = "aucn" dataArrayType = "_Py_UnicodeCharacterName" headerFileName = "ucnhash.h" cFileName = "ucnhash.c" structName = "_Py_UCNHashAPI" keys = [] hashData = {} def generateOutputFiles(perfHash, hashData): header = perfHash.generate_header(structName) header = header + """ typedef struct { const char *pszUCN; unsigned int uiValue; } _Py_UnicodeCharacterName; """ code = perfHash.generate_code(moduleName, dataArrayName, dataArrayType, structName) out = open(headerFileName, "w") out.write(header) out = open(cFileName, "w") out.write("#include \"%s\"\n" % headerFileName) out.write(code) perfHash.generate_graph(out) out.write(""" static const _Py_UnicodeCharacterName aucn[] = { """) for i in xrange(len(keys)): v = hashData[keys[i][0]] out.write(' { "' + keys[i][0] + '", ' + hex(v) + " }," + "\n") out.write("};\n\n") sys.stderr.write('\nGenerated output files: \n') sys.stderr.write('%s\n%s\n' % (headerFileName, cFileName)) def main(): # Suck in UnicodeData.txt and spit out the generated files. input = open(sys.argv[1], 'r') i = 0 while 1: line = input.readline() if line == "": break fields = string.split(line, ';') if len(fields) < 2: sys.stderr.write('Ill-formated line!\n') sys.stderr.write('line #: %d\n' % (i + 1)) sys.exit() data, key = fields[:2] key = string.strip( key ) # Any name starting with '<' is a control, or start/end character, # so skip it... if key[0] == "<": continue hashcode = i i = i + 1 # force the name to uppercase keys.append( (string.upper(key),hashcode) ) data = string.atoi(data, 16) hashData[key] = data input.close() sys.stderr.write('%i key/hash pairs read\n' % len(keys) ) perfHash = perfect_hash.generate_hash(keys, 1, minC, initC, f1Seed, f2Seed, # increment, tries 0.0025, 50) generateOutputFiles(perfHash, hashData) if __name__ == '__main__': if len(sys.argv) == 1: sys.stdout = sys.stderr print 'Usage: %s ' % sys.argv[0] print ' The input file needs to be UnicodeData.txt' sys.exit() main()