Removed Tools/perfecthash, per python-dev discussion

This commit is contained in:
Andrew M. Kuchling 2000-07-29 13:24:39 +00:00
parent 7a4409c1b2
commit 7e11170e85
3 changed files with 0 additions and 842 deletions

View File

@ -1,109 +0,0 @@
#! /usr/bin/env python
import sys
import string
import perfect_hash
# This is a user of perfect_hash.py
# that takes as input the UnicodeData.txt file available from:
# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
# It generates a hash table from Unicode Character Name ->
# unicode code space value.
# These variables determine which hash function is tried first.
# Yields a multiple of 1.7875 for UnicodeData.txt on 2000/06/24/
f1Seed = 0x64fc2234
f2Seed = 0x8db7d737
# Maximum allowed multipler, if this isn't None then instead of continually
# increasing C, it resets it back to initC to keep searching for
# a solution.
minC = 1.7875
# Initial multiplier for trying to find a perfect hash function.
initC = 1.7875
moduleName = "ucnhash"
dataArrayName = "aucn"
dataArrayType = "_Py_UnicodeCharacterName"
headerFileName = "ucnhash.h"
cFileName = "ucnhash.c"
structName = "_Py_UCNHashAPI"
keys = []
hashData = {}
def generateOutputFiles(perfHash, hashData):
header = perfHash.generate_header(structName)
header = header + """
typedef struct
{
const char *pszUCN;
Py_UCS4 value;
} _Py_UnicodeCharacterName;
"""
code = perfHash.generate_code(moduleName,
dataArrayName,
dataArrayType,
structName)
out = open(headerFileName, "w")
out.write(header)
out = open(cFileName, "w")
out.write("#include \"%s\"\n" % headerFileName)
out.write(code)
perfHash.generate_graph(out)
out.write("""
static const _Py_UnicodeCharacterName aucn[] =
{
""")
for i in xrange(len(keys)):
v = hashData[keys[i][0]]
out.write(' { "' + keys[i][0] + '", ' + hex(v) + " }," + "\n")
out.write("};\n\n")
sys.stderr.write('\nGenerated output files: \n')
sys.stderr.write('%s\n%s\n' % (headerFileName, cFileName))
def main():
# Suck in UnicodeData.txt and spit out the generated files.
input = open(sys.argv[1], 'r')
i = 0
while 1:
line = input.readline()
if line == "": break
fields = string.split(line, ';')
if len(fields) < 2:
sys.stderr.write('Ill-formated line!\n')
sys.stderr.write('line #: %d\n' % (i + 1))
sys.exit()
data, key = fields[:2]
key = string.strip( key )
# Any name starting with '<' is a control, or start/end character,
# so skip it...
if key[0] == "<":
continue
hashcode = i
i = i + 1
# force the name to uppercase
keys.append( (string.upper(key),hashcode) )
data = string.atoi(data, 16)
hashData[key] = data
input.close()
sys.stderr.write('%i key/hash pairs read\n' % len(keys) )
perfHash = perfect_hash.generate_hash(keys, 1,
minC, initC,
f1Seed, f2Seed,
# increment, tries
0.0025, 50)
generateOutputFiles(perfHash, hashData)
if __name__ == '__main__':
if len(sys.argv) == 1:
sys.stdout = sys.stderr
print 'Usage: %s <input filename>' % sys.argv[0]
print ' The input file needs to be UnicodeData.txt'
sys.exit()
main()

View File

@ -1,619 +0,0 @@
#!/usr/bin/env/python
# perfect_hash.py
#
# Outputs C code for a minimal perfect hash.
# The hash is produced using the algorithm described in
# "Optimal algorithms for minimal perfect hashing",
# G. Havas, B.S. Majewski. Available as a technical report
# from the CS department, University of Queensland
# (ftp://ftp.cs.uq.oz.au/).
#
# This is a modified version of Andrew Kuchling's code
# (http://starship.python.net/crew/amk/python/code/perfect-hash.html)
# and generates C fragments suitable for compilation as a Python
# extension module.
#
# Difference between this algorithm and gperf:
# Gperf will complete in finite time with a successful function,
# or by giving up.
# This algorithm may never complete, although it is extremely likely
# when c >= 2.
# The algorithm works like this:
# 0) You have K keys, that you want to perfectly hash to a bunch
# of hash values.
#
# 1) Choose a number N larger than K. This is the number of
# vertices in a graph G, and also the size of the resulting table.
#
# 2) Pick two random hash functions f1, f2, that output values from
# 0...N-1.
#
# 3) for key in keys:
# h1 = f1(key) ; h2 = f2(key)
# Draw an edge between vertices h1 and h2 of the graph.
# Associate the desired hash value with that edge.
#
# 4) Check if G is acyclic; if not, go back to step 1 and pick a bigger N.
#
# 5) Assign values to each vertex such that, for each edge, you can
# add the values for the two vertices and get the desired value
# for that edge -- which is the desired hash key. This task is
# dead easy, because the graph is acyclic. This is done by
# picking a vertex V, and assigning it a value of 0. You then do a
# depth-first search, assigning values to new vertices so that
# they sum up properly.
#
# 6) f1, f2, and G now make up your perfect hash function.
import sys, whrandom, string
import pprint
import perfhash
import time
class Hash:
"""Random hash function
For simplicity and speed, this doesn't implement any byte-level hashing
scheme. Instead, a random string is generated and prefixing to
str(key), and then Python's hashing function is used."""
def __init__(self, N, caseInsensitive=0):
self.N = N
junk = ""
for i in range(10):
junk = junk + whrandom.choice(string.letters + string.digits)
self.junk = junk
self.caseInsensitive = caseInsensitive
self.seed = perfhash.calcSeed(junk)
def __call__(self, key):
key = str(key)
if self.caseInsensitive:
key = string.upper(key)
x = perfhash.hash(self.seed, len(self.junk), key, self.N)
return x
def generate_code(self):
s = """{
register int len;
register unsigned char *p;
register unsigned long x;
len = cch;
p = (unsigned char *) key;
x = %(junkSeed)s;
while (--len >= 0)
{
/* (1000003 * x) ^ toupper(*(p++))
* translated to handle > 32 bit longs
*/
x = (0xf4243 * x);
x = x & 0xFFFFFFFF;
x = x ^ """ % \
{
"lenJunk" : len(self.junk),
"junkSeed" : hex(self.seed),
}
if self.caseInsensitive:
s = s + "toupper(*(p++));"
else:
s = s + "*(p++);"
s = s + """
}
x ^= cch + %(lenJunk)d;
if (x == 0xFFFFFFFF)
x = 0xfffffffe;
if (x & 0x80000000)
{
/* Emulate 32-bit signed (2's complement) modulo operation */
x = (~x & 0xFFFFFFFF) + 1;
x %%= k_cHashElements;
if (x != 0)
{
x = x + (~k_cHashElements & 0xFFFFFFFF) + 1;
x = (~x & 0xFFFFFFFF) + 1;
}
}
else
x %%= k_cHashElements;
return x;
}
""" % { "lenJunk" : len(self.junk),
"junkSeed" : hex(self.seed), }
return s
WHITE, GREY, BLACK = 0,1,2
class Graph:
"""Graph class. This class isn't particularly efficient or general,
and only has the features I needed to implement this algorithm.
num_vertices -- number of vertices
edges -- maps 2-tuples of vertex numbers to the value for this
edge. If there's an edge between v1 and v2 (v1<v2),
(v1,v2) is a key and the value is the edge's value.
reachable_list -- maps a vertex V to the list of vertices
to which V is connected by edges. Used
for traversing the graph.
values -- numeric value for each vertex
"""
def __init__(self, num_vertices):
self.num_vertices = num_vertices
self.edges = {}
self.reachable_list = {}
self.values = [-1] * num_vertices
def connect(self, vertex1, vertex2, value):
"""Connect 'vertex1' and 'vertex2' with an edge, with associated
value 'value'"""
if vertex1 > vertex2: vertex1, vertex2 = vertex2, vertex1
if self.edges.has_key( (vertex1, vertex2) ):
raise ValueError, 'Collision: vertices already connected'
self.edges[ (vertex1, vertex2) ] = value
# Add vertices to each other's reachable list
if not self.reachable_list.has_key( vertex1 ):
self.reachable_list[ vertex1 ] = [vertex2]
else:
self.reachable_list[vertex1].append(vertex2)
if not self.reachable_list.has_key( vertex2 ):
self.reachable_list[ vertex2 ] = [vertex1]
else:
self.reachable_list[vertex2].append(vertex1)
def get_edge_value(self, vertex1, vertex2):
"""Retrieve the value corresponding to the edge between
'vertex1' and 'vertex2'. Raises KeyError if no such edge"""
if vertex1 > vertex2:
vertex1, vertex2 = vertex2, vertex1
return self.edges[ (vertex1, vertex2) ]
def is_acyclic(self):
"Returns true if the graph is acyclic, otherwise false"
# This is done by doing a depth-first search of the graph;
# painting each vertex grey and then black. If the DFS
# ever finds a vertex that isn't white, there's a cycle.
colour = {}
for i in range(self.num_vertices): colour[i] = WHITE
# Loop over all vertices, taking white ones as starting
# points for a traversal.
for i in range(self.num_vertices):
if colour[i] == WHITE:
# List of vertices to visit
visit_list = [ (None,i) ]
# Do a DFS
while visit_list:
# Colour this vertex grey.
parent, vertex = visit_list[0] ; del visit_list[0]
colour[vertex] = GREY
# Make copy of list of neighbours, removing the vertex
# we arrived here from.
neighbours = self.reachable_list.get(vertex, []) [:]
if parent in neighbours: neighbours.remove( parent )
for neighbour in neighbours:
if colour[neighbour] == WHITE:
visit_list.insert(0, (vertex, neighbour) )
elif colour[neighbour] != WHITE:
# Aha! Already visited this node,
# so the graph isn't acyclic.
return 0
colour[vertex] = BLACK
# We got through, so the graph is acyclic.
return 1
def assign_values(self):
"""Compute values for each vertex, so that they sum up
properly to the associated value for each edge."""
# Also done with a DFS; I simply copied the DFS code
# from is_acyclic(). (Should generalize the logic so
# one function could be used from both methods,
# but I couldn't be bothered.)
colour = {}
for i in range(self.num_vertices): colour[i] = WHITE
# Loop over all vertices, taking white ones as starting
# points for a traversal.
for i in range(self.num_vertices):
if colour[i] == WHITE:
# Set this vertex's value, arbitrarily, to zero.
self.set_vertex_value( i, 0 )
# List of vertices to visit
visit_list = [ (None,i) ]
# Do a DFS
while visit_list:
# Colour this vertex grey.
parent, vertex = visit_list[0] ; del visit_list[0]
colour[vertex] = GREY
# Make copy of list of neighbours, removing the vertex
# we arrived here from.
neighbours = self.reachable_list.get(vertex, []) [:]
if parent in neighbours: neighbours.remove( parent )
for neighbour in self.reachable_list.get(vertex, []):
edge_value = self.get_edge_value( vertex, neighbour )
if colour[neighbour] == WHITE:
visit_list.insert(0, (vertex, neighbour) )
# Set new vertex's value to the desired
# edge value, minus the value of the
# vertex we came here from.
new_val = (edge_value -
self.get_vertex_value( vertex ) )
self.set_vertex_value( neighbour,
new_val % self.num_vertices)
colour[vertex] = BLACK
# Returns nothing
return
def __getitem__(self, index):
if index < self.num_vertices: return index
raise IndexError
def get_vertex_value(self, vertex):
"Get value for a vertex"
return self.values[ vertex ]
def set_vertex_value(self, vertex, value):
"Set value for a vertex"
self.values[ vertex ] = value
def generate_code(self, out, width = 70):
"Return nicely formatted table"
out.write("{ ")
pos = 0
for v in self.values:
v=str(v)+', '
out.write(v)
pos = pos + len(v) + 1
if pos > width: out.write('\n '); pos = 0
out.write('};\n')
class PerfectHash:
def __init__(self, cchMax, f1, f2, G, cHashElements, cKeys, maxHashValue):
self.cchMax = cchMax
self.f1 = f1
self.f2 = f2
self.G = G
self.cHashElements = cHashElements
self.cKeys = cKeys
# determine the necessary type for storing our hash function
# helper table:
self.type = self.determineType(maxHashValue)
def generate_header(self, structName):
header = """
#include "Python.h"
#include <stdlib.h>
/* --- C API ----------------------------------------------------*/
/* C API for usage by other Python modules */
typedef struct %(structName)s
{
unsigned long cKeys;
unsigned long cchMax;
unsigned long (*hash)(const char *key, unsigned int cch);
const void *(*getValue)(unsigned long iKey);
} %(structName)s;
""" % { "structName" : structName }
return header
def determineType(self, maxHashValue):
if maxHashValue <= 255:
return "unsigned char"
elif maxHashValue <= 65535:
return "unsigned short"
else:
# Take the cheesy way out...
return "unsigned long"
def generate_code(self, moduleName, dataArrayName, dataArrayType, structName):
# Output C code for the hash functions and tables
code = """
/*
* The hash is produced using the algorithm described in
* "Optimal algorithms for minimal perfect hashing",
* G. Havas, B.S. Majewski. Available as a technical report
* from the CS department, University of Queensland
* (ftp://ftp.cs.uq.oz.au/).
*
* Generated using a heavily tweaked version of Andrew Kuchling's
* perfect_hash.py:
* http://starship.python.net/crew/amk/python/code/perfect-hash.html
*
* Generated on: %s
*/
""" % time.ctime(time.time())
# MSVC SP3 was complaining when I actually used a global constant
code = code + """
#define k_cHashElements %i
#define k_cchMaxKey %d
#define k_cKeys %i
""" % (self.cHashElements, self.cchMax, self.cKeys)
code = code + """
staticforward const %s G[k_cHashElements];
staticforward const %s %s[k_cKeys];
""" % (self.type, dataArrayType, dataArrayName)
code = code + """
static long f1(const char *key, unsigned int cch)
"""
code = code + self.f1.generate_code()
code = code + """
static long f2(const char *key, unsigned int cch)
"""
code = code + self.f2.generate_code()
code = code + """
static unsigned long hash(const char *key, unsigned int cch)
{
return ((unsigned long)(G[ f1(key, cch) ]) + (unsigned long)(G[ f2(key, cch) ]) ) %% k_cHashElements;
}
const void *getValue(unsigned long iKey)
{
return &%(dataArrayName)s[iKey];
}
/* Helper for adding objects to dictionaries. Check for errors with
PyErr_Occurred() */
static
void insobj(PyObject *dict,
char *name,
PyObject *v)
{
PyDict_SetItemString(dict, name, v);
Py_XDECREF(v);
}
static const %(structName)s hashAPI =
{
k_cKeys,
k_cchMaxKey,
&hash,
&getValue,
};
static
PyMethodDef Module_methods[] =
{
{NULL, NULL},
};
static char *Module_docstring = "%(moduleName)s hash function module";
/* Error reporting for module init functions */
#define Py_ReportModuleInitError(modname) { \\
PyObject *exc_type, *exc_value, *exc_tb; \\
PyObject *str_type, *str_value; \\
\\
/* Fetch error objects and convert them to strings */ \\
PyErr_Fetch(&exc_type, &exc_value, &exc_tb); \\
if (exc_type && exc_value) { \\
str_type = PyObject_Str(exc_type); \\
str_value = PyObject_Str(exc_value); \\
} \\
else { \\
str_type = NULL; \\
str_value = NULL; \\
} \\
/* Try to format a more informative error message using the \\
original error */ \\
if (str_type && str_value && \\
PyString_Check(str_type) && PyString_Check(str_value)) \\
PyErr_Format( \\
PyExc_ImportError, \\
"initialization of module "modname" failed " \\
"(%%s:%%s)", \\
PyString_AS_STRING(str_type), \\
PyString_AS_STRING(str_value)); \\
else \\
PyErr_SetString( \\
PyExc_ImportError, \\
"initialization of module "modname" failed"); \\
Py_XDECREF(str_type); \\
Py_XDECREF(str_value); \\
Py_XDECREF(exc_type); \\
Py_XDECREF(exc_value); \\
Py_XDECREF(exc_tb); \\
}
/* Create PyMethodObjects and register them in the module\'s dict */
DL_EXPORT(void)
init%(moduleName)s(void)
{
PyObject *module, *moddict;
/* Create module */
module = Py_InitModule4("%(moduleName)s", /* Module name */
Module_methods, /* Method list */
Module_docstring, /* Module doc-string */
(PyObject *)NULL, /* always pass this as *self */
PYTHON_API_VERSION); /* API Version */
if (module == NULL)
goto onError;
/* Add some constants to the module\'s dict */
moddict = PyModule_GetDict(module);
if (moddict == NULL)
goto onError;
/* Export C API */
insobj(
moddict,
"%(moduleName)sAPI",
PyCObject_FromVoidPtr((void *)&hashAPI, NULL));
onError:
/* Check for errors and report them */
if (PyErr_Occurred())
Py_ReportModuleInitError("%(moduleName)s");
return;
}
""" % { "moduleName" : moduleName,
"dataArrayName" : dataArrayName,
"structName" : structName, }
return code
def generate_graph(self, out):
out.write("""
static const unsigned short G[] =
""")
self.G.generate_code(out)
def generate_hash(keys, caseInsensitive=0,
minC=None, initC=None,
f1Seed=None, f2Seed=None,
cIncrement=None, cTries=None):
"""Print out code for a perfect minimal hash. Input is a list of
(key, desired hash value) tuples. """
# K is the number of keys.
K = len(keys)
# We will be generating graphs of size N, where N = c * K.
# The larger C is, the fewer trial graphs will need to be made, but
# the resulting table is also larger. Increase this starting value
# if you're impatient. After 50 failures, c will be increased by 0.025.
if initC is None:
initC = 1.5
c = initC
if cIncrement is None:
cIncrement = 0.0025
if cTries is None:
cTries = 50
# Number of trial graphs so far
num_graphs = 0
sys.stderr.write('Generating graphs... ')
while 1:
# N is the number of vertices in the graph G
N = int(c*K)
num_graphs = num_graphs + 1
if (num_graphs % cTries) == 0:
# Enough failures at this multiplier,
# increase the multiplier and keep trying....
c = c + cIncrement
# Whats good with searching for a better
# hash function if we exceed the size
# of a function we've generated in the past....
if minC is not None and \
c > minC:
c = initC
sys.stderr.write(' -- c > minC, resetting c to %0.4f\n' % c)
else:
sys.stderr.write(' -- increasing c to %0.4f\n' % c)
sys.stderr.write('Generating graphs... ')
# Output a progress message
sys.stderr.write( str(num_graphs) + ' ')
sys.stderr.flush()
# Create graph w/ N vertices
G = Graph(N)
# Save the seeds used to generate
# the following two hash functions.
_seeds = whrandom._inst._seed
# Create 2 random hash functions
f1 = Hash(N, caseInsensitive)
f2 = Hash(N, caseInsensitive)
# Set the initial hash function seed values if passed in.
# Doing this protects our hash functions from
# changes to whrandom's behavior.
if f1Seed is not None:
f1.seed = f1Seed
f1Seed = None
fSpecifiedSeeds = 1
if f2Seed is not None:
f2.seed = f2Seed
f2Seed = None
fSpecifiedSeeds = 1
# Connect vertices given by the values of the two hash functions
# for each key. Associate the desired hash value with each
# edge.
for k, v in keys:
h1 = f1(k) ; h2 = f2(k)
G.connect( h1, h2, v)
# Check if the resulting graph is acyclic; if it is,
# we're done with step 1.
if G.is_acyclic():
break
elif fSpecifiedSeeds:
sys.stderr.write('\nThe initial f1/f2 seeds you specified didn\'t generate a perfect hash function: \n')
sys.stderr.write('f1 seed: %s\n' % f1.seed)
sys.stderr.write('f2 seed: %s\n' % f2.seed)
sys.stderr.write('multipler: %s\n' % c)
sys.stderr.write('Your data has likely changed, or you forgot what your initial multiplier should be.\n')
sys.stderr.write('continuing the search for a perfect hash function......\n')
fSpecifiedSeeds = 0
# Now we have an acyclic graph, so we assign values to each vertex
# such that, for each edge, you can add the values for the two vertices
# involved and get the desired value for that edge -- which is the
# desired hash key. This task is dead easy, because the graph is acyclic.
sys.stderr.write('\nAcyclic graph found; computing vertex values...\n')
G.assign_values()
sys.stderr.write('Checking uniqueness of hash values...\n')
# Sanity check the result by actually verifying that all the keys
# hash to the right value.
cchMaxKey = 0
maxHashValue = 0
for k, v in keys:
hash1 = G.values[ f1(k) ]
hash2 = G.values[ f2(k) ]
if hash1 > maxHashValue:
maxHashValue = hash1
if hash2 > maxHashValue:
maxHashValue = hash2
perfecthash = (hash1 + hash2) % N
assert perfecthash == v
cch = len(k)
if cch > cchMaxKey:
cchMaxKey = cch
sys.stderr.write('Found perfect hash function!\n')
sys.stderr.write('\nIn order to regenerate this hash function, \n')
sys.stderr.write('you need to pass these following values back in:\n')
sys.stderr.write('f1 seed: %s\n' % hex(f1.seed))
sys.stderr.write('f2 seed: %s\n' % hex(f2.seed))
sys.stderr.write('initial multipler: %s\n' % c)
return PerfectHash(cchMaxKey, f1, f2, G, N, len(keys), maxHashValue)

View File

@ -1,114 +0,0 @@
#include <Python.h>
static PyObject * hashFunction(PyObject *self, PyObject *args, PyObject *kw)
{
PyStringObject *a;
register int len;
register unsigned char *p;
register unsigned long x;
unsigned long ulSeed;
unsigned long cchSeed;
unsigned long cHashElements;
if (!PyArg_ParseTuple(args, "llOl:hash",
&ulSeed, &cchSeed, &a, &cHashElements))
return NULL;
if (!PyString_Check(a))
{
PyErr_SetString(PyExc_TypeError, "arg 3 needs to be a string");
return NULL;
}
len = a->ob_size;
p = (unsigned char *) a->ob_sval;
x = ulSeed;
while (--len >= 0)
{
/* (1000003 * x) ^ *p++
* translated to handle > 32 bit longs
*/
x = (0xf4243 * x);
x = x & 0xFFFFFFFF;
x = x ^ *p++;
}
x ^= a->ob_size + cchSeed;
if (x == 0xFFFFFFFF)
x = 0xfffffffe;
if (x & 0x80000000)
{
/* Emulate Python 32-bit signed (2's complement)
* modulo operation
*/
x = (~x & 0xFFFFFFFF) + 1;
x %= cHashElements;
if (x != 0)
{
x = x + (~cHashElements & 0xFFFFFFFF) + 1;
x = (~x & 0xFFFFFFFF) + 1;
}
}
else
x %= cHashElements;
return PyInt_FromLong((long)x);
}
static PyObject * calcSeed(PyObject *self, PyObject *args, PyObject *kw)
{
PyStringObject *a;
register int len;
register unsigned char *p;
register unsigned long x;
if (!PyString_Check(args))
{
PyErr_SetString(PyExc_TypeError, "arg 1 expected a string, but didn't get it.");
return NULL;
}
a = (PyStringObject *)args;
len = a->ob_size;
p = (unsigned char *) a->ob_sval;
x = (*p << 7) & 0xFFFFFFFF;
while (--len >= 0)
{
/* (1000003 * x) ^ *p++
* translated to handle > 32 bit longs
*/
x = (0xf4243 * x);
x = x & 0xFFFFFFFF;
x = x ^ *p++;
}
return PyInt_FromLong((long)x);
}
static struct PyMethodDef hashMethods[] = {
{ "calcSeed", calcSeed, 0, NULL },
{ "hash", hashFunction, 0, NULL },
{ NULL, NULL, 0, NULL } /* sentinel */
};
#ifdef _MSC_VER
_declspec(dllexport)
#endif
void initperfhash(void)
{
PyObject *m;
m = Py_InitModule4("perfhash", hashMethods,
NULL, NULL, PYTHON_API_VERSION);
if ( m == NULL )
Py_FatalError("can't initialize module perfhash");
}