cpython/Tools/perfecthash/perfect_hash.py

665 lines
21 KiB
Python
Raw Normal View History

#!/usr/bin/env/python
# perfect_hash.py
#
# Outputs C code for a minimal perfect hash.
# The hash is produced using the algorithm described in
# "Optimal algorithms for minimal perfect hashing",
# G. Havas, B.S. Majewski. Available as a technical report
# from the CS department, University of Queensland
# (ftp://ftp.cs.uq.oz.au/).
#
# This is a modified version of Andrew Kuchling's code
# (http://starship.python.net/crew/amk/python/code/perfect-hash.html)
# and generates C fragments suitable for compilation as a Python
# extension module.
#
# Difference between this algorithm and gperf:
# Gperf will complete in finite time with a successful function,
# or by giving up.
# This algorithm may never complete, although it is extremely likely
# when c >= 2.
# The algorithm works like this:
# 0) You have K keys, that you want to perfectly hash to a bunch
# of hash values.
#
# 1) Choose a number N larger than K. This is the number of
# vertices in a graph G, and also the size of the resulting table.
#
# 2) Pick two random hash functions f1, f2, that output values from
# 0...N-1.
#
# 3) for key in keys:
# h1 = f1(key) ; h2 = f2(key)
# Draw an edge between vertices h1 and h2 of the graph.
# Associate the desired hash value with that edge.
#
# 4) Check if G is acyclic; if not, go back to step 1 and pick a bigger N.
#
# 5) Assign values to each vertex such that, for each edge, you can
# add the values for the two vertices and get the desired value
# for that edge -- which is the desired hash key. This task is
# dead easy, because the graph is acyclic. This is done by
# picking a vertex V, and assigning it a value of 0. You then do a
# depth-first search, assigning values to new vertices so that
# they sum up properly.
#
# 6) f1, f2, and G now make up your perfect hash function.
import sys, whrandom, string
import pprint
import perfhash
import time
class Hash:
"""Random hash function
For simplicity and speed, this doesn't implement any byte-level hashing
scheme. Instead, a random string is generated and prefixing to
str(key), and then Python's hashing function is used."""
def __init__(self, N, caseInsensitive=0):
self.N = N
junk = ""
for i in range(10):
junk = junk + whrandom.choice(string.letters + string.digits)
self.junk = junk
self.caseInsensitive = caseInsensitive
self.seed = perfhash.calcSeed(junk)
def __call__(self, key):
key = str(key)
if self.caseInsensitive:
key = string.upper(key)
x = perfhash.hash(self.seed, len(self.junk), key) % self.N
#h = hash(self.junk + key) % self.N
#assert x == h
return x
def generate_code(self):
s = """{
register int len;
register unsigned char *p;
register long x;
len = cch;
p = (unsigned char *) key;
x = %(junkSeed)d;
while (--len >= 0)
x = (1000003*x) ^ """ % \
{
"lenJunk" : len(self.junk),
"junkSeed" : self.seed,
}
if self.caseInsensitive:
s = s + "toupper(*(p++));"
else:
s = s + "*(p++);"
s = s + """
x ^= cch + %(lenJunk)d;
if (x == -1)
x = -2;
x %%= k_cHashElements;
/* ensure the returned value is positive so we mimic Python's %% operator */
if (x < 0)
x += k_cHashElements;
return x;
}
""" % { "lenJunk" : len(self.junk),
"junkSeed" : self.seed, }
return s
WHITE, GREY, BLACK = 0,1,2
class Graph:
"""Graph class. This class isn't particularly efficient or general,
and only has the features I needed to implement this algorithm.
num_vertices -- number of vertices
edges -- maps 2-tuples of vertex numbers to the value for this
edge. If there's an edge between v1 and v2 (v1<v2),
(v1,v2) is a key and the value is the edge's value.
reachable_list -- maps a vertex V to the list of vertices
to which V is connected by edges. Used
for traversing the graph.
values -- numeric value for each vertex
"""
def __init__(self, num_vertices):
self.num_vertices = num_vertices
self.edges = {}
self.reachable_list = {}
self.values = [-1] * num_vertices
def connect(self, vertex1, vertex2, value):
"""Connect 'vertex1' and 'vertex2' with an edge, with associated
value 'value'"""
if vertex1 > vertex2: vertex1, vertex2 = vertex2, vertex1
# if self.edges.has_key( (vertex1, vertex2) ):
# raise ValueError, 'Collision: vertices already connected'
self.edges[ (vertex1, vertex2) ] = value
# Add vertices to each other's reachable list
if not self.reachable_list.has_key( vertex1 ):
self.reachable_list[ vertex1 ] = [vertex2]
else:
self.reachable_list[vertex1].append(vertex2)
if not self.reachable_list.has_key( vertex2 ):
self.reachable_list[ vertex2 ] = [vertex1]
else:
self.reachable_list[vertex2].append(vertex1)
def get_edge_value(self, vertex1, vertex2):
"""Retrieve the value corresponding to the edge between
'vertex1' and 'vertex2'. Raises KeyError if no such edge"""
if vertex1 > vertex2:
vertex1, vertex2 = vertex2, vertex1
return self.edges[ (vertex1, vertex2) ]
def is_acyclic(self):
"Returns true if the graph is acyclic, otherwise false"
# This is done by doing a depth-first search of the graph;
# painting each vertex grey and then black. If the DFS
# ever finds a vertex that isn't white, there's a cycle.
colour = {}
for i in range(self.num_vertices): colour[i] = WHITE
# Loop over all vertices, taking white ones as starting
# points for a traversal.
for i in range(self.num_vertices):
if colour[i] == WHITE:
# List of vertices to visit
visit_list = [ (None,i) ]
# Do a DFS
while visit_list:
# Colour this vertex grey.
parent, vertex = visit_list[0] ; del visit_list[0]
colour[vertex] = GREY
# Make copy of list of neighbours, removing the vertex
# we arrived here from.
neighbours = self.reachable_list.get(vertex, []) [:]
if parent in neighbours: neighbours.remove( parent )
for neighbour in neighbours:
if colour[neighbour] == WHITE:
visit_list.insert(0, (vertex, neighbour) )
elif colour[neighbour] != WHITE:
# Aha! Already visited this node,
# so the graph isn't acyclic.
return 0
colour[vertex] = BLACK
# We got through, so the graph is acyclic.
return 1
def assign_values(self):
"""Compute values for each vertex, so that they sum up
properly to the associated value for each edge."""
# Also done with a DFS; I simply copied the DFS code
# from is_acyclic(). (Should generalize the logic so
# one function could be used from both methods,
# but I couldn't be bothered.)
colour = {}
for i in range(self.num_vertices): colour[i] = WHITE
# Loop over all vertices, taking white ones as starting
# points for a traversal.
for i in range(self.num_vertices):
if colour[i] == WHITE:
# Set this vertex's value, arbitrarily, to zero.
self.set_vertex_value( i, 0 )
# List of vertices to visit
visit_list = [ (None,i) ]
# Do a DFS
while visit_list:
# Colour this vertex grey.
parent, vertex = visit_list[0] ; del visit_list[0]
colour[vertex] = GREY
# Make copy of list of neighbours, removing the vertex
# we arrived here from.
neighbours = self.reachable_list.get(vertex, []) [:]
if parent in neighbours: neighbours.remove( parent )
for neighbour in self.reachable_list.get(vertex, []):
edge_value = self.get_edge_value( vertex, neighbour )
if colour[neighbour] == WHITE:
visit_list.insert(0, (vertex, neighbour) )
# Set new vertex's value to the desired
# edge value, minus the value of the
# vertex we came here from.
new_val = (edge_value -
self.get_vertex_value( vertex ) )
self.set_vertex_value( neighbour,
new_val % self.num_vertices)
colour[vertex] = BLACK
# Returns nothing
return
def __getitem__(self, index):
if index < self.num_vertices: return index
raise IndexError
def get_vertex_value(self, vertex):
"Get value for a vertex"
return self.values[ vertex ]
def set_vertex_value(self, vertex, value):
"Set value for a vertex"
self.values[ vertex ] = value
def generate_code(self, out, width = 70):
"Return nicely formatted table"
out.write("{ ")
pos = 0
for v in self.values:
v=str(v)+', '
out.write(v)
pos = pos + len(v) + 1
if pos > width: out.write('\n '); pos = 0
out.write('};\n')
class PerfectHash:
def __init__(self, cchMax, f1, f2, G, cHashElements, cKeys, maxHashValue):
self.cchMax = cchMax
self.f1 = f1
self.f2 = f2
self.G = G
self.cHashElements = cHashElements
self.cKeys = cKeys
# determine the necessary type for storing our hash function
# helper table:
self.type = self.determineType(maxHashValue)
def generate_header(self, structName):
header = """
#include <Python.h>
#include <stdlib.h>
/* --- C API ----------------------------------------------------*/
/* C API for usage by other Python modules */
typedef struct %(structName)s
{
unsigned long cKeys;
unsigned long cchMax;
unsigned long (*hash)(const char *key, unsigned int cch);
const void *(*getValue)(unsigned long iKey);
} %(structName)s;
""" % { "structName" : structName }
return header
def determineType(self, maxHashValue):
if maxHashValue <= 255:
return "unsigned char"
elif maxHashValue <= 65535:
return "unsigned short"
else:
# Take the cheesy way out...
return "unsigned long"
def generate_code(self, moduleName, dataArrayName, dataArrayType, structName):
# Output C code for the hash functions and tables
code = """
/*
* The hash is produced using the algorithm described in
* "Optimal algorithms for minimal perfect hashing",
* G. Havas, B.S. Majewski. Available as a technical report
* from the CS department, University of Queensland
* (ftp://ftp.cs.uq.oz.au/).
*
* Generated using a heavily tweaked version of Andrew Kuchling's
* perfect_hash.py:
* http://starship.python.net/crew/amk/python/code/perfect-hash.html
*
* Generated on: %s
*/
""" % time.ctime(time.time())
# MSVC SP3 was complaining when I actually used a global constant
code = code + """
#define k_cHashElements %i
#define k_cchMaxKey %d
#define k_cKeys %i
""" % (self.cHashElements, self.cchMax, self.cKeys)
code = code + """
static const %s G[k_cHashElements];
static const %s %s[k_cKeys];
""" % (self.type, dataArrayType, dataArrayName)
code = code + """
static long f1(const char *key, unsigned int cch)
"""
code = code + self.f1.generate_code()
code = code + """
static long f2(const char *key, unsigned int cch)
"""
code = code + self.f2.generate_code()
code = code + """
static unsigned long hash(const char *key, unsigned int cch)
{
return ((unsigned long)(G[ f1(key, cch) ]) + (unsigned long)(G[ f2(key, cch) ]) ) %% k_cHashElements;
}
const void *getValue(unsigned long iKey)
{
return &%(dataArrayName)s[iKey];
}
/* Helper for adding objects to dictionaries. Check for errors with
PyErr_Occurred() */
static
void insobj(PyObject *dict,
char *name,
PyObject *v)
{
PyDict_SetItemString(dict, name, v);
Py_XDECREF(v);
}
static const %(structName)s hashAPI =
{
k_cKeys,
k_cchMaxKey,
&hash,
&getValue,
};
static
PyMethodDef Module_methods[] =
{
{NULL, NULL},
};
static char *Module_docstring = "%(moduleName)s hash function module";
/* Error reporting for module init functions */
#define Py_ReportModuleInitError(modname) { \\
PyObject *exc_type, *exc_value, *exc_tb; \\
PyObject *str_type, *str_value; \\
\\
/* Fetch error objects and convert them to strings */ \\
PyErr_Fetch(&exc_type, &exc_value, &exc_tb); \\
if (exc_type && exc_value) { \\
str_type = PyObject_Str(exc_type); \\
str_value = PyObject_Str(exc_value); \\
} \\
else { \\
str_type = NULL; \\
str_value = NULL; \\
} \\
/* Try to format a more informative error message using the \\
original error */ \\
if (str_type && str_value && \\
PyString_Check(str_type) && PyString_Check(str_value)) \\
PyErr_Format( \\
PyExc_ImportError, \\
"initialization of module "modname" failed " \\
"(%%s:%%s)", \\
PyString_AS_STRING(str_type), \\
PyString_AS_STRING(str_value)); \\
else \\
PyErr_SetString( \\
PyExc_ImportError, \\
"initialization of module "modname" failed"); \\
Py_XDECREF(str_type); \\
Py_XDECREF(str_value); \\
Py_XDECREF(exc_type); \\
Py_XDECREF(exc_value); \\
Py_XDECREF(exc_tb); \\
}
/* Create PyMethodObjects and register them in the module\'s dict */
DL_EXPORT(void)
init%(moduleName)s(void)
{
PyObject *module, *moddict;
/* Create module */
module = Py_InitModule4("%(moduleName)s", /* Module name */
Module_methods, /* Method list */
Module_docstring, /* Module doc-string */
(PyObject *)NULL, /* always pass this as *self */
PYTHON_API_VERSION); /* API Version */
if (module == NULL)
goto onError;
/* Add some constants to the module\'s dict */
moddict = PyModule_GetDict(module);
if (moddict == NULL)
goto onError;
/* Export C API */
insobj(
moddict,
"%(moduleName)sAPI",
PyCObject_FromVoidPtr((void *)&hashAPI, NULL));
onError:
/* Check for errors and report them */
if (PyErr_Occurred())
Py_ReportModuleInitError("%(moduleName)s");
return;
}
""" % { "moduleName" : moduleName,
"dataArrayName" : dataArrayName,
"structName" : structName, }
return code
def generate_graph(self, out):
out.write("""
static const unsigned short G[] =
""")
self.G.generate_code(out)
def generate_hash(keys, caseInsensitive=0,
minC=None, initC=None,
f1Seed=None, f2Seed=None,
cIncrement=None, cTries=None):
"""Print out code for a perfect minimal hash. Input is a list of
(key, desired hash value) tuples. """
# K is the number of keys.
K = len(keys)
# We will be generating graphs of size N, where N = c * K.
# The larger C is, the fewer trial graphs will need to be made, but
# the resulting table is also larger. Increase this starting value
# if you're impatient. After 50 failures, c will be increased by 0.025.
if initC is None:
initC = 1.5
c = initC
if cIncrement is None:
cIncrement = 0.0025
if cTries is None:
cTries = 50
# Number of trial graphs so far
num_graphs = 0
sys.stderr.write('Generating graphs... ')
while 1:
# N is the number of vertices in the graph G
N = int(c*K)
num_graphs = num_graphs + 1
if (num_graphs % cTries) == 0:
# Enough failures at this multiplier,
# increase the multiplier and keep trying....
c = c + cIncrement
# Whats good with searching for a better
# hash function if we exceed the size
# of a function we've generated in the past....
if minC is not None and \
c > minC:
c = initC
sys.stderr.write(' -- c > minC, resetting c to %0.4f\n' % c)
else:
sys.stderr.write(' -- increasing c to %0.4f\n' % c)
sys.stderr.write('Generating graphs... ')
# Output a progress message
sys.stderr.write( str(num_graphs) + ' ')
sys.stderr.flush()
# Create graph w/ N vertices
G = Graph(N)
# Save the seeds used to generate
# the following two hash functions.
_seeds = whrandom._inst._seed
# Create 2 random hash functions
f1 = Hash(N, caseInsensitive)
f2 = Hash(N, caseInsensitive)
# Set the initial hash function seed values if passed in.
# Doing this protects our hash functions from
# changes to whrandom's behavior.
if f1Seed is not None:
f1.seed = f1Seed
f1Seed = None
fSpecifiedSeeds = 1
if f2Seed is not None:
f2.seed = f2Seed
f2Seed = None
fSpecifiedSeeds = 1
# Connect vertices given by the values of the two hash functions
# for each key. Associate the desired hash value with each
# edge.
for k, v in keys:
h1 = f1(k) ; h2 = f2(k)
G.connect( h1,h2, v)
# Check if the resulting graph is acyclic; if it is,
# we're done with step 1.
if G.is_acyclic():
break
elif fSpecifiedSeeds:
sys.stderr.write('\nThe initial f1/f2 seeds you specified didn\'t generate a perfect hash function: \n')
sys.stderr.write('f1 seed: %s\n' % f1.seed)
sys.stderr.write('f2 seed: %s\n' % f2.seed)
sys.stderr.write('multipler: %s\n' % c)
sys.stderr.write('Your data has likely changed, or you forgot what your initial multiplier should be.\n')
sys.stderr.write('continuing the search for a perfect hash function......\n')
fSpecifiedSeeds = 0
# Now we have an acyclic graph, so we assign values to each vertex
# such that, for each edge, you can add the values for the two vertices
# involved and get the desired value for that edge -- which is the
# desired hash key. This task is dead easy, because the graph is acyclic.
sys.stderr.write('\nAcyclic graph found; computing vertex values...\n')
G.assign_values()
sys.stderr.write('Checking uniqueness of hash values...\n')
# Sanity check the result by actually verifying that all the keys
# hash to the right value.
cchMaxKey = 0
maxHashValue = 0
for k, v in keys:
hash1 = G.values[ f1(k) ]
hash2 = G.values[ f2(k) ]
if hash1 > maxHashValue:
maxHashValue = hash1
if hash2 > maxHashValue:
maxHashValue = hash2
perfecthash = (hash1 + hash2) % N
assert perfecthash == v
cch = len(k)
if cch > cchMaxKey:
cchMaxKey = cch
sys.stderr.write('Found perfect hash function!\n')
sys.stderr.write('\nIn order to regenerate this hash function, \n')
sys.stderr.write('you need to pass these following values back in:\n')
sys.stderr.write('f1 seed: %s\n' % repr(f1.seed))
sys.stderr.write('f2 seed: %s\n' % repr(f2.seed))
sys.stderr.write('initial multipler: %s\n' % c)
return PerfectHash(cchMaxKey, f1, f2, G, N, len(keys), maxHashValue)
"""
static
PyObject *codec_tuple(PyObject *unicode,
int len)
{
PyObject *v,*w;
if (unicode == NULL)
return NULL;
v = PyTuple_New(2);
if (v == NULL) {
Py_DECREF(unicode);
return NULL;
}
PyTuple_SET_ITEM(v,0,unicode);
w = PyInt_FromLong(len);
if (w == NULL) {
Py_DECREF(v);
return NULL;
}
PyTuple_SET_ITEM(v,1,w);
return v;
}
static PyObject *
ucn_decode(PyObject *self,
PyObject *args)
{
const char *data;
int size;
const char *errors = NULL;
PyObject *mapping = NULL;
if (!PyArg_ParseTuple(args, "t#|z:ucn_decode",
&data, &size, &errors))
return NULL;
if (mapping == Py_None)
mapping = NULL;
return codec_tuple(PyUnicode_DecodeNamedUnicodeEscape(data, size, errors),
size);
}
static PyMethodDef _codecs_functions[] = {
{ "ucn_decode", ucn_decode, 1 },
};
DL_EXPORT(void)
init_ucn()
{
Py_InitModule("_ucn", _codecs_functions);
}
"""