Removed Tools/perfecthash, per python-dev discussion
This commit is contained in:
parent
7a4409c1b2
commit
7e11170e85
|
@ -1,109 +0,0 @@
|
|||
#! /usr/bin/env python
|
||||
import sys
|
||||
import string
|
||||
import perfect_hash
|
||||
|
||||
# This is a user of perfect_hash.py
|
||||
# that takes as input the UnicodeData.txt file available from:
|
||||
# ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
|
||||
|
||||
# It generates a hash table from Unicode Character Name ->
|
||||
# unicode code space value.
|
||||
|
||||
# These variables determine which hash function is tried first.
|
||||
# Yields a multiple of 1.7875 for UnicodeData.txt on 2000/06/24/
|
||||
f1Seed = 0x64fc2234
|
||||
f2Seed = 0x8db7d737
|
||||
|
||||
# Maximum allowed multipler, if this isn't None then instead of continually
|
||||
# increasing C, it resets it back to initC to keep searching for
|
||||
# a solution.
|
||||
minC = 1.7875
|
||||
# Initial multiplier for trying to find a perfect hash function.
|
||||
initC = 1.7875
|
||||
|
||||
moduleName = "ucnhash"
|
||||
dataArrayName = "aucn"
|
||||
dataArrayType = "_Py_UnicodeCharacterName"
|
||||
headerFileName = "ucnhash.h"
|
||||
cFileName = "ucnhash.c"
|
||||
structName = "_Py_UCNHashAPI"
|
||||
|
||||
keys = []
|
||||
hashData = {}
|
||||
|
||||
def generateOutputFiles(perfHash, hashData):
|
||||
header = perfHash.generate_header(structName)
|
||||
header = header + """
|
||||
typedef struct
|
||||
{
|
||||
const char *pszUCN;
|
||||
Py_UCS4 value;
|
||||
} _Py_UnicodeCharacterName;
|
||||
|
||||
"""
|
||||
|
||||
code = perfHash.generate_code(moduleName,
|
||||
dataArrayName,
|
||||
dataArrayType,
|
||||
structName)
|
||||
out = open(headerFileName, "w")
|
||||
out.write(header)
|
||||
out = open(cFileName, "w")
|
||||
out.write("#include \"%s\"\n" % headerFileName)
|
||||
out.write(code)
|
||||
perfHash.generate_graph(out)
|
||||
out.write("""
|
||||
|
||||
static const _Py_UnicodeCharacterName aucn[] =
|
||||
{
|
||||
""")
|
||||
for i in xrange(len(keys)):
|
||||
v = hashData[keys[i][0]]
|
||||
out.write(' { "' + keys[i][0] + '", ' + hex(v) + " }," + "\n")
|
||||
out.write("};\n\n")
|
||||
sys.stderr.write('\nGenerated output files: \n')
|
||||
sys.stderr.write('%s\n%s\n' % (headerFileName, cFileName))
|
||||
|
||||
def main():
|
||||
# Suck in UnicodeData.txt and spit out the generated files.
|
||||
input = open(sys.argv[1], 'r')
|
||||
i = 0
|
||||
while 1:
|
||||
line = input.readline()
|
||||
if line == "": break
|
||||
fields = string.split(line, ';')
|
||||
if len(fields) < 2:
|
||||
sys.stderr.write('Ill-formated line!\n')
|
||||
sys.stderr.write('line #: %d\n' % (i + 1))
|
||||
sys.exit()
|
||||
data, key = fields[:2]
|
||||
key = string.strip( key )
|
||||
# Any name starting with '<' is a control, or start/end character,
|
||||
# so skip it...
|
||||
if key[0] == "<":
|
||||
continue
|
||||
hashcode = i
|
||||
i = i + 1
|
||||
# force the name to uppercase
|
||||
keys.append( (string.upper(key),hashcode) )
|
||||
data = string.atoi(data, 16)
|
||||
hashData[key] = data
|
||||
|
||||
input.close()
|
||||
sys.stderr.write('%i key/hash pairs read\n' % len(keys) )
|
||||
perfHash = perfect_hash.generate_hash(keys, 1,
|
||||
minC, initC,
|
||||
f1Seed, f2Seed,
|
||||
# increment, tries
|
||||
0.0025, 50)
|
||||
generateOutputFiles(perfHash, hashData)
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) == 1:
|
||||
sys.stdout = sys.stderr
|
||||
print 'Usage: %s <input filename>' % sys.argv[0]
|
||||
print ' The input file needs to be UnicodeData.txt'
|
||||
sys.exit()
|
||||
main()
|
||||
|
|
@ -1,619 +0,0 @@
|
|||
#!/usr/bin/env/python
|
||||
|
||||
# perfect_hash.py
|
||||
#
|
||||
# Outputs C code for a minimal perfect hash.
|
||||
# The hash is produced using the algorithm described in
|
||||
# "Optimal algorithms for minimal perfect hashing",
|
||||
# G. Havas, B.S. Majewski. Available as a technical report
|
||||
# from the CS department, University of Queensland
|
||||
# (ftp://ftp.cs.uq.oz.au/).
|
||||
#
|
||||
# This is a modified version of Andrew Kuchling's code
|
||||
# (http://starship.python.net/crew/amk/python/code/perfect-hash.html)
|
||||
# and generates C fragments suitable for compilation as a Python
|
||||
# extension module.
|
||||
#
|
||||
|
||||
# Difference between this algorithm and gperf:
|
||||
# Gperf will complete in finite time with a successful function,
|
||||
# or by giving up.
|
||||
# This algorithm may never complete, although it is extremely likely
|
||||
# when c >= 2.
|
||||
|
||||
# The algorithm works like this:
|
||||
# 0) You have K keys, that you want to perfectly hash to a bunch
|
||||
# of hash values.
|
||||
#
|
||||
# 1) Choose a number N larger than K. This is the number of
|
||||
# vertices in a graph G, and also the size of the resulting table.
|
||||
#
|
||||
# 2) Pick two random hash functions f1, f2, that output values from
|
||||
# 0...N-1.
|
||||
#
|
||||
# 3) for key in keys:
|
||||
# h1 = f1(key) ; h2 = f2(key)
|
||||
# Draw an edge between vertices h1 and h2 of the graph.
|
||||
# Associate the desired hash value with that edge.
|
||||
#
|
||||
# 4) Check if G is acyclic; if not, go back to step 1 and pick a bigger N.
|
||||
#
|
||||
# 5) Assign values to each vertex such that, for each edge, you can
|
||||
# add the values for the two vertices and get the desired value
|
||||
# for that edge -- which is the desired hash key. This task is
|
||||
# dead easy, because the graph is acyclic. This is done by
|
||||
# picking a vertex V, and assigning it a value of 0. You then do a
|
||||
# depth-first search, assigning values to new vertices so that
|
||||
# they sum up properly.
|
||||
#
|
||||
# 6) f1, f2, and G now make up your perfect hash function.
|
||||
|
||||
|
||||
import sys, whrandom, string
|
||||
import pprint
|
||||
import perfhash
|
||||
import time
|
||||
|
||||
class Hash:
|
||||
"""Random hash function
|
||||
For simplicity and speed, this doesn't implement any byte-level hashing
|
||||
scheme. Instead, a random string is generated and prefixing to
|
||||
str(key), and then Python's hashing function is used."""
|
||||
|
||||
def __init__(self, N, caseInsensitive=0):
|
||||
self.N = N
|
||||
junk = ""
|
||||
for i in range(10):
|
||||
junk = junk + whrandom.choice(string.letters + string.digits)
|
||||
self.junk = junk
|
||||
self.caseInsensitive = caseInsensitive
|
||||
self.seed = perfhash.calcSeed(junk)
|
||||
|
||||
def __call__(self, key):
|
||||
key = str(key)
|
||||
if self.caseInsensitive:
|
||||
key = string.upper(key)
|
||||
x = perfhash.hash(self.seed, len(self.junk), key, self.N)
|
||||
return x
|
||||
|
||||
def generate_code(self):
|
||||
s = """{
|
||||
register int len;
|
||||
register unsigned char *p;
|
||||
register unsigned long x;
|
||||
|
||||
len = cch;
|
||||
p = (unsigned char *) key;
|
||||
x = %(junkSeed)s;
|
||||
while (--len >= 0)
|
||||
{
|
||||
/* (1000003 * x) ^ toupper(*(p++))
|
||||
* translated to handle > 32 bit longs
|
||||
*/
|
||||
x = (0xf4243 * x);
|
||||
x = x & 0xFFFFFFFF;
|
||||
x = x ^ """ % \
|
||||
{
|
||||
"lenJunk" : len(self.junk),
|
||||
"junkSeed" : hex(self.seed),
|
||||
}
|
||||
|
||||
if self.caseInsensitive:
|
||||
s = s + "toupper(*(p++));"
|
||||
else:
|
||||
s = s + "*(p++);"
|
||||
s = s + """
|
||||
}
|
||||
x ^= cch + %(lenJunk)d;
|
||||
if (x == 0xFFFFFFFF)
|
||||
x = 0xfffffffe;
|
||||
if (x & 0x80000000)
|
||||
{
|
||||
/* Emulate 32-bit signed (2's complement) modulo operation */
|
||||
x = (~x & 0xFFFFFFFF) + 1;
|
||||
x %%= k_cHashElements;
|
||||
if (x != 0)
|
||||
{
|
||||
x = x + (~k_cHashElements & 0xFFFFFFFF) + 1;
|
||||
x = (~x & 0xFFFFFFFF) + 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
x %%= k_cHashElements;
|
||||
return x;
|
||||
}
|
||||
""" % { "lenJunk" : len(self.junk),
|
||||
"junkSeed" : hex(self.seed), }
|
||||
return s
|
||||
|
||||
WHITE, GREY, BLACK = 0,1,2
|
||||
class Graph:
|
||||
"""Graph class. This class isn't particularly efficient or general,
|
||||
and only has the features I needed to implement this algorithm.
|
||||
|
||||
num_vertices -- number of vertices
|
||||
edges -- maps 2-tuples of vertex numbers to the value for this
|
||||
edge. If there's an edge between v1 and v2 (v1<v2),
|
||||
(v1,v2) is a key and the value is the edge's value.
|
||||
reachable_list -- maps a vertex V to the list of vertices
|
||||
to which V is connected by edges. Used
|
||||
for traversing the graph.
|
||||
values -- numeric value for each vertex
|
||||
"""
|
||||
|
||||
def __init__(self, num_vertices):
|
||||
self.num_vertices = num_vertices
|
||||
self.edges = {}
|
||||
self.reachable_list = {}
|
||||
self.values = [-1] * num_vertices
|
||||
|
||||
def connect(self, vertex1, vertex2, value):
|
||||
"""Connect 'vertex1' and 'vertex2' with an edge, with associated
|
||||
value 'value'"""
|
||||
|
||||
if vertex1 > vertex2: vertex1, vertex2 = vertex2, vertex1
|
||||
if self.edges.has_key( (vertex1, vertex2) ):
|
||||
raise ValueError, 'Collision: vertices already connected'
|
||||
self.edges[ (vertex1, vertex2) ] = value
|
||||
|
||||
# Add vertices to each other's reachable list
|
||||
if not self.reachable_list.has_key( vertex1 ):
|
||||
self.reachable_list[ vertex1 ] = [vertex2]
|
||||
else:
|
||||
self.reachable_list[vertex1].append(vertex2)
|
||||
|
||||
if not self.reachable_list.has_key( vertex2 ):
|
||||
self.reachable_list[ vertex2 ] = [vertex1]
|
||||
else:
|
||||
self.reachable_list[vertex2].append(vertex1)
|
||||
|
||||
def get_edge_value(self, vertex1, vertex2):
|
||||
"""Retrieve the value corresponding to the edge between
|
||||
'vertex1' and 'vertex2'. Raises KeyError if no such edge"""
|
||||
if vertex1 > vertex2:
|
||||
vertex1, vertex2 = vertex2, vertex1
|
||||
return self.edges[ (vertex1, vertex2) ]
|
||||
|
||||
def is_acyclic(self):
|
||||
"Returns true if the graph is acyclic, otherwise false"
|
||||
|
||||
# This is done by doing a depth-first search of the graph;
|
||||
# painting each vertex grey and then black. If the DFS
|
||||
# ever finds a vertex that isn't white, there's a cycle.
|
||||
colour = {}
|
||||
for i in range(self.num_vertices): colour[i] = WHITE
|
||||
|
||||
# Loop over all vertices, taking white ones as starting
|
||||
# points for a traversal.
|
||||
for i in range(self.num_vertices):
|
||||
if colour[i] == WHITE:
|
||||
|
||||
# List of vertices to visit
|
||||
visit_list = [ (None,i) ]
|
||||
|
||||
# Do a DFS
|
||||
while visit_list:
|
||||
# Colour this vertex grey.
|
||||
parent, vertex = visit_list[0] ; del visit_list[0]
|
||||
colour[vertex] = GREY
|
||||
|
||||
# Make copy of list of neighbours, removing the vertex
|
||||
# we arrived here from.
|
||||
neighbours = self.reachable_list.get(vertex, []) [:]
|
||||
if parent in neighbours: neighbours.remove( parent )
|
||||
|
||||
for neighbour in neighbours:
|
||||
if colour[neighbour] == WHITE:
|
||||
visit_list.insert(0, (vertex, neighbour) )
|
||||
elif colour[neighbour] != WHITE:
|
||||
# Aha! Already visited this node,
|
||||
# so the graph isn't acyclic.
|
||||
return 0
|
||||
|
||||
colour[vertex] = BLACK
|
||||
|
||||
# We got through, so the graph is acyclic.
|
||||
return 1
|
||||
|
||||
def assign_values(self):
|
||||
"""Compute values for each vertex, so that they sum up
|
||||
properly to the associated value for each edge."""
|
||||
|
||||
# Also done with a DFS; I simply copied the DFS code
|
||||
# from is_acyclic(). (Should generalize the logic so
|
||||
# one function could be used from both methods,
|
||||
# but I couldn't be bothered.)
|
||||
|
||||
colour = {}
|
||||
for i in range(self.num_vertices): colour[i] = WHITE
|
||||
|
||||
# Loop over all vertices, taking white ones as starting
|
||||
# points for a traversal.
|
||||
for i in range(self.num_vertices):
|
||||
if colour[i] == WHITE:
|
||||
# Set this vertex's value, arbitrarily, to zero.
|
||||
self.set_vertex_value( i, 0 )
|
||||
|
||||
# List of vertices to visit
|
||||
visit_list = [ (None,i) ]
|
||||
|
||||
# Do a DFS
|
||||
while visit_list:
|
||||
# Colour this vertex grey.
|
||||
parent, vertex = visit_list[0] ; del visit_list[0]
|
||||
colour[vertex] = GREY
|
||||
|
||||
# Make copy of list of neighbours, removing the vertex
|
||||
# we arrived here from.
|
||||
neighbours = self.reachable_list.get(vertex, []) [:]
|
||||
if parent in neighbours: neighbours.remove( parent )
|
||||
|
||||
for neighbour in self.reachable_list.get(vertex, []):
|
||||
edge_value = self.get_edge_value( vertex, neighbour )
|
||||
if colour[neighbour] == WHITE:
|
||||
visit_list.insert(0, (vertex, neighbour) )
|
||||
|
||||
# Set new vertex's value to the desired
|
||||
# edge value, minus the value of the
|
||||
# vertex we came here from.
|
||||
new_val = (edge_value -
|
||||
self.get_vertex_value( vertex ) )
|
||||
self.set_vertex_value( neighbour,
|
||||
new_val % self.num_vertices)
|
||||
|
||||
colour[vertex] = BLACK
|
||||
|
||||
# Returns nothing
|
||||
return
|
||||
|
||||
def __getitem__(self, index):
|
||||
if index < self.num_vertices: return index
|
||||
raise IndexError
|
||||
|
||||
def get_vertex_value(self, vertex):
|
||||
"Get value for a vertex"
|
||||
return self.values[ vertex ]
|
||||
|
||||
def set_vertex_value(self, vertex, value):
|
||||
"Set value for a vertex"
|
||||
self.values[ vertex ] = value
|
||||
|
||||
def generate_code(self, out, width = 70):
|
||||
"Return nicely formatted table"
|
||||
out.write("{ ")
|
||||
pos = 0
|
||||
for v in self.values:
|
||||
v=str(v)+', '
|
||||
out.write(v)
|
||||
pos = pos + len(v) + 1
|
||||
if pos > width: out.write('\n '); pos = 0
|
||||
out.write('};\n')
|
||||
|
||||
|
||||
class PerfectHash:
|
||||
def __init__(self, cchMax, f1, f2, G, cHashElements, cKeys, maxHashValue):
|
||||
self.cchMax = cchMax
|
||||
self.f1 = f1
|
||||
self.f2 = f2
|
||||
self.G = G
|
||||
self.cHashElements = cHashElements
|
||||
self.cKeys = cKeys
|
||||
# determine the necessary type for storing our hash function
|
||||
# helper table:
|
||||
self.type = self.determineType(maxHashValue)
|
||||
|
||||
def generate_header(self, structName):
|
||||
header = """
|
||||
#include "Python.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
/* --- C API ----------------------------------------------------*/
|
||||
/* C API for usage by other Python modules */
|
||||
typedef struct %(structName)s
|
||||
{
|
||||
unsigned long cKeys;
|
||||
unsigned long cchMax;
|
||||
unsigned long (*hash)(const char *key, unsigned int cch);
|
||||
const void *(*getValue)(unsigned long iKey);
|
||||
} %(structName)s;
|
||||
""" % { "structName" : structName }
|
||||
return header
|
||||
|
||||
def determineType(self, maxHashValue):
|
||||
if maxHashValue <= 255:
|
||||
return "unsigned char"
|
||||
elif maxHashValue <= 65535:
|
||||
return "unsigned short"
|
||||
else:
|
||||
# Take the cheesy way out...
|
||||
return "unsigned long"
|
||||
|
||||
def generate_code(self, moduleName, dataArrayName, dataArrayType, structName):
|
||||
# Output C code for the hash functions and tables
|
||||
code = """
|
||||
/*
|
||||
* The hash is produced using the algorithm described in
|
||||
* "Optimal algorithms for minimal perfect hashing",
|
||||
* G. Havas, B.S. Majewski. Available as a technical report
|
||||
* from the CS department, University of Queensland
|
||||
* (ftp://ftp.cs.uq.oz.au/).
|
||||
*
|
||||
* Generated using a heavily tweaked version of Andrew Kuchling's
|
||||
* perfect_hash.py:
|
||||
* http://starship.python.net/crew/amk/python/code/perfect-hash.html
|
||||
*
|
||||
* Generated on: %s
|
||||
*/
|
||||
""" % time.ctime(time.time())
|
||||
# MSVC SP3 was complaining when I actually used a global constant
|
||||
code = code + """
|
||||
#define k_cHashElements %i
|
||||
#define k_cchMaxKey %d
|
||||
#define k_cKeys %i
|
||||
|
||||
""" % (self.cHashElements, self.cchMax, self.cKeys)
|
||||
|
||||
code = code + """
|
||||
staticforward const %s G[k_cHashElements];
|
||||
staticforward const %s %s[k_cKeys];
|
||||
""" % (self.type, dataArrayType, dataArrayName)
|
||||
|
||||
code = code + """
|
||||
static long f1(const char *key, unsigned int cch)
|
||||
"""
|
||||
code = code + self.f1.generate_code()
|
||||
code = code + """
|
||||
|
||||
static long f2(const char *key, unsigned int cch)
|
||||
"""
|
||||
code = code + self.f2.generate_code()
|
||||
code = code + """
|
||||
|
||||
static unsigned long hash(const char *key, unsigned int cch)
|
||||
{
|
||||
return ((unsigned long)(G[ f1(key, cch) ]) + (unsigned long)(G[ f2(key, cch) ]) ) %% k_cHashElements;
|
||||
}
|
||||
|
||||
const void *getValue(unsigned long iKey)
|
||||
{
|
||||
return &%(dataArrayName)s[iKey];
|
||||
}
|
||||
|
||||
/* Helper for adding objects to dictionaries. Check for errors with
|
||||
PyErr_Occurred() */
|
||||
static
|
||||
void insobj(PyObject *dict,
|
||||
char *name,
|
||||
PyObject *v)
|
||||
{
|
||||
PyDict_SetItemString(dict, name, v);
|
||||
Py_XDECREF(v);
|
||||
}
|
||||
|
||||
static const %(structName)s hashAPI =
|
||||
{
|
||||
k_cKeys,
|
||||
k_cchMaxKey,
|
||||
&hash,
|
||||
&getValue,
|
||||
};
|
||||
|
||||
static
|
||||
PyMethodDef Module_methods[] =
|
||||
{
|
||||
{NULL, NULL},
|
||||
};
|
||||
|
||||
static char *Module_docstring = "%(moduleName)s hash function module";
|
||||
|
||||
/* Error reporting for module init functions */
|
||||
|
||||
#define Py_ReportModuleInitError(modname) { \\
|
||||
PyObject *exc_type, *exc_value, *exc_tb; \\
|
||||
PyObject *str_type, *str_value; \\
|
||||
\\
|
||||
/* Fetch error objects and convert them to strings */ \\
|
||||
PyErr_Fetch(&exc_type, &exc_value, &exc_tb); \\
|
||||
if (exc_type && exc_value) { \\
|
||||
str_type = PyObject_Str(exc_type); \\
|
||||
str_value = PyObject_Str(exc_value); \\
|
||||
} \\
|
||||
else { \\
|
||||
str_type = NULL; \\
|
||||
str_value = NULL; \\
|
||||
} \\
|
||||
/* Try to format a more informative error message using the \\
|
||||
original error */ \\
|
||||
if (str_type && str_value && \\
|
||||
PyString_Check(str_type) && PyString_Check(str_value)) \\
|
||||
PyErr_Format( \\
|
||||
PyExc_ImportError, \\
|
||||
"initialization of module "modname" failed " \\
|
||||
"(%%s:%%s)", \\
|
||||
PyString_AS_STRING(str_type), \\
|
||||
PyString_AS_STRING(str_value)); \\
|
||||
else \\
|
||||
PyErr_SetString( \\
|
||||
PyExc_ImportError, \\
|
||||
"initialization of module "modname" failed"); \\
|
||||
Py_XDECREF(str_type); \\
|
||||
Py_XDECREF(str_value); \\
|
||||
Py_XDECREF(exc_type); \\
|
||||
Py_XDECREF(exc_value); \\
|
||||
Py_XDECREF(exc_tb); \\
|
||||
}
|
||||
|
||||
|
||||
/* Create PyMethodObjects and register them in the module\'s dict */
|
||||
DL_EXPORT(void)
|
||||
init%(moduleName)s(void)
|
||||
{
|
||||
PyObject *module, *moddict;
|
||||
/* Create module */
|
||||
module = Py_InitModule4("%(moduleName)s", /* Module name */
|
||||
Module_methods, /* Method list */
|
||||
Module_docstring, /* Module doc-string */
|
||||
(PyObject *)NULL, /* always pass this as *self */
|
||||
PYTHON_API_VERSION); /* API Version */
|
||||
if (module == NULL)
|
||||
goto onError;
|
||||
/* Add some constants to the module\'s dict */
|
||||
moddict = PyModule_GetDict(module);
|
||||
if (moddict == NULL)
|
||||
goto onError;
|
||||
|
||||
/* Export C API */
|
||||
insobj(
|
||||
moddict,
|
||||
"%(moduleName)sAPI",
|
||||
PyCObject_FromVoidPtr((void *)&hashAPI, NULL));
|
||||
|
||||
onError:
|
||||
/* Check for errors and report them */
|
||||
if (PyErr_Occurred())
|
||||
Py_ReportModuleInitError("%(moduleName)s");
|
||||
return;
|
||||
}
|
||||
""" % { "moduleName" : moduleName,
|
||||
"dataArrayName" : dataArrayName,
|
||||
"structName" : structName, }
|
||||
|
||||
return code
|
||||
|
||||
def generate_graph(self, out):
|
||||
out.write("""
|
||||
static const unsigned short G[] =
|
||||
""")
|
||||
self.G.generate_code(out)
|
||||
|
||||
|
||||
def generate_hash(keys, caseInsensitive=0,
|
||||
minC=None, initC=None,
|
||||
f1Seed=None, f2Seed=None,
|
||||
cIncrement=None, cTries=None):
|
||||
"""Print out code for a perfect minimal hash. Input is a list of
|
||||
(key, desired hash value) tuples. """
|
||||
|
||||
# K is the number of keys.
|
||||
K = len(keys)
|
||||
|
||||
# We will be generating graphs of size N, where N = c * K.
|
||||
# The larger C is, the fewer trial graphs will need to be made, but
|
||||
# the resulting table is also larger. Increase this starting value
|
||||
# if you're impatient. After 50 failures, c will be increased by 0.025.
|
||||
if initC is None:
|
||||
initC = 1.5
|
||||
|
||||
c = initC
|
||||
if cIncrement is None:
|
||||
cIncrement = 0.0025
|
||||
|
||||
if cTries is None:
|
||||
cTries = 50
|
||||
|
||||
# Number of trial graphs so far
|
||||
num_graphs = 0
|
||||
sys.stderr.write('Generating graphs... ')
|
||||
|
||||
while 1:
|
||||
# N is the number of vertices in the graph G
|
||||
N = int(c*K)
|
||||
num_graphs = num_graphs + 1
|
||||
if (num_graphs % cTries) == 0:
|
||||
# Enough failures at this multiplier,
|
||||
# increase the multiplier and keep trying....
|
||||
c = c + cIncrement
|
||||
|
||||
# Whats good with searching for a better
|
||||
# hash function if we exceed the size
|
||||
# of a function we've generated in the past....
|
||||
if minC is not None and \
|
||||
c > minC:
|
||||
c = initC
|
||||
sys.stderr.write(' -- c > minC, resetting c to %0.4f\n' % c)
|
||||
else:
|
||||
sys.stderr.write(' -- increasing c to %0.4f\n' % c)
|
||||
sys.stderr.write('Generating graphs... ')
|
||||
|
||||
# Output a progress message
|
||||
sys.stderr.write( str(num_graphs) + ' ')
|
||||
sys.stderr.flush()
|
||||
|
||||
# Create graph w/ N vertices
|
||||
G = Graph(N)
|
||||
# Save the seeds used to generate
|
||||
# the following two hash functions.
|
||||
_seeds = whrandom._inst._seed
|
||||
|
||||
# Create 2 random hash functions
|
||||
f1 = Hash(N, caseInsensitive)
|
||||
f2 = Hash(N, caseInsensitive)
|
||||
|
||||
# Set the initial hash function seed values if passed in.
|
||||
# Doing this protects our hash functions from
|
||||
# changes to whrandom's behavior.
|
||||
if f1Seed is not None:
|
||||
f1.seed = f1Seed
|
||||
f1Seed = None
|
||||
fSpecifiedSeeds = 1
|
||||
if f2Seed is not None:
|
||||
f2.seed = f2Seed
|
||||
f2Seed = None
|
||||
fSpecifiedSeeds = 1
|
||||
|
||||
# Connect vertices given by the values of the two hash functions
|
||||
# for each key. Associate the desired hash value with each
|
||||
# edge.
|
||||
for k, v in keys:
|
||||
h1 = f1(k) ; h2 = f2(k)
|
||||
G.connect( h1, h2, v)
|
||||
|
||||
# Check if the resulting graph is acyclic; if it is,
|
||||
# we're done with step 1.
|
||||
if G.is_acyclic():
|
||||
break
|
||||
elif fSpecifiedSeeds:
|
||||
sys.stderr.write('\nThe initial f1/f2 seeds you specified didn\'t generate a perfect hash function: \n')
|
||||
sys.stderr.write('f1 seed: %s\n' % f1.seed)
|
||||
sys.stderr.write('f2 seed: %s\n' % f2.seed)
|
||||
sys.stderr.write('multipler: %s\n' % c)
|
||||
sys.stderr.write('Your data has likely changed, or you forgot what your initial multiplier should be.\n')
|
||||
sys.stderr.write('continuing the search for a perfect hash function......\n')
|
||||
fSpecifiedSeeds = 0
|
||||
|
||||
# Now we have an acyclic graph, so we assign values to each vertex
|
||||
# such that, for each edge, you can add the values for the two vertices
|
||||
# involved and get the desired value for that edge -- which is the
|
||||
# desired hash key. This task is dead easy, because the graph is acyclic.
|
||||
sys.stderr.write('\nAcyclic graph found; computing vertex values...\n')
|
||||
G.assign_values()
|
||||
|
||||
sys.stderr.write('Checking uniqueness of hash values...\n')
|
||||
|
||||
# Sanity check the result by actually verifying that all the keys
|
||||
# hash to the right value.
|
||||
cchMaxKey = 0
|
||||
maxHashValue = 0
|
||||
|
||||
for k, v in keys:
|
||||
hash1 = G.values[ f1(k) ]
|
||||
hash2 = G.values[ f2(k) ]
|
||||
if hash1 > maxHashValue:
|
||||
maxHashValue = hash1
|
||||
if hash2 > maxHashValue:
|
||||
maxHashValue = hash2
|
||||
perfecthash = (hash1 + hash2) % N
|
||||
assert perfecthash == v
|
||||
cch = len(k)
|
||||
if cch > cchMaxKey:
|
||||
cchMaxKey = cch
|
||||
|
||||
sys.stderr.write('Found perfect hash function!\n')
|
||||
sys.stderr.write('\nIn order to regenerate this hash function, \n')
|
||||
sys.stderr.write('you need to pass these following values back in:\n')
|
||||
sys.stderr.write('f1 seed: %s\n' % hex(f1.seed))
|
||||
sys.stderr.write('f2 seed: %s\n' % hex(f2.seed))
|
||||
sys.stderr.write('initial multipler: %s\n' % c)
|
||||
|
||||
return PerfectHash(cchMaxKey, f1, f2, G, N, len(keys), maxHashValue)
|
||||
|
|
@ -1,114 +0,0 @@
|
|||
#include <Python.h>
|
||||
|
||||
static PyObject * hashFunction(PyObject *self, PyObject *args, PyObject *kw)
|
||||
{
|
||||
PyStringObject *a;
|
||||
register int len;
|
||||
register unsigned char *p;
|
||||
register unsigned long x;
|
||||
unsigned long ulSeed;
|
||||
unsigned long cchSeed;
|
||||
unsigned long cHashElements;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "llOl:hash",
|
||||
&ulSeed, &cchSeed, &a, &cHashElements))
|
||||
return NULL;
|
||||
if (!PyString_Check(a))
|
||||
{
|
||||
PyErr_SetString(PyExc_TypeError, "arg 3 needs to be a string");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
len = a->ob_size;
|
||||
p = (unsigned char *) a->ob_sval;
|
||||
x = ulSeed;
|
||||
while (--len >= 0)
|
||||
{
|
||||
/* (1000003 * x) ^ *p++
|
||||
* translated to handle > 32 bit longs
|
||||
*/
|
||||
x = (0xf4243 * x);
|
||||
x = x & 0xFFFFFFFF;
|
||||
x = x ^ *p++;
|
||||
}
|
||||
x ^= a->ob_size + cchSeed;
|
||||
if (x == 0xFFFFFFFF)
|
||||
x = 0xfffffffe;
|
||||
if (x & 0x80000000)
|
||||
{
|
||||
/* Emulate Python 32-bit signed (2's complement)
|
||||
* modulo operation
|
||||
*/
|
||||
x = (~x & 0xFFFFFFFF) + 1;
|
||||
x %= cHashElements;
|
||||
if (x != 0)
|
||||
{
|
||||
x = x + (~cHashElements & 0xFFFFFFFF) + 1;
|
||||
x = (~x & 0xFFFFFFFF) + 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
x %= cHashElements;
|
||||
return PyInt_FromLong((long)x);
|
||||
}
|
||||
|
||||
static PyObject * calcSeed(PyObject *self, PyObject *args, PyObject *kw)
|
||||
{
|
||||
PyStringObject *a;
|
||||
register int len;
|
||||
register unsigned char *p;
|
||||
register unsigned long x;
|
||||
|
||||
if (!PyString_Check(args))
|
||||
{
|
||||
PyErr_SetString(PyExc_TypeError, "arg 1 expected a string, but didn't get it.");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
a = (PyStringObject *)args;
|
||||
|
||||
len = a->ob_size;
|
||||
p = (unsigned char *) a->ob_sval;
|
||||
x = (*p << 7) & 0xFFFFFFFF;
|
||||
while (--len >= 0)
|
||||
{
|
||||
/* (1000003 * x) ^ *p++
|
||||
* translated to handle > 32 bit longs
|
||||
*/
|
||||
x = (0xf4243 * x);
|
||||
x = x & 0xFFFFFFFF;
|
||||
x = x ^ *p++;
|
||||
}
|
||||
return PyInt_FromLong((long)x);
|
||||
}
|
||||
|
||||
|
||||
static struct PyMethodDef hashMethods[] = {
|
||||
{ "calcSeed", calcSeed, 0, NULL },
|
||||
{ "hash", hashFunction, 0, NULL },
|
||||
{ NULL, NULL, 0, NULL } /* sentinel */
|
||||
};
|
||||
|
||||
#ifdef _MSC_VER
|
||||
_declspec(dllexport)
|
||||
#endif
|
||||
void initperfhash(void)
|
||||
{
|
||||
PyObject *m;
|
||||
|
||||
m = Py_InitModule4("perfhash", hashMethods,
|
||||
NULL, NULL, PYTHON_API_VERSION);
|
||||
if ( m == NULL )
|
||||
Py_FatalError("can't initialize module perfhash");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue