#!/usr/bin/env/python # perfect_hash.py # # Outputs C code for a minimal perfect hash. # The hash is produced using the algorithm described in # "Optimal algorithms for minimal perfect hashing", # G. Havas, B.S. Majewski. Available as a technical report # from the CS department, University of Queensland # (ftp://ftp.cs.uq.oz.au/). # # This is a modified version of Andrew Kuchling's code # (http://starship.python.net/crew/amk/python/code/perfect-hash.html) # and generates C fragments suitable for compilation as a Python # extension module. # # Difference between this algorithm and gperf: # Gperf will complete in finite time with a successful function, # or by giving up. # This algorithm may never complete, although it is extremely likely # when c >= 2. # The algorithm works like this: # 0) You have K keys, that you want to perfectly hash to a bunch # of hash values. # # 1) Choose a number N larger than K. This is the number of # vertices in a graph G, and also the size of the resulting table. # # 2) Pick two random hash functions f1, f2, that output values from # 0...N-1. # # 3) for key in keys: # h1 = f1(key) ; h2 = f2(key) # Draw an edge between vertices h1 and h2 of the graph. # Associate the desired hash value with that edge. # # 4) Check if G is acyclic; if not, go back to step 1 and pick a bigger N. # # 5) Assign values to each vertex such that, for each edge, you can # add the values for the two vertices and get the desired value # for that edge -- which is the desired hash key. This task is # dead easy, because the graph is acyclic. This is done by # picking a vertex V, and assigning it a value of 0. You then do a # depth-first search, assigning values to new vertices so that # they sum up properly. # # 6) f1, f2, and G now make up your perfect hash function. import sys, whrandom, string import pprint import perfhash import time class Hash: """Random hash function For simplicity and speed, this doesn't implement any byte-level hashing scheme. Instead, a random string is generated and prefixing to str(key), and then Python's hashing function is used.""" def __init__(self, N, caseInsensitive=0): self.N = N junk = "" for i in range(10): junk = junk + whrandom.choice(string.letters + string.digits) self.junk = junk self.caseInsensitive = caseInsensitive self.seed = perfhash.calcSeed(junk) def __call__(self, key): key = str(key) if self.caseInsensitive: key = string.upper(key) x = perfhash.hash(self.seed, len(self.junk), key) % self.N #h = hash(self.junk + key) % self.N #assert x == h return x def generate_code(self): s = """{ register int len; register unsigned char *p; register long x; len = cch; p = (unsigned char *) key; x = %(junkSeed)d; while (--len >= 0) x = (1000003*x) ^ """ % \ { "lenJunk" : len(self.junk), "junkSeed" : self.seed, } if self.caseInsensitive: s = s + "toupper(*(p++));" else: s = s + "*(p++);" s = s + """ x ^= cch + %(lenJunk)d; if (x == -1) x = -2; x %%= k_cHashElements; /* ensure the returned value is positive so we mimic Python's %% operator */ if (x < 0) x += k_cHashElements; return x; } """ % { "lenJunk" : len(self.junk), "junkSeed" : self.seed, } return s WHITE, GREY, BLACK = 0,1,2 class Graph: """Graph class. This class isn't particularly efficient or general, and only has the features I needed to implement this algorithm. num_vertices -- number of vertices edges -- maps 2-tuples of vertex numbers to the value for this edge. If there's an edge between v1 and v2 (v1 vertex2: vertex1, vertex2 = vertex2, vertex1 # if self.edges.has_key( (vertex1, vertex2) ): # raise ValueError, 'Collision: vertices already connected' self.edges[ (vertex1, vertex2) ] = value # Add vertices to each other's reachable list if not self.reachable_list.has_key( vertex1 ): self.reachable_list[ vertex1 ] = [vertex2] else: self.reachable_list[vertex1].append(vertex2) if not self.reachable_list.has_key( vertex2 ): self.reachable_list[ vertex2 ] = [vertex1] else: self.reachable_list[vertex2].append(vertex1) def get_edge_value(self, vertex1, vertex2): """Retrieve the value corresponding to the edge between 'vertex1' and 'vertex2'. Raises KeyError if no such edge""" if vertex1 > vertex2: vertex1, vertex2 = vertex2, vertex1 return self.edges[ (vertex1, vertex2) ] def is_acyclic(self): "Returns true if the graph is acyclic, otherwise false" # This is done by doing a depth-first search of the graph; # painting each vertex grey and then black. If the DFS # ever finds a vertex that isn't white, there's a cycle. colour = {} for i in range(self.num_vertices): colour[i] = WHITE # Loop over all vertices, taking white ones as starting # points for a traversal. for i in range(self.num_vertices): if colour[i] == WHITE: # List of vertices to visit visit_list = [ (None,i) ] # Do a DFS while visit_list: # Colour this vertex grey. parent, vertex = visit_list[0] ; del visit_list[0] colour[vertex] = GREY # Make copy of list of neighbours, removing the vertex # we arrived here from. neighbours = self.reachable_list.get(vertex, []) [:] if parent in neighbours: neighbours.remove( parent ) for neighbour in neighbours: if colour[neighbour] == WHITE: visit_list.insert(0, (vertex, neighbour) ) elif colour[neighbour] != WHITE: # Aha! Already visited this node, # so the graph isn't acyclic. return 0 colour[vertex] = BLACK # We got through, so the graph is acyclic. return 1 def assign_values(self): """Compute values for each vertex, so that they sum up properly to the associated value for each edge.""" # Also done with a DFS; I simply copied the DFS code # from is_acyclic(). (Should generalize the logic so # one function could be used from both methods, # but I couldn't be bothered.) colour = {} for i in range(self.num_vertices): colour[i] = WHITE # Loop over all vertices, taking white ones as starting # points for a traversal. for i in range(self.num_vertices): if colour[i] == WHITE: # Set this vertex's value, arbitrarily, to zero. self.set_vertex_value( i, 0 ) # List of vertices to visit visit_list = [ (None,i) ] # Do a DFS while visit_list: # Colour this vertex grey. parent, vertex = visit_list[0] ; del visit_list[0] colour[vertex] = GREY # Make copy of list of neighbours, removing the vertex # we arrived here from. neighbours = self.reachable_list.get(vertex, []) [:] if parent in neighbours: neighbours.remove( parent ) for neighbour in self.reachable_list.get(vertex, []): edge_value = self.get_edge_value( vertex, neighbour ) if colour[neighbour] == WHITE: visit_list.insert(0, (vertex, neighbour) ) # Set new vertex's value to the desired # edge value, minus the value of the # vertex we came here from. new_val = (edge_value - self.get_vertex_value( vertex ) ) self.set_vertex_value( neighbour, new_val % self.num_vertices) colour[vertex] = BLACK # Returns nothing return def __getitem__(self, index): if index < self.num_vertices: return index raise IndexError def get_vertex_value(self, vertex): "Get value for a vertex" return self.values[ vertex ] def set_vertex_value(self, vertex, value): "Set value for a vertex" self.values[ vertex ] = value def generate_code(self, out, width = 70): "Return nicely formatted table" out.write("{ ") pos = 0 for v in self.values: v=str(v)+', ' out.write(v) pos = pos + len(v) + 1 if pos > width: out.write('\n '); pos = 0 out.write('};\n') class PerfectHash: def __init__(self, cchMax, f1, f2, G, cHashElements, cKeys, maxHashValue): self.cchMax = cchMax self.f1 = f1 self.f2 = f2 self.G = G self.cHashElements = cHashElements self.cKeys = cKeys # determine the necessary type for storing our hash function # helper table: self.type = self.determineType(maxHashValue) def generate_header(self, structName): header = """ #include "Python.h" #include /* --- C API ----------------------------------------------------*/ /* C API for usage by other Python modules */ typedef struct %(structName)s { unsigned long cKeys; unsigned long cchMax; unsigned long (*hash)(const char *key, unsigned int cch); const void *(*getValue)(unsigned long iKey); } %(structName)s; """ % { "structName" : structName } return header def determineType(self, maxHashValue): if maxHashValue <= 255: return "unsigned char" elif maxHashValue <= 65535: return "unsigned short" else: # Take the cheesy way out... return "unsigned long" def generate_code(self, moduleName, dataArrayName, dataArrayType, structName): # Output C code for the hash functions and tables code = """ /* * The hash is produced using the algorithm described in * "Optimal algorithms for minimal perfect hashing", * G. Havas, B.S. Majewski. Available as a technical report * from the CS department, University of Queensland * (ftp://ftp.cs.uq.oz.au/). * * Generated using a heavily tweaked version of Andrew Kuchling's * perfect_hash.py: * http://starship.python.net/crew/amk/python/code/perfect-hash.html * * Generated on: %s */ """ % time.ctime(time.time()) # MSVC SP3 was complaining when I actually used a global constant code = code + """ #define k_cHashElements %i #define k_cchMaxKey %d #define k_cKeys %i """ % (self.cHashElements, self.cchMax, self.cKeys) code = code + """ static const %s G[k_cHashElements]; static const %s %s[k_cKeys]; """ % (self.type, dataArrayType, dataArrayName) code = code + """ static long f1(const char *key, unsigned int cch) """ code = code + self.f1.generate_code() code = code + """ static long f2(const char *key, unsigned int cch) """ code = code + self.f2.generate_code() code = code + """ static unsigned long hash(const char *key, unsigned int cch) { return ((unsigned long)(G[ f1(key, cch) ]) + (unsigned long)(G[ f2(key, cch) ]) ) %% k_cHashElements; } const void *getValue(unsigned long iKey) { return &%(dataArrayName)s[iKey]; } /* Helper for adding objects to dictionaries. Check for errors with PyErr_Occurred() */ static void insobj(PyObject *dict, char *name, PyObject *v) { PyDict_SetItemString(dict, name, v); Py_XDECREF(v); } static const %(structName)s hashAPI = { k_cKeys, k_cchMaxKey, &hash, &getValue, }; static PyMethodDef Module_methods[] = { {NULL, NULL}, }; static char *Module_docstring = "%(moduleName)s hash function module"; /* Error reporting for module init functions */ #define Py_ReportModuleInitError(modname) { \\ PyObject *exc_type, *exc_value, *exc_tb; \\ PyObject *str_type, *str_value; \\ \\ /* Fetch error objects and convert them to strings */ \\ PyErr_Fetch(&exc_type, &exc_value, &exc_tb); \\ if (exc_type && exc_value) { \\ str_type = PyObject_Str(exc_type); \\ str_value = PyObject_Str(exc_value); \\ } \\ else { \\ str_type = NULL; \\ str_value = NULL; \\ } \\ /* Try to format a more informative error message using the \\ original error */ \\ if (str_type && str_value && \\ PyString_Check(str_type) && PyString_Check(str_value)) \\ PyErr_Format( \\ PyExc_ImportError, \\ "initialization of module "modname" failed " \\ "(%%s:%%s)", \\ PyString_AS_STRING(str_type), \\ PyString_AS_STRING(str_value)); \\ else \\ PyErr_SetString( \\ PyExc_ImportError, \\ "initialization of module "modname" failed"); \\ Py_XDECREF(str_type); \\ Py_XDECREF(str_value); \\ Py_XDECREF(exc_type); \\ Py_XDECREF(exc_value); \\ Py_XDECREF(exc_tb); \\ } /* Create PyMethodObjects and register them in the module\'s dict */ DL_EXPORT(void) init%(moduleName)s(void) { PyObject *module, *moddict; /* Create module */ module = Py_InitModule4("%(moduleName)s", /* Module name */ Module_methods, /* Method list */ Module_docstring, /* Module doc-string */ (PyObject *)NULL, /* always pass this as *self */ PYTHON_API_VERSION); /* API Version */ if (module == NULL) goto onError; /* Add some constants to the module\'s dict */ moddict = PyModule_GetDict(module); if (moddict == NULL) goto onError; /* Export C API */ insobj( moddict, "%(moduleName)sAPI", PyCObject_FromVoidPtr((void *)&hashAPI, NULL)); onError: /* Check for errors and report them */ if (PyErr_Occurred()) Py_ReportModuleInitError("%(moduleName)s"); return; } """ % { "moduleName" : moduleName, "dataArrayName" : dataArrayName, "structName" : structName, } return code def generate_graph(self, out): out.write(""" static const unsigned short G[] = """) self.G.generate_code(out) def generate_hash(keys, caseInsensitive=0, minC=None, initC=None, f1Seed=None, f2Seed=None, cIncrement=None, cTries=None): """Print out code for a perfect minimal hash. Input is a list of (key, desired hash value) tuples. """ # K is the number of keys. K = len(keys) # We will be generating graphs of size N, where N = c * K. # The larger C is, the fewer trial graphs will need to be made, but # the resulting table is also larger. Increase this starting value # if you're impatient. After 50 failures, c will be increased by 0.025. if initC is None: initC = 1.5 c = initC if cIncrement is None: cIncrement = 0.0025 if cTries is None: cTries = 50 # Number of trial graphs so far num_graphs = 0 sys.stderr.write('Generating graphs... ') while 1: # N is the number of vertices in the graph G N = int(c*K) num_graphs = num_graphs + 1 if (num_graphs % cTries) == 0: # Enough failures at this multiplier, # increase the multiplier and keep trying.... c = c + cIncrement # Whats good with searching for a better # hash function if we exceed the size # of a function we've generated in the past.... if minC is not None and \ c > minC: c = initC sys.stderr.write(' -- c > minC, resetting c to %0.4f\n' % c) else: sys.stderr.write(' -- increasing c to %0.4f\n' % c) sys.stderr.write('Generating graphs... ') # Output a progress message sys.stderr.write( str(num_graphs) + ' ') sys.stderr.flush() # Create graph w/ N vertices G = Graph(N) # Save the seeds used to generate # the following two hash functions. _seeds = whrandom._inst._seed # Create 2 random hash functions f1 = Hash(N, caseInsensitive) f2 = Hash(N, caseInsensitive) # Set the initial hash function seed values if passed in. # Doing this protects our hash functions from # changes to whrandom's behavior. if f1Seed is not None: f1.seed = f1Seed f1Seed = None fSpecifiedSeeds = 1 if f2Seed is not None: f2.seed = f2Seed f2Seed = None fSpecifiedSeeds = 1 # Connect vertices given by the values of the two hash functions # for each key. Associate the desired hash value with each # edge. for k, v in keys: h1 = f1(k) ; h2 = f2(k) G.connect( h1,h2, v) # Check if the resulting graph is acyclic; if it is, # we're done with step 1. if G.is_acyclic(): break elif fSpecifiedSeeds: sys.stderr.write('\nThe initial f1/f2 seeds you specified didn\'t generate a perfect hash function: \n') sys.stderr.write('f1 seed: %s\n' % f1.seed) sys.stderr.write('f2 seed: %s\n' % f2.seed) sys.stderr.write('multipler: %s\n' % c) sys.stderr.write('Your data has likely changed, or you forgot what your initial multiplier should be.\n') sys.stderr.write('continuing the search for a perfect hash function......\n') fSpecifiedSeeds = 0 # Now we have an acyclic graph, so we assign values to each vertex # such that, for each edge, you can add the values for the two vertices # involved and get the desired value for that edge -- which is the # desired hash key. This task is dead easy, because the graph is acyclic. sys.stderr.write('\nAcyclic graph found; computing vertex values...\n') G.assign_values() sys.stderr.write('Checking uniqueness of hash values...\n') # Sanity check the result by actually verifying that all the keys # hash to the right value. cchMaxKey = 0 maxHashValue = 0 for k, v in keys: hash1 = G.values[ f1(k) ] hash2 = G.values[ f2(k) ] if hash1 > maxHashValue: maxHashValue = hash1 if hash2 > maxHashValue: maxHashValue = hash2 perfecthash = (hash1 + hash2) % N assert perfecthash == v cch = len(k) if cch > cchMaxKey: cchMaxKey = cch sys.stderr.write('Found perfect hash function!\n') sys.stderr.write('\nIn order to regenerate this hash function, \n') sys.stderr.write('you need to pass these following values back in:\n') sys.stderr.write('f1 seed: %s\n' % repr(f1.seed)) sys.stderr.write('f2 seed: %s\n' % repr(f2.seed)) sys.stderr.write('initial multipler: %s\n' % c) return PerfectHash(cchMaxKey, f1, f2, G, N, len(keys), maxHashValue)