gh-96954: use a directed acyclic word graph for storing the unicodedata codepoint names (#97906)

Co-authored-by: Łukasz Langa <lukasz@langa.pl> Co-authored-by: Pieter Eendebak <pieter.eendebak@gmail.com> Co-authored-by: Dennis Sweeney <36520290+sweeneyde@users.noreply.github.com>
2023-11-04 15:56:58 +01:00 · 2023-11-04 15:56:58 +01:00 · 9573d14215
parent 0e9c364f4a
commit 9573d14215
8 changed files with 18134 additions and 30444 deletions
--- a/Lib/test/test_tools/test_makeunicodedata.py
+++ b/Lib/test/test_tools/test_makeunicodedata.py
@ -0,0 +1,121 @@
+import unittest
+from test.test_tools import toolsdir, imports_under_tool
+from test import support
+from test.support.hypothesis_helper import hypothesis
+
+st = hypothesis.strategies
+given = hypothesis.given
+example = hypothesis.example
+
+
+with imports_under_tool("unicode"):
+    from dawg import Dawg, build_compression_dawg, lookup, inverse_lookup
+
+
+@st.composite
+def char_name_db(draw, min_length=1, max_length=30):
+    m = draw(st.integers(min_value=min_length, max_value=max_length))
+    names = draw(
+        st.sets(st.text("abcd", min_size=1, max_size=10), min_size=m, max_size=m)
+    )
+    characters = draw(st.sets(st.characters(), min_size=m, max_size=m))
+    return list(zip(names, characters))
+
+
+class TestDawg(unittest.TestCase):
+    """Tests for the directed acyclic word graph data structure that is used
+    to store the unicode character names in unicodedata. Tests ported from PyPy
+    """
+
+    def test_dawg_direct_simple(self):
+        dawg = Dawg()
+        dawg.insert("a", -4)
+        dawg.insert("c", -2)
+        dawg.insert("cat", -1)
+        dawg.insert("catarr", 0)
+        dawg.insert("catnip", 1)
+        dawg.insert("zcatnip", 5)
+        packed, data, inverse = dawg.finish()
+
+        self.assertEqual(lookup(packed, data, b"a"), -4)
+        self.assertEqual(lookup(packed, data, b"c"), -2)
+        self.assertEqual(lookup(packed, data, b"cat"), -1)
+        self.assertEqual(lookup(packed, data, b"catarr"), 0)
+        self.assertEqual(lookup(packed, data, b"catnip"), 1)
+        self.assertEqual(lookup(packed, data, b"zcatnip"), 5)
+        self.assertRaises(KeyError, lookup, packed, data, b"b")
+        self.assertRaises(KeyError, lookup, packed, data, b"catni")
+        self.assertRaises(KeyError, lookup, packed, data, b"catnipp")
+
+        self.assertEqual(inverse_lookup(packed, inverse, -4), b"a")
+        self.assertEqual(inverse_lookup(packed, inverse, -2), b"c")
+        self.assertEqual(inverse_lookup(packed, inverse, -1), b"cat")
+        self.assertEqual(inverse_lookup(packed, inverse, 0), b"catarr")
+        self.assertEqual(inverse_lookup(packed, inverse, 1), b"catnip")
+        self.assertEqual(inverse_lookup(packed, inverse, 5), b"zcatnip")
+        self.assertRaises(KeyError, inverse_lookup, packed, inverse, 12)
+
+    def test_forbid_empty_dawg(self):
+        dawg = Dawg()
+        self.assertRaises(ValueError, dawg.finish)
+
+    @given(char_name_db())
+    @example([("abc", "a"), ("abd", "b")])
+    @example(
+        [
+            ("bab", "1"),
+            ("a", ":"),
+            ("ad", "@"),
+            ("b", "<"),
+            ("aacc", "?"),
+            ("dab", "D"),
+            ("aa", "0"),
+            ("ab", "F"),
+            ("aaa", "7"),
+            ("cbd", "="),
+            ("abad", ";"),
+            ("ac", "B"),
+            ("abb", "4"),
+            ("bb", "2"),
+            ("aab", "9"),
+            ("caaaaba", "E"),
+            ("ca", ">"),
+            ("bbaaa", "5"),
+            ("d", "3"),
+            ("baac", "8"),
+            ("c", "6"),
+            ("ba", "A"),
+        ]
+    )
+    @example(
+        [
+            ("bcdac", "9"),
+            ("acc", "g"),
+            ("d", "d"),
+            ("daabdda", "0"),
+            ("aba", ";"),
+            ("c", "6"),
+            ("aa", "7"),
+            ("abbd", "c"),
+            ("badbd", "?"),
+            ("bbd", "f"),
+            ("cc", "@"),
+            ("bb", "8"),
+            ("daca", ">"),
+            ("ba", ":"),
+            ("baac", "3"),
+            ("dbdddac", "a"),
+            ("a", "2"),
+            ("cabd", "b"),
+            ("b", "="),
+            ("abd", "4"),
+            ("adcbd", "5"),
+            ("abc", "e"),
+            ("ab", "1"),
+        ]
+    )
+    def test_dawg(self, data):
+        # suppress debug prints
+        with support.captured_stdout() as output:
+            # it's enough to build it, building will also check the result
+            build_compression_dawg(data)
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@ -104,6 +104,26 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
            if looked_name := self.db.name(char, None):
                self.assertEqual(self.db.lookup(looked_name), char)

+    def test_no_names_in_pua(self):
+        puas = [*range(0xe000, 0xf8ff),
+                *range(0xf0000, 0xfffff),
+                *range(0x100000, 0x10ffff)]
+        for i in puas:
+            char = chr(i)
+            self.assertRaises(ValueError, self.db.name, char)
+
+    def test_lookup_nonexistant(self):
+        # just make sure that lookup can fail
+        for nonexistant in [
+            "LATIN SMLL LETR A",
+            "OPEN HANDS SIGHS",
+            "DREGS",
+            "HANDBUG",
+            "MODIFIER LETTER CYRILLIC SMALL QUESTION MARK",
+            "???",
+        ]:
+            self.assertRaises(KeyError, self.db.lookup, nonexistant)
+
    def test_digit(self):
        self.assertEqual(self.db.digit('A', None), None)
        self.assertEqual(self.db.digit('9'), 9)
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@ -1342,6 +1342,14 @@ check-abidump: all
 regen-limited-abi: all
 	$(RUNSHARED) ./$(BUILDPYTHON) $(srcdir)/Tools/build/stable_abi.py --generate-all $(srcdir)/Misc/stable_abi.toml

+############################################################################
+# Regenerate Unicode Data
+
+.PHONY: regen-unicodedata
+regen-unicodedata:
+	$(PYTHON_FOR_REGEN) Tools/unicode/makeunicodedata.py
+
+
 ############################################################################
 # Regenerate all generated files

@ -1350,7 +1358,7 @@ regen-limited-abi: all
 regen-all: regen-cases regen-typeslots \
 	regen-token regen-ast regen-keyword regen-sre regen-frozen \
 	regen-pegen-metaparser regen-pegen regen-test-frozenmain \
-	regen-test-levenshtein regen-global-objects
+	regen-test-levenshtein regen-global-objects regen-unicodedata
 	@echo
 	@echo "Note: make regen-stdlib-module-names, make regen-limited-abi"
 	@echo "and make regen-configure should be run manually"
--- a/Misc/NEWS.d/next/Library/2022-10-05-15-01-36.gh-issue-96954.ezwkrU.rst
+++ b/Misc/NEWS.d/next/Library/2022-10-05-15-01-36.gh-issue-96954.ezwkrU.rst
@ -0,0 +1,5 @@
+Switch the storage of the unicode codepoint names to use a different
+data-structure, a `directed acyclic word graph
+<https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton>`_.
+This makes the unicodedata shared library about 440 KiB smaller. Contributed by
+Carl Friedrich Bolz-Tereick using code from the PyPy project.
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@ -977,21 +977,6 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
 /* -------------------------------------------------------------------- */
 /* database code (cut and pasted from the unidb package) */

-static unsigned long
-_gethash(const char *s, int len, int scale)
-{
-    int i;
-    unsigned long h = 0;
-    unsigned long ix;
-    for (i = 0; i < len; i++) {
-        h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
-        ix = h & 0xff000000;
-        if (ix)
-            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
-    }
-    return h;
-}
-
 static const char * const hangul_syllables[][3] = {
    { "G",  "A",   ""   },
    { "GG", "AE",  "G"  },
@ -1046,6 +1031,247 @@ is_unified_ideograph(Py_UCS4 code)
 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
                          (cp < named_sequences_end))

+
+// DAWG decoding functions
+
+static unsigned int
+_dawg_decode_varint_unsigned(unsigned int index, unsigned int* result)
+{
+    unsigned int res = 0;
+    unsigned int shift = 0;
+    for (;;) {
+        unsigned char byte = packed_name_dawg[index];
+        res |= (byte & 0x7f) << shift;
+        index++;
+        shift += 7;
+        if (!(byte & 0x80)) {
+            *result = res;
+            return index;
+        }
+    }
+}
+
+static int
+_dawg_match_edge(const char* name, unsigned int namelen, unsigned int size,
+                 unsigned int label_offset, unsigned int namepos)
+{
+    // This returns 1 if the edge matched, 0 if it didn't (but further edges
+    // could match) and -1 if the name cannot match at all.
+    if (size > 1 && namepos + size > namelen) {
+        return 0;
+    }
+    for (unsigned int i = 0; i < size; i++) {
+        if (packed_name_dawg[label_offset + i] != Py_TOUPPER(name[namepos + i])) {
+            if (i > 0) {
+                return -1; // cannot match at all
+            }
+            return 0;
+        }
+    }
+    return 1;
+}
+
+// reading DAWG node information:
+// a node is encoded by a varint. The lowest bit of that int is set if the node
+// is a final, accepting state. The higher bits of that int represent the
+// number of names that are encoded by the sub-DAWG started by this node. It's
+// used to compute the position of a name.
+//
+// the starting node of the DAWG is at position 0.
+//
+// the varint representing a node is followed by the node's edges, the encoding
+// is described below
+
+
+static unsigned int
+_dawg_decode_node(unsigned int node_offset, bool* final)
+{
+    unsigned int num;
+    node_offset = _dawg_decode_varint_unsigned(node_offset, &num);
+    *final = num & 1;
+    return node_offset;
+}
+
+static bool
+_dawg_node_is_final(unsigned int node_offset)
+{
+    unsigned int num;
+    _dawg_decode_varint_unsigned(node_offset, &num);
+    return num & 1;
+}
+
+static unsigned int
+_dawg_node_descendant_count(unsigned int node_offset)
+{
+    unsigned int num;
+    _dawg_decode_varint_unsigned(node_offset, &num);
+    return num >> 1;
+}
+
+
+// reading DAWG edge information:
+// a DAWG edge is comprised of the following information:
+// (1) the size of the label of the string attached to the edge
+// (2) the characters of that edge
+// (3) the target node
+// (4) whether the edge is the last edge in the list of edges following a node
+//
+// this information is encoded in a compact form as follows:
+//
+// +---------+-----------------+--------------+--------------------
+// |  varint | size (if != 1)  | label chars  | ... next edge ...
+// +---------+-----------------+--------------+--------------------
+//
+// - first comes a varint
+//     - the lowest bit of that varint is whether the edge is final (4)
+//     - the second lowest bit of that varint is true if the size of
+//       the length of the label is 1 (1)
+//     - the rest of the varint is an offset that can be used to compute
+//       the offset of the target node of that edge (3)
+//  - if the size is not 1, the first varint is followed by a
+//    character encoding the number of characters of the label (1)
+//    (unicode character names aren't larger than 256 bytes, therefore each
+//    edge label can be at most 256 chars, but is usually smaller)
+//  - the next size bytes are the characters of the label (2)
+//
+// the offset of the target node is computed as follows: the number in the
+// upper bits of the varint needs to be added to the offset of the target node
+// of the previous edge. For the first edge, where there is no previous target
+// node, the offset of the first edge is used.
+// The intuition here is that edges going out from a node often lead to nodes
+// that are close by, leading to small offsets from the current node and thus
+// fewer bytes.
+//
+// There is a special case: if a final node has no outgoing edges, it has to be
+// followed by a 0 byte to indicate that there are no edges (because the end of
+// the edge list is normally indicated in a bit in the edge encoding). This is
+// indicated by _dawg_decode_edge returning -1
+
+
+static int
+_dawg_decode_edge(bool is_first_edge, unsigned int prev_target_node_offset,
+                  unsigned int edge_offset, unsigned int* size,
+                  unsigned int* label_offset, unsigned int* target_node_offset)
+{
+    unsigned int num;
+    edge_offset = _dawg_decode_varint_unsigned(edge_offset, &num);
+    if (num == 0 && is_first_edge) {
+        return -1; // trying to decode past a final node without outgoing edges
+    }
+    bool last_edge = num & 1;
+    num >>= 1;
+    bool len_is_one = num & 1;
+    num >>= 1;
+    *target_node_offset = prev_target_node_offset + num;
+    if (len_is_one) {
+        *size = 1;
+    } else {
+        *size = packed_name_dawg[edge_offset++];
+    }
+    *label_offset = edge_offset;
+    return last_edge;
+}
+
+static int
+_lookup_dawg_packed(const char* name, unsigned int namelen)
+{
+    unsigned int stringpos = 0;
+    unsigned int node_offset = 0;
+    unsigned int result = 0; // this is the number of final nodes that we skipped to match name
+    while (stringpos < namelen) {
+        bool final;
+        unsigned int edge_offset = _dawg_decode_node(node_offset, &final);
+        unsigned int prev_target_node_offset = edge_offset;
+        bool is_first_edge = true;
+        for (;;) {
+            unsigned int size;
+            unsigned int label_offset, target_node_offset;
+            int last_edge = _dawg_decode_edge(
+                    is_first_edge, prev_target_node_offset, edge_offset,
+                    &size, &label_offset, &target_node_offset);
+            if (last_edge == -1) {
+                return -1;
+            }
+            is_first_edge = false;
+            prev_target_node_offset = target_node_offset;
+            int matched = _dawg_match_edge(name, namelen, size, label_offset, stringpos);
+            if (matched == -1) {
+                return -1;
+            }
+            if (matched) {
+                if (final)
+                    result += 1;
+                stringpos += size;
+                node_offset = target_node_offset;
+                break;
+            }
+            if (last_edge) {
+                return -1;
+            }
+            result += _dawg_node_descendant_count(target_node_offset);
+            edge_offset = label_offset + size;
+        }
+    }
+    if (_dawg_node_is_final(node_offset)) {
+        return result;
+    }
+    return -1;
+}
+
+static int
+_inverse_dawg_lookup(char* buffer, unsigned int buflen, unsigned int pos)
+{
+    unsigned int node_offset = 0;
+    unsigned int bufpos = 0;
+    for (;;) {
+        bool final;
+        unsigned int edge_offset = _dawg_decode_node(node_offset, &final);
+
+        if (final) {
+            if (pos == 0) {
+                if (bufpos + 1 == buflen) {
+                    return 0;
+                }
+                buffer[bufpos] = '\0';
+                return 1;
+            }
+            pos--;
+        }
+        unsigned int prev_target_node_offset = edge_offset;
+        bool is_first_edge = true;
+        for (;;) {
+            unsigned int size;
+            unsigned int label_offset, target_node_offset;
+            int last_edge = _dawg_decode_edge(
+                    is_first_edge, prev_target_node_offset, edge_offset,
+                    &size, &label_offset, &target_node_offset);
+            if (last_edge == -1) {
+                return 0;
+            }
+            is_first_edge = false;
+            prev_target_node_offset = target_node_offset;
+
+            unsigned int descendant_count = _dawg_node_descendant_count(target_node_offset);
+            if (pos < descendant_count) {
+                if (bufpos + size >= buflen) {
+                    return 0; // buffer overflow
+                }
+                for (unsigned int i = 0; i < size; i++) {
+                    buffer[bufpos++] = packed_name_dawg[label_offset++];
+                }
+                node_offset = target_node_offset;
+                break;
+            } else if (!last_edge) {
+                pos -= descendant_count;
+                edge_offset = label_offset + size;
+            } else {
+                return 0;
+            }
+        }
+    }
+}
+
+
 static int
 _getucname(PyObject *self,
           Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
@ -1054,9 +1280,6 @@ _getucname(PyObject *self,
     * If with_alias_and_seq is 1, check for names in the Private Use Area 15
     * that we are using for aliases and named sequences. */
    int offset;
-    int i;
-    int word;
-    const unsigned char* w;

    if (code >= 0x110000)
        return 0;
@ -1107,45 +1330,15 @@ _getucname(PyObject *self,
        return 1;
    }

-    /* get offset into phrasebook */
-    offset = phrasebook_offset1[(code>>phrasebook_shift)];
-    offset = phrasebook_offset2[(offset<<phrasebook_shift) +
-                               (code&((1<<phrasebook_shift)-1))];
-    if (!offset)
+    /* get position of codepoint in order of names in the dawg */
+    offset = dawg_codepoint_to_pos_index1[(code>>DAWG_CODEPOINT_TO_POS_SHIFT)];
+    offset = dawg_codepoint_to_pos_index2[(offset<<DAWG_CODEPOINT_TO_POS_SHIFT) +
+                               (code&((1<<DAWG_CODEPOINT_TO_POS_SHIFT)-1))];
+    if (offset == DAWG_CODEPOINT_TO_POS_NOTFOUND)
        return 0;

-    i = 0;
-
-    for (;;) {
-        /* get word index */
-        word = phrasebook[offset] - phrasebook_short;
-        if (word >= 0) {
-            word = (word << 8) + phrasebook[offset+1];
-            offset += 2;
-        } else
-            word = phrasebook[offset++];
-        if (i) {
-            if (i > buflen)
-                return 0; /* buffer overflow */
-            buffer[i++] = ' ';
-        }
-        /* copy word string from lexicon.  the last character in the
-           word has bit 7 set.  the last word in a string ends with
-           0x80 */
-        w = lexicon + lexicon_offset[word];
-        while (*w < 128) {
-            if (i >= buflen)
-                return 0; /* buffer overflow */
-            buffer[i++] = *w++;
-        }
-        if (i >= buflen)
-            return 0; /* buffer overflow */
-        buffer[i++] = *w & 127;
-        if (*w == 128)
-            break; /* end of word */
-    }
-
-    return 1;
+    assert(buflen >= 0);
+    return _inverse_dawg_lookup(buffer, Py_SAFE_DOWNCAST(buflen, int, unsigned int), offset);
 }

 static int
@ -1157,21 +1350,6 @@ capi_getucname(Py_UCS4 code,

 }

-static int
-_cmpname(PyObject *self, int code, const char* name, int namelen)
-{
-    /* check if code corresponds to the given name */
-    int i;
-    char buffer[NAME_MAXLEN+1];
-    if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
-        return 0;
-    for (i = 0; i < namelen; i++) {
-        if (Py_TOUPPER(name[i]) != buffer[i])
-            return 0;
-    }
-    return buffer[namelen] == '\0';
-}
-
 static void
 find_syllable(const char *str, int *len, int *pos, int count, int column)
 {
@ -1193,31 +1371,25 @@ find_syllable(const char *str, int *len, int *pos, int count, int column)
 }

 static int
-_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
+_check_alias_and_seq(Py_UCS4* code, int with_named_seq)
 {
    /* check if named sequences are allowed */
-    if (!with_named_seq && IS_NAMED_SEQ(cp))
+    if (!with_named_seq && IS_NAMED_SEQ(*code))
        return 0;
    /* if the code point is in the PUA range that we use for aliases,
     * convert it to obtain the right code point */
-    if (IS_ALIAS(cp))
-        *code = name_aliases[cp-aliases_start];
-    else
-        *code = cp;
+    if (IS_ALIAS(*code))
+        *code = name_aliases[*code-aliases_start];
    return 1;
 }

+
 static int
-_getcode(PyObject* self,
-         const char* name, int namelen, Py_UCS4* code, int with_named_seq)
+_getcode(const char* name, int namelen, Py_UCS4* code)
 {
    /* Return the code point associated with the given name.
-     * Named aliases are resolved too (unless self != NULL (i.e. we are using
-     * 3.2.0)).  If with_named_seq is 1, returns the PUA code point that we are
-     * using for the named sequence, and the caller must then convert it. */
-    unsigned int h, v;
-    unsigned int mask = code_size-1;
-    unsigned int i, incr;
+     * Named aliases are not resolved, they are returned as a code point in the
+     * PUA */

    /* Check for hangul syllables. */
    if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
@ -1240,6 +1412,7 @@ _getcode(PyObject* self,
    /* Check for unified ideographs. */
    if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
        /* Four or five hexdigits must follow. */
+        unsigned int v;
        v = 0;
        name += 22;
        namelen -= 22;
@ -1261,41 +1434,24 @@ _getcode(PyObject* self,
        return 1;
    }

-    /* the following is the same as python's dictionary lookup, with
-       only minor changes.  see the makeunicodedata script for more
-       details */
-
-    h = (unsigned int) _gethash(name, namelen, code_magic);
-    i = (~h) & mask;
-    v = code_hash[i];
-    if (!v)
+    assert(namelen >= 0);
+    int position = _lookup_dawg_packed(name, Py_SAFE_DOWNCAST(namelen, int, unsigned int));
+    if (position < 0) {
        return 0;
-    if (_cmpname(self, v, name, namelen)) {
-        return _check_alias_and_seq(v, code, with_named_seq);
-    }
-    incr = (h ^ (h >> 3)) & mask;
-    if (!incr)
-        incr = mask;
-    for (;;) {
-        i = (i + incr) & mask;
-        v = code_hash[i];
-        if (!v)
-            return 0;
-        if (_cmpname(self, v, name, namelen)) {
-            return _check_alias_and_seq(v, code, with_named_seq);
-        }
-        incr = incr << 1;
-        if (incr > mask)
-            incr = incr ^ code_poly;
    }
+    *code = dawg_pos_to_codepoint[position];
+    return 1;
 }

+
 static int
 capi_getcode(const char* name, int namelen, Py_UCS4* code,
             int with_named_seq)
 {
-    return _getcode(NULL, name, namelen, code, with_named_seq);
-
+    if (!_getcode(name, namelen, code)) {
+        return 0;
+    }
+    return _check_alias_and_seq(code, with_named_seq);
 }

 static void
@ -1388,10 +1544,17 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
        return NULL;
    }

-    if (!_getcode(self, name, (int)name_length, &code, 1)) {
+    if (!_getcode(name, (int)name_length, &code)) {
        PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
        return NULL;
    }
+    if (UCD_Check(self)) {
+        /* in 3.2.0 there are no aliases and named sequences */
+        if (IS_ALIAS(code) || IS_NAMED_SEQ(code)) {
+            PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
+            return 0;
+        }
+    }
    /* check if code is in the PUA range that we use for named sequences
       and convert it */
    if (IS_NAMED_SEQ(code)) {
@ -1400,6 +1563,9 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
                                         named_sequences[index].seq,
                                         named_sequences[index].seqlen);
    }
+    if (IS_ALIAS(code)) {
+        code = name_aliases[code-aliases_start];
+    }
    return PyUnicode_FromOrdinal(code);
 }

--- a/Modules/unicodename_db.h
+++ b/Modules/unicodename_db.h
--- a/Tools/unicode/dawg.py
+++ b/Tools/unicode/dawg.py
@ -0,0 +1,533 @@
+# Original Algorithm:
+# By Steve Hanov, 2011. Released to the public domain.
+# Please see http://stevehanov.ca/blog/index.php?id=115 for the accompanying article.
+#
+# Adapted for PyPy/CPython by Carl Friedrich Bolz-Tereick
+#
+# Based on Daciuk, Jan, et al. "Incremental construction of minimal acyclic finite-state automata."
+# Computational linguistics 26.1 (2000): 3-16.
+#
+# Updated 2014 to use DAWG as a mapping; see
+# Kowaltowski, T.; CL. Lucchesi (1993), "Applications of finite automata representing large vocabularies",
+# Software-Practice and Experience 1993
+
+from collections import defaultdict
+from functools import cached_property
+
+
+# This class represents a node in the directed acyclic word graph (DAWG). It
+# has a list of edges to other nodes. It has functions for testing whether it
+# is equivalent to another node. Nodes are equivalent if they have identical
+# edges, and each identical edge leads to identical states. The __hash__ and
+# __eq__ functions allow it to be used as a key in a python dictionary.
+
+
+class DawgNode:
+
+    def __init__(self, dawg):
+        self.id = dawg.next_id
+        dawg.next_id += 1
+        self.final = False
+        self.edges = {}
+
+        self.linear_edges = None # later: list of (string, next_state)
+
+    def __str__(self):
+        if self.final:
+            arr = ["1"]
+        else:
+            arr = ["0"]
+
+        for (label, node) in sorted(self.edges.items()):
+            arr.append(label)
+            arr.append(str(node.id))
+
+        return "_".join(arr)
+    __repr__ = __str__
+
+    def _as_tuple(self):
+        edges = sorted(self.edges.items())
+        edge_tuple = tuple((label, node.id) for label, node in edges)
+        return (self.final, edge_tuple)
+
+    def __hash__(self):
+        return hash(self._as_tuple())
+
+    def __eq__(self, other):
+        return self._as_tuple() == other._as_tuple()
+
+    @cached_property
+    def num_reachable_linear(self):
+        # returns the number of different paths to final nodes reachable from
+        # this one
+
+        count = 0
+        # staying at self counts as a path if self is final
+        if self.final:
+            count += 1
+        for label, node in self.linear_edges:
+            count += node.num_reachable_linear
+
+        return count
+
+
+class Dawg:
+    def __init__(self):
+        self.previous_word = ""
+        self.next_id = 0
+        self.root = DawgNode(self)
+
+        # Here is a list of nodes that have not been checked for duplication.
+        self.unchecked_nodes = []
+
+        # To deduplicate, maintain a dictionary with
+        # minimized_nodes[canonical_node] is canonical_node.
+        # Based on __hash__ and __eq__, minimized_nodes[n] is the
+        # canonical node equal to n.
+        # In other words, self.minimized_nodes[x] == x for all nodes found in
+        # the dict.
+        self.minimized_nodes = {}
+
+        # word: value mapping
+        self.data = {}
+        # value: word mapping
+        self.inverse = {}
+
+    def insert(self, word, value):
+        if not all(0 <= ord(c) < 128 for c in word):
+            raise ValueError("Use 7-bit ASCII characters only")
+        if word <= self.previous_word:
+            raise ValueError("Error: Words must be inserted in alphabetical order.")
+        if value in self.inverse:
+            raise ValueError(f"value {value} is duplicate, got it for word {self.inverse[value]} and now {word}")
+
+        # find common prefix between word and previous word
+        common_prefix = 0
+        for i in range(min(len(word), len(self.previous_word))):
+            if word[i] != self.previous_word[i]:
+                break
+            common_prefix += 1
+
+        # Check the unchecked_nodes for redundant nodes, proceeding from last
+        # one down to the common prefix size. Then truncate the list at that
+        # point.
+        self._minimize(common_prefix)
+
+        self.data[word] = value
+        self.inverse[value] = word
+
+        # add the suffix, starting from the correct node mid-way through the
+        # graph
+        if len(self.unchecked_nodes) == 0:
+            node = self.root
+        else:
+            node = self.unchecked_nodes[-1][2]
+
+        for letter in word[common_prefix:]:
+            next_node = DawgNode(self)
+            node.edges[letter] = next_node
+            self.unchecked_nodes.append((node, letter, next_node))
+            node = next_node
+
+        node.final = True
+        self.previous_word = word
+
+    def finish(self):
+        if not self.data:
+            raise ValueError("need at least one word in the dawg")
+        # minimize all unchecked_nodes
+        self._minimize(0)
+
+        self._linearize_edges()
+
+        topoorder, linear_data, inverse = self._topological_order()
+        return self.compute_packed(topoorder), linear_data, inverse
+
+    def _minimize(self, down_to):
+        # proceed from the leaf up to a certain point
+        for i in range(len(self.unchecked_nodes) - 1, down_to - 1, -1):
+            (parent, letter, child) = self.unchecked_nodes[i]
+            if child in self.minimized_nodes:
+                # replace the child with the previously encountered one
+                parent.edges[letter] = self.minimized_nodes[child]
+            else:
+                # add the state to the minimized nodes.
+                self.minimized_nodes[child] = child
+            self.unchecked_nodes.pop()
+
+    def _lookup(self, word):
+        """ Return an integer 0 <= k < number of strings in dawg
+        where word is the kth successful traversal of the dawg. """
+        node = self.root
+        skipped = 0  # keep track of number of final nodes that we skipped
+        index = 0
+        while index < len(word):
+            for label, child in node.linear_edges:
+                if word[index] == label[0]:
+                    if word[index:index + len(label)] == label:
+                        if node.final:
+                            skipped += 1
+                        index += len(label)
+                        node = child
+                        break
+                    else:
+                        return None
+                skipped += child.num_reachable_linear
+            else:
+                return None
+        return skipped
+
+    def enum_all_nodes(self):
+        stack = [self.root]
+        done = set()
+        while stack:
+            node = stack.pop()
+            if node.id in done:
+                continue
+            yield node
+            done.add(node.id)
+            for label, child in sorted(node.edges.items()):
+                stack.append(child)
+
+    def prettyprint(self):
+        for node in sorted(self.enum_all_nodes(), key=lambda e: e.id):
+            s_final = " final" if node.final else ""
+            print(f"{node.id}: ({node}) {s_final}")
+            for label, child in sorted(node.edges.items()):
+                print(f"    {label} goto {child.id}")
+
+    def _inverse_lookup(self, number):
+        assert 0, "not working in the current form, but keep it as the pure python version of compact lookup"
+        result = []
+        node = self.root
+        while 1:
+            if node.final:
+                if pos == 0:
+                    return "".join(result)
+                pos -= 1
+            for label, child in sorted(node.edges.items()):
+                nextpos = pos - child.num_reachable_linear
+                if nextpos < 0:
+                    result.append(label)
+                    node = child
+                    break
+                else:
+                    pos = nextpos
+            else:
+                assert 0
+
+    def _linearize_edges(self):
+        # compute "linear" edges. the idea is that long chains of edges without
+        # any of the intermediate states being final or any extra incoming or
+        # outgoing edges can be represented by having removing them, and
+        # instead using longer strings as edge labels (instead of single
+        # characters)
+        incoming = defaultdict(list)
+        nodes = sorted(self.enum_all_nodes(), key=lambda e: e.id)
+        for node in nodes:
+            for label, child in sorted(node.edges.items()):
+                incoming[child].append(node)
+        for node in nodes:
+            node.linear_edges = []
+            for label, child in sorted(node.edges.items()):
+                s = [label]
+                while len(child.edges) == 1 and len(incoming[child]) == 1 and not child.final:
+                    (c, child), = child.edges.items()
+                    s.append(c)
+                node.linear_edges.append((''.join(s), child))
+
+    def _topological_order(self):
+        # compute reachable linear nodes, and the set of incoming edges for each node
+        order = []
+        stack = [self.root]
+        seen = set()
+        while stack:
+            # depth first traversal
+            node = stack.pop()
+            if node.id in seen:
+                continue
+            seen.add(node.id)
+            order.append(node)
+            for label, child in node.linear_edges:
+                stack.append(child)
+
+        # do a (slightly bad) topological sort
+        incoming = defaultdict(set)
+        for node in order:
+            for label, child in node.linear_edges:
+                incoming[child].add((label, node))
+        no_incoming = [order[0]]
+        topoorder = []
+        positions = {}
+        while no_incoming:
+            node = no_incoming.pop()
+            topoorder.append(node)
+            positions[node] = len(topoorder)
+            # use "reversed" to make sure that the linear_edges get reorderd
+            # from their alphabetical order as little as necessary (no_incoming
+            # is LIFO)
+            for label, child in reversed(node.linear_edges):
+                incoming[child].discard((label, node))
+                if not incoming[child]:
+                    no_incoming.append(child)
+                    del incoming[child]
+        # check result
+        assert set(topoorder) == set(order)
+        assert len(set(topoorder)) == len(topoorder)
+
+        for node in order:
+            node.linear_edges.sort(key=lambda element: positions[element[1]])
+
+        for node in order:
+            for label, child in node.linear_edges:
+                assert positions[child] > positions[node]
+        # number the nodes. afterwards every input string in the set has a
+        # unique number in the 0 <= number < len(data). We then put the data in
+        # self.data into a linear list using these numbers as indexes.
+        topoorder[0].num_reachable_linear
+        linear_data = [None] * len(self.data)
+        inverse = {} # maps value back to index
+        for word, value in self.data.items():
+            index = self._lookup(word)
+            linear_data[index] = value
+            inverse[value] = index
+
+        return topoorder, linear_data, inverse
+
+    def compute_packed(self, order):
+        def compute_chunk(node, offsets):
+            """ compute the packed node/edge data for a node. result is a
+            list of bytes as long as order. the jump distance calculations use
+            the offsets dictionary to know where in the final big output
+            bytestring the individual nodes will end up. """
+            result = bytearray()
+            offset = offsets[node]
+            encode_varint_unsigned(number_add_bits(node.num_reachable_linear, node.final), result)
+            if len(node.linear_edges) == 0:
+                assert node.final
+                encode_varint_unsigned(0, result) # add a 0 saying "done"
+            prev_child_offset = offset + len(result)
+            for edgeindex, (label, targetnode) in enumerate(node.linear_edges):
+                label = label.encode('ascii')
+                child_offset = offsets[targetnode]
+                child_offset_difference = child_offset - prev_child_offset
+
+                info = number_add_bits(child_offset_difference, len(label) == 1, edgeindex == len(node.linear_edges) - 1)
+                if edgeindex == 0:
+                    assert info != 0
+                encode_varint_unsigned(info, result)
+                prev_child_offset = child_offset
+                if len(label) > 1:
+                    encode_varint_unsigned(len(label), result)
+                result.extend(label)
+            return result
+
+        def compute_new_offsets(chunks, offsets):
+            """ Given a list of chunks, compute the new offsets (by adding the
+            chunk lengths together). Also check if we cannot shrink the output
+            further because none of the node offsets are smaller now. if that's
+            the case return None. """
+            new_offsets = {}
+            curr_offset = 0
+            should_continue = False
+            for node, result in zip(order, chunks):
+                if curr_offset < offsets[node]:
+                    # the new offset is below the current assumption, this
+                    # means we can shrink the output more
+                    should_continue = True
+                new_offsets[node] = curr_offset
+                curr_offset += len(result)
+            if not should_continue:
+                return None
+            return new_offsets
+
+        # assign initial offsets to every node
+        offsets = {}
+        for i, node in enumerate(order):
+            # we don't know position of the edge yet, just use something big as
+            # the starting position. we'll have to do further iterations anyway,
+            # but the size is at least a lower limit then
+            offsets[node] = i * 2 ** 30
+
+
+        # due to the variable integer width encoding of edge targets we need to
+        # run this to fixpoint. in the process we shrink the output more and
+        # more until we can't any more. at any point we can stop and use the
+        # output, but we might need padding zero bytes when joining the chunks
+        # to have the correct jump distances
+        last_offsets = None
+        while 1:
+            chunks = [compute_chunk(node, offsets) for node in order]
+            last_offsets = offsets
+            offsets = compute_new_offsets(chunks, offsets)
+            if offsets is None: # couldn't shrink
+                break
+
+        # build the final packed string
+        total_result = bytearray()
+        for node, result in zip(order, chunks):
+            node_offset = last_offsets[node]
+            if node_offset > len(total_result):
+                # need to pad to get the offsets correct
+                padding = b"\x00" * (node_offset - len(total_result))
+                total_result.extend(padding)
+            assert node_offset == len(total_result)
+            total_result.extend(result)
+        return bytes(total_result)
+
+
+# ______________________________________________________________________
+# the following functions operate on the packed representation
+
+def number_add_bits(x, *bits):
+    for bit in bits:
+        assert bit == 0 or bit == 1
+        x = (x << 1) | bit
+    return x
+
+def encode_varint_unsigned(i, res):
+    # https://en.wikipedia.org/wiki/LEB128 unsigned variant
+    more = True
+    startlen = len(res)
+    if i < 0:
+        raise ValueError("only positive numbers supported", i)
+    while more:
+        lowest7bits = i & 0b1111111
+        i >>= 7
+        if i == 0:
+            more = False
+        else:
+            lowest7bits |= 0b10000000
+        res.append(lowest7bits)
+    return len(res) - startlen
+
+def number_split_bits(x, n, acc=()):
+    if n == 1:
+        return x >> 1, x & 1
+    if n == 2:
+        return x >> 2, (x >> 1) & 1, x & 1
+    assert 0, "implement me!"
+
+def decode_varint_unsigned(b, index=0):
+    res = 0
+    shift = 0
+    while True:
+        byte = b[index]
+        res = res | ((byte & 0b1111111) << shift)
+        index += 1
+        shift += 7
+        if not (byte & 0b10000000):
+            return res, index
+
+def decode_node(packed, node):
+    x, node = decode_varint_unsigned(packed, node)
+    node_count, final = number_split_bits(x, 1)
+    return node_count, final, node
+
+def decode_edge(packed, edgeindex, prev_child_offset, offset):
+    x, offset = decode_varint_unsigned(packed, offset)
+    if x == 0 and edgeindex == 0:
+        raise KeyError # trying to decode past a final node
+    child_offset_difference, len1, last_edge = number_split_bits(x, 2)
+    child_offset = prev_child_offset + child_offset_difference
+    if len1:
+        size = 1
+    else:
+        size, offset = decode_varint_unsigned(packed, offset)
+    return child_offset, last_edge, size, offset
+
+def _match_edge(packed, s, size, node_offset, stringpos):
+    if size > 1 and stringpos + size > len(s):
+        # past the end of the string, can't match
+        return False
+    for i in range(size):
+        if packed[node_offset + i] != s[stringpos + i]:
+            # if a subsequent char of an edge doesn't match, the word isn't in
+            # the dawg
+            if i > 0:
+                raise KeyError
+            return False
+    return True
+
+def lookup(packed, data, s):
+    return data[_lookup(packed, s)]
+
+def _lookup(packed, s):
+    stringpos = 0
+    node_offset = 0
+    skipped = 0  # keep track of number of final nodes that we skipped
+    false = False
+    while stringpos < len(s):
+        #print(f"{node_offset=} {stringpos=}")
+        _, final, edge_offset = decode_node(packed, node_offset)
+        prev_child_offset = edge_offset
+        edgeindex = 0
+        while 1:
+            child_offset, last_edge, size, edgelabel_chars_offset = decode_edge(packed, edgeindex, prev_child_offset, edge_offset)
+            #print(f"    {edge_offset=} {child_offset=} {last_edge=} {size=} {edgelabel_chars_offset=}")
+            edgeindex += 1
+            prev_child_offset = child_offset
+            if _match_edge(packed, s, size, edgelabel_chars_offset, stringpos):
+                # match
+                if final:
+                    skipped += 1
+                stringpos += size
+                node_offset = child_offset
+                break
+            if last_edge:
+                raise KeyError
+            descendant_count, _, _ = decode_node(packed, child_offset)
+            skipped += descendant_count
+            edge_offset = edgelabel_chars_offset + size
+    _, final, _ = decode_node(packed, node_offset)
+    if final:
+        return skipped
+    raise KeyError
+
+def inverse_lookup(packed, inverse, x):
+    pos = inverse[x]
+    return _inverse_lookup(packed, pos)
+
+def _inverse_lookup(packed, pos):
+    result = bytearray()
+    node_offset = 0
+    while 1:
+        node_count, final, edge_offset = decode_node(packed, node_offset)
+        if final:
+            if pos == 0:
+                return bytes(result)
+            pos -= 1
+        prev_child_offset = edge_offset
+        edgeindex = 0
+        while 1:
+            child_offset, last_edge, size, edgelabel_chars_offset = decode_edge(packed, edgeindex, prev_child_offset, edge_offset)
+            edgeindex += 1
+            prev_child_offset = child_offset
+            descendant_count, _, _ = decode_node(packed, child_offset)
+            nextpos = pos - descendant_count
+            if nextpos < 0:
+                assert edgelabel_chars_offset >= 0
+                result.extend(packed[edgelabel_chars_offset: edgelabel_chars_offset + size])
+                node_offset = child_offset
+                break
+            elif not last_edge:
+                pos = nextpos
+                edge_offset = edgelabel_chars_offset + size
+            else:
+                raise KeyError
+        else:
+            raise KeyError
+
+
+def build_compression_dawg(ucdata):
+    d = Dawg()
+    ucdata.sort()
+    for name, value in ucdata:
+        d.insert(name, value)
+    packed, pos_to_code, reversedict = d.finish()
+    print("size of dawg [KiB]", round(len(packed) / 1024, 2))
+    # check that lookup and inverse_lookup work correctly on the input data
+    for name, value in ucdata:
+        assert lookup(packed, pos_to_code, name.encode('ascii')) == value
+        assert inverse_lookup(packed, reversedict, value) == name.encode('ascii')
+    return packed, pos_to_code
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@ -623,120 +623,12 @@ def makeunicodetype(unicode, trace):
 # unicode name database

 def makeunicodename(unicode, trace):
+    from dawg import build_compression_dawg

    FILE = "Modules/unicodename_db.h"

    print("--- Preparing", FILE, "...")

-    # collect names
-    names = [None] * len(unicode.chars)
-
-    for char in unicode.chars:
-        record = unicode.table[char]
-        if record:
-            name = record.name.strip()
-            if name and name[0] != "<":
-                names[char] = name + chr(0)
-
-    print(len([n for n in names if n is not None]), "distinct names")
-
-    # collect unique words from names (note that we differ between
-    # words inside a sentence, and words ending a sentence.  the
-    # latter includes the trailing null byte.
-
-    words = {}
-    n = b = 0
-    for char in unicode.chars:
-        name = names[char]
-        if name:
-            w = name.split()
-            b = b + len(name)
-            n = n + len(w)
-            for w in w:
-                l = words.get(w)
-                if l:
-                    l.append(None)
-                else:
-                    words[w] = [len(words)]
-
-    print(n, "words in text;", b, "bytes")
-
-    wordlist = list(words.items())
-
-    # sort on falling frequency, then by name
-    def word_key(a):
-        aword, alist = a
-        return -len(alist), aword
-    wordlist.sort(key=word_key)
-
-    # figure out how many phrasebook escapes we need
-    escapes = 0
-    while escapes * 256 < len(wordlist):
-        escapes = escapes + 1
-    print(escapes, "escapes")
-
-    short = 256 - escapes
-
-    assert short > 0
-
-    print(short, "short indexes in lexicon")
-
-    # statistics
-    n = 0
-    for i in range(short):
-        n = n + len(wordlist[i][1])
-    print(n, "short indexes in phrasebook")
-
-    # pick the most commonly used words, and sort the rest on falling
-    # length (to maximize overlap)
-
-    wordlist, wordtail = wordlist[:short], wordlist[short:]
-    wordtail.sort(key=lambda a: a[0], reverse=True)
-    wordlist.extend(wordtail)
-
-    # generate lexicon from words
-
-    lexicon_offset = [0]
-    lexicon = ""
-    words = {}
-
-    # build a lexicon string
-    offset = 0
-    for w, x in wordlist:
-        # encoding: bit 7 indicates last character in word (chr(128)
-        # indicates the last character in an entire string)
-        ww = w[:-1] + chr(ord(w[-1])+128)
-        # reuse string tails, when possible
-        o = lexicon.find(ww)
-        if o < 0:
-            o = offset
-            lexicon = lexicon + ww
-            offset = offset + len(w)
-        words[w] = len(lexicon_offset)
-        lexicon_offset.append(o)
-
-    lexicon = list(map(ord, lexicon))
-
-    # generate phrasebook from names and lexicon
-    phrasebook = [0]
-    phrasebook_offset = [0] * len(unicode.chars)
-    for char in unicode.chars:
-        name = names[char]
-        if name:
-            w = name.split()
-            phrasebook_offset[char] = len(phrasebook)
-            for w in w:
-                i = words[w]
-                if i < short:
-                    phrasebook.append(i)
-                else:
-                    # store as two bytes
-                    phrasebook.append((i>>8) + short)
-                    phrasebook.append(i&255)
-
-    assert getsize(phrasebook) == 1
-
-    #
    # unicode name hash table

    # extract names
@ -748,12 +640,6 @@ def makeunicodename(unicode, trace):
            if name and name[0] != "<":
                data.append((name, char))

-    # the magic number 47 was chosen to minimize the number of
-    # collisions on the current data set.  if you like, change it
-    # and see what happens...
-
-    codehash = Hash("code", data, 47)
-
    print("--- Writing", FILE, "...")

    with open(FILE, "w") as fp:
@ -762,24 +648,22 @@ def makeunicodename(unicode, trace):
        fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
        fprint()
        fprint("#define NAME_MAXLEN", 256)
+        assert max(len(x) for x in data) < 256
        fprint()
-        fprint("/* lexicon */")
-        Array("lexicon", lexicon).dump(fp, trace)
-        Array("lexicon_offset", lexicon_offset).dump(fp, trace)
-
-        # split decomposition index table
-        offset1, offset2, shift = splitbins(phrasebook_offset, trace)
-
-        fprint("/* code->name phrasebook */")
-        fprint("#define phrasebook_shift", shift)
-        fprint("#define phrasebook_short", short)
-
-        Array("phrasebook", phrasebook).dump(fp, trace)
-        Array("phrasebook_offset1", offset1).dump(fp, trace)
-        Array("phrasebook_offset2", offset2).dump(fp, trace)

        fprint("/* name->code dictionary */")
-        codehash.dump(fp, trace)
+        packed_dawg, pos_to_codepoint = build_compression_dawg(data)
+        notfound = len(pos_to_codepoint)
+        inverse_list = [notfound] * len(unicode.chars)
+        for pos, codepoint in enumerate(pos_to_codepoint):
+            inverse_list[codepoint] = pos
+        Array("packed_name_dawg", list(packed_dawg)).dump(fp, trace)
+        Array("dawg_pos_to_codepoint", pos_to_codepoint).dump(fp, trace)
+        index1, index2, shift = splitbins(inverse_list, trace)
+        fprint("#define DAWG_CODEPOINT_TO_POS_SHIFT", shift)
+        fprint("#define DAWG_CODEPOINT_TO_POS_NOTFOUND", notfound)
+        Array("dawg_codepoint_to_pos_index1", index1).dump(fp, trace)
+        Array("dawg_codepoint_to_pos_index2", index2).dump(fp, trace)

        fprint()
        fprint('static const unsigned int aliases_start = %#x;' %
@ -1188,94 +1072,6 @@ class UnicodeData:
        self.chars = list(range(256))


-# hash table tools
-
-# this is a straight-forward reimplementation of Python's built-in
-# dictionary type, using a static data structure, and a custom string
-# hash algorithm.
-
-def myhash(s, magic):
-    h = 0
-    for c in map(ord, s.upper()):
-        h = (h * magic) + c
-        ix = h & 0xff000000
-        if ix:
-            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
-    return h
-
-
-SIZES = [
-    (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
-    (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
-    (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
-    (2097152,5), (4194304,3), (8388608,33), (16777216,27)
-]
-
-
-class Hash:
-    def __init__(self, name, data, magic):
-        # turn a (key, value) list into a static hash table structure
-
-        # determine table size
-        for size, poly in SIZES:
-            if size > len(data):
-                poly = size + poly
-                break
-        else:
-            raise AssertionError("ran out of polynomials")
-
-        print(size, "slots in hash table")
-
-        table = [None] * size
-
-        mask = size-1
-
-        n = 0
-
-        hash = myhash
-
-        # initialize hash table
-        for key, value in data:
-            h = hash(key, magic)
-            i = (~h) & mask
-            v = table[i]
-            if v is None:
-                table[i] = value
-                continue
-            incr = (h ^ (h >> 3)) & mask
-            if not incr:
-                incr = mask
-            while 1:
-                n = n + 1
-                i = (i + incr) & mask
-                v = table[i]
-                if v is None:
-                    table[i] = value
-                    break
-                incr = incr << 1
-                if incr > mask:
-                    incr = incr ^ poly
-
-        print(n, "collisions")
-        self.collisions = n
-
-        for i in range(len(table)):
-            if table[i] is None:
-                table[i] = 0
-
-        self.data = Array(name + "_hash", table)
-        self.magic = magic
-        self.name = name
-        self.size = size
-        self.poly = poly
-
-    def dump(self, file, trace):
-        # write data to file, as a C array
-        self.data.dump(file, trace)
-        file.write("#define %s_magic %d\n" % (self.name, self.magic))
-        file.write("#define %s_size %d\n" % (self.name, self.size))
-        file.write("#define %s_poly %d\n" % (self.name, self.poly))
-

 # stuff to deal with arrays of unsigned integers