mirror of https://github.com/python/cpython
gh-96954: use a directed acyclic word graph for storing the unicodedata codepoint names (#97906)
Co-authored-by: Łukasz Langa <lukasz@langa.pl> Co-authored-by: Pieter Eendebak <pieter.eendebak@gmail.com> Co-authored-by: Dennis Sweeney <36520290+sweeneyde@users.noreply.github.com>
This commit is contained in:
parent
0e9c364f4a
commit
9573d14215
|
@ -0,0 +1,121 @@
|
||||||
|
import unittest
|
||||||
|
from test.test_tools import toolsdir, imports_under_tool
|
||||||
|
from test import support
|
||||||
|
from test.support.hypothesis_helper import hypothesis
|
||||||
|
|
||||||
|
st = hypothesis.strategies
|
||||||
|
given = hypothesis.given
|
||||||
|
example = hypothesis.example
|
||||||
|
|
||||||
|
|
||||||
|
with imports_under_tool("unicode"):
|
||||||
|
from dawg import Dawg, build_compression_dawg, lookup, inverse_lookup
|
||||||
|
|
||||||
|
|
||||||
|
@st.composite
|
||||||
|
def char_name_db(draw, min_length=1, max_length=30):
|
||||||
|
m = draw(st.integers(min_value=min_length, max_value=max_length))
|
||||||
|
names = draw(
|
||||||
|
st.sets(st.text("abcd", min_size=1, max_size=10), min_size=m, max_size=m)
|
||||||
|
)
|
||||||
|
characters = draw(st.sets(st.characters(), min_size=m, max_size=m))
|
||||||
|
return list(zip(names, characters))
|
||||||
|
|
||||||
|
|
||||||
|
class TestDawg(unittest.TestCase):
|
||||||
|
"""Tests for the directed acyclic word graph data structure that is used
|
||||||
|
to store the unicode character names in unicodedata. Tests ported from PyPy
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_dawg_direct_simple(self):
|
||||||
|
dawg = Dawg()
|
||||||
|
dawg.insert("a", -4)
|
||||||
|
dawg.insert("c", -2)
|
||||||
|
dawg.insert("cat", -1)
|
||||||
|
dawg.insert("catarr", 0)
|
||||||
|
dawg.insert("catnip", 1)
|
||||||
|
dawg.insert("zcatnip", 5)
|
||||||
|
packed, data, inverse = dawg.finish()
|
||||||
|
|
||||||
|
self.assertEqual(lookup(packed, data, b"a"), -4)
|
||||||
|
self.assertEqual(lookup(packed, data, b"c"), -2)
|
||||||
|
self.assertEqual(lookup(packed, data, b"cat"), -1)
|
||||||
|
self.assertEqual(lookup(packed, data, b"catarr"), 0)
|
||||||
|
self.assertEqual(lookup(packed, data, b"catnip"), 1)
|
||||||
|
self.assertEqual(lookup(packed, data, b"zcatnip"), 5)
|
||||||
|
self.assertRaises(KeyError, lookup, packed, data, b"b")
|
||||||
|
self.assertRaises(KeyError, lookup, packed, data, b"catni")
|
||||||
|
self.assertRaises(KeyError, lookup, packed, data, b"catnipp")
|
||||||
|
|
||||||
|
self.assertEqual(inverse_lookup(packed, inverse, -4), b"a")
|
||||||
|
self.assertEqual(inverse_lookup(packed, inverse, -2), b"c")
|
||||||
|
self.assertEqual(inverse_lookup(packed, inverse, -1), b"cat")
|
||||||
|
self.assertEqual(inverse_lookup(packed, inverse, 0), b"catarr")
|
||||||
|
self.assertEqual(inverse_lookup(packed, inverse, 1), b"catnip")
|
||||||
|
self.assertEqual(inverse_lookup(packed, inverse, 5), b"zcatnip")
|
||||||
|
self.assertRaises(KeyError, inverse_lookup, packed, inverse, 12)
|
||||||
|
|
||||||
|
def test_forbid_empty_dawg(self):
|
||||||
|
dawg = Dawg()
|
||||||
|
self.assertRaises(ValueError, dawg.finish)
|
||||||
|
|
||||||
|
@given(char_name_db())
|
||||||
|
@example([("abc", "a"), ("abd", "b")])
|
||||||
|
@example(
|
||||||
|
[
|
||||||
|
("bab", "1"),
|
||||||
|
("a", ":"),
|
||||||
|
("ad", "@"),
|
||||||
|
("b", "<"),
|
||||||
|
("aacc", "?"),
|
||||||
|
("dab", "D"),
|
||||||
|
("aa", "0"),
|
||||||
|
("ab", "F"),
|
||||||
|
("aaa", "7"),
|
||||||
|
("cbd", "="),
|
||||||
|
("abad", ";"),
|
||||||
|
("ac", "B"),
|
||||||
|
("abb", "4"),
|
||||||
|
("bb", "2"),
|
||||||
|
("aab", "9"),
|
||||||
|
("caaaaba", "E"),
|
||||||
|
("ca", ">"),
|
||||||
|
("bbaaa", "5"),
|
||||||
|
("d", "3"),
|
||||||
|
("baac", "8"),
|
||||||
|
("c", "6"),
|
||||||
|
("ba", "A"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
@example(
|
||||||
|
[
|
||||||
|
("bcdac", "9"),
|
||||||
|
("acc", "g"),
|
||||||
|
("d", "d"),
|
||||||
|
("daabdda", "0"),
|
||||||
|
("aba", ";"),
|
||||||
|
("c", "6"),
|
||||||
|
("aa", "7"),
|
||||||
|
("abbd", "c"),
|
||||||
|
("badbd", "?"),
|
||||||
|
("bbd", "f"),
|
||||||
|
("cc", "@"),
|
||||||
|
("bb", "8"),
|
||||||
|
("daca", ">"),
|
||||||
|
("ba", ":"),
|
||||||
|
("baac", "3"),
|
||||||
|
("dbdddac", "a"),
|
||||||
|
("a", "2"),
|
||||||
|
("cabd", "b"),
|
||||||
|
("b", "="),
|
||||||
|
("abd", "4"),
|
||||||
|
("adcbd", "5"),
|
||||||
|
("abc", "e"),
|
||||||
|
("ab", "1"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def test_dawg(self, data):
|
||||||
|
# suppress debug prints
|
||||||
|
with support.captured_stdout() as output:
|
||||||
|
# it's enough to build it, building will also check the result
|
||||||
|
build_compression_dawg(data)
|
|
@ -104,6 +104,26 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
|
||||||
if looked_name := self.db.name(char, None):
|
if looked_name := self.db.name(char, None):
|
||||||
self.assertEqual(self.db.lookup(looked_name), char)
|
self.assertEqual(self.db.lookup(looked_name), char)
|
||||||
|
|
||||||
|
def test_no_names_in_pua(self):
|
||||||
|
puas = [*range(0xe000, 0xf8ff),
|
||||||
|
*range(0xf0000, 0xfffff),
|
||||||
|
*range(0x100000, 0x10ffff)]
|
||||||
|
for i in puas:
|
||||||
|
char = chr(i)
|
||||||
|
self.assertRaises(ValueError, self.db.name, char)
|
||||||
|
|
||||||
|
def test_lookup_nonexistant(self):
|
||||||
|
# just make sure that lookup can fail
|
||||||
|
for nonexistant in [
|
||||||
|
"LATIN SMLL LETR A",
|
||||||
|
"OPEN HANDS SIGHS",
|
||||||
|
"DREGS",
|
||||||
|
"HANDBUG",
|
||||||
|
"MODIFIER LETTER CYRILLIC SMALL QUESTION MARK",
|
||||||
|
"???",
|
||||||
|
]:
|
||||||
|
self.assertRaises(KeyError, self.db.lookup, nonexistant)
|
||||||
|
|
||||||
def test_digit(self):
|
def test_digit(self):
|
||||||
self.assertEqual(self.db.digit('A', None), None)
|
self.assertEqual(self.db.digit('A', None), None)
|
||||||
self.assertEqual(self.db.digit('9'), 9)
|
self.assertEqual(self.db.digit('9'), 9)
|
||||||
|
|
|
@ -1342,6 +1342,14 @@ check-abidump: all
|
||||||
regen-limited-abi: all
|
regen-limited-abi: all
|
||||||
$(RUNSHARED) ./$(BUILDPYTHON) $(srcdir)/Tools/build/stable_abi.py --generate-all $(srcdir)/Misc/stable_abi.toml
|
$(RUNSHARED) ./$(BUILDPYTHON) $(srcdir)/Tools/build/stable_abi.py --generate-all $(srcdir)/Misc/stable_abi.toml
|
||||||
|
|
||||||
|
############################################################################
|
||||||
|
# Regenerate Unicode Data
|
||||||
|
|
||||||
|
.PHONY: regen-unicodedata
|
||||||
|
regen-unicodedata:
|
||||||
|
$(PYTHON_FOR_REGEN) Tools/unicode/makeunicodedata.py
|
||||||
|
|
||||||
|
|
||||||
############################################################################
|
############################################################################
|
||||||
# Regenerate all generated files
|
# Regenerate all generated files
|
||||||
|
|
||||||
|
@ -1350,7 +1358,7 @@ regen-limited-abi: all
|
||||||
regen-all: regen-cases regen-typeslots \
|
regen-all: regen-cases regen-typeslots \
|
||||||
regen-token regen-ast regen-keyword regen-sre regen-frozen \
|
regen-token regen-ast regen-keyword regen-sre regen-frozen \
|
||||||
regen-pegen-metaparser regen-pegen regen-test-frozenmain \
|
regen-pegen-metaparser regen-pegen regen-test-frozenmain \
|
||||||
regen-test-levenshtein regen-global-objects
|
regen-test-levenshtein regen-global-objects regen-unicodedata
|
||||||
@echo
|
@echo
|
||||||
@echo "Note: make regen-stdlib-module-names, make regen-limited-abi"
|
@echo "Note: make regen-stdlib-module-names, make regen-limited-abi"
|
||||||
@echo "and make regen-configure should be run manually"
|
@echo "and make regen-configure should be run manually"
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
Switch the storage of the unicode codepoint names to use a different
|
||||||
|
data-structure, a `directed acyclic word graph
|
||||||
|
<https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton>`_.
|
||||||
|
This makes the unicodedata shared library about 440 KiB smaller. Contributed by
|
||||||
|
Carl Friedrich Bolz-Tereick using code from the PyPy project.
|
|
@ -977,21 +977,6 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
|
||||||
/* -------------------------------------------------------------------- */
|
/* -------------------------------------------------------------------- */
|
||||||
/* database code (cut and pasted from the unidb package) */
|
/* database code (cut and pasted from the unidb package) */
|
||||||
|
|
||||||
static unsigned long
|
|
||||||
_gethash(const char *s, int len, int scale)
|
|
||||||
{
|
|
||||||
int i;
|
|
||||||
unsigned long h = 0;
|
|
||||||
unsigned long ix;
|
|
||||||
for (i = 0; i < len; i++) {
|
|
||||||
h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
|
|
||||||
ix = h & 0xff000000;
|
|
||||||
if (ix)
|
|
||||||
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
|
|
||||||
}
|
|
||||||
return h;
|
|
||||||
}
|
|
||||||
|
|
||||||
static const char * const hangul_syllables[][3] = {
|
static const char * const hangul_syllables[][3] = {
|
||||||
{ "G", "A", "" },
|
{ "G", "A", "" },
|
||||||
{ "GG", "AE", "G" },
|
{ "GG", "AE", "G" },
|
||||||
|
@ -1046,6 +1031,247 @@ is_unified_ideograph(Py_UCS4 code)
|
||||||
#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
|
#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
|
||||||
(cp < named_sequences_end))
|
(cp < named_sequences_end))
|
||||||
|
|
||||||
|
|
||||||
|
// DAWG decoding functions
|
||||||
|
|
||||||
|
static unsigned int
|
||||||
|
_dawg_decode_varint_unsigned(unsigned int index, unsigned int* result)
|
||||||
|
{
|
||||||
|
unsigned int res = 0;
|
||||||
|
unsigned int shift = 0;
|
||||||
|
for (;;) {
|
||||||
|
unsigned char byte = packed_name_dawg[index];
|
||||||
|
res |= (byte & 0x7f) << shift;
|
||||||
|
index++;
|
||||||
|
shift += 7;
|
||||||
|
if (!(byte & 0x80)) {
|
||||||
|
*result = res;
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
_dawg_match_edge(const char* name, unsigned int namelen, unsigned int size,
|
||||||
|
unsigned int label_offset, unsigned int namepos)
|
||||||
|
{
|
||||||
|
// This returns 1 if the edge matched, 0 if it didn't (but further edges
|
||||||
|
// could match) and -1 if the name cannot match at all.
|
||||||
|
if (size > 1 && namepos + size > namelen) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
for (unsigned int i = 0; i < size; i++) {
|
||||||
|
if (packed_name_dawg[label_offset + i] != Py_TOUPPER(name[namepos + i])) {
|
||||||
|
if (i > 0) {
|
||||||
|
return -1; // cannot match at all
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// reading DAWG node information:
|
||||||
|
// a node is encoded by a varint. The lowest bit of that int is set if the node
|
||||||
|
// is a final, accepting state. The higher bits of that int represent the
|
||||||
|
// number of names that are encoded by the sub-DAWG started by this node. It's
|
||||||
|
// used to compute the position of a name.
|
||||||
|
//
|
||||||
|
// the starting node of the DAWG is at position 0.
|
||||||
|
//
|
||||||
|
// the varint representing a node is followed by the node's edges, the encoding
|
||||||
|
// is described below
|
||||||
|
|
||||||
|
|
||||||
|
static unsigned int
|
||||||
|
_dawg_decode_node(unsigned int node_offset, bool* final)
|
||||||
|
{
|
||||||
|
unsigned int num;
|
||||||
|
node_offset = _dawg_decode_varint_unsigned(node_offset, &num);
|
||||||
|
*final = num & 1;
|
||||||
|
return node_offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
_dawg_node_is_final(unsigned int node_offset)
|
||||||
|
{
|
||||||
|
unsigned int num;
|
||||||
|
_dawg_decode_varint_unsigned(node_offset, &num);
|
||||||
|
return num & 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static unsigned int
|
||||||
|
_dawg_node_descendant_count(unsigned int node_offset)
|
||||||
|
{
|
||||||
|
unsigned int num;
|
||||||
|
_dawg_decode_varint_unsigned(node_offset, &num);
|
||||||
|
return num >> 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// reading DAWG edge information:
|
||||||
|
// a DAWG edge is comprised of the following information:
|
||||||
|
// (1) the size of the label of the string attached to the edge
|
||||||
|
// (2) the characters of that edge
|
||||||
|
// (3) the target node
|
||||||
|
// (4) whether the edge is the last edge in the list of edges following a node
|
||||||
|
//
|
||||||
|
// this information is encoded in a compact form as follows:
|
||||||
|
//
|
||||||
|
// +---------+-----------------+--------------+--------------------
|
||||||
|
// | varint | size (if != 1) | label chars | ... next edge ...
|
||||||
|
// +---------+-----------------+--------------+--------------------
|
||||||
|
//
|
||||||
|
// - first comes a varint
|
||||||
|
// - the lowest bit of that varint is whether the edge is final (4)
|
||||||
|
// - the second lowest bit of that varint is true if the size of
|
||||||
|
// the length of the label is 1 (1)
|
||||||
|
// - the rest of the varint is an offset that can be used to compute
|
||||||
|
// the offset of the target node of that edge (3)
|
||||||
|
// - if the size is not 1, the first varint is followed by a
|
||||||
|
// character encoding the number of characters of the label (1)
|
||||||
|
// (unicode character names aren't larger than 256 bytes, therefore each
|
||||||
|
// edge label can be at most 256 chars, but is usually smaller)
|
||||||
|
// - the next size bytes are the characters of the label (2)
|
||||||
|
//
|
||||||
|
// the offset of the target node is computed as follows: the number in the
|
||||||
|
// upper bits of the varint needs to be added to the offset of the target node
|
||||||
|
// of the previous edge. For the first edge, where there is no previous target
|
||||||
|
// node, the offset of the first edge is used.
|
||||||
|
// The intuition here is that edges going out from a node often lead to nodes
|
||||||
|
// that are close by, leading to small offsets from the current node and thus
|
||||||
|
// fewer bytes.
|
||||||
|
//
|
||||||
|
// There is a special case: if a final node has no outgoing edges, it has to be
|
||||||
|
// followed by a 0 byte to indicate that there are no edges (because the end of
|
||||||
|
// the edge list is normally indicated in a bit in the edge encoding). This is
|
||||||
|
// indicated by _dawg_decode_edge returning -1
|
||||||
|
|
||||||
|
|
||||||
|
static int
|
||||||
|
_dawg_decode_edge(bool is_first_edge, unsigned int prev_target_node_offset,
|
||||||
|
unsigned int edge_offset, unsigned int* size,
|
||||||
|
unsigned int* label_offset, unsigned int* target_node_offset)
|
||||||
|
{
|
||||||
|
unsigned int num;
|
||||||
|
edge_offset = _dawg_decode_varint_unsigned(edge_offset, &num);
|
||||||
|
if (num == 0 && is_first_edge) {
|
||||||
|
return -1; // trying to decode past a final node without outgoing edges
|
||||||
|
}
|
||||||
|
bool last_edge = num & 1;
|
||||||
|
num >>= 1;
|
||||||
|
bool len_is_one = num & 1;
|
||||||
|
num >>= 1;
|
||||||
|
*target_node_offset = prev_target_node_offset + num;
|
||||||
|
if (len_is_one) {
|
||||||
|
*size = 1;
|
||||||
|
} else {
|
||||||
|
*size = packed_name_dawg[edge_offset++];
|
||||||
|
}
|
||||||
|
*label_offset = edge_offset;
|
||||||
|
return last_edge;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
_lookup_dawg_packed(const char* name, unsigned int namelen)
|
||||||
|
{
|
||||||
|
unsigned int stringpos = 0;
|
||||||
|
unsigned int node_offset = 0;
|
||||||
|
unsigned int result = 0; // this is the number of final nodes that we skipped to match name
|
||||||
|
while (stringpos < namelen) {
|
||||||
|
bool final;
|
||||||
|
unsigned int edge_offset = _dawg_decode_node(node_offset, &final);
|
||||||
|
unsigned int prev_target_node_offset = edge_offset;
|
||||||
|
bool is_first_edge = true;
|
||||||
|
for (;;) {
|
||||||
|
unsigned int size;
|
||||||
|
unsigned int label_offset, target_node_offset;
|
||||||
|
int last_edge = _dawg_decode_edge(
|
||||||
|
is_first_edge, prev_target_node_offset, edge_offset,
|
||||||
|
&size, &label_offset, &target_node_offset);
|
||||||
|
if (last_edge == -1) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
is_first_edge = false;
|
||||||
|
prev_target_node_offset = target_node_offset;
|
||||||
|
int matched = _dawg_match_edge(name, namelen, size, label_offset, stringpos);
|
||||||
|
if (matched == -1) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (matched) {
|
||||||
|
if (final)
|
||||||
|
result += 1;
|
||||||
|
stringpos += size;
|
||||||
|
node_offset = target_node_offset;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (last_edge) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
result += _dawg_node_descendant_count(target_node_offset);
|
||||||
|
edge_offset = label_offset + size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (_dawg_node_is_final(node_offset)) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
_inverse_dawg_lookup(char* buffer, unsigned int buflen, unsigned int pos)
|
||||||
|
{
|
||||||
|
unsigned int node_offset = 0;
|
||||||
|
unsigned int bufpos = 0;
|
||||||
|
for (;;) {
|
||||||
|
bool final;
|
||||||
|
unsigned int edge_offset = _dawg_decode_node(node_offset, &final);
|
||||||
|
|
||||||
|
if (final) {
|
||||||
|
if (pos == 0) {
|
||||||
|
if (bufpos + 1 == buflen) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
buffer[bufpos] = '\0';
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
pos--;
|
||||||
|
}
|
||||||
|
unsigned int prev_target_node_offset = edge_offset;
|
||||||
|
bool is_first_edge = true;
|
||||||
|
for (;;) {
|
||||||
|
unsigned int size;
|
||||||
|
unsigned int label_offset, target_node_offset;
|
||||||
|
int last_edge = _dawg_decode_edge(
|
||||||
|
is_first_edge, prev_target_node_offset, edge_offset,
|
||||||
|
&size, &label_offset, &target_node_offset);
|
||||||
|
if (last_edge == -1) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
is_first_edge = false;
|
||||||
|
prev_target_node_offset = target_node_offset;
|
||||||
|
|
||||||
|
unsigned int descendant_count = _dawg_node_descendant_count(target_node_offset);
|
||||||
|
if (pos < descendant_count) {
|
||||||
|
if (bufpos + size >= buflen) {
|
||||||
|
return 0; // buffer overflow
|
||||||
|
}
|
||||||
|
for (unsigned int i = 0; i < size; i++) {
|
||||||
|
buffer[bufpos++] = packed_name_dawg[label_offset++];
|
||||||
|
}
|
||||||
|
node_offset = target_node_offset;
|
||||||
|
break;
|
||||||
|
} else if (!last_edge) {
|
||||||
|
pos -= descendant_count;
|
||||||
|
edge_offset = label_offset + size;
|
||||||
|
} else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
_getucname(PyObject *self,
|
_getucname(PyObject *self,
|
||||||
Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
|
Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
|
||||||
|
@ -1054,9 +1280,6 @@ _getucname(PyObject *self,
|
||||||
* If with_alias_and_seq is 1, check for names in the Private Use Area 15
|
* If with_alias_and_seq is 1, check for names in the Private Use Area 15
|
||||||
* that we are using for aliases and named sequences. */
|
* that we are using for aliases and named sequences. */
|
||||||
int offset;
|
int offset;
|
||||||
int i;
|
|
||||||
int word;
|
|
||||||
const unsigned char* w;
|
|
||||||
|
|
||||||
if (code >= 0x110000)
|
if (code >= 0x110000)
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -1107,45 +1330,15 @@ _getucname(PyObject *self,
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* get offset into phrasebook */
|
/* get position of codepoint in order of names in the dawg */
|
||||||
offset = phrasebook_offset1[(code>>phrasebook_shift)];
|
offset = dawg_codepoint_to_pos_index1[(code>>DAWG_CODEPOINT_TO_POS_SHIFT)];
|
||||||
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
|
offset = dawg_codepoint_to_pos_index2[(offset<<DAWG_CODEPOINT_TO_POS_SHIFT) +
|
||||||
(code&((1<<phrasebook_shift)-1))];
|
(code&((1<<DAWG_CODEPOINT_TO_POS_SHIFT)-1))];
|
||||||
if (!offset)
|
if (offset == DAWG_CODEPOINT_TO_POS_NOTFOUND)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
i = 0;
|
assert(buflen >= 0);
|
||||||
|
return _inverse_dawg_lookup(buffer, Py_SAFE_DOWNCAST(buflen, int, unsigned int), offset);
|
||||||
for (;;) {
|
|
||||||
/* get word index */
|
|
||||||
word = phrasebook[offset] - phrasebook_short;
|
|
||||||
if (word >= 0) {
|
|
||||||
word = (word << 8) + phrasebook[offset+1];
|
|
||||||
offset += 2;
|
|
||||||
} else
|
|
||||||
word = phrasebook[offset++];
|
|
||||||
if (i) {
|
|
||||||
if (i > buflen)
|
|
||||||
return 0; /* buffer overflow */
|
|
||||||
buffer[i++] = ' ';
|
|
||||||
}
|
|
||||||
/* copy word string from lexicon. the last character in the
|
|
||||||
word has bit 7 set. the last word in a string ends with
|
|
||||||
0x80 */
|
|
||||||
w = lexicon + lexicon_offset[word];
|
|
||||||
while (*w < 128) {
|
|
||||||
if (i >= buflen)
|
|
||||||
return 0; /* buffer overflow */
|
|
||||||
buffer[i++] = *w++;
|
|
||||||
}
|
|
||||||
if (i >= buflen)
|
|
||||||
return 0; /* buffer overflow */
|
|
||||||
buffer[i++] = *w & 127;
|
|
||||||
if (*w == 128)
|
|
||||||
break; /* end of word */
|
|
||||||
}
|
|
||||||
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
|
@ -1157,21 +1350,6 @@ capi_getucname(Py_UCS4 code,
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
|
||||||
_cmpname(PyObject *self, int code, const char* name, int namelen)
|
|
||||||
{
|
|
||||||
/* check if code corresponds to the given name */
|
|
||||||
int i;
|
|
||||||
char buffer[NAME_MAXLEN+1];
|
|
||||||
if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
|
|
||||||
return 0;
|
|
||||||
for (i = 0; i < namelen; i++) {
|
|
||||||
if (Py_TOUPPER(name[i]) != buffer[i])
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
return buffer[namelen] == '\0';
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
find_syllable(const char *str, int *len, int *pos, int count, int column)
|
find_syllable(const char *str, int *len, int *pos, int count, int column)
|
||||||
{
|
{
|
||||||
|
@ -1193,31 +1371,25 @@ find_syllable(const char *str, int *len, int *pos, int count, int column)
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
|
_check_alias_and_seq(Py_UCS4* code, int with_named_seq)
|
||||||
{
|
{
|
||||||
/* check if named sequences are allowed */
|
/* check if named sequences are allowed */
|
||||||
if (!with_named_seq && IS_NAMED_SEQ(cp))
|
if (!with_named_seq && IS_NAMED_SEQ(*code))
|
||||||
return 0;
|
return 0;
|
||||||
/* if the code point is in the PUA range that we use for aliases,
|
/* if the code point is in the PUA range that we use for aliases,
|
||||||
* convert it to obtain the right code point */
|
* convert it to obtain the right code point */
|
||||||
if (IS_ALIAS(cp))
|
if (IS_ALIAS(*code))
|
||||||
*code = name_aliases[cp-aliases_start];
|
*code = name_aliases[*code-aliases_start];
|
||||||
else
|
|
||||||
*code = cp;
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
_getcode(PyObject* self,
|
_getcode(const char* name, int namelen, Py_UCS4* code)
|
||||||
const char* name, int namelen, Py_UCS4* code, int with_named_seq)
|
|
||||||
{
|
{
|
||||||
/* Return the code point associated with the given name.
|
/* Return the code point associated with the given name.
|
||||||
* Named aliases are resolved too (unless self != NULL (i.e. we are using
|
* Named aliases are not resolved, they are returned as a code point in the
|
||||||
* 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
|
* PUA */
|
||||||
* using for the named sequence, and the caller must then convert it. */
|
|
||||||
unsigned int h, v;
|
|
||||||
unsigned int mask = code_size-1;
|
|
||||||
unsigned int i, incr;
|
|
||||||
|
|
||||||
/* Check for hangul syllables. */
|
/* Check for hangul syllables. */
|
||||||
if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
|
if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
|
||||||
|
@ -1240,6 +1412,7 @@ _getcode(PyObject* self,
|
||||||
/* Check for unified ideographs. */
|
/* Check for unified ideographs. */
|
||||||
if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
|
if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
|
||||||
/* Four or five hexdigits must follow. */
|
/* Four or five hexdigits must follow. */
|
||||||
|
unsigned int v;
|
||||||
v = 0;
|
v = 0;
|
||||||
name += 22;
|
name += 22;
|
||||||
namelen -= 22;
|
namelen -= 22;
|
||||||
|
@ -1261,41 +1434,24 @@ _getcode(PyObject* self,
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* the following is the same as python's dictionary lookup, with
|
assert(namelen >= 0);
|
||||||
only minor changes. see the makeunicodedata script for more
|
int position = _lookup_dawg_packed(name, Py_SAFE_DOWNCAST(namelen, int, unsigned int));
|
||||||
details */
|
if (position < 0) {
|
||||||
|
|
||||||
h = (unsigned int) _gethash(name, namelen, code_magic);
|
|
||||||
i = (~h) & mask;
|
|
||||||
v = code_hash[i];
|
|
||||||
if (!v)
|
|
||||||
return 0;
|
return 0;
|
||||||
if (_cmpname(self, v, name, namelen)) {
|
|
||||||
return _check_alias_and_seq(v, code, with_named_seq);
|
|
||||||
}
|
|
||||||
incr = (h ^ (h >> 3)) & mask;
|
|
||||||
if (!incr)
|
|
||||||
incr = mask;
|
|
||||||
for (;;) {
|
|
||||||
i = (i + incr) & mask;
|
|
||||||
v = code_hash[i];
|
|
||||||
if (!v)
|
|
||||||
return 0;
|
|
||||||
if (_cmpname(self, v, name, namelen)) {
|
|
||||||
return _check_alias_and_seq(v, code, with_named_seq);
|
|
||||||
}
|
|
||||||
incr = incr << 1;
|
|
||||||
if (incr > mask)
|
|
||||||
incr = incr ^ code_poly;
|
|
||||||
}
|
}
|
||||||
|
*code = dawg_pos_to_codepoint[position];
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
capi_getcode(const char* name, int namelen, Py_UCS4* code,
|
capi_getcode(const char* name, int namelen, Py_UCS4* code,
|
||||||
int with_named_seq)
|
int with_named_seq)
|
||||||
{
|
{
|
||||||
return _getcode(NULL, name, namelen, code, with_named_seq);
|
if (!_getcode(name, namelen, code)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return _check_alias_and_seq(code, with_named_seq);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
@ -1388,10 +1544,17 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!_getcode(self, name, (int)name_length, &code, 1)) {
|
if (!_getcode(name, (int)name_length, &code)) {
|
||||||
PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
|
PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
if (UCD_Check(self)) {
|
||||||
|
/* in 3.2.0 there are no aliases and named sequences */
|
||||||
|
if (IS_ALIAS(code) || IS_NAMED_SEQ(code)) {
|
||||||
|
PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
/* check if code is in the PUA range that we use for named sequences
|
/* check if code is in the PUA range that we use for named sequences
|
||||||
and convert it */
|
and convert it */
|
||||||
if (IS_NAMED_SEQ(code)) {
|
if (IS_NAMED_SEQ(code)) {
|
||||||
|
@ -1400,6 +1563,9 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
|
||||||
named_sequences[index].seq,
|
named_sequences[index].seq,
|
||||||
named_sequences[index].seqlen);
|
named_sequences[index].seqlen);
|
||||||
}
|
}
|
||||||
|
if (IS_ALIAS(code)) {
|
||||||
|
code = name_aliases[code-aliases_start];
|
||||||
|
}
|
||||||
return PyUnicode_FromOrdinal(code);
|
return PyUnicode_FromOrdinal(code);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,533 @@
|
||||||
|
# Original Algorithm:
|
||||||
|
# By Steve Hanov, 2011. Released to the public domain.
|
||||||
|
# Please see http://stevehanov.ca/blog/index.php?id=115 for the accompanying article.
|
||||||
|
#
|
||||||
|
# Adapted for PyPy/CPython by Carl Friedrich Bolz-Tereick
|
||||||
|
#
|
||||||
|
# Based on Daciuk, Jan, et al. "Incremental construction of minimal acyclic finite-state automata."
|
||||||
|
# Computational linguistics 26.1 (2000): 3-16.
|
||||||
|
#
|
||||||
|
# Updated 2014 to use DAWG as a mapping; see
|
||||||
|
# Kowaltowski, T.; CL. Lucchesi (1993), "Applications of finite automata representing large vocabularies",
|
||||||
|
# Software-Practice and Experience 1993
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
from functools import cached_property
|
||||||
|
|
||||||
|
|
||||||
|
# This class represents a node in the directed acyclic word graph (DAWG). It
|
||||||
|
# has a list of edges to other nodes. It has functions for testing whether it
|
||||||
|
# is equivalent to another node. Nodes are equivalent if they have identical
|
||||||
|
# edges, and each identical edge leads to identical states. The __hash__ and
|
||||||
|
# __eq__ functions allow it to be used as a key in a python dictionary.
|
||||||
|
|
||||||
|
|
||||||
|
class DawgNode:
|
||||||
|
|
||||||
|
def __init__(self, dawg):
|
||||||
|
self.id = dawg.next_id
|
||||||
|
dawg.next_id += 1
|
||||||
|
self.final = False
|
||||||
|
self.edges = {}
|
||||||
|
|
||||||
|
self.linear_edges = None # later: list of (string, next_state)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
if self.final:
|
||||||
|
arr = ["1"]
|
||||||
|
else:
|
||||||
|
arr = ["0"]
|
||||||
|
|
||||||
|
for (label, node) in sorted(self.edges.items()):
|
||||||
|
arr.append(label)
|
||||||
|
arr.append(str(node.id))
|
||||||
|
|
||||||
|
return "_".join(arr)
|
||||||
|
__repr__ = __str__
|
||||||
|
|
||||||
|
def _as_tuple(self):
|
||||||
|
edges = sorted(self.edges.items())
|
||||||
|
edge_tuple = tuple((label, node.id) for label, node in edges)
|
||||||
|
return (self.final, edge_tuple)
|
||||||
|
|
||||||
|
def __hash__(self):
|
||||||
|
return hash(self._as_tuple())
|
||||||
|
|
||||||
|
def __eq__(self, other):
|
||||||
|
return self._as_tuple() == other._as_tuple()
|
||||||
|
|
||||||
|
@cached_property
|
||||||
|
def num_reachable_linear(self):
|
||||||
|
# returns the number of different paths to final nodes reachable from
|
||||||
|
# this one
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
# staying at self counts as a path if self is final
|
||||||
|
if self.final:
|
||||||
|
count += 1
|
||||||
|
for label, node in self.linear_edges:
|
||||||
|
count += node.num_reachable_linear
|
||||||
|
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
class Dawg:
|
||||||
|
def __init__(self):
|
||||||
|
self.previous_word = ""
|
||||||
|
self.next_id = 0
|
||||||
|
self.root = DawgNode(self)
|
||||||
|
|
||||||
|
# Here is a list of nodes that have not been checked for duplication.
|
||||||
|
self.unchecked_nodes = []
|
||||||
|
|
||||||
|
# To deduplicate, maintain a dictionary with
|
||||||
|
# minimized_nodes[canonical_node] is canonical_node.
|
||||||
|
# Based on __hash__ and __eq__, minimized_nodes[n] is the
|
||||||
|
# canonical node equal to n.
|
||||||
|
# In other words, self.minimized_nodes[x] == x for all nodes found in
|
||||||
|
# the dict.
|
||||||
|
self.minimized_nodes = {}
|
||||||
|
|
||||||
|
# word: value mapping
|
||||||
|
self.data = {}
|
||||||
|
# value: word mapping
|
||||||
|
self.inverse = {}
|
||||||
|
|
||||||
|
def insert(self, word, value):
|
||||||
|
if not all(0 <= ord(c) < 128 for c in word):
|
||||||
|
raise ValueError("Use 7-bit ASCII characters only")
|
||||||
|
if word <= self.previous_word:
|
||||||
|
raise ValueError("Error: Words must be inserted in alphabetical order.")
|
||||||
|
if value in self.inverse:
|
||||||
|
raise ValueError(f"value {value} is duplicate, got it for word {self.inverse[value]} and now {word}")
|
||||||
|
|
||||||
|
# find common prefix between word and previous word
|
||||||
|
common_prefix = 0
|
||||||
|
for i in range(min(len(word), len(self.previous_word))):
|
||||||
|
if word[i] != self.previous_word[i]:
|
||||||
|
break
|
||||||
|
common_prefix += 1
|
||||||
|
|
||||||
|
# Check the unchecked_nodes for redundant nodes, proceeding from last
|
||||||
|
# one down to the common prefix size. Then truncate the list at that
|
||||||
|
# point.
|
||||||
|
self._minimize(common_prefix)
|
||||||
|
|
||||||
|
self.data[word] = value
|
||||||
|
self.inverse[value] = word
|
||||||
|
|
||||||
|
# add the suffix, starting from the correct node mid-way through the
|
||||||
|
# graph
|
||||||
|
if len(self.unchecked_nodes) == 0:
|
||||||
|
node = self.root
|
||||||
|
else:
|
||||||
|
node = self.unchecked_nodes[-1][2]
|
||||||
|
|
||||||
|
for letter in word[common_prefix:]:
|
||||||
|
next_node = DawgNode(self)
|
||||||
|
node.edges[letter] = next_node
|
||||||
|
self.unchecked_nodes.append((node, letter, next_node))
|
||||||
|
node = next_node
|
||||||
|
|
||||||
|
node.final = True
|
||||||
|
self.previous_word = word
|
||||||
|
|
||||||
|
def finish(self):
|
||||||
|
if not self.data:
|
||||||
|
raise ValueError("need at least one word in the dawg")
|
||||||
|
# minimize all unchecked_nodes
|
||||||
|
self._minimize(0)
|
||||||
|
|
||||||
|
self._linearize_edges()
|
||||||
|
|
||||||
|
topoorder, linear_data, inverse = self._topological_order()
|
||||||
|
return self.compute_packed(topoorder), linear_data, inverse
|
||||||
|
|
||||||
|
def _minimize(self, down_to):
|
||||||
|
# proceed from the leaf up to a certain point
|
||||||
|
for i in range(len(self.unchecked_nodes) - 1, down_to - 1, -1):
|
||||||
|
(parent, letter, child) = self.unchecked_nodes[i]
|
||||||
|
if child in self.minimized_nodes:
|
||||||
|
# replace the child with the previously encountered one
|
||||||
|
parent.edges[letter] = self.minimized_nodes[child]
|
||||||
|
else:
|
||||||
|
# add the state to the minimized nodes.
|
||||||
|
self.minimized_nodes[child] = child
|
||||||
|
self.unchecked_nodes.pop()
|
||||||
|
|
||||||
|
def _lookup(self, word):
|
||||||
|
""" Return an integer 0 <= k < number of strings in dawg
|
||||||
|
where word is the kth successful traversal of the dawg. """
|
||||||
|
node = self.root
|
||||||
|
skipped = 0 # keep track of number of final nodes that we skipped
|
||||||
|
index = 0
|
||||||
|
while index < len(word):
|
||||||
|
for label, child in node.linear_edges:
|
||||||
|
if word[index] == label[0]:
|
||||||
|
if word[index:index + len(label)] == label:
|
||||||
|
if node.final:
|
||||||
|
skipped += 1
|
||||||
|
index += len(label)
|
||||||
|
node = child
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
skipped += child.num_reachable_linear
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
return skipped
|
||||||
|
|
||||||
|
def enum_all_nodes(self):
|
||||||
|
stack = [self.root]
|
||||||
|
done = set()
|
||||||
|
while stack:
|
||||||
|
node = stack.pop()
|
||||||
|
if node.id in done:
|
||||||
|
continue
|
||||||
|
yield node
|
||||||
|
done.add(node.id)
|
||||||
|
for label, child in sorted(node.edges.items()):
|
||||||
|
stack.append(child)
|
||||||
|
|
||||||
|
def prettyprint(self):
|
||||||
|
for node in sorted(self.enum_all_nodes(), key=lambda e: e.id):
|
||||||
|
s_final = " final" if node.final else ""
|
||||||
|
print(f"{node.id}: ({node}) {s_final}")
|
||||||
|
for label, child in sorted(node.edges.items()):
|
||||||
|
print(f" {label} goto {child.id}")
|
||||||
|
|
||||||
|
def _inverse_lookup(self, number):
|
||||||
|
assert 0, "not working in the current form, but keep it as the pure python version of compact lookup"
|
||||||
|
result = []
|
||||||
|
node = self.root
|
||||||
|
while 1:
|
||||||
|
if node.final:
|
||||||
|
if pos == 0:
|
||||||
|
return "".join(result)
|
||||||
|
pos -= 1
|
||||||
|
for label, child in sorted(node.edges.items()):
|
||||||
|
nextpos = pos - child.num_reachable_linear
|
||||||
|
if nextpos < 0:
|
||||||
|
result.append(label)
|
||||||
|
node = child
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
pos = nextpos
|
||||||
|
else:
|
||||||
|
assert 0
|
||||||
|
|
||||||
|
def _linearize_edges(self):
|
||||||
|
# compute "linear" edges. the idea is that long chains of edges without
|
||||||
|
# any of the intermediate states being final or any extra incoming or
|
||||||
|
# outgoing edges can be represented by having removing them, and
|
||||||
|
# instead using longer strings as edge labels (instead of single
|
||||||
|
# characters)
|
||||||
|
incoming = defaultdict(list)
|
||||||
|
nodes = sorted(self.enum_all_nodes(), key=lambda e: e.id)
|
||||||
|
for node in nodes:
|
||||||
|
for label, child in sorted(node.edges.items()):
|
||||||
|
incoming[child].append(node)
|
||||||
|
for node in nodes:
|
||||||
|
node.linear_edges = []
|
||||||
|
for label, child in sorted(node.edges.items()):
|
||||||
|
s = [label]
|
||||||
|
while len(child.edges) == 1 and len(incoming[child]) == 1 and not child.final:
|
||||||
|
(c, child), = child.edges.items()
|
||||||
|
s.append(c)
|
||||||
|
node.linear_edges.append((''.join(s), child))
|
||||||
|
|
||||||
|
def _topological_order(self):
|
||||||
|
# compute reachable linear nodes, and the set of incoming edges for each node
|
||||||
|
order = []
|
||||||
|
stack = [self.root]
|
||||||
|
seen = set()
|
||||||
|
while stack:
|
||||||
|
# depth first traversal
|
||||||
|
node = stack.pop()
|
||||||
|
if node.id in seen:
|
||||||
|
continue
|
||||||
|
seen.add(node.id)
|
||||||
|
order.append(node)
|
||||||
|
for label, child in node.linear_edges:
|
||||||
|
stack.append(child)
|
||||||
|
|
||||||
|
# do a (slightly bad) topological sort
|
||||||
|
incoming = defaultdict(set)
|
||||||
|
for node in order:
|
||||||
|
for label, child in node.linear_edges:
|
||||||
|
incoming[child].add((label, node))
|
||||||
|
no_incoming = [order[0]]
|
||||||
|
topoorder = []
|
||||||
|
positions = {}
|
||||||
|
while no_incoming:
|
||||||
|
node = no_incoming.pop()
|
||||||
|
topoorder.append(node)
|
||||||
|
positions[node] = len(topoorder)
|
||||||
|
# use "reversed" to make sure that the linear_edges get reorderd
|
||||||
|
# from their alphabetical order as little as necessary (no_incoming
|
||||||
|
# is LIFO)
|
||||||
|
for label, child in reversed(node.linear_edges):
|
||||||
|
incoming[child].discard((label, node))
|
||||||
|
if not incoming[child]:
|
||||||
|
no_incoming.append(child)
|
||||||
|
del incoming[child]
|
||||||
|
# check result
|
||||||
|
assert set(topoorder) == set(order)
|
||||||
|
assert len(set(topoorder)) == len(topoorder)
|
||||||
|
|
||||||
|
for node in order:
|
||||||
|
node.linear_edges.sort(key=lambda element: positions[element[1]])
|
||||||
|
|
||||||
|
for node in order:
|
||||||
|
for label, child in node.linear_edges:
|
||||||
|
assert positions[child] > positions[node]
|
||||||
|
# number the nodes. afterwards every input string in the set has a
|
||||||
|
# unique number in the 0 <= number < len(data). We then put the data in
|
||||||
|
# self.data into a linear list using these numbers as indexes.
|
||||||
|
topoorder[0].num_reachable_linear
|
||||||
|
linear_data = [None] * len(self.data)
|
||||||
|
inverse = {} # maps value back to index
|
||||||
|
for word, value in self.data.items():
|
||||||
|
index = self._lookup(word)
|
||||||
|
linear_data[index] = value
|
||||||
|
inverse[value] = index
|
||||||
|
|
||||||
|
return topoorder, linear_data, inverse
|
||||||
|
|
||||||
|
def compute_packed(self, order):
|
||||||
|
def compute_chunk(node, offsets):
|
||||||
|
""" compute the packed node/edge data for a node. result is a
|
||||||
|
list of bytes as long as order. the jump distance calculations use
|
||||||
|
the offsets dictionary to know where in the final big output
|
||||||
|
bytestring the individual nodes will end up. """
|
||||||
|
result = bytearray()
|
||||||
|
offset = offsets[node]
|
||||||
|
encode_varint_unsigned(number_add_bits(node.num_reachable_linear, node.final), result)
|
||||||
|
if len(node.linear_edges) == 0:
|
||||||
|
assert node.final
|
||||||
|
encode_varint_unsigned(0, result) # add a 0 saying "done"
|
||||||
|
prev_child_offset = offset + len(result)
|
||||||
|
for edgeindex, (label, targetnode) in enumerate(node.linear_edges):
|
||||||
|
label = label.encode('ascii')
|
||||||
|
child_offset = offsets[targetnode]
|
||||||
|
child_offset_difference = child_offset - prev_child_offset
|
||||||
|
|
||||||
|
info = number_add_bits(child_offset_difference, len(label) == 1, edgeindex == len(node.linear_edges) - 1)
|
||||||
|
if edgeindex == 0:
|
||||||
|
assert info != 0
|
||||||
|
encode_varint_unsigned(info, result)
|
||||||
|
prev_child_offset = child_offset
|
||||||
|
if len(label) > 1:
|
||||||
|
encode_varint_unsigned(len(label), result)
|
||||||
|
result.extend(label)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def compute_new_offsets(chunks, offsets):
|
||||||
|
""" Given a list of chunks, compute the new offsets (by adding the
|
||||||
|
chunk lengths together). Also check if we cannot shrink the output
|
||||||
|
further because none of the node offsets are smaller now. if that's
|
||||||
|
the case return None. """
|
||||||
|
new_offsets = {}
|
||||||
|
curr_offset = 0
|
||||||
|
should_continue = False
|
||||||
|
for node, result in zip(order, chunks):
|
||||||
|
if curr_offset < offsets[node]:
|
||||||
|
# the new offset is below the current assumption, this
|
||||||
|
# means we can shrink the output more
|
||||||
|
should_continue = True
|
||||||
|
new_offsets[node] = curr_offset
|
||||||
|
curr_offset += len(result)
|
||||||
|
if not should_continue:
|
||||||
|
return None
|
||||||
|
return new_offsets
|
||||||
|
|
||||||
|
# assign initial offsets to every node
|
||||||
|
offsets = {}
|
||||||
|
for i, node in enumerate(order):
|
||||||
|
# we don't know position of the edge yet, just use something big as
|
||||||
|
# the starting position. we'll have to do further iterations anyway,
|
||||||
|
# but the size is at least a lower limit then
|
||||||
|
offsets[node] = i * 2 ** 30
|
||||||
|
|
||||||
|
|
||||||
|
# due to the variable integer width encoding of edge targets we need to
|
||||||
|
# run this to fixpoint. in the process we shrink the output more and
|
||||||
|
# more until we can't any more. at any point we can stop and use the
|
||||||
|
# output, but we might need padding zero bytes when joining the chunks
|
||||||
|
# to have the correct jump distances
|
||||||
|
last_offsets = None
|
||||||
|
while 1:
|
||||||
|
chunks = [compute_chunk(node, offsets) for node in order]
|
||||||
|
last_offsets = offsets
|
||||||
|
offsets = compute_new_offsets(chunks, offsets)
|
||||||
|
if offsets is None: # couldn't shrink
|
||||||
|
break
|
||||||
|
|
||||||
|
# build the final packed string
|
||||||
|
total_result = bytearray()
|
||||||
|
for node, result in zip(order, chunks):
|
||||||
|
node_offset = last_offsets[node]
|
||||||
|
if node_offset > len(total_result):
|
||||||
|
# need to pad to get the offsets correct
|
||||||
|
padding = b"\x00" * (node_offset - len(total_result))
|
||||||
|
total_result.extend(padding)
|
||||||
|
assert node_offset == len(total_result)
|
||||||
|
total_result.extend(result)
|
||||||
|
return bytes(total_result)
|
||||||
|
|
||||||
|
|
||||||
|
# ______________________________________________________________________
|
||||||
|
# the following functions operate on the packed representation
|
||||||
|
|
||||||
|
def number_add_bits(x, *bits):
|
||||||
|
for bit in bits:
|
||||||
|
assert bit == 0 or bit == 1
|
||||||
|
x = (x << 1) | bit
|
||||||
|
return x
|
||||||
|
|
||||||
|
def encode_varint_unsigned(i, res):
|
||||||
|
# https://en.wikipedia.org/wiki/LEB128 unsigned variant
|
||||||
|
more = True
|
||||||
|
startlen = len(res)
|
||||||
|
if i < 0:
|
||||||
|
raise ValueError("only positive numbers supported", i)
|
||||||
|
while more:
|
||||||
|
lowest7bits = i & 0b1111111
|
||||||
|
i >>= 7
|
||||||
|
if i == 0:
|
||||||
|
more = False
|
||||||
|
else:
|
||||||
|
lowest7bits |= 0b10000000
|
||||||
|
res.append(lowest7bits)
|
||||||
|
return len(res) - startlen
|
||||||
|
|
||||||
|
def number_split_bits(x, n, acc=()):
|
||||||
|
if n == 1:
|
||||||
|
return x >> 1, x & 1
|
||||||
|
if n == 2:
|
||||||
|
return x >> 2, (x >> 1) & 1, x & 1
|
||||||
|
assert 0, "implement me!"
|
||||||
|
|
||||||
|
def decode_varint_unsigned(b, index=0):
|
||||||
|
res = 0
|
||||||
|
shift = 0
|
||||||
|
while True:
|
||||||
|
byte = b[index]
|
||||||
|
res = res | ((byte & 0b1111111) << shift)
|
||||||
|
index += 1
|
||||||
|
shift += 7
|
||||||
|
if not (byte & 0b10000000):
|
||||||
|
return res, index
|
||||||
|
|
||||||
|
def decode_node(packed, node):
|
||||||
|
x, node = decode_varint_unsigned(packed, node)
|
||||||
|
node_count, final = number_split_bits(x, 1)
|
||||||
|
return node_count, final, node
|
||||||
|
|
||||||
|
def decode_edge(packed, edgeindex, prev_child_offset, offset):
|
||||||
|
x, offset = decode_varint_unsigned(packed, offset)
|
||||||
|
if x == 0 and edgeindex == 0:
|
||||||
|
raise KeyError # trying to decode past a final node
|
||||||
|
child_offset_difference, len1, last_edge = number_split_bits(x, 2)
|
||||||
|
child_offset = prev_child_offset + child_offset_difference
|
||||||
|
if len1:
|
||||||
|
size = 1
|
||||||
|
else:
|
||||||
|
size, offset = decode_varint_unsigned(packed, offset)
|
||||||
|
return child_offset, last_edge, size, offset
|
||||||
|
|
||||||
|
def _match_edge(packed, s, size, node_offset, stringpos):
|
||||||
|
if size > 1 and stringpos + size > len(s):
|
||||||
|
# past the end of the string, can't match
|
||||||
|
return False
|
||||||
|
for i in range(size):
|
||||||
|
if packed[node_offset + i] != s[stringpos + i]:
|
||||||
|
# if a subsequent char of an edge doesn't match, the word isn't in
|
||||||
|
# the dawg
|
||||||
|
if i > 0:
|
||||||
|
raise KeyError
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def lookup(packed, data, s):
|
||||||
|
return data[_lookup(packed, s)]
|
||||||
|
|
||||||
|
def _lookup(packed, s):
|
||||||
|
stringpos = 0
|
||||||
|
node_offset = 0
|
||||||
|
skipped = 0 # keep track of number of final nodes that we skipped
|
||||||
|
false = False
|
||||||
|
while stringpos < len(s):
|
||||||
|
#print(f"{node_offset=} {stringpos=}")
|
||||||
|
_, final, edge_offset = decode_node(packed, node_offset)
|
||||||
|
prev_child_offset = edge_offset
|
||||||
|
edgeindex = 0
|
||||||
|
while 1:
|
||||||
|
child_offset, last_edge, size, edgelabel_chars_offset = decode_edge(packed, edgeindex, prev_child_offset, edge_offset)
|
||||||
|
#print(f" {edge_offset=} {child_offset=} {last_edge=} {size=} {edgelabel_chars_offset=}")
|
||||||
|
edgeindex += 1
|
||||||
|
prev_child_offset = child_offset
|
||||||
|
if _match_edge(packed, s, size, edgelabel_chars_offset, stringpos):
|
||||||
|
# match
|
||||||
|
if final:
|
||||||
|
skipped += 1
|
||||||
|
stringpos += size
|
||||||
|
node_offset = child_offset
|
||||||
|
break
|
||||||
|
if last_edge:
|
||||||
|
raise KeyError
|
||||||
|
descendant_count, _, _ = decode_node(packed, child_offset)
|
||||||
|
skipped += descendant_count
|
||||||
|
edge_offset = edgelabel_chars_offset + size
|
||||||
|
_, final, _ = decode_node(packed, node_offset)
|
||||||
|
if final:
|
||||||
|
return skipped
|
||||||
|
raise KeyError
|
||||||
|
|
||||||
|
def inverse_lookup(packed, inverse, x):
|
||||||
|
pos = inverse[x]
|
||||||
|
return _inverse_lookup(packed, pos)
|
||||||
|
|
||||||
|
def _inverse_lookup(packed, pos):
|
||||||
|
result = bytearray()
|
||||||
|
node_offset = 0
|
||||||
|
while 1:
|
||||||
|
node_count, final, edge_offset = decode_node(packed, node_offset)
|
||||||
|
if final:
|
||||||
|
if pos == 0:
|
||||||
|
return bytes(result)
|
||||||
|
pos -= 1
|
||||||
|
prev_child_offset = edge_offset
|
||||||
|
edgeindex = 0
|
||||||
|
while 1:
|
||||||
|
child_offset, last_edge, size, edgelabel_chars_offset = decode_edge(packed, edgeindex, prev_child_offset, edge_offset)
|
||||||
|
edgeindex += 1
|
||||||
|
prev_child_offset = child_offset
|
||||||
|
descendant_count, _, _ = decode_node(packed, child_offset)
|
||||||
|
nextpos = pos - descendant_count
|
||||||
|
if nextpos < 0:
|
||||||
|
assert edgelabel_chars_offset >= 0
|
||||||
|
result.extend(packed[edgelabel_chars_offset: edgelabel_chars_offset + size])
|
||||||
|
node_offset = child_offset
|
||||||
|
break
|
||||||
|
elif not last_edge:
|
||||||
|
pos = nextpos
|
||||||
|
edge_offset = edgelabel_chars_offset + size
|
||||||
|
else:
|
||||||
|
raise KeyError
|
||||||
|
else:
|
||||||
|
raise KeyError
|
||||||
|
|
||||||
|
|
||||||
|
def build_compression_dawg(ucdata):
|
||||||
|
d = Dawg()
|
||||||
|
ucdata.sort()
|
||||||
|
for name, value in ucdata:
|
||||||
|
d.insert(name, value)
|
||||||
|
packed, pos_to_code, reversedict = d.finish()
|
||||||
|
print("size of dawg [KiB]", round(len(packed) / 1024, 2))
|
||||||
|
# check that lookup and inverse_lookup work correctly on the input data
|
||||||
|
for name, value in ucdata:
|
||||||
|
assert lookup(packed, pos_to_code, name.encode('ascii')) == value
|
||||||
|
assert inverse_lookup(packed, reversedict, value) == name.encode('ascii')
|
||||||
|
return packed, pos_to_code
|
|
@ -623,120 +623,12 @@ def makeunicodetype(unicode, trace):
|
||||||
# unicode name database
|
# unicode name database
|
||||||
|
|
||||||
def makeunicodename(unicode, trace):
|
def makeunicodename(unicode, trace):
|
||||||
|
from dawg import build_compression_dawg
|
||||||
|
|
||||||
FILE = "Modules/unicodename_db.h"
|
FILE = "Modules/unicodename_db.h"
|
||||||
|
|
||||||
print("--- Preparing", FILE, "...")
|
print("--- Preparing", FILE, "...")
|
||||||
|
|
||||||
# collect names
|
|
||||||
names = [None] * len(unicode.chars)
|
|
||||||
|
|
||||||
for char in unicode.chars:
|
|
||||||
record = unicode.table[char]
|
|
||||||
if record:
|
|
||||||
name = record.name.strip()
|
|
||||||
if name and name[0] != "<":
|
|
||||||
names[char] = name + chr(0)
|
|
||||||
|
|
||||||
print(len([n for n in names if n is not None]), "distinct names")
|
|
||||||
|
|
||||||
# collect unique words from names (note that we differ between
|
|
||||||
# words inside a sentence, and words ending a sentence. the
|
|
||||||
# latter includes the trailing null byte.
|
|
||||||
|
|
||||||
words = {}
|
|
||||||
n = b = 0
|
|
||||||
for char in unicode.chars:
|
|
||||||
name = names[char]
|
|
||||||
if name:
|
|
||||||
w = name.split()
|
|
||||||
b = b + len(name)
|
|
||||||
n = n + len(w)
|
|
||||||
for w in w:
|
|
||||||
l = words.get(w)
|
|
||||||
if l:
|
|
||||||
l.append(None)
|
|
||||||
else:
|
|
||||||
words[w] = [len(words)]
|
|
||||||
|
|
||||||
print(n, "words in text;", b, "bytes")
|
|
||||||
|
|
||||||
wordlist = list(words.items())
|
|
||||||
|
|
||||||
# sort on falling frequency, then by name
|
|
||||||
def word_key(a):
|
|
||||||
aword, alist = a
|
|
||||||
return -len(alist), aword
|
|
||||||
wordlist.sort(key=word_key)
|
|
||||||
|
|
||||||
# figure out how many phrasebook escapes we need
|
|
||||||
escapes = 0
|
|
||||||
while escapes * 256 < len(wordlist):
|
|
||||||
escapes = escapes + 1
|
|
||||||
print(escapes, "escapes")
|
|
||||||
|
|
||||||
short = 256 - escapes
|
|
||||||
|
|
||||||
assert short > 0
|
|
||||||
|
|
||||||
print(short, "short indexes in lexicon")
|
|
||||||
|
|
||||||
# statistics
|
|
||||||
n = 0
|
|
||||||
for i in range(short):
|
|
||||||
n = n + len(wordlist[i][1])
|
|
||||||
print(n, "short indexes in phrasebook")
|
|
||||||
|
|
||||||
# pick the most commonly used words, and sort the rest on falling
|
|
||||||
# length (to maximize overlap)
|
|
||||||
|
|
||||||
wordlist, wordtail = wordlist[:short], wordlist[short:]
|
|
||||||
wordtail.sort(key=lambda a: a[0], reverse=True)
|
|
||||||
wordlist.extend(wordtail)
|
|
||||||
|
|
||||||
# generate lexicon from words
|
|
||||||
|
|
||||||
lexicon_offset = [0]
|
|
||||||
lexicon = ""
|
|
||||||
words = {}
|
|
||||||
|
|
||||||
# build a lexicon string
|
|
||||||
offset = 0
|
|
||||||
for w, x in wordlist:
|
|
||||||
# encoding: bit 7 indicates last character in word (chr(128)
|
|
||||||
# indicates the last character in an entire string)
|
|
||||||
ww = w[:-1] + chr(ord(w[-1])+128)
|
|
||||||
# reuse string tails, when possible
|
|
||||||
o = lexicon.find(ww)
|
|
||||||
if o < 0:
|
|
||||||
o = offset
|
|
||||||
lexicon = lexicon + ww
|
|
||||||
offset = offset + len(w)
|
|
||||||
words[w] = len(lexicon_offset)
|
|
||||||
lexicon_offset.append(o)
|
|
||||||
|
|
||||||
lexicon = list(map(ord, lexicon))
|
|
||||||
|
|
||||||
# generate phrasebook from names and lexicon
|
|
||||||
phrasebook = [0]
|
|
||||||
phrasebook_offset = [0] * len(unicode.chars)
|
|
||||||
for char in unicode.chars:
|
|
||||||
name = names[char]
|
|
||||||
if name:
|
|
||||||
w = name.split()
|
|
||||||
phrasebook_offset[char] = len(phrasebook)
|
|
||||||
for w in w:
|
|
||||||
i = words[w]
|
|
||||||
if i < short:
|
|
||||||
phrasebook.append(i)
|
|
||||||
else:
|
|
||||||
# store as two bytes
|
|
||||||
phrasebook.append((i>>8) + short)
|
|
||||||
phrasebook.append(i&255)
|
|
||||||
|
|
||||||
assert getsize(phrasebook) == 1
|
|
||||||
|
|
||||||
#
|
|
||||||
# unicode name hash table
|
# unicode name hash table
|
||||||
|
|
||||||
# extract names
|
# extract names
|
||||||
|
@ -748,12 +640,6 @@ def makeunicodename(unicode, trace):
|
||||||
if name and name[0] != "<":
|
if name and name[0] != "<":
|
||||||
data.append((name, char))
|
data.append((name, char))
|
||||||
|
|
||||||
# the magic number 47 was chosen to minimize the number of
|
|
||||||
# collisions on the current data set. if you like, change it
|
|
||||||
# and see what happens...
|
|
||||||
|
|
||||||
codehash = Hash("code", data, 47)
|
|
||||||
|
|
||||||
print("--- Writing", FILE, "...")
|
print("--- Writing", FILE, "...")
|
||||||
|
|
||||||
with open(FILE, "w") as fp:
|
with open(FILE, "w") as fp:
|
||||||
|
@ -762,24 +648,22 @@ def makeunicodename(unicode, trace):
|
||||||
fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
|
fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
|
||||||
fprint()
|
fprint()
|
||||||
fprint("#define NAME_MAXLEN", 256)
|
fprint("#define NAME_MAXLEN", 256)
|
||||||
|
assert max(len(x) for x in data) < 256
|
||||||
fprint()
|
fprint()
|
||||||
fprint("/* lexicon */")
|
|
||||||
Array("lexicon", lexicon).dump(fp, trace)
|
|
||||||
Array("lexicon_offset", lexicon_offset).dump(fp, trace)
|
|
||||||
|
|
||||||
# split decomposition index table
|
|
||||||
offset1, offset2, shift = splitbins(phrasebook_offset, trace)
|
|
||||||
|
|
||||||
fprint("/* code->name phrasebook */")
|
|
||||||
fprint("#define phrasebook_shift", shift)
|
|
||||||
fprint("#define phrasebook_short", short)
|
|
||||||
|
|
||||||
Array("phrasebook", phrasebook).dump(fp, trace)
|
|
||||||
Array("phrasebook_offset1", offset1).dump(fp, trace)
|
|
||||||
Array("phrasebook_offset2", offset2).dump(fp, trace)
|
|
||||||
|
|
||||||
fprint("/* name->code dictionary */")
|
fprint("/* name->code dictionary */")
|
||||||
codehash.dump(fp, trace)
|
packed_dawg, pos_to_codepoint = build_compression_dawg(data)
|
||||||
|
notfound = len(pos_to_codepoint)
|
||||||
|
inverse_list = [notfound] * len(unicode.chars)
|
||||||
|
for pos, codepoint in enumerate(pos_to_codepoint):
|
||||||
|
inverse_list[codepoint] = pos
|
||||||
|
Array("packed_name_dawg", list(packed_dawg)).dump(fp, trace)
|
||||||
|
Array("dawg_pos_to_codepoint", pos_to_codepoint).dump(fp, trace)
|
||||||
|
index1, index2, shift = splitbins(inverse_list, trace)
|
||||||
|
fprint("#define DAWG_CODEPOINT_TO_POS_SHIFT", shift)
|
||||||
|
fprint("#define DAWG_CODEPOINT_TO_POS_NOTFOUND", notfound)
|
||||||
|
Array("dawg_codepoint_to_pos_index1", index1).dump(fp, trace)
|
||||||
|
Array("dawg_codepoint_to_pos_index2", index2).dump(fp, trace)
|
||||||
|
|
||||||
fprint()
|
fprint()
|
||||||
fprint('static const unsigned int aliases_start = %#x;' %
|
fprint('static const unsigned int aliases_start = %#x;' %
|
||||||
|
@ -1188,94 +1072,6 @@ class UnicodeData:
|
||||||
self.chars = list(range(256))
|
self.chars = list(range(256))
|
||||||
|
|
||||||
|
|
||||||
# hash table tools
|
|
||||||
|
|
||||||
# this is a straight-forward reimplementation of Python's built-in
|
|
||||||
# dictionary type, using a static data structure, and a custom string
|
|
||||||
# hash algorithm.
|
|
||||||
|
|
||||||
def myhash(s, magic):
|
|
||||||
h = 0
|
|
||||||
for c in map(ord, s.upper()):
|
|
||||||
h = (h * magic) + c
|
|
||||||
ix = h & 0xff000000
|
|
||||||
if ix:
|
|
||||||
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
|
|
||||||
return h
|
|
||||||
|
|
||||||
|
|
||||||
SIZES = [
|
|
||||||
(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
|
|
||||||
(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
|
|
||||||
(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
|
|
||||||
(2097152,5), (4194304,3), (8388608,33), (16777216,27)
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class Hash:
|
|
||||||
def __init__(self, name, data, magic):
|
|
||||||
# turn a (key, value) list into a static hash table structure
|
|
||||||
|
|
||||||
# determine table size
|
|
||||||
for size, poly in SIZES:
|
|
||||||
if size > len(data):
|
|
||||||
poly = size + poly
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
raise AssertionError("ran out of polynomials")
|
|
||||||
|
|
||||||
print(size, "slots in hash table")
|
|
||||||
|
|
||||||
table = [None] * size
|
|
||||||
|
|
||||||
mask = size-1
|
|
||||||
|
|
||||||
n = 0
|
|
||||||
|
|
||||||
hash = myhash
|
|
||||||
|
|
||||||
# initialize hash table
|
|
||||||
for key, value in data:
|
|
||||||
h = hash(key, magic)
|
|
||||||
i = (~h) & mask
|
|
||||||
v = table[i]
|
|
||||||
if v is None:
|
|
||||||
table[i] = value
|
|
||||||
continue
|
|
||||||
incr = (h ^ (h >> 3)) & mask
|
|
||||||
if not incr:
|
|
||||||
incr = mask
|
|
||||||
while 1:
|
|
||||||
n = n + 1
|
|
||||||
i = (i + incr) & mask
|
|
||||||
v = table[i]
|
|
||||||
if v is None:
|
|
||||||
table[i] = value
|
|
||||||
break
|
|
||||||
incr = incr << 1
|
|
||||||
if incr > mask:
|
|
||||||
incr = incr ^ poly
|
|
||||||
|
|
||||||
print(n, "collisions")
|
|
||||||
self.collisions = n
|
|
||||||
|
|
||||||
for i in range(len(table)):
|
|
||||||
if table[i] is None:
|
|
||||||
table[i] = 0
|
|
||||||
|
|
||||||
self.data = Array(name + "_hash", table)
|
|
||||||
self.magic = magic
|
|
||||||
self.name = name
|
|
||||||
self.size = size
|
|
||||||
self.poly = poly
|
|
||||||
|
|
||||||
def dump(self, file, trace):
|
|
||||||
# write data to file, as a C array
|
|
||||||
self.data.dump(file, trace)
|
|
||||||
file.write("#define %s_magic %d\n" % (self.name, self.magic))
|
|
||||||
file.write("#define %s_size %d\n" % (self.name, self.size))
|
|
||||||
file.write("#define %s_poly %d\n" % (self.name, self.poly))
|
|
||||||
|
|
||||||
|
|
||||||
# stuff to deal with arrays of unsigned integers
|
# stuff to deal with arrays of unsigned integers
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue