mirror of https://github.com/python/cpython
gh-96954: use a directed acyclic word graph for storing the unicodedata codepoint names (#97906)
Co-authored-by: Łukasz Langa <lukasz@langa.pl> Co-authored-by: Pieter Eendebak <pieter.eendebak@gmail.com> Co-authored-by: Dennis Sweeney <36520290+sweeneyde@users.noreply.github.com>
This commit is contained in:
parent
0e9c364f4a
commit
9573d14215
|
@ -0,0 +1,121 @@
|
|||
import unittest
|
||||
from test.test_tools import toolsdir, imports_under_tool
|
||||
from test import support
|
||||
from test.support.hypothesis_helper import hypothesis
|
||||
|
||||
st = hypothesis.strategies
|
||||
given = hypothesis.given
|
||||
example = hypothesis.example
|
||||
|
||||
|
||||
with imports_under_tool("unicode"):
|
||||
from dawg import Dawg, build_compression_dawg, lookup, inverse_lookup
|
||||
|
||||
|
||||
@st.composite
|
||||
def char_name_db(draw, min_length=1, max_length=30):
|
||||
m = draw(st.integers(min_value=min_length, max_value=max_length))
|
||||
names = draw(
|
||||
st.sets(st.text("abcd", min_size=1, max_size=10), min_size=m, max_size=m)
|
||||
)
|
||||
characters = draw(st.sets(st.characters(), min_size=m, max_size=m))
|
||||
return list(zip(names, characters))
|
||||
|
||||
|
||||
class TestDawg(unittest.TestCase):
|
||||
"""Tests for the directed acyclic word graph data structure that is used
|
||||
to store the unicode character names in unicodedata. Tests ported from PyPy
|
||||
"""
|
||||
|
||||
def test_dawg_direct_simple(self):
|
||||
dawg = Dawg()
|
||||
dawg.insert("a", -4)
|
||||
dawg.insert("c", -2)
|
||||
dawg.insert("cat", -1)
|
||||
dawg.insert("catarr", 0)
|
||||
dawg.insert("catnip", 1)
|
||||
dawg.insert("zcatnip", 5)
|
||||
packed, data, inverse = dawg.finish()
|
||||
|
||||
self.assertEqual(lookup(packed, data, b"a"), -4)
|
||||
self.assertEqual(lookup(packed, data, b"c"), -2)
|
||||
self.assertEqual(lookup(packed, data, b"cat"), -1)
|
||||
self.assertEqual(lookup(packed, data, b"catarr"), 0)
|
||||
self.assertEqual(lookup(packed, data, b"catnip"), 1)
|
||||
self.assertEqual(lookup(packed, data, b"zcatnip"), 5)
|
||||
self.assertRaises(KeyError, lookup, packed, data, b"b")
|
||||
self.assertRaises(KeyError, lookup, packed, data, b"catni")
|
||||
self.assertRaises(KeyError, lookup, packed, data, b"catnipp")
|
||||
|
||||
self.assertEqual(inverse_lookup(packed, inverse, -4), b"a")
|
||||
self.assertEqual(inverse_lookup(packed, inverse, -2), b"c")
|
||||
self.assertEqual(inverse_lookup(packed, inverse, -1), b"cat")
|
||||
self.assertEqual(inverse_lookup(packed, inverse, 0), b"catarr")
|
||||
self.assertEqual(inverse_lookup(packed, inverse, 1), b"catnip")
|
||||
self.assertEqual(inverse_lookup(packed, inverse, 5), b"zcatnip")
|
||||
self.assertRaises(KeyError, inverse_lookup, packed, inverse, 12)
|
||||
|
||||
def test_forbid_empty_dawg(self):
|
||||
dawg = Dawg()
|
||||
self.assertRaises(ValueError, dawg.finish)
|
||||
|
||||
@given(char_name_db())
|
||||
@example([("abc", "a"), ("abd", "b")])
|
||||
@example(
|
||||
[
|
||||
("bab", "1"),
|
||||
("a", ":"),
|
||||
("ad", "@"),
|
||||
("b", "<"),
|
||||
("aacc", "?"),
|
||||
("dab", "D"),
|
||||
("aa", "0"),
|
||||
("ab", "F"),
|
||||
("aaa", "7"),
|
||||
("cbd", "="),
|
||||
("abad", ";"),
|
||||
("ac", "B"),
|
||||
("abb", "4"),
|
||||
("bb", "2"),
|
||||
("aab", "9"),
|
||||
("caaaaba", "E"),
|
||||
("ca", ">"),
|
||||
("bbaaa", "5"),
|
||||
("d", "3"),
|
||||
("baac", "8"),
|
||||
("c", "6"),
|
||||
("ba", "A"),
|
||||
]
|
||||
)
|
||||
@example(
|
||||
[
|
||||
("bcdac", "9"),
|
||||
("acc", "g"),
|
||||
("d", "d"),
|
||||
("daabdda", "0"),
|
||||
("aba", ";"),
|
||||
("c", "6"),
|
||||
("aa", "7"),
|
||||
("abbd", "c"),
|
||||
("badbd", "?"),
|
||||
("bbd", "f"),
|
||||
("cc", "@"),
|
||||
("bb", "8"),
|
||||
("daca", ">"),
|
||||
("ba", ":"),
|
||||
("baac", "3"),
|
||||
("dbdddac", "a"),
|
||||
("a", "2"),
|
||||
("cabd", "b"),
|
||||
("b", "="),
|
||||
("abd", "4"),
|
||||
("adcbd", "5"),
|
||||
("abc", "e"),
|
||||
("ab", "1"),
|
||||
]
|
||||
)
|
||||
def test_dawg(self, data):
|
||||
# suppress debug prints
|
||||
with support.captured_stdout() as output:
|
||||
# it's enough to build it, building will also check the result
|
||||
build_compression_dawg(data)
|
|
@ -104,6 +104,26 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
|
|||
if looked_name := self.db.name(char, None):
|
||||
self.assertEqual(self.db.lookup(looked_name), char)
|
||||
|
||||
def test_no_names_in_pua(self):
|
||||
puas = [*range(0xe000, 0xf8ff),
|
||||
*range(0xf0000, 0xfffff),
|
||||
*range(0x100000, 0x10ffff)]
|
||||
for i in puas:
|
||||
char = chr(i)
|
||||
self.assertRaises(ValueError, self.db.name, char)
|
||||
|
||||
def test_lookup_nonexistant(self):
|
||||
# just make sure that lookup can fail
|
||||
for nonexistant in [
|
||||
"LATIN SMLL LETR A",
|
||||
"OPEN HANDS SIGHS",
|
||||
"DREGS",
|
||||
"HANDBUG",
|
||||
"MODIFIER LETTER CYRILLIC SMALL QUESTION MARK",
|
||||
"???",
|
||||
]:
|
||||
self.assertRaises(KeyError, self.db.lookup, nonexistant)
|
||||
|
||||
def test_digit(self):
|
||||
self.assertEqual(self.db.digit('A', None), None)
|
||||
self.assertEqual(self.db.digit('9'), 9)
|
||||
|
|
|
@ -1342,6 +1342,14 @@ check-abidump: all
|
|||
regen-limited-abi: all
|
||||
$(RUNSHARED) ./$(BUILDPYTHON) $(srcdir)/Tools/build/stable_abi.py --generate-all $(srcdir)/Misc/stable_abi.toml
|
||||
|
||||
############################################################################
|
||||
# Regenerate Unicode Data
|
||||
|
||||
.PHONY: regen-unicodedata
|
||||
regen-unicodedata:
|
||||
$(PYTHON_FOR_REGEN) Tools/unicode/makeunicodedata.py
|
||||
|
||||
|
||||
############################################################################
|
||||
# Regenerate all generated files
|
||||
|
||||
|
@ -1350,7 +1358,7 @@ regen-limited-abi: all
|
|||
regen-all: regen-cases regen-typeslots \
|
||||
regen-token regen-ast regen-keyword regen-sre regen-frozen \
|
||||
regen-pegen-metaparser regen-pegen regen-test-frozenmain \
|
||||
regen-test-levenshtein regen-global-objects
|
||||
regen-test-levenshtein regen-global-objects regen-unicodedata
|
||||
@echo
|
||||
@echo "Note: make regen-stdlib-module-names, make regen-limited-abi"
|
||||
@echo "and make regen-configure should be run manually"
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
Switch the storage of the unicode codepoint names to use a different
|
||||
data-structure, a `directed acyclic word graph
|
||||
<https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton>`_.
|
||||
This makes the unicodedata shared library about 440 KiB smaller. Contributed by
|
||||
Carl Friedrich Bolz-Tereick using code from the PyPy project.
|
|
@ -977,21 +977,6 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
|
|||
/* -------------------------------------------------------------------- */
|
||||
/* database code (cut and pasted from the unidb package) */
|
||||
|
||||
static unsigned long
|
||||
_gethash(const char *s, int len, int scale)
|
||||
{
|
||||
int i;
|
||||
unsigned long h = 0;
|
||||
unsigned long ix;
|
||||
for (i = 0; i < len; i++) {
|
||||
h = (h * scale) + (unsigned char) Py_TOUPPER(s[i]);
|
||||
ix = h & 0xff000000;
|
||||
if (ix)
|
||||
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
static const char * const hangul_syllables[][3] = {
|
||||
{ "G", "A", "" },
|
||||
{ "GG", "AE", "G" },
|
||||
|
@ -1046,6 +1031,247 @@ is_unified_ideograph(Py_UCS4 code)
|
|||
#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
|
||||
(cp < named_sequences_end))
|
||||
|
||||
|
||||
// DAWG decoding functions
|
||||
|
||||
static unsigned int
|
||||
_dawg_decode_varint_unsigned(unsigned int index, unsigned int* result)
|
||||
{
|
||||
unsigned int res = 0;
|
||||
unsigned int shift = 0;
|
||||
for (;;) {
|
||||
unsigned char byte = packed_name_dawg[index];
|
||||
res |= (byte & 0x7f) << shift;
|
||||
index++;
|
||||
shift += 7;
|
||||
if (!(byte & 0x80)) {
|
||||
*result = res;
|
||||
return index;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
_dawg_match_edge(const char* name, unsigned int namelen, unsigned int size,
|
||||
unsigned int label_offset, unsigned int namepos)
|
||||
{
|
||||
// This returns 1 if the edge matched, 0 if it didn't (but further edges
|
||||
// could match) and -1 if the name cannot match at all.
|
||||
if (size > 1 && namepos + size > namelen) {
|
||||
return 0;
|
||||
}
|
||||
for (unsigned int i = 0; i < size; i++) {
|
||||
if (packed_name_dawg[label_offset + i] != Py_TOUPPER(name[namepos + i])) {
|
||||
if (i > 0) {
|
||||
return -1; // cannot match at all
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
// reading DAWG node information:
|
||||
// a node is encoded by a varint. The lowest bit of that int is set if the node
|
||||
// is a final, accepting state. The higher bits of that int represent the
|
||||
// number of names that are encoded by the sub-DAWG started by this node. It's
|
||||
// used to compute the position of a name.
|
||||
//
|
||||
// the starting node of the DAWG is at position 0.
|
||||
//
|
||||
// the varint representing a node is followed by the node's edges, the encoding
|
||||
// is described below
|
||||
|
||||
|
||||
static unsigned int
|
||||
_dawg_decode_node(unsigned int node_offset, bool* final)
|
||||
{
|
||||
unsigned int num;
|
||||
node_offset = _dawg_decode_varint_unsigned(node_offset, &num);
|
||||
*final = num & 1;
|
||||
return node_offset;
|
||||
}
|
||||
|
||||
static bool
|
||||
_dawg_node_is_final(unsigned int node_offset)
|
||||
{
|
||||
unsigned int num;
|
||||
_dawg_decode_varint_unsigned(node_offset, &num);
|
||||
return num & 1;
|
||||
}
|
||||
|
||||
static unsigned int
|
||||
_dawg_node_descendant_count(unsigned int node_offset)
|
||||
{
|
||||
unsigned int num;
|
||||
_dawg_decode_varint_unsigned(node_offset, &num);
|
||||
return num >> 1;
|
||||
}
|
||||
|
||||
|
||||
// reading DAWG edge information:
|
||||
// a DAWG edge is comprised of the following information:
|
||||
// (1) the size of the label of the string attached to the edge
|
||||
// (2) the characters of that edge
|
||||
// (3) the target node
|
||||
// (4) whether the edge is the last edge in the list of edges following a node
|
||||
//
|
||||
// this information is encoded in a compact form as follows:
|
||||
//
|
||||
// +---------+-----------------+--------------+--------------------
|
||||
// | varint | size (if != 1) | label chars | ... next edge ...
|
||||
// +---------+-----------------+--------------+--------------------
|
||||
//
|
||||
// - first comes a varint
|
||||
// - the lowest bit of that varint is whether the edge is final (4)
|
||||
// - the second lowest bit of that varint is true if the size of
|
||||
// the length of the label is 1 (1)
|
||||
// - the rest of the varint is an offset that can be used to compute
|
||||
// the offset of the target node of that edge (3)
|
||||
// - if the size is not 1, the first varint is followed by a
|
||||
// character encoding the number of characters of the label (1)
|
||||
// (unicode character names aren't larger than 256 bytes, therefore each
|
||||
// edge label can be at most 256 chars, but is usually smaller)
|
||||
// - the next size bytes are the characters of the label (2)
|
||||
//
|
||||
// the offset of the target node is computed as follows: the number in the
|
||||
// upper bits of the varint needs to be added to the offset of the target node
|
||||
// of the previous edge. For the first edge, where there is no previous target
|
||||
// node, the offset of the first edge is used.
|
||||
// The intuition here is that edges going out from a node often lead to nodes
|
||||
// that are close by, leading to small offsets from the current node and thus
|
||||
// fewer bytes.
|
||||
//
|
||||
// There is a special case: if a final node has no outgoing edges, it has to be
|
||||
// followed by a 0 byte to indicate that there are no edges (because the end of
|
||||
// the edge list is normally indicated in a bit in the edge encoding). This is
|
||||
// indicated by _dawg_decode_edge returning -1
|
||||
|
||||
|
||||
static int
|
||||
_dawg_decode_edge(bool is_first_edge, unsigned int prev_target_node_offset,
|
||||
unsigned int edge_offset, unsigned int* size,
|
||||
unsigned int* label_offset, unsigned int* target_node_offset)
|
||||
{
|
||||
unsigned int num;
|
||||
edge_offset = _dawg_decode_varint_unsigned(edge_offset, &num);
|
||||
if (num == 0 && is_first_edge) {
|
||||
return -1; // trying to decode past a final node without outgoing edges
|
||||
}
|
||||
bool last_edge = num & 1;
|
||||
num >>= 1;
|
||||
bool len_is_one = num & 1;
|
||||
num >>= 1;
|
||||
*target_node_offset = prev_target_node_offset + num;
|
||||
if (len_is_one) {
|
||||
*size = 1;
|
||||
} else {
|
||||
*size = packed_name_dawg[edge_offset++];
|
||||
}
|
||||
*label_offset = edge_offset;
|
||||
return last_edge;
|
||||
}
|
||||
|
||||
static int
|
||||
_lookup_dawg_packed(const char* name, unsigned int namelen)
|
||||
{
|
||||
unsigned int stringpos = 0;
|
||||
unsigned int node_offset = 0;
|
||||
unsigned int result = 0; // this is the number of final nodes that we skipped to match name
|
||||
while (stringpos < namelen) {
|
||||
bool final;
|
||||
unsigned int edge_offset = _dawg_decode_node(node_offset, &final);
|
||||
unsigned int prev_target_node_offset = edge_offset;
|
||||
bool is_first_edge = true;
|
||||
for (;;) {
|
||||
unsigned int size;
|
||||
unsigned int label_offset, target_node_offset;
|
||||
int last_edge = _dawg_decode_edge(
|
||||
is_first_edge, prev_target_node_offset, edge_offset,
|
||||
&size, &label_offset, &target_node_offset);
|
||||
if (last_edge == -1) {
|
||||
return -1;
|
||||
}
|
||||
is_first_edge = false;
|
||||
prev_target_node_offset = target_node_offset;
|
||||
int matched = _dawg_match_edge(name, namelen, size, label_offset, stringpos);
|
||||
if (matched == -1) {
|
||||
return -1;
|
||||
}
|
||||
if (matched) {
|
||||
if (final)
|
||||
result += 1;
|
||||
stringpos += size;
|
||||
node_offset = target_node_offset;
|
||||
break;
|
||||
}
|
||||
if (last_edge) {
|
||||
return -1;
|
||||
}
|
||||
result += _dawg_node_descendant_count(target_node_offset);
|
||||
edge_offset = label_offset + size;
|
||||
}
|
||||
}
|
||||
if (_dawg_node_is_final(node_offset)) {
|
||||
return result;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
static int
|
||||
_inverse_dawg_lookup(char* buffer, unsigned int buflen, unsigned int pos)
|
||||
{
|
||||
unsigned int node_offset = 0;
|
||||
unsigned int bufpos = 0;
|
||||
for (;;) {
|
||||
bool final;
|
||||
unsigned int edge_offset = _dawg_decode_node(node_offset, &final);
|
||||
|
||||
if (final) {
|
||||
if (pos == 0) {
|
||||
if (bufpos + 1 == buflen) {
|
||||
return 0;
|
||||
}
|
||||
buffer[bufpos] = '\0';
|
||||
return 1;
|
||||
}
|
||||
pos--;
|
||||
}
|
||||
unsigned int prev_target_node_offset = edge_offset;
|
||||
bool is_first_edge = true;
|
||||
for (;;) {
|
||||
unsigned int size;
|
||||
unsigned int label_offset, target_node_offset;
|
||||
int last_edge = _dawg_decode_edge(
|
||||
is_first_edge, prev_target_node_offset, edge_offset,
|
||||
&size, &label_offset, &target_node_offset);
|
||||
if (last_edge == -1) {
|
||||
return 0;
|
||||
}
|
||||
is_first_edge = false;
|
||||
prev_target_node_offset = target_node_offset;
|
||||
|
||||
unsigned int descendant_count = _dawg_node_descendant_count(target_node_offset);
|
||||
if (pos < descendant_count) {
|
||||
if (bufpos + size >= buflen) {
|
||||
return 0; // buffer overflow
|
||||
}
|
||||
for (unsigned int i = 0; i < size; i++) {
|
||||
buffer[bufpos++] = packed_name_dawg[label_offset++];
|
||||
}
|
||||
node_offset = target_node_offset;
|
||||
break;
|
||||
} else if (!last_edge) {
|
||||
pos -= descendant_count;
|
||||
edge_offset = label_offset + size;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
_getucname(PyObject *self,
|
||||
Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
|
||||
|
@ -1054,9 +1280,6 @@ _getucname(PyObject *self,
|
|||
* If with_alias_and_seq is 1, check for names in the Private Use Area 15
|
||||
* that we are using for aliases and named sequences. */
|
||||
int offset;
|
||||
int i;
|
||||
int word;
|
||||
const unsigned char* w;
|
||||
|
||||
if (code >= 0x110000)
|
||||
return 0;
|
||||
|
@ -1107,45 +1330,15 @@ _getucname(PyObject *self,
|
|||
return 1;
|
||||
}
|
||||
|
||||
/* get offset into phrasebook */
|
||||
offset = phrasebook_offset1[(code>>phrasebook_shift)];
|
||||
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
|
||||
(code&((1<<phrasebook_shift)-1))];
|
||||
if (!offset)
|
||||
/* get position of codepoint in order of names in the dawg */
|
||||
offset = dawg_codepoint_to_pos_index1[(code>>DAWG_CODEPOINT_TO_POS_SHIFT)];
|
||||
offset = dawg_codepoint_to_pos_index2[(offset<<DAWG_CODEPOINT_TO_POS_SHIFT) +
|
||||
(code&((1<<DAWG_CODEPOINT_TO_POS_SHIFT)-1))];
|
||||
if (offset == DAWG_CODEPOINT_TO_POS_NOTFOUND)
|
||||
return 0;
|
||||
|
||||
i = 0;
|
||||
|
||||
for (;;) {
|
||||
/* get word index */
|
||||
word = phrasebook[offset] - phrasebook_short;
|
||||
if (word >= 0) {
|
||||
word = (word << 8) + phrasebook[offset+1];
|
||||
offset += 2;
|
||||
} else
|
||||
word = phrasebook[offset++];
|
||||
if (i) {
|
||||
if (i > buflen)
|
||||
return 0; /* buffer overflow */
|
||||
buffer[i++] = ' ';
|
||||
}
|
||||
/* copy word string from lexicon. the last character in the
|
||||
word has bit 7 set. the last word in a string ends with
|
||||
0x80 */
|
||||
w = lexicon + lexicon_offset[word];
|
||||
while (*w < 128) {
|
||||
if (i >= buflen)
|
||||
return 0; /* buffer overflow */
|
||||
buffer[i++] = *w++;
|
||||
}
|
||||
if (i >= buflen)
|
||||
return 0; /* buffer overflow */
|
||||
buffer[i++] = *w & 127;
|
||||
if (*w == 128)
|
||||
break; /* end of word */
|
||||
}
|
||||
|
||||
return 1;
|
||||
assert(buflen >= 0);
|
||||
return _inverse_dawg_lookup(buffer, Py_SAFE_DOWNCAST(buflen, int, unsigned int), offset);
|
||||
}
|
||||
|
||||
static int
|
||||
|
@ -1157,21 +1350,6 @@ capi_getucname(Py_UCS4 code,
|
|||
|
||||
}
|
||||
|
||||
static int
|
||||
_cmpname(PyObject *self, int code, const char* name, int namelen)
|
||||
{
|
||||
/* check if code corresponds to the given name */
|
||||
int i;
|
||||
char buffer[NAME_MAXLEN+1];
|
||||
if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
|
||||
return 0;
|
||||
for (i = 0; i < namelen; i++) {
|
||||
if (Py_TOUPPER(name[i]) != buffer[i])
|
||||
return 0;
|
||||
}
|
||||
return buffer[namelen] == '\0';
|
||||
}
|
||||
|
||||
static void
|
||||
find_syllable(const char *str, int *len, int *pos, int count, int column)
|
||||
{
|
||||
|
@ -1193,31 +1371,25 @@ find_syllable(const char *str, int *len, int *pos, int count, int column)
|
|||
}
|
||||
|
||||
static int
|
||||
_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
|
||||
_check_alias_and_seq(Py_UCS4* code, int with_named_seq)
|
||||
{
|
||||
/* check if named sequences are allowed */
|
||||
if (!with_named_seq && IS_NAMED_SEQ(cp))
|
||||
if (!with_named_seq && IS_NAMED_SEQ(*code))
|
||||
return 0;
|
||||
/* if the code point is in the PUA range that we use for aliases,
|
||||
* convert it to obtain the right code point */
|
||||
if (IS_ALIAS(cp))
|
||||
*code = name_aliases[cp-aliases_start];
|
||||
else
|
||||
*code = cp;
|
||||
if (IS_ALIAS(*code))
|
||||
*code = name_aliases[*code-aliases_start];
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
_getcode(PyObject* self,
|
||||
const char* name, int namelen, Py_UCS4* code, int with_named_seq)
|
||||
_getcode(const char* name, int namelen, Py_UCS4* code)
|
||||
{
|
||||
/* Return the code point associated with the given name.
|
||||
* Named aliases are resolved too (unless self != NULL (i.e. we are using
|
||||
* 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
|
||||
* using for the named sequence, and the caller must then convert it. */
|
||||
unsigned int h, v;
|
||||
unsigned int mask = code_size-1;
|
||||
unsigned int i, incr;
|
||||
* Named aliases are not resolved, they are returned as a code point in the
|
||||
* PUA */
|
||||
|
||||
/* Check for hangul syllables. */
|
||||
if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
|
||||
|
@ -1240,6 +1412,7 @@ _getcode(PyObject* self,
|
|||
/* Check for unified ideographs. */
|
||||
if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
|
||||
/* Four or five hexdigits must follow. */
|
||||
unsigned int v;
|
||||
v = 0;
|
||||
name += 22;
|
||||
namelen -= 22;
|
||||
|
@ -1261,41 +1434,24 @@ _getcode(PyObject* self,
|
|||
return 1;
|
||||
}
|
||||
|
||||
/* the following is the same as python's dictionary lookup, with
|
||||
only minor changes. see the makeunicodedata script for more
|
||||
details */
|
||||
|
||||
h = (unsigned int) _gethash(name, namelen, code_magic);
|
||||
i = (~h) & mask;
|
||||
v = code_hash[i];
|
||||
if (!v)
|
||||
assert(namelen >= 0);
|
||||
int position = _lookup_dawg_packed(name, Py_SAFE_DOWNCAST(namelen, int, unsigned int));
|
||||
if (position < 0) {
|
||||
return 0;
|
||||
if (_cmpname(self, v, name, namelen)) {
|
||||
return _check_alias_and_seq(v, code, with_named_seq);
|
||||
}
|
||||
incr = (h ^ (h >> 3)) & mask;
|
||||
if (!incr)
|
||||
incr = mask;
|
||||
for (;;) {
|
||||
i = (i + incr) & mask;
|
||||
v = code_hash[i];
|
||||
if (!v)
|
||||
return 0;
|
||||
if (_cmpname(self, v, name, namelen)) {
|
||||
return _check_alias_and_seq(v, code, with_named_seq);
|
||||
}
|
||||
incr = incr << 1;
|
||||
if (incr > mask)
|
||||
incr = incr ^ code_poly;
|
||||
}
|
||||
*code = dawg_pos_to_codepoint[position];
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
static int
|
||||
capi_getcode(const char* name, int namelen, Py_UCS4* code,
|
||||
int with_named_seq)
|
||||
{
|
||||
return _getcode(NULL, name, namelen, code, with_named_seq);
|
||||
|
||||
if (!_getcode(name, namelen, code)) {
|
||||
return 0;
|
||||
}
|
||||
return _check_alias_and_seq(code, with_named_seq);
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -1388,10 +1544,17 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if (!_getcode(self, name, (int)name_length, &code, 1)) {
|
||||
if (!_getcode(name, (int)name_length, &code)) {
|
||||
PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
|
||||
return NULL;
|
||||
}
|
||||
if (UCD_Check(self)) {
|
||||
/* in 3.2.0 there are no aliases and named sequences */
|
||||
if (IS_ALIAS(code) || IS_NAMED_SEQ(code)) {
|
||||
PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
/* check if code is in the PUA range that we use for named sequences
|
||||
and convert it */
|
||||
if (IS_NAMED_SEQ(code)) {
|
||||
|
@ -1400,6 +1563,9 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
|
|||
named_sequences[index].seq,
|
||||
named_sequences[index].seqlen);
|
||||
}
|
||||
if (IS_ALIAS(code)) {
|
||||
code = name_aliases[code-aliases_start];
|
||||
}
|
||||
return PyUnicode_FromOrdinal(code);
|
||||
}
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,533 @@
|
|||
# Original Algorithm:
|
||||
# By Steve Hanov, 2011. Released to the public domain.
|
||||
# Please see http://stevehanov.ca/blog/index.php?id=115 for the accompanying article.
|
||||
#
|
||||
# Adapted for PyPy/CPython by Carl Friedrich Bolz-Tereick
|
||||
#
|
||||
# Based on Daciuk, Jan, et al. "Incremental construction of minimal acyclic finite-state automata."
|
||||
# Computational linguistics 26.1 (2000): 3-16.
|
||||
#
|
||||
# Updated 2014 to use DAWG as a mapping; see
|
||||
# Kowaltowski, T.; CL. Lucchesi (1993), "Applications of finite automata representing large vocabularies",
|
||||
# Software-Practice and Experience 1993
|
||||
|
||||
from collections import defaultdict
|
||||
from functools import cached_property
|
||||
|
||||
|
||||
# This class represents a node in the directed acyclic word graph (DAWG). It
|
||||
# has a list of edges to other nodes. It has functions for testing whether it
|
||||
# is equivalent to another node. Nodes are equivalent if they have identical
|
||||
# edges, and each identical edge leads to identical states. The __hash__ and
|
||||
# __eq__ functions allow it to be used as a key in a python dictionary.
|
||||
|
||||
|
||||
class DawgNode:
|
||||
|
||||
def __init__(self, dawg):
|
||||
self.id = dawg.next_id
|
||||
dawg.next_id += 1
|
||||
self.final = False
|
||||
self.edges = {}
|
||||
|
||||
self.linear_edges = None # later: list of (string, next_state)
|
||||
|
||||
def __str__(self):
|
||||
if self.final:
|
||||
arr = ["1"]
|
||||
else:
|
||||
arr = ["0"]
|
||||
|
||||
for (label, node) in sorted(self.edges.items()):
|
||||
arr.append(label)
|
||||
arr.append(str(node.id))
|
||||
|
||||
return "_".join(arr)
|
||||
__repr__ = __str__
|
||||
|
||||
def _as_tuple(self):
|
||||
edges = sorted(self.edges.items())
|
||||
edge_tuple = tuple((label, node.id) for label, node in edges)
|
||||
return (self.final, edge_tuple)
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self._as_tuple())
|
||||
|
||||
def __eq__(self, other):
|
||||
return self._as_tuple() == other._as_tuple()
|
||||
|
||||
@cached_property
|
||||
def num_reachable_linear(self):
|
||||
# returns the number of different paths to final nodes reachable from
|
||||
# this one
|
||||
|
||||
count = 0
|
||||
# staying at self counts as a path if self is final
|
||||
if self.final:
|
||||
count += 1
|
||||
for label, node in self.linear_edges:
|
||||
count += node.num_reachable_linear
|
||||
|
||||
return count
|
||||
|
||||
|
||||
class Dawg:
|
||||
def __init__(self):
|
||||
self.previous_word = ""
|
||||
self.next_id = 0
|
||||
self.root = DawgNode(self)
|
||||
|
||||
# Here is a list of nodes that have not been checked for duplication.
|
||||
self.unchecked_nodes = []
|
||||
|
||||
# To deduplicate, maintain a dictionary with
|
||||
# minimized_nodes[canonical_node] is canonical_node.
|
||||
# Based on __hash__ and __eq__, minimized_nodes[n] is the
|
||||
# canonical node equal to n.
|
||||
# In other words, self.minimized_nodes[x] == x for all nodes found in
|
||||
# the dict.
|
||||
self.minimized_nodes = {}
|
||||
|
||||
# word: value mapping
|
||||
self.data = {}
|
||||
# value: word mapping
|
||||
self.inverse = {}
|
||||
|
||||
def insert(self, word, value):
|
||||
if not all(0 <= ord(c) < 128 for c in word):
|
||||
raise ValueError("Use 7-bit ASCII characters only")
|
||||
if word <= self.previous_word:
|
||||
raise ValueError("Error: Words must be inserted in alphabetical order.")
|
||||
if value in self.inverse:
|
||||
raise ValueError(f"value {value} is duplicate, got it for word {self.inverse[value]} and now {word}")
|
||||
|
||||
# find common prefix between word and previous word
|
||||
common_prefix = 0
|
||||
for i in range(min(len(word), len(self.previous_word))):
|
||||
if word[i] != self.previous_word[i]:
|
||||
break
|
||||
common_prefix += 1
|
||||
|
||||
# Check the unchecked_nodes for redundant nodes, proceeding from last
|
||||
# one down to the common prefix size. Then truncate the list at that
|
||||
# point.
|
||||
self._minimize(common_prefix)
|
||||
|
||||
self.data[word] = value
|
||||
self.inverse[value] = word
|
||||
|
||||
# add the suffix, starting from the correct node mid-way through the
|
||||
# graph
|
||||
if len(self.unchecked_nodes) == 0:
|
||||
node = self.root
|
||||
else:
|
||||
node = self.unchecked_nodes[-1][2]
|
||||
|
||||
for letter in word[common_prefix:]:
|
||||
next_node = DawgNode(self)
|
||||
node.edges[letter] = next_node
|
||||
self.unchecked_nodes.append((node, letter, next_node))
|
||||
node = next_node
|
||||
|
||||
node.final = True
|
||||
self.previous_word = word
|
||||
|
||||
def finish(self):
|
||||
if not self.data:
|
||||
raise ValueError("need at least one word in the dawg")
|
||||
# minimize all unchecked_nodes
|
||||
self._minimize(0)
|
||||
|
||||
self._linearize_edges()
|
||||
|
||||
topoorder, linear_data, inverse = self._topological_order()
|
||||
return self.compute_packed(topoorder), linear_data, inverse
|
||||
|
||||
def _minimize(self, down_to):
|
||||
# proceed from the leaf up to a certain point
|
||||
for i in range(len(self.unchecked_nodes) - 1, down_to - 1, -1):
|
||||
(parent, letter, child) = self.unchecked_nodes[i]
|
||||
if child in self.minimized_nodes:
|
||||
# replace the child with the previously encountered one
|
||||
parent.edges[letter] = self.minimized_nodes[child]
|
||||
else:
|
||||
# add the state to the minimized nodes.
|
||||
self.minimized_nodes[child] = child
|
||||
self.unchecked_nodes.pop()
|
||||
|
||||
def _lookup(self, word):
|
||||
""" Return an integer 0 <= k < number of strings in dawg
|
||||
where word is the kth successful traversal of the dawg. """
|
||||
node = self.root
|
||||
skipped = 0 # keep track of number of final nodes that we skipped
|
||||
index = 0
|
||||
while index < len(word):
|
||||
for label, child in node.linear_edges:
|
||||
if word[index] == label[0]:
|
||||
if word[index:index + len(label)] == label:
|
||||
if node.final:
|
||||
skipped += 1
|
||||
index += len(label)
|
||||
node = child
|
||||
break
|
||||
else:
|
||||
return None
|
||||
skipped += child.num_reachable_linear
|
||||
else:
|
||||
return None
|
||||
return skipped
|
||||
|
||||
def enum_all_nodes(self):
|
||||
stack = [self.root]
|
||||
done = set()
|
||||
while stack:
|
||||
node = stack.pop()
|
||||
if node.id in done:
|
||||
continue
|
||||
yield node
|
||||
done.add(node.id)
|
||||
for label, child in sorted(node.edges.items()):
|
||||
stack.append(child)
|
||||
|
||||
def prettyprint(self):
|
||||
for node in sorted(self.enum_all_nodes(), key=lambda e: e.id):
|
||||
s_final = " final" if node.final else ""
|
||||
print(f"{node.id}: ({node}) {s_final}")
|
||||
for label, child in sorted(node.edges.items()):
|
||||
print(f" {label} goto {child.id}")
|
||||
|
||||
def _inverse_lookup(self, number):
|
||||
assert 0, "not working in the current form, but keep it as the pure python version of compact lookup"
|
||||
result = []
|
||||
node = self.root
|
||||
while 1:
|
||||
if node.final:
|
||||
if pos == 0:
|
||||
return "".join(result)
|
||||
pos -= 1
|
||||
for label, child in sorted(node.edges.items()):
|
||||
nextpos = pos - child.num_reachable_linear
|
||||
if nextpos < 0:
|
||||
result.append(label)
|
||||
node = child
|
||||
break
|
||||
else:
|
||||
pos = nextpos
|
||||
else:
|
||||
assert 0
|
||||
|
||||
def _linearize_edges(self):
|
||||
# compute "linear" edges. the idea is that long chains of edges without
|
||||
# any of the intermediate states being final or any extra incoming or
|
||||
# outgoing edges can be represented by having removing them, and
|
||||
# instead using longer strings as edge labels (instead of single
|
||||
# characters)
|
||||
incoming = defaultdict(list)
|
||||
nodes = sorted(self.enum_all_nodes(), key=lambda e: e.id)
|
||||
for node in nodes:
|
||||
for label, child in sorted(node.edges.items()):
|
||||
incoming[child].append(node)
|
||||
for node in nodes:
|
||||
node.linear_edges = []
|
||||
for label, child in sorted(node.edges.items()):
|
||||
s = [label]
|
||||
while len(child.edges) == 1 and len(incoming[child]) == 1 and not child.final:
|
||||
(c, child), = child.edges.items()
|
||||
s.append(c)
|
||||
node.linear_edges.append((''.join(s), child))
|
||||
|
||||
def _topological_order(self):
|
||||
# compute reachable linear nodes, and the set of incoming edges for each node
|
||||
order = []
|
||||
stack = [self.root]
|
||||
seen = set()
|
||||
while stack:
|
||||
# depth first traversal
|
||||
node = stack.pop()
|
||||
if node.id in seen:
|
||||
continue
|
||||
seen.add(node.id)
|
||||
order.append(node)
|
||||
for label, child in node.linear_edges:
|
||||
stack.append(child)
|
||||
|
||||
# do a (slightly bad) topological sort
|
||||
incoming = defaultdict(set)
|
||||
for node in order:
|
||||
for label, child in node.linear_edges:
|
||||
incoming[child].add((label, node))
|
||||
no_incoming = [order[0]]
|
||||
topoorder = []
|
||||
positions = {}
|
||||
while no_incoming:
|
||||
node = no_incoming.pop()
|
||||
topoorder.append(node)
|
||||
positions[node] = len(topoorder)
|
||||
# use "reversed" to make sure that the linear_edges get reorderd
|
||||
# from their alphabetical order as little as necessary (no_incoming
|
||||
# is LIFO)
|
||||
for label, child in reversed(node.linear_edges):
|
||||
incoming[child].discard((label, node))
|
||||
if not incoming[child]:
|
||||
no_incoming.append(child)
|
||||
del incoming[child]
|
||||
# check result
|
||||
assert set(topoorder) == set(order)
|
||||
assert len(set(topoorder)) == len(topoorder)
|
||||
|
||||
for node in order:
|
||||
node.linear_edges.sort(key=lambda element: positions[element[1]])
|
||||
|
||||
for node in order:
|
||||
for label, child in node.linear_edges:
|
||||
assert positions[child] > positions[node]
|
||||
# number the nodes. afterwards every input string in the set has a
|
||||
# unique number in the 0 <= number < len(data). We then put the data in
|
||||
# self.data into a linear list using these numbers as indexes.
|
||||
topoorder[0].num_reachable_linear
|
||||
linear_data = [None] * len(self.data)
|
||||
inverse = {} # maps value back to index
|
||||
for word, value in self.data.items():
|
||||
index = self._lookup(word)
|
||||
linear_data[index] = value
|
||||
inverse[value] = index
|
||||
|
||||
return topoorder, linear_data, inverse
|
||||
|
||||
def compute_packed(self, order):
|
||||
def compute_chunk(node, offsets):
|
||||
""" compute the packed node/edge data for a node. result is a
|
||||
list of bytes as long as order. the jump distance calculations use
|
||||
the offsets dictionary to know where in the final big output
|
||||
bytestring the individual nodes will end up. """
|
||||
result = bytearray()
|
||||
offset = offsets[node]
|
||||
encode_varint_unsigned(number_add_bits(node.num_reachable_linear, node.final), result)
|
||||
if len(node.linear_edges) == 0:
|
||||
assert node.final
|
||||
encode_varint_unsigned(0, result) # add a 0 saying "done"
|
||||
prev_child_offset = offset + len(result)
|
||||
for edgeindex, (label, targetnode) in enumerate(node.linear_edges):
|
||||
label = label.encode('ascii')
|
||||
child_offset = offsets[targetnode]
|
||||
child_offset_difference = child_offset - prev_child_offset
|
||||
|
||||
info = number_add_bits(child_offset_difference, len(label) == 1, edgeindex == len(node.linear_edges) - 1)
|
||||
if edgeindex == 0:
|
||||
assert info != 0
|
||||
encode_varint_unsigned(info, result)
|
||||
prev_child_offset = child_offset
|
||||
if len(label) > 1:
|
||||
encode_varint_unsigned(len(label), result)
|
||||
result.extend(label)
|
||||
return result
|
||||
|
||||
def compute_new_offsets(chunks, offsets):
|
||||
""" Given a list of chunks, compute the new offsets (by adding the
|
||||
chunk lengths together). Also check if we cannot shrink the output
|
||||
further because none of the node offsets are smaller now. if that's
|
||||
the case return None. """
|
||||
new_offsets = {}
|
||||
curr_offset = 0
|
||||
should_continue = False
|
||||
for node, result in zip(order, chunks):
|
||||
if curr_offset < offsets[node]:
|
||||
# the new offset is below the current assumption, this
|
||||
# means we can shrink the output more
|
||||
should_continue = True
|
||||
new_offsets[node] = curr_offset
|
||||
curr_offset += len(result)
|
||||
if not should_continue:
|
||||
return None
|
||||
return new_offsets
|
||||
|
||||
# assign initial offsets to every node
|
||||
offsets = {}
|
||||
for i, node in enumerate(order):
|
||||
# we don't know position of the edge yet, just use something big as
|
||||
# the starting position. we'll have to do further iterations anyway,
|
||||
# but the size is at least a lower limit then
|
||||
offsets[node] = i * 2 ** 30
|
||||
|
||||
|
||||
# due to the variable integer width encoding of edge targets we need to
|
||||
# run this to fixpoint. in the process we shrink the output more and
|
||||
# more until we can't any more. at any point we can stop and use the
|
||||
# output, but we might need padding zero bytes when joining the chunks
|
||||
# to have the correct jump distances
|
||||
last_offsets = None
|
||||
while 1:
|
||||
chunks = [compute_chunk(node, offsets) for node in order]
|
||||
last_offsets = offsets
|
||||
offsets = compute_new_offsets(chunks, offsets)
|
||||
if offsets is None: # couldn't shrink
|
||||
break
|
||||
|
||||
# build the final packed string
|
||||
total_result = bytearray()
|
||||
for node, result in zip(order, chunks):
|
||||
node_offset = last_offsets[node]
|
||||
if node_offset > len(total_result):
|
||||
# need to pad to get the offsets correct
|
||||
padding = b"\x00" * (node_offset - len(total_result))
|
||||
total_result.extend(padding)
|
||||
assert node_offset == len(total_result)
|
||||
total_result.extend(result)
|
||||
return bytes(total_result)
|
||||
|
||||
|
||||
# ______________________________________________________________________
|
||||
# the following functions operate on the packed representation
|
||||
|
||||
def number_add_bits(x, *bits):
|
||||
for bit in bits:
|
||||
assert bit == 0 or bit == 1
|
||||
x = (x << 1) | bit
|
||||
return x
|
||||
|
||||
def encode_varint_unsigned(i, res):
|
||||
# https://en.wikipedia.org/wiki/LEB128 unsigned variant
|
||||
more = True
|
||||
startlen = len(res)
|
||||
if i < 0:
|
||||
raise ValueError("only positive numbers supported", i)
|
||||
while more:
|
||||
lowest7bits = i & 0b1111111
|
||||
i >>= 7
|
||||
if i == 0:
|
||||
more = False
|
||||
else:
|
||||
lowest7bits |= 0b10000000
|
||||
res.append(lowest7bits)
|
||||
return len(res) - startlen
|
||||
|
||||
def number_split_bits(x, n, acc=()):
|
||||
if n == 1:
|
||||
return x >> 1, x & 1
|
||||
if n == 2:
|
||||
return x >> 2, (x >> 1) & 1, x & 1
|
||||
assert 0, "implement me!"
|
||||
|
||||
def decode_varint_unsigned(b, index=0):
|
||||
res = 0
|
||||
shift = 0
|
||||
while True:
|
||||
byte = b[index]
|
||||
res = res | ((byte & 0b1111111) << shift)
|
||||
index += 1
|
||||
shift += 7
|
||||
if not (byte & 0b10000000):
|
||||
return res, index
|
||||
|
||||
def decode_node(packed, node):
|
||||
x, node = decode_varint_unsigned(packed, node)
|
||||
node_count, final = number_split_bits(x, 1)
|
||||
return node_count, final, node
|
||||
|
||||
def decode_edge(packed, edgeindex, prev_child_offset, offset):
|
||||
x, offset = decode_varint_unsigned(packed, offset)
|
||||
if x == 0 and edgeindex == 0:
|
||||
raise KeyError # trying to decode past a final node
|
||||
child_offset_difference, len1, last_edge = number_split_bits(x, 2)
|
||||
child_offset = prev_child_offset + child_offset_difference
|
||||
if len1:
|
||||
size = 1
|
||||
else:
|
||||
size, offset = decode_varint_unsigned(packed, offset)
|
||||
return child_offset, last_edge, size, offset
|
||||
|
||||
def _match_edge(packed, s, size, node_offset, stringpos):
|
||||
if size > 1 and stringpos + size > len(s):
|
||||
# past the end of the string, can't match
|
||||
return False
|
||||
for i in range(size):
|
||||
if packed[node_offset + i] != s[stringpos + i]:
|
||||
# if a subsequent char of an edge doesn't match, the word isn't in
|
||||
# the dawg
|
||||
if i > 0:
|
||||
raise KeyError
|
||||
return False
|
||||
return True
|
||||
|
||||
def lookup(packed, data, s):
|
||||
return data[_lookup(packed, s)]
|
||||
|
||||
def _lookup(packed, s):
|
||||
stringpos = 0
|
||||
node_offset = 0
|
||||
skipped = 0 # keep track of number of final nodes that we skipped
|
||||
false = False
|
||||
while stringpos < len(s):
|
||||
#print(f"{node_offset=} {stringpos=}")
|
||||
_, final, edge_offset = decode_node(packed, node_offset)
|
||||
prev_child_offset = edge_offset
|
||||
edgeindex = 0
|
||||
while 1:
|
||||
child_offset, last_edge, size, edgelabel_chars_offset = decode_edge(packed, edgeindex, prev_child_offset, edge_offset)
|
||||
#print(f" {edge_offset=} {child_offset=} {last_edge=} {size=} {edgelabel_chars_offset=}")
|
||||
edgeindex += 1
|
||||
prev_child_offset = child_offset
|
||||
if _match_edge(packed, s, size, edgelabel_chars_offset, stringpos):
|
||||
# match
|
||||
if final:
|
||||
skipped += 1
|
||||
stringpos += size
|
||||
node_offset = child_offset
|
||||
break
|
||||
if last_edge:
|
||||
raise KeyError
|
||||
descendant_count, _, _ = decode_node(packed, child_offset)
|
||||
skipped += descendant_count
|
||||
edge_offset = edgelabel_chars_offset + size
|
||||
_, final, _ = decode_node(packed, node_offset)
|
||||
if final:
|
||||
return skipped
|
||||
raise KeyError
|
||||
|
||||
def inverse_lookup(packed, inverse, x):
|
||||
pos = inverse[x]
|
||||
return _inverse_lookup(packed, pos)
|
||||
|
||||
def _inverse_lookup(packed, pos):
|
||||
result = bytearray()
|
||||
node_offset = 0
|
||||
while 1:
|
||||
node_count, final, edge_offset = decode_node(packed, node_offset)
|
||||
if final:
|
||||
if pos == 0:
|
||||
return bytes(result)
|
||||
pos -= 1
|
||||
prev_child_offset = edge_offset
|
||||
edgeindex = 0
|
||||
while 1:
|
||||
child_offset, last_edge, size, edgelabel_chars_offset = decode_edge(packed, edgeindex, prev_child_offset, edge_offset)
|
||||
edgeindex += 1
|
||||
prev_child_offset = child_offset
|
||||
descendant_count, _, _ = decode_node(packed, child_offset)
|
||||
nextpos = pos - descendant_count
|
||||
if nextpos < 0:
|
||||
assert edgelabel_chars_offset >= 0
|
||||
result.extend(packed[edgelabel_chars_offset: edgelabel_chars_offset + size])
|
||||
node_offset = child_offset
|
||||
break
|
||||
elif not last_edge:
|
||||
pos = nextpos
|
||||
edge_offset = edgelabel_chars_offset + size
|
||||
else:
|
||||
raise KeyError
|
||||
else:
|
||||
raise KeyError
|
||||
|
||||
|
||||
def build_compression_dawg(ucdata):
|
||||
d = Dawg()
|
||||
ucdata.sort()
|
||||
for name, value in ucdata:
|
||||
d.insert(name, value)
|
||||
packed, pos_to_code, reversedict = d.finish()
|
||||
print("size of dawg [KiB]", round(len(packed) / 1024, 2))
|
||||
# check that lookup and inverse_lookup work correctly on the input data
|
||||
for name, value in ucdata:
|
||||
assert lookup(packed, pos_to_code, name.encode('ascii')) == value
|
||||
assert inverse_lookup(packed, reversedict, value) == name.encode('ascii')
|
||||
return packed, pos_to_code
|
|
@ -623,120 +623,12 @@ def makeunicodetype(unicode, trace):
|
|||
# unicode name database
|
||||
|
||||
def makeunicodename(unicode, trace):
|
||||
from dawg import build_compression_dawg
|
||||
|
||||
FILE = "Modules/unicodename_db.h"
|
||||
|
||||
print("--- Preparing", FILE, "...")
|
||||
|
||||
# collect names
|
||||
names = [None] * len(unicode.chars)
|
||||
|
||||
for char in unicode.chars:
|
||||
record = unicode.table[char]
|
||||
if record:
|
||||
name = record.name.strip()
|
||||
if name and name[0] != "<":
|
||||
names[char] = name + chr(0)
|
||||
|
||||
print(len([n for n in names if n is not None]), "distinct names")
|
||||
|
||||
# collect unique words from names (note that we differ between
|
||||
# words inside a sentence, and words ending a sentence. the
|
||||
# latter includes the trailing null byte.
|
||||
|
||||
words = {}
|
||||
n = b = 0
|
||||
for char in unicode.chars:
|
||||
name = names[char]
|
||||
if name:
|
||||
w = name.split()
|
||||
b = b + len(name)
|
||||
n = n + len(w)
|
||||
for w in w:
|
||||
l = words.get(w)
|
||||
if l:
|
||||
l.append(None)
|
||||
else:
|
||||
words[w] = [len(words)]
|
||||
|
||||
print(n, "words in text;", b, "bytes")
|
||||
|
||||
wordlist = list(words.items())
|
||||
|
||||
# sort on falling frequency, then by name
|
||||
def word_key(a):
|
||||
aword, alist = a
|
||||
return -len(alist), aword
|
||||
wordlist.sort(key=word_key)
|
||||
|
||||
# figure out how many phrasebook escapes we need
|
||||
escapes = 0
|
||||
while escapes * 256 < len(wordlist):
|
||||
escapes = escapes + 1
|
||||
print(escapes, "escapes")
|
||||
|
||||
short = 256 - escapes
|
||||
|
||||
assert short > 0
|
||||
|
||||
print(short, "short indexes in lexicon")
|
||||
|
||||
# statistics
|
||||
n = 0
|
||||
for i in range(short):
|
||||
n = n + len(wordlist[i][1])
|
||||
print(n, "short indexes in phrasebook")
|
||||
|
||||
# pick the most commonly used words, and sort the rest on falling
|
||||
# length (to maximize overlap)
|
||||
|
||||
wordlist, wordtail = wordlist[:short], wordlist[short:]
|
||||
wordtail.sort(key=lambda a: a[0], reverse=True)
|
||||
wordlist.extend(wordtail)
|
||||
|
||||
# generate lexicon from words
|
||||
|
||||
lexicon_offset = [0]
|
||||
lexicon = ""
|
||||
words = {}
|
||||
|
||||
# build a lexicon string
|
||||
offset = 0
|
||||
for w, x in wordlist:
|
||||
# encoding: bit 7 indicates last character in word (chr(128)
|
||||
# indicates the last character in an entire string)
|
||||
ww = w[:-1] + chr(ord(w[-1])+128)
|
||||
# reuse string tails, when possible
|
||||
o = lexicon.find(ww)
|
||||
if o < 0:
|
||||
o = offset
|
||||
lexicon = lexicon + ww
|
||||
offset = offset + len(w)
|
||||
words[w] = len(lexicon_offset)
|
||||
lexicon_offset.append(o)
|
||||
|
||||
lexicon = list(map(ord, lexicon))
|
||||
|
||||
# generate phrasebook from names and lexicon
|
||||
phrasebook = [0]
|
||||
phrasebook_offset = [0] * len(unicode.chars)
|
||||
for char in unicode.chars:
|
||||
name = names[char]
|
||||
if name:
|
||||
w = name.split()
|
||||
phrasebook_offset[char] = len(phrasebook)
|
||||
for w in w:
|
||||
i = words[w]
|
||||
if i < short:
|
||||
phrasebook.append(i)
|
||||
else:
|
||||
# store as two bytes
|
||||
phrasebook.append((i>>8) + short)
|
||||
phrasebook.append(i&255)
|
||||
|
||||
assert getsize(phrasebook) == 1
|
||||
|
||||
#
|
||||
# unicode name hash table
|
||||
|
||||
# extract names
|
||||
|
@ -748,12 +640,6 @@ def makeunicodename(unicode, trace):
|
|||
if name and name[0] != "<":
|
||||
data.append((name, char))
|
||||
|
||||
# the magic number 47 was chosen to minimize the number of
|
||||
# collisions on the current data set. if you like, change it
|
||||
# and see what happens...
|
||||
|
||||
codehash = Hash("code", data, 47)
|
||||
|
||||
print("--- Writing", FILE, "...")
|
||||
|
||||
with open(FILE, "w") as fp:
|
||||
|
@ -762,24 +648,22 @@ def makeunicodename(unicode, trace):
|
|||
fprint("/* this file was generated by %s %s */" % (SCRIPT, VERSION))
|
||||
fprint()
|
||||
fprint("#define NAME_MAXLEN", 256)
|
||||
assert max(len(x) for x in data) < 256
|
||||
fprint()
|
||||
fprint("/* lexicon */")
|
||||
Array("lexicon", lexicon).dump(fp, trace)
|
||||
Array("lexicon_offset", lexicon_offset).dump(fp, trace)
|
||||
|
||||
# split decomposition index table
|
||||
offset1, offset2, shift = splitbins(phrasebook_offset, trace)
|
||||
|
||||
fprint("/* code->name phrasebook */")
|
||||
fprint("#define phrasebook_shift", shift)
|
||||
fprint("#define phrasebook_short", short)
|
||||
|
||||
Array("phrasebook", phrasebook).dump(fp, trace)
|
||||
Array("phrasebook_offset1", offset1).dump(fp, trace)
|
||||
Array("phrasebook_offset2", offset2).dump(fp, trace)
|
||||
|
||||
fprint("/* name->code dictionary */")
|
||||
codehash.dump(fp, trace)
|
||||
packed_dawg, pos_to_codepoint = build_compression_dawg(data)
|
||||
notfound = len(pos_to_codepoint)
|
||||
inverse_list = [notfound] * len(unicode.chars)
|
||||
for pos, codepoint in enumerate(pos_to_codepoint):
|
||||
inverse_list[codepoint] = pos
|
||||
Array("packed_name_dawg", list(packed_dawg)).dump(fp, trace)
|
||||
Array("dawg_pos_to_codepoint", pos_to_codepoint).dump(fp, trace)
|
||||
index1, index2, shift = splitbins(inverse_list, trace)
|
||||
fprint("#define DAWG_CODEPOINT_TO_POS_SHIFT", shift)
|
||||
fprint("#define DAWG_CODEPOINT_TO_POS_NOTFOUND", notfound)
|
||||
Array("dawg_codepoint_to_pos_index1", index1).dump(fp, trace)
|
||||
Array("dawg_codepoint_to_pos_index2", index2).dump(fp, trace)
|
||||
|
||||
fprint()
|
||||
fprint('static const unsigned int aliases_start = %#x;' %
|
||||
|
@ -1188,94 +1072,6 @@ class UnicodeData:
|
|||
self.chars = list(range(256))
|
||||
|
||||
|
||||
# hash table tools
|
||||
|
||||
# this is a straight-forward reimplementation of Python's built-in
|
||||
# dictionary type, using a static data structure, and a custom string
|
||||
# hash algorithm.
|
||||
|
||||
def myhash(s, magic):
|
||||
h = 0
|
||||
for c in map(ord, s.upper()):
|
||||
h = (h * magic) + c
|
||||
ix = h & 0xff000000
|
||||
if ix:
|
||||
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
|
||||
return h
|
||||
|
||||
|
||||
SIZES = [
|
||||
(4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
|
||||
(1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
|
||||
(65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
|
||||
(2097152,5), (4194304,3), (8388608,33), (16777216,27)
|
||||
]
|
||||
|
||||
|
||||
class Hash:
|
||||
def __init__(self, name, data, magic):
|
||||
# turn a (key, value) list into a static hash table structure
|
||||
|
||||
# determine table size
|
||||
for size, poly in SIZES:
|
||||
if size > len(data):
|
||||
poly = size + poly
|
||||
break
|
||||
else:
|
||||
raise AssertionError("ran out of polynomials")
|
||||
|
||||
print(size, "slots in hash table")
|
||||
|
||||
table = [None] * size
|
||||
|
||||
mask = size-1
|
||||
|
||||
n = 0
|
||||
|
||||
hash = myhash
|
||||
|
||||
# initialize hash table
|
||||
for key, value in data:
|
||||
h = hash(key, magic)
|
||||
i = (~h) & mask
|
||||
v = table[i]
|
||||
if v is None:
|
||||
table[i] = value
|
||||
continue
|
||||
incr = (h ^ (h >> 3)) & mask
|
||||
if not incr:
|
||||
incr = mask
|
||||
while 1:
|
||||
n = n + 1
|
||||
i = (i + incr) & mask
|
||||
v = table[i]
|
||||
if v is None:
|
||||
table[i] = value
|
||||
break
|
||||
incr = incr << 1
|
||||
if incr > mask:
|
||||
incr = incr ^ poly
|
||||
|
||||
print(n, "collisions")
|
||||
self.collisions = n
|
||||
|
||||
for i in range(len(table)):
|
||||
if table[i] is None:
|
||||
table[i] = 0
|
||||
|
||||
self.data = Array(name + "_hash", table)
|
||||
self.magic = magic
|
||||
self.name = name
|
||||
self.size = size
|
||||
self.poly = poly
|
||||
|
||||
def dump(self, file, trace):
|
||||
# write data to file, as a C array
|
||||
self.data.dump(file, trace)
|
||||
file.write("#define %s_magic %d\n" % (self.name, self.magic))
|
||||
file.write("#define %s_size %d\n" % (self.name, self.size))
|
||||
file.write("#define %s_poly %d\n" % (self.name, self.poly))
|
||||
|
||||
|
||||
# stuff to deal with arrays of unsigned integers
|
||||
|
||||
|
|
Loading…
Reference in New Issue