213 lines
5.0 KiB
C
213 lines
5.0 KiB
C
/* unicode character name tables */
|
|
/* rewritten for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */
|
|
|
|
#include "Python.h"
|
|
#include "ucnhash.h"
|
|
|
|
/* data file generated by Tools/unicode/makeunicodedata.py */
|
|
#include "unicodename_db.h"
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
/* database code (cut and pasted from the unidb package) */
|
|
|
|
static unsigned long
|
|
gethash(const char *s, int len, int scale)
|
|
{
|
|
int i;
|
|
unsigned long h = 0;
|
|
unsigned long ix;
|
|
for (i = 0; i < len; i++) {
|
|
h = (h * scale) + (unsigned char) toupper(s[i]);
|
|
ix = h & 0xff000000;
|
|
if (ix)
|
|
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
|
|
}
|
|
return h;
|
|
}
|
|
|
|
static int
|
|
getname(Py_UCS4 code, char* buffer, int buflen)
|
|
{
|
|
int offset;
|
|
int i;
|
|
int word;
|
|
unsigned char* w;
|
|
|
|
if (code < 0 || code >= 65536)
|
|
return 0;
|
|
|
|
/* get offset into phrasebook */
|
|
offset = phrasebook_offset1[(code>>phrasebook_shift)];
|
|
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
|
|
(code&((1<<phrasebook_shift)-1))];
|
|
if (!offset)
|
|
return 0;
|
|
|
|
i = 0;
|
|
|
|
for (;;) {
|
|
/* get word index */
|
|
word = phrasebook[offset] - phrasebook_short;
|
|
if (word >= 0) {
|
|
word = (word << 8) + phrasebook[offset+1];
|
|
offset += 2;
|
|
} else
|
|
word = phrasebook[offset++];
|
|
if (i) {
|
|
if (i > buflen)
|
|
return 0; /* buffer overflow */
|
|
buffer[i++] = ' ';
|
|
}
|
|
/* copy word string from lexicon. the last character in the
|
|
word has bit 7 set. the last word in a string ends with
|
|
0x80 */
|
|
w = lexicon + lexicon_offset[word];
|
|
while (*w < 128) {
|
|
if (i >= buflen)
|
|
return 0; /* buffer overflow */
|
|
buffer[i++] = *w++;
|
|
}
|
|
if (i >= buflen)
|
|
return 0; /* buffer overflow */
|
|
buffer[i++] = *w & 127;
|
|
if (*w == 128)
|
|
break; /* end of word */
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
static int
|
|
cmpname(int code, const char* name, int namelen)
|
|
{
|
|
/* check if code corresponds to the given name */
|
|
int i;
|
|
char buffer[NAME_MAXLEN];
|
|
if (!getname(code, buffer, sizeof(buffer)))
|
|
return 0;
|
|
for (i = 0; i < namelen; i++) {
|
|
if (toupper(name[i]) != buffer[i])
|
|
return 0;
|
|
}
|
|
return buffer[namelen] == '\0';
|
|
}
|
|
|
|
static int
|
|
getcode(const char* name, int namelen, Py_UCS4* code)
|
|
{
|
|
unsigned int h, v;
|
|
unsigned int mask = code_size-1;
|
|
unsigned int i, incr;
|
|
|
|
/* the following is the same as python's dictionary lookup, with
|
|
only minor changes. see the makeunicodedata script for more
|
|
details */
|
|
|
|
h = (unsigned int) gethash(name, namelen, code_magic);
|
|
i = (~h) & mask;
|
|
v = code_hash[i];
|
|
if (!v)
|
|
return 0;
|
|
if (cmpname(v, name, namelen)) {
|
|
*code = v;
|
|
return 1;
|
|
}
|
|
incr = (h ^ (h >> 3)) & mask;
|
|
if (!incr)
|
|
incr = mask;
|
|
for (;;) {
|
|
i = (i + incr) & mask;
|
|
v = code_hash[i];
|
|
if (!v)
|
|
return -1;
|
|
if (cmpname(v, name, namelen)) {
|
|
*code = v;
|
|
return 1;
|
|
}
|
|
incr = incr << 1;
|
|
if (incr > mask)
|
|
incr = incr ^ code_poly;
|
|
}
|
|
}
|
|
|
|
static const _PyUnicode_Name_CAPI hashAPI =
|
|
{
|
|
sizeof(_PyUnicode_Name_CAPI),
|
|
getname,
|
|
getcode
|
|
};
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
/* Python bindings */
|
|
|
|
static PyObject *
|
|
ucnhash_getname(PyObject* self, PyObject* args)
|
|
{
|
|
char name[NAME_MAXLEN];
|
|
|
|
int code;
|
|
if (!PyArg_ParseTuple(args, "i", &code))
|
|
return NULL;
|
|
|
|
if (!getname((Py_UCS4) code, name, sizeof(name))) {
|
|
PyErr_SetString(PyExc_ValueError, "undefined character code");
|
|
return NULL;
|
|
}
|
|
|
|
return Py_BuildValue("s", name);
|
|
}
|
|
|
|
static PyObject *
|
|
ucnhash_getcode(PyObject* self, PyObject* args)
|
|
{
|
|
Py_UCS4 code;
|
|
|
|
char* name;
|
|
int namelen;
|
|
if (!PyArg_ParseTuple(args, "s#", &name, &namelen))
|
|
return NULL;
|
|
|
|
if (!getcode(name, namelen, &code)) {
|
|
PyErr_SetString(PyExc_ValueError, "undefined character name");
|
|
return NULL;
|
|
}
|
|
|
|
return Py_BuildValue("i", code);
|
|
}
|
|
|
|
static
|
|
PyMethodDef ucnhash_methods[] =
|
|
{
|
|
{"getname", ucnhash_getname, 1},
|
|
{"getcode", ucnhash_getcode, 1},
|
|
{NULL, NULL},
|
|
};
|
|
|
|
static char *ucnhash_docstring = "ucnhash hash function module";
|
|
|
|
|
|
/* Create PyMethodObjects and register them in the module's dict */
|
|
DL_EXPORT(void)
|
|
initucnhash(void)
|
|
{
|
|
PyObject *m, *d, *v;
|
|
|
|
m = Py_InitModule4(
|
|
"ucnhash", /* Module name */
|
|
ucnhash_methods, /* Method list */
|
|
ucnhash_docstring, /* Module doc-string */
|
|
(PyObject *)NULL, /* always pass this as *self */
|
|
PYTHON_API_VERSION); /* API Version */
|
|
if (!m)
|
|
return;
|
|
|
|
d = PyModule_GetDict(m);
|
|
if (!d)
|
|
return;
|
|
|
|
/* Export C API */
|
|
v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
|
|
PyDict_SetItemString(d, "Unicode_Names_CAPI", v);
|
|
Py_XDECREF(v);
|
|
}
|