Move uchhash functionality into unicodedata (after the recent

crop of changes, the files are small enough to do this).  Also
adds "name" and "lookup" functions to unicodedata.
This commit is contained in:
Fredrik Lundh 2001-01-24 07:59:11 +00:00
parent eda28445c0
commit 06d126803c
4 changed files with 248 additions and 228 deletions

View File

@ -1,6 +1,7 @@
""" Test script for the Unicode implementation.
Written by Bill Tutt.
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
@ -46,23 +47,24 @@ except UnicodeError, v:
print v
print "done."
import ucnhash
import unicodedata
print "Testing name to code mapping....",
for char in "SPAM":
name = "LATIN SMALL LETTER %s" % char
code = ucnhash.getcode(name)
verify(ucnhash.getname(code) == name)
code = unicodedata.lookup(name)
verify(unicodedata.name(code) == name)
print "done."
print "Testing code to name mapping for all characters....",
count = 0
for code in range(65536):
try:
name = ucnhash.getname(code)
verify(ucnhash.getcode(name) == code)
char = unichr(code)
name = unicodedata.name(char)
verify(unicodedata.lookup(name) == char)
count += 1
except ValueError:
except (KeyError, ValueError):
pass
print "done."
@ -78,7 +80,6 @@ verify(u"\N{FULLWIDTH LATIN SMALL LETTER A}" == u"\uFF41")
"""
print "done."
# strict error testing:
print "Testing unicode character name expansion strict error handling....",
try:

View File

@ -1,212 +1,22 @@
/* unicode character name tables */
/* rewritten for Python 2.1 by Fredrik Lundh (fredrik@pythonware.com) */
/* obsolete -- remove this file! */
#include "Python.h"
#include "ucnhash.h"
/* data file generated by Tools/unicode/makeunicodedata.py */
#include "unicodename_db.h"
/* -------------------------------------------------------------------- */
/* database code (cut and pasted from the unidb package) */
static unsigned long
gethash(const char *s, int len, int scale)
{
int i;
unsigned long h = 0;
unsigned long ix;
for (i = 0; i < len; i++) {
h = (h * scale) + (unsigned char) toupper(s[i]);
ix = h & 0xff000000;
if (ix)
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
}
return h;
}
static int
getname(Py_UCS4 code, char* buffer, int buflen)
{
int offset;
int i;
int word;
unsigned char* w;
if (code < 0 || code >= 65536)
return 0;
/* get offset into phrasebook */
offset = phrasebook_offset1[(code>>phrasebook_shift)];
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
(code&((1<<phrasebook_shift)-1))];
if (!offset)
return 0;
i = 0;
for (;;) {
/* get word index */
word = phrasebook[offset] - phrasebook_short;
if (word >= 0) {
word = (word << 8) + phrasebook[offset+1];
offset += 2;
} else
word = phrasebook[offset++];
if (i) {
if (i > buflen)
return 0; /* buffer overflow */
buffer[i++] = ' ';
}
/* copy word string from lexicon. the last character in the
word has bit 7 set. the last word in a string ends with
0x80 */
w = lexicon + lexicon_offset[word];
while (*w < 128) {
if (i >= buflen)
return 0; /* buffer overflow */
buffer[i++] = *w++;
}
if (i >= buflen)
return 0; /* buffer overflow */
buffer[i++] = *w & 127;
if (*w == 128)
break; /* end of word */
}
return 1;
}
static int
cmpname(int code, const char* name, int namelen)
{
/* check if code corresponds to the given name */
int i;
char buffer[NAME_MAXLEN];
if (!getname(code, buffer, sizeof(buffer)))
return 0;
for (i = 0; i < namelen; i++) {
if (toupper(name[i]) != buffer[i])
return 0;
}
return buffer[namelen] == '\0';
}
static int
getcode(const char* name, int namelen, Py_UCS4* code)
{
unsigned int h, v;
unsigned int mask = code_size-1;
unsigned int i, incr;
/* the following is the same as python's dictionary lookup, with
only minor changes. see the makeunicodedata script for more
details */
h = (unsigned int) gethash(name, namelen, code_magic);
i = (~h) & mask;
v = code_hash[i];
if (!v)
return 0;
if (cmpname(v, name, namelen)) {
*code = v;
return 1;
}
incr = (h ^ (h >> 3)) & mask;
if (!incr)
incr = mask;
for (;;) {
i = (i + incr) & mask;
v = code_hash[i];
if (!v)
return -1;
if (cmpname(v, name, namelen)) {
*code = v;
return 1;
}
incr = incr << 1;
if (incr > mask)
incr = incr ^ code_poly;
}
}
static const _PyUnicode_Name_CAPI hashAPI =
{
sizeof(_PyUnicode_Name_CAPI),
getname,
getcode
};
/* -------------------------------------------------------------------- */
/* Python bindings */
static PyObject *
ucnhash_getname(PyObject* self, PyObject* args)
{
char name[NAME_MAXLEN];
int code;
if (!PyArg_ParseTuple(args, "i", &code))
return NULL;
if (!getname((Py_UCS4) code, name, sizeof(name))) {
PyErr_SetString(PyExc_ValueError, "undefined character code");
return NULL;
}
return Py_BuildValue("s", name);
}
static PyObject *
ucnhash_getcode(PyObject* self, PyObject* args)
{
Py_UCS4 code;
char* name;
int namelen;
if (!PyArg_ParseTuple(args, "s#", &name, &namelen))
return NULL;
if (!getcode(name, namelen, &code)) {
PyErr_SetString(PyExc_ValueError, "undefined character name");
return NULL;
}
return Py_BuildValue("i", code);
}
static
PyMethodDef ucnhash_methods[] =
{
{"getname", ucnhash_getname, 1},
{"getcode", ucnhash_getcode, 1},
{NULL, NULL},
};
static char *ucnhash_docstring = "ucnhash hash function module";
static char *ucnhash_docstring = "ucnhash hash function module (obsolete)";
/* Create PyMethodObjects and register them in the module's dict */
DL_EXPORT(void)
initucnhash(void)
{
PyObject *m, *d, *v;
m = Py_InitModule4(
Py_InitModule4(
"ucnhash", /* Module name */
ucnhash_methods, /* Method list */
ucnhash_docstring, /* Module doc-string */
(PyObject *)NULL, /* always pass this as *self */
PYTHON_API_VERSION); /* API Version */
if (!m)
return;
d = PyModule_GetDict(m);
if (!d)
return;
/* Export C API */
v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
PyDict_SetItemString(d, "Unicode_Names_CAPI", v);
Py_XDECREF(v);
}

View File

@ -12,6 +12,9 @@
------------------------------------------------------------------------ */
#include "Python.h"
#include "ucnhash.h"
/* character properties */
typedef struct {
const unsigned char category; /* index into
@ -52,8 +55,7 @@ unicodedata_decimal(PyObject *self, PyObject *args)
PyObject *defobj = NULL;
long rc;
if (!PyArg_ParseTuple(args, "O!|O:decimal",
&PyUnicode_Type, &v, &defobj))
if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
@ -82,8 +84,7 @@ unicodedata_digit(PyObject *self, PyObject *args)
PyObject *defobj = NULL;
long rc;
if (!PyArg_ParseTuple(args, "O!|O:digit",
&PyUnicode_Type, &v, &defobj))
if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
@ -93,8 +94,7 @@ unicodedata_digit(PyObject *self, PyObject *args)
rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
if (rc < 0) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError,
"not a digit");
PyErr_SetString(PyExc_ValueError, "not a digit");
return NULL;
}
else {
@ -112,8 +112,7 @@ unicodedata_numeric(PyObject *self, PyObject *args)
PyObject *defobj = NULL;
double rc;
if (!PyArg_ParseTuple(args, "O!|O:numeric",
&PyUnicode_Type, &v, &defobj))
if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
@ -123,8 +122,7 @@ unicodedata_numeric(PyObject *self, PyObject *args)
rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
if (rc < 0) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError,
"not a numeric character");
PyErr_SetString(PyExc_ValueError, "not a numeric character");
return NULL;
}
else {
@ -252,22 +250,231 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
return PyString_FromString(decomp);
}
/* -------------------------------------------------------------------- */
/* unicode character name tables */
/* data file generated by Tools/unicode/makeunicodedata.py */
#include "unicodename_db.h"
/* -------------------------------------------------------------------- */
/* database code (cut and pasted from the unidb package) */
static unsigned long
gethash(const char *s, int len, int scale)
{
int i;
unsigned long h = 0;
unsigned long ix;
for (i = 0; i < len; i++) {
h = (h * scale) + (unsigned char) toupper(s[i]);
ix = h & 0xff000000;
if (ix)
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
}
return h;
}
static int
getname(Py_UCS4 code, char* buffer, int buflen)
{
int offset;
int i;
int word;
unsigned char* w;
if (code < 0 || code >= 65536)
return 0;
/* get offset into phrasebook */
offset = phrasebook_offset1[(code>>phrasebook_shift)];
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
(code&((1<<phrasebook_shift)-1))];
if (!offset)
return 0;
i = 0;
for (;;) {
/* get word index */
word = phrasebook[offset] - phrasebook_short;
if (word >= 0) {
word = (word << 8) + phrasebook[offset+1];
offset += 2;
} else
word = phrasebook[offset++];
if (i) {
if (i > buflen)
return 0; /* buffer overflow */
buffer[i++] = ' ';
}
/* copy word string from lexicon. the last character in the
word has bit 7 set. the last word in a string ends with
0x80 */
w = lexicon + lexicon_offset[word];
while (*w < 128) {
if (i >= buflen)
return 0; /* buffer overflow */
buffer[i++] = *w++;
}
if (i >= buflen)
return 0; /* buffer overflow */
buffer[i++] = *w & 127;
if (*w == 128)
break; /* end of word */
}
return 1;
}
static int
cmpname(int code, const char* name, int namelen)
{
/* check if code corresponds to the given name */
int i;
char buffer[NAME_MAXLEN];
if (!getname(code, buffer, sizeof(buffer)))
return 0;
for (i = 0; i < namelen; i++) {
if (toupper(name[i]) != buffer[i])
return 0;
}
return buffer[namelen] == '\0';
}
static int
getcode(const char* name, int namelen, Py_UCS4* code)
{
unsigned int h, v;
unsigned int mask = code_size-1;
unsigned int i, incr;
/* the following is the same as python's dictionary lookup, with
only minor changes. see the makeunicodedata script for more
details */
h = (unsigned int) gethash(name, namelen, code_magic);
i = (~h) & mask;
v = code_hash[i];
if (!v)
return 0;
if (cmpname(v, name, namelen)) {
*code = v;
return 1;
}
incr = (h ^ (h >> 3)) & mask;
if (!incr)
incr = mask;
for (;;) {
i = (i + incr) & mask;
v = code_hash[i];
if (!v)
return -1;
if (cmpname(v, name, namelen)) {
*code = v;
return 1;
}
incr = incr << 1;
if (incr > mask)
incr = incr ^ code_poly;
}
}
static const _PyUnicode_Name_CAPI hashAPI =
{
sizeof(_PyUnicode_Name_CAPI),
getname,
getcode
};
/* -------------------------------------------------------------------- */
/* Python bindings */
static PyObject *
unicodedata_name(PyObject* self, PyObject* args)
{
char name[NAME_MAXLEN];
PyUnicodeObject* v;
PyObject* defobj = NULL;
if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
return NULL;
if (PyUnicode_GET_SIZE(v) != 1) {
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
return NULL;
}
if (!getname((Py_UCS4) *PyUnicode_AS_UNICODE(v), name, sizeof(name))) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError, "no such name");
return NULL;
}
else {
Py_INCREF(defobj);
return defobj;
}
}
return Py_BuildValue("s", name);
}
static PyObject *
unicodedata_lookup(PyObject* self, PyObject* args)
{
Py_UCS4 code;
Py_UNICODE str[1];
char* name;
int namelen;
if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
return NULL;
if (!getcode(name, namelen, &code)) {
PyErr_SetString(PyExc_KeyError, "undefined character name");
return NULL;
}
str[0] = (Py_UNICODE) code;
return PyUnicode_FromUnicode(str, 1);
}
/* XXX Add doc strings. */
static PyMethodDef unicodedata_functions[] = {
{"decimal", unicodedata_decimal, 1},
{"digit", unicodedata_digit, 1},
{"numeric", unicodedata_numeric, 1},
{"category", unicodedata_category, 1},
{"bidirectional", unicodedata_bidirectional, 1},
{"combining", unicodedata_combining, 1},
{"mirrored", unicodedata_mirrored, 1},
{"decomposition", unicodedata_decomposition, 1},
{"decimal", unicodedata_decimal, METH_VARARGS},
{"digit", unicodedata_digit, METH_VARARGS},
{"numeric", unicodedata_numeric, METH_VARARGS},
{"category", unicodedata_category, METH_VARARGS},
{"bidirectional", unicodedata_bidirectional, METH_VARARGS},
{"combining", unicodedata_combining, METH_VARARGS},
{"mirrored", unicodedata_mirrored, METH_VARARGS},
{"decomposition",unicodedata_decomposition, METH_VARARGS},
{"name", unicodedata_name, METH_VARARGS},
{"lookup", unicodedata_lookup, METH_VARARGS},
{NULL, NULL} /* sentinel */
};
static char *unicodedata_docstring = "unicode character database";
DL_EXPORT(void)
initunicodedata(void)
{
Py_InitModule("unicodedata", unicodedata_functions);
PyObject *m, *d, *v;
m = Py_InitModule4(
"unicodedata", unicodedata_functions,
unicodedata_docstring, NULL, PYTHON_API_VERSION);
if (!m)
return;
d = PyModule_GetDict(m);
if (!d)
return;
/* Export C API */
v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
PyDict_SetItemString(d, "ucnhash_CAPI", v);
Py_XDECREF(v);
}

View File

@ -1103,7 +1103,7 @@ int unicodeescape_decoding_error(const char **source,
}
}
static _PyUnicode_Name_CAPI *unicode_names = NULL;
static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
int size,
@ -1236,18 +1236,18 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
/* Ok, we need to deal with Unicode Character Names now,
* make sure we've imported the hash table data...
*/
if (unicode_names == NULL) {
if (ucnhash_CAPI == NULL) {
PyObject *mod = 0, *v = 0;
mod = PyImport_ImportModule("ucnhash");
mod = PyImport_ImportModule("unicodedata");
if (mod == NULL)
goto ucnhashError;
v = PyObject_GetAttrString(mod,"Unicode_Names_CAPI");
v = PyObject_GetAttrString(mod,"ucnhash_CAPI");
Py_DECREF(mod);
if (v == NULL)
goto ucnhashError;
unicode_names = PyCObject_AsVoidPtr(v);
ucnhash_CAPI = PyCObject_AsVoidPtr(v);
Py_DECREF(v);
if (unicode_names == NULL)
if (ucnhash_CAPI == NULL)
goto ucnhashError;
}
@ -1259,7 +1259,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
while (*endBrace != '}' && endBrace < end)
endBrace++;
if (endBrace != end && *endBrace == '}') {
if (!unicode_names->getcode(start, endBrace-start, &chr)) {
if (!ucnhash_CAPI->getcode(start, endBrace-start, &chr)) {
if (unicodeescape_decoding_error(
&s, &x, errors,
"Invalid Unicode Character Name")
@ -1312,8 +1312,10 @@ store:
return (PyObject *)v;
ucnhashError:
PyErr_SetString(PyExc_UnicodeError,
"\\N escapes not supported (can't load ucnhash module)");
PyErr_SetString(
PyExc_UnicodeError,
"\\N escapes not supported (can't load unicodedata module)"
);
return NULL;
onError: