478 lines
12 KiB
C
478 lines
12 KiB
C
/* ------------------------------------------------------------------------
|
|
|
|
unicodedata -- Provides access to the Unicode 3.0 data base.
|
|
|
|
Data was extracted from the Unicode 3.0 UnicodeData.txt file.
|
|
|
|
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
|
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
|
|
|
|
Copyright (c) Corporation for National Research Initiatives.
|
|
|
|
------------------------------------------------------------------------ */
|
|
|
|
#include "Python.h"
|
|
#include "ucnhash.h"
|
|
|
|
/* character properties */
|
|
|
|
typedef struct {
|
|
const unsigned char category; /* index into
|
|
_PyUnicode_CategoryNames */
|
|
const unsigned char combining; /* combining class value 0 - 255 */
|
|
const unsigned char bidirectional; /* index into
|
|
_PyUnicode_BidirectionalNames */
|
|
const unsigned char mirrored; /* true if mirrored in bidir mode */
|
|
} _PyUnicode_DatabaseRecord;
|
|
|
|
/* data file generated by Tools/unicode/makeunicodedata.py */
|
|
#include "unicodedata_db.h"
|
|
|
|
static const _PyUnicode_DatabaseRecord*
|
|
_getrecord(PyUnicodeObject* v)
|
|
{
|
|
int code;
|
|
int index;
|
|
|
|
code = (int) *PyUnicode_AS_UNICODE(v);
|
|
|
|
if (code < 0 || code >= 0x110000)
|
|
index = 0;
|
|
else {
|
|
index = index1[(code>>SHIFT)];
|
|
index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
|
|
}
|
|
|
|
return &_PyUnicode_Database_Records[index];
|
|
}
|
|
|
|
/* --- Module API --------------------------------------------------------- */
|
|
|
|
static PyObject *
|
|
unicodedata_decimal(PyObject *self, PyObject *args)
|
|
{
|
|
PyUnicodeObject *v;
|
|
PyObject *defobj = NULL;
|
|
long rc;
|
|
|
|
if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
|
|
return NULL;
|
|
if (PyUnicode_GET_SIZE(v) != 1) {
|
|
PyErr_SetString(PyExc_TypeError,
|
|
"need a single Unicode character as parameter");
|
|
return NULL;
|
|
}
|
|
rc = Py_UNICODE_TODECIMAL(*PyUnicode_AS_UNICODE(v));
|
|
if (rc < 0) {
|
|
if (defobj == NULL) {
|
|
PyErr_SetString(PyExc_ValueError,
|
|
"not a decimal");
|
|
return NULL;
|
|
}
|
|
else {
|
|
Py_INCREF(defobj);
|
|
return defobj;
|
|
}
|
|
}
|
|
return PyInt_FromLong(rc);
|
|
}
|
|
|
|
static PyObject *
|
|
unicodedata_digit(PyObject *self, PyObject *args)
|
|
{
|
|
PyUnicodeObject *v;
|
|
PyObject *defobj = NULL;
|
|
long rc;
|
|
|
|
if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
|
|
return NULL;
|
|
if (PyUnicode_GET_SIZE(v) != 1) {
|
|
PyErr_SetString(PyExc_TypeError,
|
|
"need a single Unicode character as parameter");
|
|
return NULL;
|
|
}
|
|
rc = Py_UNICODE_TODIGIT(*PyUnicode_AS_UNICODE(v));
|
|
if (rc < 0) {
|
|
if (defobj == NULL) {
|
|
PyErr_SetString(PyExc_ValueError, "not a digit");
|
|
return NULL;
|
|
}
|
|
else {
|
|
Py_INCREF(defobj);
|
|
return defobj;
|
|
}
|
|
}
|
|
return PyInt_FromLong(rc);
|
|
}
|
|
|
|
static PyObject *
|
|
unicodedata_numeric(PyObject *self, PyObject *args)
|
|
{
|
|
PyUnicodeObject *v;
|
|
PyObject *defobj = NULL;
|
|
double rc;
|
|
|
|
if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
|
|
return NULL;
|
|
if (PyUnicode_GET_SIZE(v) != 1) {
|
|
PyErr_SetString(PyExc_TypeError,
|
|
"need a single Unicode character as parameter");
|
|
return NULL;
|
|
}
|
|
rc = Py_UNICODE_TONUMERIC(*PyUnicode_AS_UNICODE(v));
|
|
if (rc < 0) {
|
|
if (defobj == NULL) {
|
|
PyErr_SetString(PyExc_ValueError, "not a numeric character");
|
|
return NULL;
|
|
}
|
|
else {
|
|
Py_INCREF(defobj);
|
|
return defobj;
|
|
}
|
|
}
|
|
return PyFloat_FromDouble(rc);
|
|
}
|
|
|
|
static PyObject *
|
|
unicodedata_category(PyObject *self, PyObject *args)
|
|
{
|
|
PyUnicodeObject *v;
|
|
int index;
|
|
|
|
if (!PyArg_ParseTuple(args, "O!:category",
|
|
&PyUnicode_Type, &v))
|
|
return NULL;
|
|
if (PyUnicode_GET_SIZE(v) != 1) {
|
|
PyErr_SetString(PyExc_TypeError,
|
|
"need a single Unicode character as parameter");
|
|
return NULL;
|
|
}
|
|
index = (int) _getrecord(v)->category;
|
|
return PyString_FromString(_PyUnicode_CategoryNames[index]);
|
|
}
|
|
|
|
static PyObject *
|
|
unicodedata_bidirectional(PyObject *self, PyObject *args)
|
|
{
|
|
PyUnicodeObject *v;
|
|
int index;
|
|
|
|
if (!PyArg_ParseTuple(args, "O!:bidirectional",
|
|
&PyUnicode_Type, &v))
|
|
return NULL;
|
|
if (PyUnicode_GET_SIZE(v) != 1) {
|
|
PyErr_SetString(PyExc_TypeError,
|
|
"need a single Unicode character as parameter");
|
|
return NULL;
|
|
}
|
|
index = (int) _getrecord(v)->bidirectional;
|
|
return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
|
|
}
|
|
|
|
static PyObject *
|
|
unicodedata_combining(PyObject *self, PyObject *args)
|
|
{
|
|
PyUnicodeObject *v;
|
|
|
|
if (!PyArg_ParseTuple(args, "O!:combining",
|
|
&PyUnicode_Type, &v))
|
|
return NULL;
|
|
if (PyUnicode_GET_SIZE(v) != 1) {
|
|
PyErr_SetString(PyExc_TypeError,
|
|
"need a single Unicode character as parameter");
|
|
return NULL;
|
|
}
|
|
return PyInt_FromLong((int) _getrecord(v)->combining);
|
|
}
|
|
|
|
static PyObject *
|
|
unicodedata_mirrored(PyObject *self, PyObject *args)
|
|
{
|
|
PyUnicodeObject *v;
|
|
|
|
if (!PyArg_ParseTuple(args, "O!:mirrored",
|
|
&PyUnicode_Type, &v))
|
|
return NULL;
|
|
if (PyUnicode_GET_SIZE(v) != 1) {
|
|
PyErr_SetString(PyExc_TypeError,
|
|
"need a single Unicode character as parameter");
|
|
return NULL;
|
|
}
|
|
return PyInt_FromLong((int) _getrecord(v)->mirrored);
|
|
}
|
|
|
|
static PyObject *
|
|
unicodedata_decomposition(PyObject *self, PyObject *args)
|
|
{
|
|
PyUnicodeObject *v;
|
|
char decomp[256];
|
|
int code, index, count, i;
|
|
|
|
if (!PyArg_ParseTuple(args, "O!:decomposition",
|
|
&PyUnicode_Type, &v))
|
|
return NULL;
|
|
if (PyUnicode_GET_SIZE(v) != 1) {
|
|
PyErr_SetString(PyExc_TypeError,
|
|
"need a single Unicode character as parameter");
|
|
return NULL;
|
|
}
|
|
|
|
code = (int) *PyUnicode_AS_UNICODE(v);
|
|
|
|
if (code < 0 || code >= 0x110000)
|
|
index = 0;
|
|
else {
|
|
index = decomp_index1[(code>>DECOMP_SHIFT)];
|
|
index = decomp_index2[(index<<DECOMP_SHIFT)+
|
|
(code&((1<<DECOMP_SHIFT)-1))];
|
|
}
|
|
|
|
/* high byte is number of hex bytes (usually one or two), low byte
|
|
is prefix code (from*/
|
|
count = decomp_data[index] >> 8;
|
|
|
|
/* XXX: could allocate the PyString up front instead
|
|
(strlen(prefix) + 5 * count + 1 bytes) */
|
|
|
|
/* copy prefix */
|
|
i = strlen(decomp_prefix[decomp_data[index] & 255]);
|
|
memcpy(decomp, decomp_prefix[decomp_data[index] & 255], i);
|
|
|
|
while (count-- > 0) {
|
|
if (i)
|
|
decomp[i++] = ' ';
|
|
assert((size_t)i < sizeof(decomp));
|
|
PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
|
|
decomp_data[++index]);
|
|
i += strlen(decomp + i);
|
|
}
|
|
|
|
decomp[i] = '\0';
|
|
|
|
return PyString_FromString(decomp);
|
|
}
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
/* unicode character name tables */
|
|
|
|
/* data file generated by Tools/unicode/makeunicodedata.py */
|
|
#include "unicodename_db.h"
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
/* database code (cut and pasted from the unidb package) */
|
|
|
|
static unsigned long
|
|
_gethash(const char *s, int len, int scale)
|
|
{
|
|
int i;
|
|
unsigned long h = 0;
|
|
unsigned long ix;
|
|
for (i = 0; i < len; i++) {
|
|
h = (h * scale) + (unsigned char) toupper(s[i]);
|
|
ix = h & 0xff000000;
|
|
if (ix)
|
|
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
|
|
}
|
|
return h;
|
|
}
|
|
|
|
static int
|
|
_getucname(Py_UCS4 code, char* buffer, int buflen)
|
|
{
|
|
int offset;
|
|
int i;
|
|
int word;
|
|
unsigned char* w;
|
|
|
|
if (code >= 0x110000)
|
|
return 0;
|
|
|
|
/* get offset into phrasebook */
|
|
offset = phrasebook_offset1[(code>>phrasebook_shift)];
|
|
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
|
|
(code&((1<<phrasebook_shift)-1))];
|
|
if (!offset)
|
|
return 0;
|
|
|
|
i = 0;
|
|
|
|
for (;;) {
|
|
/* get word index */
|
|
word = phrasebook[offset] - phrasebook_short;
|
|
if (word >= 0) {
|
|
word = (word << 8) + phrasebook[offset+1];
|
|
offset += 2;
|
|
} else
|
|
word = phrasebook[offset++];
|
|
if (i) {
|
|
if (i > buflen)
|
|
return 0; /* buffer overflow */
|
|
buffer[i++] = ' ';
|
|
}
|
|
/* copy word string from lexicon. the last character in the
|
|
word has bit 7 set. the last word in a string ends with
|
|
0x80 */
|
|
w = lexicon + lexicon_offset[word];
|
|
while (*w < 128) {
|
|
if (i >= buflen)
|
|
return 0; /* buffer overflow */
|
|
buffer[i++] = *w++;
|
|
}
|
|
if (i >= buflen)
|
|
return 0; /* buffer overflow */
|
|
buffer[i++] = *w & 127;
|
|
if (*w == 128)
|
|
break; /* end of word */
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
static int
|
|
_cmpname(int code, const char* name, int namelen)
|
|
{
|
|
/* check if code corresponds to the given name */
|
|
int i;
|
|
char buffer[NAME_MAXLEN];
|
|
if (!_getucname(code, buffer, sizeof(buffer)))
|
|
return 0;
|
|
for (i = 0; i < namelen; i++) {
|
|
if (toupper(name[i]) != buffer[i])
|
|
return 0;
|
|
}
|
|
return buffer[namelen] == '\0';
|
|
}
|
|
|
|
static int
|
|
_getcode(const char* name, int namelen, Py_UCS4* code)
|
|
{
|
|
unsigned int h, v;
|
|
unsigned int mask = code_size-1;
|
|
unsigned int i, incr;
|
|
|
|
/* the following is the same as python's dictionary lookup, with
|
|
only minor changes. see the makeunicodedata script for more
|
|
details */
|
|
|
|
h = (unsigned int) _gethash(name, namelen, code_magic);
|
|
i = (~h) & mask;
|
|
v = code_hash[i];
|
|
if (!v)
|
|
return 0;
|
|
if (_cmpname(v, name, namelen)) {
|
|
*code = v;
|
|
return 1;
|
|
}
|
|
incr = (h ^ (h >> 3)) & mask;
|
|
if (!incr)
|
|
incr = mask;
|
|
for (;;) {
|
|
i = (i + incr) & mask;
|
|
v = code_hash[i];
|
|
if (!v)
|
|
return 0;
|
|
if (_cmpname(v, name, namelen)) {
|
|
*code = v;
|
|
return 1;
|
|
}
|
|
incr = incr << 1;
|
|
if (incr > mask)
|
|
incr = incr ^ code_poly;
|
|
}
|
|
}
|
|
|
|
static const _PyUnicode_Name_CAPI hashAPI =
|
|
{
|
|
sizeof(_PyUnicode_Name_CAPI),
|
|
_getucname,
|
|
_getcode
|
|
};
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
/* Python bindings */
|
|
|
|
static PyObject *
|
|
unicodedata_name(PyObject* self, PyObject* args)
|
|
{
|
|
char name[NAME_MAXLEN];
|
|
|
|
PyUnicodeObject* v;
|
|
PyObject* defobj = NULL;
|
|
if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
|
|
return NULL;
|
|
|
|
if (PyUnicode_GET_SIZE(v) != 1) {
|
|
PyErr_SetString(PyExc_TypeError,
|
|
"need a single Unicode character as parameter");
|
|
return NULL;
|
|
}
|
|
|
|
if (!_getucname((Py_UCS4) *PyUnicode_AS_UNICODE(v),
|
|
name, sizeof(name))) {
|
|
if (defobj == NULL) {
|
|
PyErr_SetString(PyExc_ValueError, "no such name");
|
|
return NULL;
|
|
}
|
|
else {
|
|
Py_INCREF(defobj);
|
|
return defobj;
|
|
}
|
|
}
|
|
|
|
return Py_BuildValue("s", name);
|
|
}
|
|
|
|
static PyObject *
|
|
unicodedata_lookup(PyObject* self, PyObject* args)
|
|
{
|
|
Py_UCS4 code;
|
|
Py_UNICODE str[1];
|
|
|
|
char* name;
|
|
int namelen;
|
|
if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
|
|
return NULL;
|
|
|
|
if (!_getcode(name, namelen, &code)) {
|
|
PyErr_SetString(PyExc_KeyError, "undefined character name");
|
|
return NULL;
|
|
}
|
|
|
|
str[0] = (Py_UNICODE) code;
|
|
return PyUnicode_FromUnicode(str, 1);
|
|
}
|
|
|
|
/* XXX Add doc strings. */
|
|
|
|
static PyMethodDef unicodedata_functions[] = {
|
|
{"decimal", unicodedata_decimal, METH_VARARGS},
|
|
{"digit", unicodedata_digit, METH_VARARGS},
|
|
{"numeric", unicodedata_numeric, METH_VARARGS},
|
|
{"category", unicodedata_category, METH_VARARGS},
|
|
{"bidirectional", unicodedata_bidirectional, METH_VARARGS},
|
|
{"combining", unicodedata_combining, METH_VARARGS},
|
|
{"mirrored", unicodedata_mirrored, METH_VARARGS},
|
|
{"decomposition",unicodedata_decomposition, METH_VARARGS},
|
|
{"name", unicodedata_name, METH_VARARGS},
|
|
{"lookup", unicodedata_lookup, METH_VARARGS},
|
|
{NULL, NULL} /* sentinel */
|
|
};
|
|
|
|
PyDoc_STRVAR(unicodedata_docstring, "unicode character database");
|
|
|
|
PyMODINIT_FUNC
|
|
initunicodedata(void)
|
|
{
|
|
PyObject *m, *v;
|
|
|
|
m = Py_InitModule3(
|
|
"unicodedata", unicodedata_functions, unicodedata_docstring);
|
|
if (!m)
|
|
return;
|
|
|
|
/* Export C API */
|
|
v = PyCObject_FromVoidPtr((void *) &hashAPI, NULL);
|
|
if (v != NULL)
|
|
PyModule_AddObject(m, "ucnhash_CAPI", v);
|
|
}
|