cpython/Modules/unicodedata.c

1383 lines
41 KiB
C

/* ------------------------------------------------------------------------
unicodedata -- Provides access to the Unicode database.
Data was extracted from the UnicodeData.txt file.
The current version number is reported in the unidata_version constant.
Written by Marc-Andre Lemburg (mal@lemburg.com).
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Modified by Martin v. Löwis (martin@v.loewis.de)
Copyright (c) Corporation for National Research Initiatives.
------------------------------------------------------------------------ */
#include "Python.h"
#include "ucnhash.h"
#include "structmember.h"
/* character properties */
typedef struct {
const unsigned char category; /* index into
_PyUnicode_CategoryNames */
const unsigned char combining; /* combining class value 0 - 255 */
const unsigned char bidirectional; /* index into
_PyUnicode_BidirectionalNames */
const unsigned char mirrored; /* true if mirrored in bidir mode */
const unsigned char east_asian_width; /* index into
_PyUnicode_EastAsianWidth */
const unsigned char normalization_quick_check; /* see is_normalized() */
} _PyUnicode_DatabaseRecord;
typedef struct change_record {
/* sequence of fields should be the same as in merge_old_version */
const unsigned char bidir_changed;
const unsigned char category_changed;
const unsigned char decimal_changed;
const unsigned char mirrored_changed;
const double numeric_changed;
} change_record;
/* data file generated by Tools/unicode/makeunicodedata.py */
#include "unicodedata_db.h"
static const _PyUnicode_DatabaseRecord*
_getrecord_ex(Py_UCS4 code)
{
int index;
if (code >= 0x110000)
index = 0;
else {
index = index1[(code>>SHIFT)];
index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
}
return &_PyUnicode_Database_Records[index];
}
/* ------------- Previous-version API ------------------------------------- */
typedef struct previous_version {
PyObject_HEAD
const char *name;
const change_record* (*getrecord)(Py_UCS4);
Py_UCS4 (*normalization)(Py_UCS4);
} PreviousDBVersion;
#define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
static PyMemberDef DB_members[] = {
{"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
{NULL}
};
/* forward declaration */
static PyTypeObject UCD_Type;
#define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
static PyObject*
new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
Py_UCS4 (*normalization)(Py_UCS4))
{
PreviousDBVersion *self;
self = PyObject_New(PreviousDBVersion, &UCD_Type);
if (self == NULL)
return NULL;
self->name = name;
self->getrecord = getrecord;
self->normalization = normalization;
return (PyObject*)self;
}
static Py_UCS4 getuchar(PyUnicodeObject *obj)
{
if (PyUnicode_READY(obj))
return (Py_UCS4)-1;
if (PyUnicode_GET_LENGTH(obj) == 1) {
if (PyUnicode_READY(obj))
return (Py_UCS4)-1;
return PyUnicode_READ_CHAR(obj, 0);
}
PyErr_SetString(PyExc_TypeError,
"need a single Unicode character as parameter");
return (Py_UCS4)-1;
}
/* --- Module API --------------------------------------------------------- */
PyDoc_STRVAR(unicodedata_decimal__doc__,
"decimal(unichr[, default])\n\
\n\
Returns the decimal value assigned to the Unicode character unichr\n\
as integer. If no such value is defined, default is returned, or, if\n\
not given, ValueError is raised.");
static PyObject *
unicodedata_decimal(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
PyObject *defobj = NULL;
int have_old = 0;
long rc;
Py_UCS4 c;
if (!PyArg_ParseTuple(args, "O!|O:decimal", &PyUnicode_Type, &v, &defobj))
return NULL;
c = getuchar(v);
if (c == (Py_UCS4)-1)
return NULL;
if (self && UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0) {
/* unassigned */
have_old = 1;
rc = -1;
}
else if (old->decimal_changed != 0xFF) {
have_old = 1;
rc = old->decimal_changed;
}
}
if (!have_old)
rc = Py_UNICODE_TODECIMAL(c);
if (rc < 0) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError,
"not a decimal");
return NULL;
}
else {
Py_INCREF(defobj);
return defobj;
}
}
return PyLong_FromLong(rc);
}
PyDoc_STRVAR(unicodedata_digit__doc__,
"digit(unichr[, default])\n\
\n\
Returns the digit value assigned to the Unicode character unichr as\n\
integer. If no such value is defined, default is returned, or, if\n\
not given, ValueError is raised.");
static PyObject *
unicodedata_digit(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
PyObject *defobj = NULL;
long rc;
Py_UCS4 c;
if (!PyArg_ParseTuple(args, "O!|O:digit", &PyUnicode_Type, &v, &defobj))
return NULL;
c = getuchar(v);
if (c == (Py_UCS4)-1)
return NULL;
rc = Py_UNICODE_TODIGIT(c);
if (rc < 0) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError, "not a digit");
return NULL;
}
else {
Py_INCREF(defobj);
return defobj;
}
}
return PyLong_FromLong(rc);
}
PyDoc_STRVAR(unicodedata_numeric__doc__,
"numeric(unichr[, default])\n\
\n\
Returns the numeric value assigned to the Unicode character unichr\n\
as float. If no such value is defined, default is returned, or, if\n\
not given, ValueError is raised.");
static PyObject *
unicodedata_numeric(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
PyObject *defobj = NULL;
int have_old = 0;
double rc;
Py_UCS4 c;
if (!PyArg_ParseTuple(args, "O!|O:numeric", &PyUnicode_Type, &v, &defobj))
return NULL;
c = getuchar(v);
if (c == (Py_UCS4)-1)
return NULL;
if (self && UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0) {
/* unassigned */
have_old = 1;
rc = -1.0;
}
else if (old->decimal_changed != 0xFF) {
have_old = 1;
rc = old->decimal_changed;
}
}
if (!have_old)
rc = Py_UNICODE_TONUMERIC(c);
if (rc == -1.0) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError, "not a numeric character");
return NULL;
}
else {
Py_INCREF(defobj);
return defobj;
}
}
return PyFloat_FromDouble(rc);
}
PyDoc_STRVAR(unicodedata_category__doc__,
"category(unichr)\n\
\n\
Returns the general category assigned to the Unicode character\n\
unichr as string.");
static PyObject *
unicodedata_category(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
int index;
Py_UCS4 c;
if (!PyArg_ParseTuple(args, "O!:category",
&PyUnicode_Type, &v))
return NULL;
c = getuchar(v);
if (c == (Py_UCS4)-1)
return NULL;
index = (int) _getrecord_ex(c)->category;
if (self && UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed != 0xFF)
index = old->category_changed;
}
return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
}
PyDoc_STRVAR(unicodedata_bidirectional__doc__,
"bidirectional(unichr)\n\
\n\
Returns the bidirectional class assigned to the Unicode character\n\
unichr as string. If no such value is defined, an empty string is\n\
returned.");
static PyObject *
unicodedata_bidirectional(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
int index;
Py_UCS4 c;
if (!PyArg_ParseTuple(args, "O!:bidirectional",
&PyUnicode_Type, &v))
return NULL;
c = getuchar(v);
if (c == (Py_UCS4)-1)
return NULL;
index = (int) _getrecord_ex(c)->bidirectional;
if (self && UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0)
index = 0; /* unassigned */
else if (old->bidir_changed != 0xFF)
index = old->bidir_changed;
}
return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
}
PyDoc_STRVAR(unicodedata_combining__doc__,
"combining(unichr)\n\
\n\
Returns the canonical combining class assigned to the Unicode\n\
character unichr as integer. Returns 0 if no combining class is\n\
defined.");
static PyObject *
unicodedata_combining(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
int index;
Py_UCS4 c;
if (!PyArg_ParseTuple(args, "O!:combining",
&PyUnicode_Type, &v))
return NULL;
c = getuchar(v);
if (c == (Py_UCS4)-1)
return NULL;
index = (int) _getrecord_ex(c)->combining;
if (self && UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0)
index = 0; /* unassigned */
}
return PyLong_FromLong(index);
}
PyDoc_STRVAR(unicodedata_mirrored__doc__,
"mirrored(unichr)\n\
\n\
Returns the mirrored property assigned to the Unicode character\n\
unichr as integer. Returns 1 if the character has been identified as\n\
a \"mirrored\" character in bidirectional text, 0 otherwise.");
static PyObject *
unicodedata_mirrored(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
int index;
Py_UCS4 c;
if (!PyArg_ParseTuple(args, "O!:mirrored",
&PyUnicode_Type, &v))
return NULL;
c = getuchar(v);
if (c == (Py_UCS4)-1)
return NULL;
index = (int) _getrecord_ex(c)->mirrored;
if (self && UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0)
index = 0; /* unassigned */
else if (old->mirrored_changed != 0xFF)
index = old->mirrored_changed;
}
return PyLong_FromLong(index);
}
PyDoc_STRVAR(unicodedata_east_asian_width__doc__,
"east_asian_width(unichr)\n\
\n\
Returns the east asian width assigned to the Unicode character\n\
unichr as string.");
static PyObject *
unicodedata_east_asian_width(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
int index;
Py_UCS4 c;
if (!PyArg_ParseTuple(args, "O!:east_asian_width",
&PyUnicode_Type, &v))
return NULL;
c = getuchar(v);
if (c == (Py_UCS4)-1)
return NULL;
index = (int) _getrecord_ex(c)->east_asian_width;
if (self && UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0)
index = 0; /* unassigned */
}
return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
}
PyDoc_STRVAR(unicodedata_decomposition__doc__,
"decomposition(unichr)\n\
\n\
Returns the character decomposition mapping assigned to the Unicode\n\
character unichr as string. An empty string is returned in case no\n\
such mapping is defined.");
static PyObject *
unicodedata_decomposition(PyObject *self, PyObject *args)
{
PyUnicodeObject *v;
char decomp[256];
int code, index, count;
size_t i;
unsigned int prefix_index;
Py_UCS4 c;
if (!PyArg_ParseTuple(args, "O!:decomposition",
&PyUnicode_Type, &v))
return NULL;
c = getuchar(v);
if (c == (Py_UCS4)-1)
return NULL;
code = (int)c;
if (self && UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0)
return PyUnicode_FromString(""); /* unassigned */
}
if (code < 0 || code >= 0x110000)
index = 0;
else {
index = decomp_index1[(code>>DECOMP_SHIFT)];
index = decomp_index2[(index<<DECOMP_SHIFT)+
(code&((1<<DECOMP_SHIFT)-1))];
}
/* high byte is number of hex bytes (usually one or two), low byte
is prefix code (from*/
count = decomp_data[index] >> 8;
/* XXX: could allocate the PyString up front instead
(strlen(prefix) + 5 * count + 1 bytes) */
/* Based on how index is calculated above and decomp_data is generated
from Tools/unicode/makeunicodedata.py, it should not be possible
to overflow decomp_prefix. */
prefix_index = decomp_data[index] & 255;
assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
/* copy prefix */
i = strlen(decomp_prefix[prefix_index]);
memcpy(decomp, decomp_prefix[prefix_index], i);
while (count-- > 0) {
if (i)
decomp[i++] = ' ';
assert(i < sizeof(decomp));
PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
decomp_data[++index]);
i += strlen(decomp + i);
}
return PyUnicode_FromStringAndSize(decomp, i);
}
static void
get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
{
if (code >= 0x110000) {
*index = 0;
} else if (self && UCD_Check(self) &&
get_old_record(self, code)->category_changed==0) {
/* unassigned in old version */
*index = 0;
}
else {
*index = decomp_index1[(code>>DECOMP_SHIFT)];
*index = decomp_index2[(*index<<DECOMP_SHIFT)+
(code&((1<<DECOMP_SHIFT)-1))];
}
/* high byte is number of hex bytes (usually one or two), low byte
is prefix code (from*/
*count = decomp_data[*index] >> 8;
*prefix = decomp_data[*index] & 255;
(*index)++;
}
#define SBase 0xAC00
#define LBase 0x1100
#define VBase 0x1161
#define TBase 0x11A7
#define LCount 19
#define VCount 21
#define TCount 28
#define NCount (VCount*TCount)
#define SCount (LCount*NCount)
static PyObject*
nfd_nfkd(PyObject *self, PyObject *input, int k)
{
PyObject *result;
Py_UCS4 *output;
Py_ssize_t i, o, osize;
int kind;
void *data;
/* Longest decomposition in Unicode 3.2: U+FDFA */
Py_UCS4 stack[20];
Py_ssize_t space, isize;
int index, prefix, count, stackptr;
unsigned char prev, cur;
stackptr = 0;
isize = PyUnicode_GET_LENGTH(input);
space = isize;
/* Overallocate at most 10 characters. */
if (space > 10) {
if (space <= PY_SSIZE_T_MAX - 10)
space += 10;
}
else {
space *= 2;
}
osize = space;
output = PyMem_NEW(Py_UCS4, space);
if (!output) {
PyErr_NoMemory();
return NULL;
}
i = o = 0;
kind = PyUnicode_KIND(input);
data = PyUnicode_DATA(input);
while (i < isize) {
stack[stackptr++] = PyUnicode_READ(kind, data, i++);
while(stackptr) {
Py_UCS4 code = stack[--stackptr];
/* Hangul Decomposition adds three characters in
a single step, so we need at least that much room. */
if (space < 3) {
Py_UCS4 *new_output;
osize += 10;
space += 10;
new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
if (new_output == NULL) {
PyMem_Free(output);
PyErr_NoMemory();
return NULL;
}
output = new_output;
}
/* Hangul Decomposition. */
if (SBase <= code && code < (SBase+SCount)) {
int SIndex = code - SBase;
int L = LBase + SIndex / NCount;
int V = VBase + (SIndex % NCount) / TCount;
int T = TBase + SIndex % TCount;
output[o++] = L;
output[o++] = V;
space -= 2;
if (T != TBase) {
output[o++] = T;
space --;
}
continue;
}
/* normalization changes */
if (self && UCD_Check(self)) {
Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
if (value != 0) {
stack[stackptr++] = value;
continue;
}
}
/* Other decompositions. */
get_decomp_record(self, code, &index, &prefix, &count);
/* Copy character if it is not decomposable, or has a
compatibility decomposition, but we do NFD. */
if (!count || (prefix && !k)) {
output[o++] = code;
space--;
continue;
}
/* Copy decomposition onto the stack, in reverse
order. */
while(count) {
code = decomp_data[index + (--count)];
stack[stackptr++] = code;
}
}
}
result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
output, o);
PyMem_Free(output);
if (!result)
return NULL;
/* result is guaranteed to be ready, as it is compact. */
kind = PyUnicode_KIND(result);
data = PyUnicode_DATA(result);
/* Sort canonically. */
i = 0;
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
if (prev == 0 || cur == 0 || prev <= cur) {
prev = cur;
continue;
}
/* Non-canonical order. Need to switch *i with previous. */
o = i - 1;
while (1) {
Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
PyUnicode_WRITE(kind, data, o+1,
PyUnicode_READ(kind, data, o));
PyUnicode_WRITE(kind, data, o, tmp);
o--;
if (o < 0)
break;
prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
if (prev == 0 || prev <= cur)
break;
}
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
}
return result;
}
static int
find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
{
unsigned int index;
for (index = 0; nfc[index].start; index++) {
unsigned int start = nfc[index].start;
if (code < start)
return -1;
if (code <= start + nfc[index].count) {
unsigned int delta = code - start;
return nfc[index].index + delta;
}
}
return -1;
}
static PyObject*
nfc_nfkc(PyObject *self, PyObject *input, int k)
{
PyObject *result;
int kind;
void *data;
Py_UCS4 *output;
Py_ssize_t i, i1, o, len;
int f,l,index,index1,comb;
Py_UCS4 code;
Py_ssize_t skipped[20];
int cskipped = 0;
result = nfd_nfkd(self, input, k);
if (!result)
return NULL;
/* result will be "ready". */
kind = PyUnicode_KIND(result);
data = PyUnicode_DATA(result);
len = PyUnicode_GET_LENGTH(result);
/* We allocate a buffer for the output.
If we find that we made no changes, we still return
the NFD result. */
output = PyMem_NEW(Py_UCS4, len);
if (!output) {
PyErr_NoMemory();
Py_DECREF(result);
return 0;
}
i = o = 0;
again:
while (i < len) {
for (index = 0; index < cskipped; index++) {
if (skipped[index] == i) {
/* *i character is skipped.
Remove from list. */
skipped[index] = skipped[cskipped-1];
cskipped--;
i++;
goto again; /* continue while */
}
}
/* Hangul Composition. We don't need to check for <LV,T>
pairs, since we always have decomposed data. */
code = PyUnicode_READ(kind, data, i);
if (LBase <= code && code < (LBase+LCount) &&
i + 1 < len &&
VBase <= PyUnicode_READ(kind, data, i+1) &&
PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
int LIndex, VIndex;
LIndex = code - LBase;
VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
code = SBase + (LIndex*VCount+VIndex)*TCount;
i+=2;
if (i < len &&
TBase <= PyUnicode_READ(kind, data, i) &&
PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
code += PyUnicode_READ(kind, data, i)-TBase;
i++;
}
output[o++] = code;
continue;
}
/* code is still input[i] here */
f = find_nfc_index(self, nfc_first, code);
if (f == -1) {
output[o++] = code;
i++;
continue;
}
/* Find next unblocked character. */
i1 = i+1;
comb = 0;
/* output base character for now; might be updated later. */
output[o] = PyUnicode_READ(kind, data, i);
while (i1 < len) {
Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
int comb1 = _getrecord_ex(code1)->combining;
if (comb) {
if (comb1 == 0)
break;
if (comb >= comb1) {
/* Character is blocked. */
i1++;
continue;
}
}
l = find_nfc_index(self, nfc_last, code1);
/* i1 cannot be combined with i. If i1
is a starter, we don't need to look further.
Otherwise, record the combining class. */
if (l == -1) {
not_combinable:
if (comb1 == 0)
break;
comb = comb1;
i1++;
continue;
}
index = f*TOTAL_LAST + l;
index1 = comp_index[index >> COMP_SHIFT];
code = comp_data[(index1<<COMP_SHIFT)+
(index&((1<<COMP_SHIFT)-1))];
if (code == 0)
goto not_combinable;
/* Replace the original character. */
output[o] = code;
/* Mark the second character unused. */
assert(cskipped < 20);
skipped[cskipped++] = i1;
i1++;
f = find_nfc_index(self, nfc_first, output[o]);
if (f == -1)
break;
}
/* Output character was already written.
Just advance the indices. */
o++; i++;
}
if (o == len) {
/* No changes. Return original string. */
PyMem_Free(output);
return result;
}
Py_DECREF(result);
result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
output, o);
PyMem_Free(output);
return result;
}
/* Return 1 if the input is certainly normalized, 0 if it might not be. */
static int
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
{
Py_ssize_t i, len;
int kind;
void *data;
unsigned char prev_combining = 0, quickcheck_mask;
/* An older version of the database is requested, quickchecks must be
disabled. */
if (self && UCD_Check(self))
return 0;
/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
as described in http://unicode.org/reports/tr15/#Annex8. */
quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
i = 0;
kind = PyUnicode_KIND(input);
data = PyUnicode_DATA(input);
len = PyUnicode_GET_LENGTH(input);
while (i < len) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
unsigned char combining = record->combining;
unsigned char quickcheck = record->normalization_quick_check;
if (quickcheck & quickcheck_mask)
return 0; /* this string might need normalization */
if (combining && prev_combining > combining)
return 0; /* non-canonical sort order, not normalized */
prev_combining = combining;
}
return 1; /* certainly normalized */
}
PyDoc_STRVAR(unicodedata_normalize__doc__,
"normalize(form, unistr)\n\
\n\
Return the normal form 'form' for the Unicode string unistr. Valid\n\
values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.");
static PyObject*
unicodedata_normalize(PyObject *self, PyObject *args)
{
char *form;
PyObject *input;
if(!PyArg_ParseTuple(args, "sO!:normalize",
&form, &PyUnicode_Type, &input))
return NULL;
if (PyUnicode_READY(input) == -1)
return NULL;
if (PyUnicode_GET_LENGTH(input) == 0) {
/* Special case empty input strings, since resizing
them later would cause internal errors. */
Py_INCREF(input);
return input;
}
if (strcmp(form, "NFC") == 0) {
if (is_normalized(self, input, 1, 0)) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(self, input, 0);
}
if (strcmp(form, "NFKC") == 0) {
if (is_normalized(self, input, 1, 1)) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(self, input, 1);
}
if (strcmp(form, "NFD") == 0) {
if (is_normalized(self, input, 0, 0)) {
Py_INCREF(input);
return input;
}
return nfd_nfkd(self, input, 0);
}
if (strcmp(form, "NFKD") == 0) {
if (is_normalized(self, input, 0, 1)) {
Py_INCREF(input);
return input;
}
return nfd_nfkd(self, input, 1);
}
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
return NULL;
}
/* -------------------------------------------------------------------- */
/* unicode character name tables */
/* data file generated by Tools/unicode/makeunicodedata.py */
#include "unicodename_db.h"
/* -------------------------------------------------------------------- */
/* database code (cut and pasted from the unidb package) */
static unsigned long
_gethash(const char *s, int len, int scale)
{
int i;
unsigned long h = 0;
unsigned long ix;
for (i = 0; i < len; i++) {
h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
ix = h & 0xff000000;
if (ix)
h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
}
return h;
}
static char *hangul_syllables[][3] = {
{ "G", "A", "" },
{ "GG", "AE", "G" },
{ "N", "YA", "GG" },
{ "D", "YAE", "GS" },
{ "DD", "EO", "N", },
{ "R", "E", "NJ" },
{ "M", "YEO", "NH" },
{ "B", "YE", "D" },
{ "BB", "O", "L" },
{ "S", "WA", "LG" },
{ "SS", "WAE", "LM" },
{ "", "OE", "LB" },
{ "J", "YO", "LS" },
{ "JJ", "U", "LT" },
{ "C", "WEO", "LP" },
{ "K", "WE", "LH" },
{ "T", "WI", "M" },
{ "P", "YU", "B" },
{ "H", "EU", "BS" },
{ 0, "YI", "S" },
{ 0, "I", "SS" },
{ 0, 0, "NG" },
{ 0, 0, "J" },
{ 0, 0, "C" },
{ 0, 0, "K" },
{ 0, 0, "T" },
{ 0, 0, "P" },
{ 0, 0, "H" }
};
/* These ranges need to match makeunicodedata.py:cjk_ranges. */
static int
is_unified_ideograph(Py_UCS4 code)
{
return
(0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
(0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
(0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
(0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
(0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
}
/* macros used to determine if the given codepoint is in the PUA range that
* we are using to store aliases and named sequences */
#define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
#define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
(cp < named_sequences_end))
static int
_getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
int with_alias_and_seq)
{
/* Find the name associated with the given codepoint.
* If with_alias_and_seq is 1, check for names in the Private Use Area 15
* that we are using for aliases and named sequences. */
int offset;
int i;
int word;
unsigned char* w;
if (code >= 0x110000)
return 0;
/* XXX should we just skip all the codepoints in the PUAs here? */
if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
return 0;
if (self && UCD_Check(self)) {
/* in 3.2.0 there are no aliases and named sequences */
const change_record *old;
if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
return 0;
old = get_old_record(self, code);
if (old->category_changed == 0) {
/* unassigned */
return 0;
}
}
if (SBase <= code && code < SBase+SCount) {
/* Hangul syllable. */
int SIndex = code - SBase;
int L = SIndex / NCount;
int V = (SIndex % NCount) / TCount;
int T = SIndex % TCount;
if (buflen < 27)
/* Worst case: HANGUL SYLLABLE <10chars>. */
return 0;
strcpy(buffer, "HANGUL SYLLABLE ");
buffer += 16;
strcpy(buffer, hangul_syllables[L][0]);
buffer += strlen(hangul_syllables[L][0]);
strcpy(buffer, hangul_syllables[V][1]);
buffer += strlen(hangul_syllables[V][1]);
strcpy(buffer, hangul_syllables[T][2]);
buffer += strlen(hangul_syllables[T][2]);
*buffer = '\0';
return 1;
}
if (is_unified_ideograph(code)) {
if (buflen < 28)
/* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
return 0;
sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
return 1;
}
/* get offset into phrasebook */
offset = phrasebook_offset1[(code>>phrasebook_shift)];
offset = phrasebook_offset2[(offset<<phrasebook_shift) +
(code&((1<<phrasebook_shift)-1))];
if (!offset)
return 0;
i = 0;
for (;;) {
/* get word index */
word = phrasebook[offset] - phrasebook_short;
if (word >= 0) {
word = (word << 8) + phrasebook[offset+1];
offset += 2;
} else
word = phrasebook[offset++];
if (i) {
if (i > buflen)
return 0; /* buffer overflow */
buffer[i++] = ' ';
}
/* copy word string from lexicon. the last character in the
word has bit 7 set. the last word in a string ends with
0x80 */
w = lexicon + lexicon_offset[word];
while (*w < 128) {
if (i >= buflen)
return 0; /* buffer overflow */
buffer[i++] = *w++;
}
if (i >= buflen)
return 0; /* buffer overflow */
buffer[i++] = *w & 127;
if (*w == 128)
break; /* end of word */
}
return 1;
}
static int
_cmpname(PyObject *self, int code, const char* name, int namelen)
{
/* check if code corresponds to the given name */
int i;
char buffer[NAME_MAXLEN];
if (!_getucname(self, code, buffer, sizeof(buffer), 1))
return 0;
for (i = 0; i < namelen; i++) {
if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
return 0;
}
return buffer[namelen] == '\0';
}
static void
find_syllable(const char *str, int *len, int *pos, int count, int column)
{
int i, len1;
*len = -1;
for (i = 0; i < count; i++) {
char *s = hangul_syllables[i][column];
len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
if (len1 <= *len)
continue;
if (strncmp(str, s, len1) == 0) {
*len = len1;
*pos = i;
}
}
if (*len == -1) {
*len = 0;
}
}
static int
_check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
{
/* check if named sequences are allowed */
if (!with_named_seq && IS_NAMED_SEQ(cp))
return 0;
/* if the codepoint is in the PUA range that we use for aliases,
* convert it to obtain the right codepoint */
if (IS_ALIAS(cp))
*code = name_aliases[cp-aliases_start];
else
*code = cp;
return 1;
}
static int
_getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
int with_named_seq)
{
/* Return the codepoint associated with the given name.
* Named aliases are resolved too (unless self != NULL (i.e. we are using
* 3.2.0)). If with_named_seq is 1, returns the PUA codepoint that we are
* using for the named sequence, and the caller must then convert it. */
unsigned int h, v;
unsigned int mask = code_size-1;
unsigned int i, incr;
/* Check for hangul syllables. */
if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
int len, L = -1, V = -1, T = -1;
const char *pos = name + 16;
find_syllable(pos, &len, &L, LCount, 0);
pos += len;
find_syllable(pos, &len, &V, VCount, 1);
pos += len;
find_syllable(pos, &len, &T, TCount, 2);
pos += len;
if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
*code = SBase + (L*VCount+V)*TCount + T;
return 1;
}
/* Otherwise, it's an illegal syllable name. */
return 0;
}
/* Check for unified ideographs. */
if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
/* Four or five hexdigits must follow. */
v = 0;
name += 22;
namelen -= 22;
if (namelen != 4 && namelen != 5)
return 0;
while (namelen--) {
v *= 16;
if (*name >= '0' && *name <= '9')
v += *name - '0';
else if (*name >= 'A' && *name <= 'F')
v += *name - 'A' + 10;
else
return 0;
name++;
}
if (!is_unified_ideograph(v))
return 0;
*code = v;
return 1;
}
/* the following is the same as python's dictionary lookup, with
only minor changes. see the makeunicodedata script for more
details */
h = (unsigned int) _gethash(name, namelen, code_magic);
i = (~h) & mask;
v = code_hash[i];
if (!v)
return 0;
if (_cmpname(self, v, name, namelen))
return _check_alias_and_seq(v, code, with_named_seq);
incr = (h ^ (h >> 3)) & mask;
if (!incr)
incr = mask;
for (;;) {
i = (i + incr) & mask;
v = code_hash[i];
if (!v)
return 0;
if (_cmpname(self, v, name, namelen))
return _check_alias_and_seq(v, code, with_named_seq);
incr = incr << 1;
if (incr > mask)
incr = incr ^ code_poly;
}
}
static const _PyUnicode_Name_CAPI hashAPI =
{
sizeof(_PyUnicode_Name_CAPI),
_getucname,
_getcode
};
/* -------------------------------------------------------------------- */
/* Python bindings */
PyDoc_STRVAR(unicodedata_name__doc__,
"name(unichr[, default])\n\
Returns the name assigned to the Unicode character unichr as a\n\
string. If no name is defined, default is returned, or, if not\n\
given, ValueError is raised.");
static PyObject *
unicodedata_name(PyObject* self, PyObject* args)
{
char name[NAME_MAXLEN];
Py_UCS4 c;
PyUnicodeObject* v;
PyObject* defobj = NULL;
if (!PyArg_ParseTuple(args, "O!|O:name", &PyUnicode_Type, &v, &defobj))
return NULL;
c = getuchar(v);
if (c == (Py_UCS4)-1)
return NULL;
if (!_getucname(self, c, name, sizeof(name), 0)) {
if (defobj == NULL) {
PyErr_SetString(PyExc_ValueError, "no such name");
return NULL;
}
else {
Py_INCREF(defobj);
return defobj;
}
}
return PyUnicode_FromString(name);
}
PyDoc_STRVAR(unicodedata_lookup__doc__,
"lookup(name)\n\
\n\
Look up character by name. If a character with the\n\
given name is found, return the corresponding Unicode\n\
character. If not found, KeyError is raised.");
static PyObject *
unicodedata_lookup(PyObject* self, PyObject* args)
{
Py_UCS4 code;
char* name;
int namelen;
unsigned int index;
if (!PyArg_ParseTuple(args, "s#:lookup", &name, &namelen))
return NULL;
if (!_getcode(self, name, namelen, &code, 1)) {
PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
return NULL;
}
/* check if code is in the PUA range that we use for named sequences
and convert it */
if (IS_NAMED_SEQ(code)) {
index = code-named_sequences_start;
return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
named_sequences[index].seq,
named_sequences[index].seqlen);
}
return PyUnicode_FromOrdinal(code);
}
/* XXX Add doc strings. */
static PyMethodDef unicodedata_functions[] = {
{"decimal", unicodedata_decimal, METH_VARARGS, unicodedata_decimal__doc__},
{"digit", unicodedata_digit, METH_VARARGS, unicodedata_digit__doc__},
{"numeric", unicodedata_numeric, METH_VARARGS, unicodedata_numeric__doc__},
{"category", unicodedata_category, METH_VARARGS,
unicodedata_category__doc__},
{"bidirectional", unicodedata_bidirectional, METH_VARARGS,
unicodedata_bidirectional__doc__},
{"combining", unicodedata_combining, METH_VARARGS,
unicodedata_combining__doc__},
{"mirrored", unicodedata_mirrored, METH_VARARGS,
unicodedata_mirrored__doc__},
{"east_asian_width", unicodedata_east_asian_width, METH_VARARGS,
unicodedata_east_asian_width__doc__},
{"decomposition", unicodedata_decomposition, METH_VARARGS,
unicodedata_decomposition__doc__},
{"name", unicodedata_name, METH_VARARGS, unicodedata_name__doc__},
{"lookup", unicodedata_lookup, METH_VARARGS, unicodedata_lookup__doc__},
{"normalize", unicodedata_normalize, METH_VARARGS,
unicodedata_normalize__doc__},
{NULL, NULL} /* sentinel */
};
static PyTypeObject UCD_Type = {
/* The ob_type field must be initialized in the module init function
* to be portable to Windows without using C++. */
PyVarObject_HEAD_INIT(NULL, 0)
"unicodedata.UCD", /*tp_name*/
sizeof(PreviousDBVersion), /*tp_basicsize*/
0, /*tp_itemsize*/
/* methods */
(destructor)PyObject_Del, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_reserved*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash*/
0, /*tp_call*/
0, /*tp_str*/
PyObject_GenericGetAttr,/*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT, /*tp_flags*/
0, /*tp_doc*/
0, /*tp_traverse*/
0, /*tp_clear*/
0, /*tp_richcompare*/
0, /*tp_weaklistoffset*/
0, /*tp_iter*/
0, /*tp_iternext*/
unicodedata_functions, /*tp_methods*/
DB_members, /*tp_members*/
0, /*tp_getset*/
0, /*tp_base*/
0, /*tp_dict*/
0, /*tp_descr_get*/
0, /*tp_descr_set*/
0, /*tp_dictoffset*/
0, /*tp_init*/
0, /*tp_alloc*/
0, /*tp_new*/
0, /*tp_free*/
0, /*tp_is_gc*/
};
PyDoc_STRVAR(unicodedata_docstring,
"This module provides access to the Unicode Character Database which\n\
defines character properties for all Unicode characters. The data in\n\
this database is based on the UnicodeData.txt file version\n\
" UNIDATA_VERSION " which is publically available from ftp://ftp.unicode.org/.\n\
\n\
The module uses the same names and symbols as defined by the\n\
UnicodeData File Format " UNIDATA_VERSION ".");
static struct PyModuleDef unicodedatamodule = {
PyModuleDef_HEAD_INIT,
"unicodedata",
unicodedata_docstring,
-1,
unicodedata_functions,
NULL,
NULL,
NULL,
NULL
};
PyMODINIT_FUNC
PyInit_unicodedata(void)
{
PyObject *m, *v;
Py_TYPE(&UCD_Type) = &PyType_Type;
m = PyModule_Create(&unicodedatamodule);
if (!m)
return NULL;
PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Py_INCREF(&UCD_Type);
PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
/* Previous versions */
v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
if (v != NULL)
PyModule_AddObject(m, "ucd_3_2_0", v);
/* Export C API */
v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
if (v != NULL)
PyModule_AddObject(m, "ucnhash_CAPI", v);
return m;
}
/*
Local variables:
c-basic-offset: 4
indent-tabs-mode: nil
End:
*/