Port normalization to new API.
This commit is contained in:
parent
f0ddadcf2e
commit
22970667da
|
@ -494,36 +494,44 @@ static PyObject*
|
||||||
nfd_nfkd(PyObject *self, PyObject *input, int k)
|
nfd_nfkd(PyObject *self, PyObject *input, int k)
|
||||||
{
|
{
|
||||||
PyObject *result;
|
PyObject *result;
|
||||||
Py_UNICODE *i, *end, *o;
|
Py_UCS4 *output;
|
||||||
|
Py_ssize_t i, o, osize;
|
||||||
|
int kind;
|
||||||
|
void *data;
|
||||||
/* Longest decomposition in Unicode 3.2: U+FDFA */
|
/* Longest decomposition in Unicode 3.2: U+FDFA */
|
||||||
Py_UNICODE stack[20];
|
Py_UCS4 stack[20];
|
||||||
Py_ssize_t space, isize;
|
Py_ssize_t space, isize;
|
||||||
int index, prefix, count, stackptr;
|
int index, prefix, count, stackptr;
|
||||||
unsigned char prev, cur;
|
unsigned char prev, cur;
|
||||||
|
|
||||||
stackptr = 0;
|
stackptr = 0;
|
||||||
isize = PyUnicode_GET_SIZE(input);
|
isize = PyUnicode_GET_LENGTH(input);
|
||||||
/* Overallocate atmost 10 characters. */
|
/* Overallocate atmost 10 characters. */
|
||||||
space = (isize > 10 ? 10 : isize) + isize;
|
space = (isize > 10 ? 10 : isize) + isize;
|
||||||
result = PyUnicode_FromUnicode(NULL, space);
|
osize = space;
|
||||||
if (!result)
|
output = PyMem_Malloc(space * sizeof(Py_UCS4));
|
||||||
|
if (!output) {
|
||||||
|
PyErr_NoMemory();
|
||||||
return NULL;
|
return NULL;
|
||||||
i = PyUnicode_AS_UNICODE(input);
|
}
|
||||||
end = i + isize;
|
i = o = 0;
|
||||||
o = PyUnicode_AS_UNICODE(result);
|
kind = PyUnicode_KIND(input);
|
||||||
|
data = PyUnicode_DATA(input);
|
||||||
|
|
||||||
while (i < end) {
|
while (i < isize) {
|
||||||
stack[stackptr++] = *i++;
|
stack[stackptr++] = PyUnicode_READ(kind, data, i++);
|
||||||
while(stackptr) {
|
while(stackptr) {
|
||||||
Py_UNICODE code = stack[--stackptr];
|
Py_UCS4 code = stack[--stackptr];
|
||||||
/* Hangul Decomposition adds three characters in
|
/* Hangul Decomposition adds three characters in
|
||||||
a single step, so we need atleast that much room. */
|
a single step, so we need atleast that much room. */
|
||||||
if (space < 3) {
|
if (space < 3) {
|
||||||
Py_ssize_t newsize = PyUnicode_GET_SIZE(result) + 10;
|
osize += 10;
|
||||||
space += 10;
|
space += 10;
|
||||||
if (PyUnicode_Resize(&result, newsize) == -1)
|
output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
|
||||||
|
if (output == NULL) {
|
||||||
|
PyErr_NoMemory();
|
||||||
return NULL;
|
return NULL;
|
||||||
o = PyUnicode_AS_UNICODE(result) + newsize - space;
|
}
|
||||||
}
|
}
|
||||||
/* Hangul Decomposition. */
|
/* Hangul Decomposition. */
|
||||||
if (SBase <= code && code < (SBase+SCount)) {
|
if (SBase <= code && code < (SBase+SCount)) {
|
||||||
|
@ -531,11 +539,11 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
|
||||||
int L = LBase + SIndex / NCount;
|
int L = LBase + SIndex / NCount;
|
||||||
int V = VBase + (SIndex % NCount) / TCount;
|
int V = VBase + (SIndex % NCount) / TCount;
|
||||||
int T = TBase + SIndex % TCount;
|
int T = TBase + SIndex % TCount;
|
||||||
*o++ = L;
|
output[o++] = L;
|
||||||
*o++ = V;
|
output[o++] = V;
|
||||||
space -= 2;
|
space -= 2;
|
||||||
if (T != TBase) {
|
if (T != TBase) {
|
||||||
*o++ = T;
|
output[o++] = T;
|
||||||
space --;
|
space --;
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
|
@ -555,7 +563,7 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
|
||||||
/* Copy character if it is not decomposable, or has a
|
/* Copy character if it is not decomposable, or has a
|
||||||
compatibility decomposition, but we do NFD. */
|
compatibility decomposition, but we do NFD. */
|
||||||
if (!count || (prefix && !k)) {
|
if (!count || (prefix && !k)) {
|
||||||
*o++ = code;
|
output[o++] = code;
|
||||||
space--;
|
space--;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -568,15 +576,20 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Drop overallocation. Cannot fail. */
|
result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
|
||||||
PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
|
output, o);
|
||||||
|
PyMem_Free(output);
|
||||||
|
if (!result)
|
||||||
|
return NULL;
|
||||||
|
/* result is guaranteed to be ready, as it is compact. */
|
||||||
|
kind = PyUnicode_KIND(result);
|
||||||
|
data = PyUnicode_DATA(result);
|
||||||
|
|
||||||
/* Sort canonically. */
|
/* Sort canonically. */
|
||||||
i = PyUnicode_AS_UNICODE(result);
|
i = 0;
|
||||||
prev = _getrecord_ex(*i)->combining;
|
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
|
||||||
end = i + PyUnicode_GET_SIZE(result);
|
for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
|
||||||
for (i++; i < end; i++) {
|
cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
|
||||||
cur = _getrecord_ex(*i)->combining;
|
|
||||||
if (prev == 0 || cur == 0 || prev <= cur) {
|
if (prev == 0 || cur == 0 || prev <= cur) {
|
||||||
prev = cur;
|
prev = cur;
|
||||||
continue;
|
continue;
|
||||||
|
@ -584,23 +597,24 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
|
||||||
/* Non-canonical order. Need to switch *i with previous. */
|
/* Non-canonical order. Need to switch *i with previous. */
|
||||||
o = i - 1;
|
o = i - 1;
|
||||||
while (1) {
|
while (1) {
|
||||||
Py_UNICODE tmp = o[1];
|
Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
|
||||||
o[1] = o[0];
|
PyUnicode_WRITE(kind, data, o+1,
|
||||||
o[0] = tmp;
|
PyUnicode_READ(kind, data, o));
|
||||||
|
PyUnicode_WRITE(kind, data, o, tmp);
|
||||||
o--;
|
o--;
|
||||||
if (o < PyUnicode_AS_UNICODE(result))
|
if (o < 0)
|
||||||
break;
|
break;
|
||||||
prev = _getrecord_ex(*o)->combining;
|
prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
|
||||||
if (prev == 0 || prev <= cur)
|
if (prev == 0 || prev <= cur)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
prev = _getrecord_ex(*i)->combining;
|
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
find_nfc_index(PyObject *self, struct reindex* nfc, Py_UNICODE code)
|
find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
|
||||||
{
|
{
|
||||||
int index;
|
int index;
|
||||||
for (index = 0; nfc[index].start; index++) {
|
for (index = 0; nfc[index].start; index++) {
|
||||||
|
@ -619,27 +633,36 @@ static PyObject*
|
||||||
nfc_nfkc(PyObject *self, PyObject *input, int k)
|
nfc_nfkc(PyObject *self, PyObject *input, int k)
|
||||||
{
|
{
|
||||||
PyObject *result;
|
PyObject *result;
|
||||||
Py_UNICODE *i, *i1, *o, *end;
|
int kind;
|
||||||
|
void *data;
|
||||||
|
Py_UCS4 *output;
|
||||||
|
Py_ssize_t i, i1, o, len;
|
||||||
int f,l,index,index1,comb;
|
int f,l,index,index1,comb;
|
||||||
Py_UNICODE code;
|
Py_UCS4 code;
|
||||||
Py_UNICODE *skipped[20];
|
Py_ssize_t skipped[20];
|
||||||
int cskipped = 0;
|
int cskipped = 0;
|
||||||
|
|
||||||
result = nfd_nfkd(self, input, k);
|
result = nfd_nfkd(self, input, k);
|
||||||
if (!result)
|
if (!result)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
/* result will be "ready". */
|
||||||
|
kind = PyUnicode_KIND(result);
|
||||||
|
data = PyUnicode_DATA(result);
|
||||||
|
len = PyUnicode_GET_LENGTH(result);
|
||||||
|
|
||||||
/* We are going to modify result in-place.
|
/* We allocate a buffer for the output.
|
||||||
If nfd_nfkd is changed to sometimes return the input,
|
If we find that we made no changes, we still return
|
||||||
this code needs to be reviewed. */
|
the NFD result. */
|
||||||
assert(result != input);
|
output = PyMem_Malloc(len * sizeof(Py_UCS4));
|
||||||
|
if (!output) {
|
||||||
i = PyUnicode_AS_UNICODE(result);
|
PyErr_NoMemory();
|
||||||
end = i + PyUnicode_GET_SIZE(result);
|
Py_DECREF(result);
|
||||||
o = PyUnicode_AS_UNICODE(result);
|
return 0;
|
||||||
|
}
|
||||||
|
i = o = 0;
|
||||||
|
|
||||||
again:
|
again:
|
||||||
while (i < end) {
|
while (i < len) {
|
||||||
for (index = 0; index < cskipped; index++) {
|
for (index = 0; index < cskipped; index++) {
|
||||||
if (skipped[index] == i) {
|
if (skipped[index] == i) {
|
||||||
/* *i character is skipped.
|
/* *i character is skipped.
|
||||||
|
@ -652,33 +675,41 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
|
||||||
}
|
}
|
||||||
/* Hangul Composition. We don't need to check for <LV,T>
|
/* Hangul Composition. We don't need to check for <LV,T>
|
||||||
pairs, since we always have decomposed data. */
|
pairs, since we always have decomposed data. */
|
||||||
if (LBase <= *i && *i < (LBase+LCount) &&
|
code = PyUnicode_READ(kind, data, i);
|
||||||
i + 1 < end &&
|
if (LBase <= code && code < (LBase+LCount) &&
|
||||||
VBase <= i[1] && i[1] <= (VBase+VCount)) {
|
i + 1 < len &&
|
||||||
|
VBase <= PyUnicode_READ(kind, data, i+1) &&
|
||||||
|
PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
|
||||||
int LIndex, VIndex;
|
int LIndex, VIndex;
|
||||||
LIndex = i[0] - LBase;
|
LIndex = code - LBase;
|
||||||
VIndex = i[1] - VBase;
|
VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
|
||||||
code = SBase + (LIndex*VCount+VIndex)*TCount;
|
code = SBase + (LIndex*VCount+VIndex)*TCount;
|
||||||
i+=2;
|
i+=2;
|
||||||
if (i < end &&
|
if (i < len &&
|
||||||
TBase <= *i && *i <= (TBase+TCount)) {
|
TBase <= PyUnicode_READ(kind, data, i) &&
|
||||||
code += *i-TBase;
|
PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
|
||||||
|
code += PyUnicode_READ(kind, data, i)-TBase;
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
*o++ = code;
|
output[o++] = code;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
f = find_nfc_index(self, nfc_first, *i);
|
/* code is still input[i] here */
|
||||||
|
f = find_nfc_index(self, nfc_first, code);
|
||||||
if (f == -1) {
|
if (f == -1) {
|
||||||
*o++ = *i++;
|
output[o++] = code;
|
||||||
|
i++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* Find next unblocked character. */
|
/* Find next unblocked character. */
|
||||||
i1 = i+1;
|
i1 = i+1;
|
||||||
comb = 0;
|
comb = 0;
|
||||||
while (i1 < end) {
|
/* output base character for now; might be updated later. */
|
||||||
int comb1 = _getrecord_ex(*i1)->combining;
|
output[o] = PyUnicode_READ(kind, data, i);
|
||||||
|
while (i1 < len) {
|
||||||
|
Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
|
||||||
|
int comb1 = _getrecord_ex(code1)->combining;
|
||||||
if (comb) {
|
if (comb) {
|
||||||
if (comb1 == 0)
|
if (comb1 == 0)
|
||||||
break;
|
break;
|
||||||
|
@ -688,8 +719,8 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
l = find_nfc_index(self, nfc_last, *i1);
|
l = find_nfc_index(self, nfc_last, code1);
|
||||||
/* *i1 cannot be combined with *i. If *i1
|
/* i1 cannot be combined with i. If i1
|
||||||
is a starter, we don't need to look further.
|
is a starter, we don't need to look further.
|
||||||
Otherwise, record the combining class. */
|
Otherwise, record the combining class. */
|
||||||
if (l == -1) {
|
if (l == -1) {
|
||||||
|
@ -708,19 +739,28 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
|
||||||
goto not_combinable;
|
goto not_combinable;
|
||||||
|
|
||||||
/* Replace the original character. */
|
/* Replace the original character. */
|
||||||
*i = code;
|
output[o] = code;
|
||||||
/* Mark the second character unused. */
|
/* Mark the second character unused. */
|
||||||
assert(cskipped < 20);
|
assert(cskipped < 20);
|
||||||
skipped[cskipped++] = i1;
|
skipped[cskipped++] = i1;
|
||||||
i1++;
|
i1++;
|
||||||
f = find_nfc_index(self, nfc_first, *i);
|
f = find_nfc_index(self, nfc_first, output[o]);
|
||||||
if (f == -1)
|
if (f == -1)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
*o++ = *i++;
|
/* Output character was already written.
|
||||||
|
Just advance the indices. */
|
||||||
|
o++; i++;
|
||||||
}
|
}
|
||||||
if (o != end)
|
if (o == len) {
|
||||||
PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
|
/* No changes. Return original string. */
|
||||||
|
PyMem_Free(output);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
Py_DECREF(result);
|
||||||
|
result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
|
||||||
|
output, o);
|
||||||
|
PyMem_Free(output);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -728,7 +768,9 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
|
||||||
static int
|
static int
|
||||||
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
|
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
|
||||||
{
|
{
|
||||||
Py_UNICODE *i, *end;
|
Py_ssize_t i, len;
|
||||||
|
int kind;
|
||||||
|
void *data;
|
||||||
unsigned char prev_combining = 0, quickcheck_mask;
|
unsigned char prev_combining = 0, quickcheck_mask;
|
||||||
|
|
||||||
/* An older version of the database is requested, quickchecks must be
|
/* An older version of the database is requested, quickchecks must be
|
||||||
|
@ -740,10 +782,13 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
|
||||||
as described in http://unicode.org/reports/tr15/#Annex8. */
|
as described in http://unicode.org/reports/tr15/#Annex8. */
|
||||||
quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
|
quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
|
||||||
|
|
||||||
i = PyUnicode_AS_UNICODE(input);
|
i = 0;
|
||||||
end = i + PyUnicode_GET_SIZE(input);
|
kind = PyUnicode_KIND(input);
|
||||||
while (i < end) {
|
data = PyUnicode_DATA(input);
|
||||||
const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
|
len = PyUnicode_GET_LENGTH(input);
|
||||||
|
while (i < len) {
|
||||||
|
Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
|
||||||
|
const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
|
||||||
unsigned char combining = record->combining;
|
unsigned char combining = record->combining;
|
||||||
unsigned char quickcheck = record->normalization_quick_check;
|
unsigned char quickcheck = record->normalization_quick_check;
|
||||||
|
|
||||||
|
@ -772,7 +817,10 @@ unicodedata_normalize(PyObject *self, PyObject *args)
|
||||||
&form, &PyUnicode_Type, &input))
|
&form, &PyUnicode_Type, &input))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
if (PyUnicode_GetSize(input) == 0) {
|
if (PyUnicode_READY(input) == -1)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
if (PyUnicode_GET_LENGTH(input) == 0) {
|
||||||
/* Special case empty input strings, since resizing
|
/* Special case empty input strings, since resizing
|
||||||
them later would cause internal errors. */
|
them later would cause internal errors. */
|
||||||
Py_INCREF(input);
|
Py_INCREF(input);
|
||||||
|
|
Loading…
Reference in New Issue