|
|
@ -93,29 +93,19 @@ static PyMemberDef DB_members[] = {
|
|
|
|
/* forward declaration */
|
|
|
|
/* forward declaration */
|
|
|
|
static PyTypeObject UCD_Type;
|
|
|
|
static PyTypeObject UCD_Type;
|
|
|
|
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
// Check if self is an unicodedata.UCD instance.
|
|
|
|
// Borrowed reference to &UCD_Type. It is used to prepare the code
|
|
|
|
// If self is NULL (when the PyCapsule C API is used), return 0.
|
|
|
|
// to convert the UCD_Type static type to a heap type.
|
|
|
|
// PyModule_Check() is used to avoid having to retrieve the ucd_type.
|
|
|
|
PyTypeObject *ucd_type;
|
|
|
|
// See unicodedata_functions comment to the rationale of this macro.
|
|
|
|
|
|
|
|
#define UCD_Check(self) (self != NULL && !PyModule_Check(self))
|
|
|
|
_PyUnicode_Name_CAPI capi;
|
|
|
|
|
|
|
|
} unicodedata_module_state;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// bpo-1635741: Temporary global state until the unicodedata module
|
|
|
|
|
|
|
|
// gets a real module state.
|
|
|
|
|
|
|
|
static unicodedata_module_state global_module_state;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Check if self is an instance of ucd_type.
|
|
|
|
|
|
|
|
// Return 0 if self is NULL (when the PyCapsule C API is used).
|
|
|
|
|
|
|
|
#define UCD_Check(self, ucd_type) (self != NULL && Py_IS_TYPE(self, ucd_type))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static PyObject*
|
|
|
|
static PyObject*
|
|
|
|
new_previous_version(unicodedata_module_state *state,
|
|
|
|
new_previous_version(PyTypeObject *ucd_type,
|
|
|
|
const char*name, const change_record* (*getrecord)(Py_UCS4),
|
|
|
|
const char*name, const change_record* (*getrecord)(Py_UCS4),
|
|
|
|
Py_UCS4 (*normalization)(Py_UCS4))
|
|
|
|
Py_UCS4 (*normalization)(Py_UCS4))
|
|
|
|
{
|
|
|
|
{
|
|
|
|
PreviousDBVersion *self;
|
|
|
|
PreviousDBVersion *self;
|
|
|
|
self = PyObject_New(PreviousDBVersion, state->ucd_type);
|
|
|
|
self = PyObject_New(PreviousDBVersion, ucd_type);
|
|
|
|
if (self == NULL)
|
|
|
|
if (self == NULL)
|
|
|
|
return NULL;
|
|
|
|
return NULL;
|
|
|
|
self->name = name;
|
|
|
|
self->name = name;
|
|
|
@ -147,12 +137,11 @@ unicodedata_UCD_decimal_impl(PyObject *self, int chr,
|
|
|
|
PyObject *default_value)
|
|
|
|
PyObject *default_value)
|
|
|
|
/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
|
|
|
|
/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
|
|
|
|
{
|
|
|
|
{
|
|
|
|
unicodedata_module_state *state = &global_module_state;
|
|
|
|
|
|
|
|
int have_old = 0;
|
|
|
|
int have_old = 0;
|
|
|
|
long rc;
|
|
|
|
long rc;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
|
|
|
|
|
|
|
|
if (UCD_Check(self, state->ucd_type)) {
|
|
|
|
if (UCD_Check(self)) {
|
|
|
|
const change_record *old = get_old_record(self, c);
|
|
|
|
const change_record *old = get_old_record(self, c);
|
|
|
|
if (old->category_changed == 0) {
|
|
|
|
if (old->category_changed == 0) {
|
|
|
|
/* unassigned */
|
|
|
|
/* unassigned */
|
|
|
@ -236,12 +225,11 @@ unicodedata_UCD_numeric_impl(PyObject *self, int chr,
|
|
|
|
PyObject *default_value)
|
|
|
|
PyObject *default_value)
|
|
|
|
/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
|
|
|
|
/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
|
|
|
|
{
|
|
|
|
{
|
|
|
|
unicodedata_module_state *state = &global_module_state;
|
|
|
|
|
|
|
|
int have_old = 0;
|
|
|
|
int have_old = 0;
|
|
|
|
double rc;
|
|
|
|
double rc;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
|
|
|
|
|
|
|
|
if (UCD_Check(self, state->ucd_type)) {
|
|
|
|
if (UCD_Check(self)) {
|
|
|
|
const change_record *old = get_old_record(self, c);
|
|
|
|
const change_record *old = get_old_record(self, c);
|
|
|
|
if (old->category_changed == 0) {
|
|
|
|
if (old->category_changed == 0) {
|
|
|
|
/* unassigned */
|
|
|
|
/* unassigned */
|
|
|
@ -283,11 +271,10 @@ static PyObject *
|
|
|
|
unicodedata_UCD_category_impl(PyObject *self, int chr)
|
|
|
|
unicodedata_UCD_category_impl(PyObject *self, int chr)
|
|
|
|
/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
|
|
|
|
/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
|
|
|
|
{
|
|
|
|
{
|
|
|
|
unicodedata_module_state *state = &global_module_state;
|
|
|
|
|
|
|
|
int index;
|
|
|
|
int index;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
index = (int) _getrecord_ex(c)->category;
|
|
|
|
index = (int) _getrecord_ex(c)->category;
|
|
|
|
if (UCD_Check(self, state->ucd_type)) {
|
|
|
|
if (UCD_Check(self)) {
|
|
|
|
const change_record *old = get_old_record(self, c);
|
|
|
|
const change_record *old = get_old_record(self, c);
|
|
|
|
if (old->category_changed != 0xFF)
|
|
|
|
if (old->category_changed != 0xFF)
|
|
|
|
index = old->category_changed;
|
|
|
|
index = old->category_changed;
|
|
|
@ -311,11 +298,10 @@ static PyObject *
|
|
|
|
unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
|
|
|
|
unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
|
|
|
|
/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
|
|
|
|
/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
|
|
|
|
{
|
|
|
|
{
|
|
|
|
unicodedata_module_state *state = &global_module_state;
|
|
|
|
|
|
|
|
int index;
|
|
|
|
int index;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
index = (int) _getrecord_ex(c)->bidirectional;
|
|
|
|
index = (int) _getrecord_ex(c)->bidirectional;
|
|
|
|
if (UCD_Check(self, state->ucd_type)) {
|
|
|
|
if (UCD_Check(self)) {
|
|
|
|
const change_record *old = get_old_record(self, c);
|
|
|
|
const change_record *old = get_old_record(self, c);
|
|
|
|
if (old->category_changed == 0)
|
|
|
|
if (old->category_changed == 0)
|
|
|
|
index = 0; /* unassigned */
|
|
|
|
index = 0; /* unassigned */
|
|
|
@ -341,11 +327,10 @@ static int
|
|
|
|
unicodedata_UCD_combining_impl(PyObject *self, int chr)
|
|
|
|
unicodedata_UCD_combining_impl(PyObject *self, int chr)
|
|
|
|
/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
|
|
|
|
/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
|
|
|
|
{
|
|
|
|
{
|
|
|
|
unicodedata_module_state *state = &global_module_state;
|
|
|
|
|
|
|
|
int index;
|
|
|
|
int index;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
index = (int) _getrecord_ex(c)->combining;
|
|
|
|
index = (int) _getrecord_ex(c)->combining;
|
|
|
|
if (UCD_Check(self, state->ucd_type)) {
|
|
|
|
if (UCD_Check(self)) {
|
|
|
|
const change_record *old = get_old_record(self, c);
|
|
|
|
const change_record *old = get_old_record(self, c);
|
|
|
|
if (old->category_changed == 0)
|
|
|
|
if (old->category_changed == 0)
|
|
|
|
index = 0; /* unassigned */
|
|
|
|
index = 0; /* unassigned */
|
|
|
@ -370,11 +355,10 @@ static int
|
|
|
|
unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
|
|
|
|
unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
|
|
|
|
/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
|
|
|
|
/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
|
|
|
|
{
|
|
|
|
{
|
|
|
|
unicodedata_module_state *state = &global_module_state;
|
|
|
|
|
|
|
|
int index;
|
|
|
|
int index;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
index = (int) _getrecord_ex(c)->mirrored;
|
|
|
|
index = (int) _getrecord_ex(c)->mirrored;
|
|
|
|
if (UCD_Check(self, state->ucd_type)) {
|
|
|
|
if (UCD_Check(self)) {
|
|
|
|
const change_record *old = get_old_record(self, c);
|
|
|
|
const change_record *old = get_old_record(self, c);
|
|
|
|
if (old->category_changed == 0)
|
|
|
|
if (old->category_changed == 0)
|
|
|
|
index = 0; /* unassigned */
|
|
|
|
index = 0; /* unassigned */
|
|
|
@ -398,11 +382,10 @@ static PyObject *
|
|
|
|
unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
|
|
|
|
unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
|
|
|
|
/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
|
|
|
|
/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
|
|
|
|
{
|
|
|
|
{
|
|
|
|
unicodedata_module_state *state = &global_module_state;
|
|
|
|
|
|
|
|
int index;
|
|
|
|
int index;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
index = (int) _getrecord_ex(c)->east_asian_width;
|
|
|
|
index = (int) _getrecord_ex(c)->east_asian_width;
|
|
|
|
if (UCD_Check(self, state->ucd_type)) {
|
|
|
|
if (UCD_Check(self)) {
|
|
|
|
const change_record *old = get_old_record(self, c);
|
|
|
|
const change_record *old = get_old_record(self, c);
|
|
|
|
if (old->category_changed == 0)
|
|
|
|
if (old->category_changed == 0)
|
|
|
|
index = 0; /* unassigned */
|
|
|
|
index = 0; /* unassigned */
|
|
|
@ -428,7 +411,6 @@ static PyObject *
|
|
|
|
unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
|
|
|
|
unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
|
|
|
|
/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
|
|
|
|
/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
|
|
|
|
{
|
|
|
|
{
|
|
|
|
unicodedata_module_state *state = &global_module_state;
|
|
|
|
|
|
|
|
char decomp[256];
|
|
|
|
char decomp[256];
|
|
|
|
int code, index, count;
|
|
|
|
int code, index, count;
|
|
|
|
size_t i;
|
|
|
|
size_t i;
|
|
|
@ -437,7 +419,7 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
|
|
|
|
|
|
|
|
|
|
|
|
code = (int)c;
|
|
|
|
code = (int)c;
|
|
|
|
|
|
|
|
|
|
|
|
if (UCD_Check(self, state->ucd_type)) {
|
|
|
|
if (UCD_Check(self)) {
|
|
|
|
const change_record *old = get_old_record(self, c);
|
|
|
|
const change_record *old = get_old_record(self, c);
|
|
|
|
if (old->category_changed == 0)
|
|
|
|
if (old->category_changed == 0)
|
|
|
|
return PyUnicode_FromString(""); /* unassigned */
|
|
|
|
return PyUnicode_FromString(""); /* unassigned */
|
|
|
@ -480,13 +462,14 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
static void
|
|
|
|
get_decomp_record(unicodedata_module_state *state, PyObject *self,
|
|
|
|
get_decomp_record(PyObject *self, Py_UCS4 code,
|
|
|
|
Py_UCS4 code, int *index, int *prefix, int *count)
|
|
|
|
int *index, int *prefix, int *count)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
if (code >= 0x110000) {
|
|
|
|
if (code >= 0x110000) {
|
|
|
|
*index = 0;
|
|
|
|
*index = 0;
|
|
|
|
} else if (UCD_Check(self, state->ucd_type) &&
|
|
|
|
}
|
|
|
|
get_old_record(self, code)->category_changed==0) {
|
|
|
|
else if (UCD_Check(self)
|
|
|
|
|
|
|
|
&& get_old_record(self, code)->category_changed==0) {
|
|
|
|
/* unassigned in old version */
|
|
|
|
/* unassigned in old version */
|
|
|
|
*index = 0;
|
|
|
|
*index = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -515,8 +498,7 @@ get_decomp_record(unicodedata_module_state *state, PyObject *self,
|
|
|
|
#define SCount (LCount*NCount)
|
|
|
|
#define SCount (LCount*NCount)
|
|
|
|
|
|
|
|
|
|
|
|
static PyObject*
|
|
|
|
static PyObject*
|
|
|
|
nfd_nfkd(unicodedata_module_state *state, PyObject *self,
|
|
|
|
nfd_nfkd(PyObject *self, PyObject *input, int k)
|
|
|
|
PyObject *input, int k)
|
|
|
|
|
|
|
|
{
|
|
|
|
{
|
|
|
|
PyObject *result;
|
|
|
|
PyObject *result;
|
|
|
|
Py_UCS4 *output;
|
|
|
|
Py_UCS4 *output;
|
|
|
@ -584,7 +566,7 @@ nfd_nfkd(unicodedata_module_state *state, PyObject *self,
|
|
|
|
continue;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* normalization changes */
|
|
|
|
/* normalization changes */
|
|
|
|
if (UCD_Check(self, state->ucd_type)) {
|
|
|
|
if (UCD_Check(self)) {
|
|
|
|
Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
|
|
|
|
Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
|
|
|
|
if (value != 0) {
|
|
|
|
if (value != 0) {
|
|
|
|
stack[stackptr++] = value;
|
|
|
|
stack[stackptr++] = value;
|
|
|
@ -593,7 +575,7 @@ nfd_nfkd(unicodedata_module_state *state, PyObject *self,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Other decompositions. */
|
|
|
|
/* Other decompositions. */
|
|
|
|
get_decomp_record(state, self, code, &index, &prefix, &count);
|
|
|
|
get_decomp_record(self, code, &index, &prefix, &count);
|
|
|
|
|
|
|
|
|
|
|
|
/* Copy character if it is not decomposable, or has a
|
|
|
|
/* Copy character if it is not decomposable, or has a
|
|
|
|
compatibility decomposition, but we do NFD. */
|
|
|
|
compatibility decomposition, but we do NFD. */
|
|
|
@ -665,7 +647,7 @@ find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static PyObject*
|
|
|
|
static PyObject*
|
|
|
|
nfc_nfkc(unicodedata_module_state *state, PyObject *self, PyObject *input, int k)
|
|
|
|
nfc_nfkc(PyObject *self, PyObject *input, int k)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
PyObject *result;
|
|
|
|
PyObject *result;
|
|
|
|
int kind;
|
|
|
|
int kind;
|
|
|
@ -677,7 +659,7 @@ nfc_nfkc(unicodedata_module_state *state, PyObject *self, PyObject *input, int k
|
|
|
|
Py_ssize_t skipped[20];
|
|
|
|
Py_ssize_t skipped[20];
|
|
|
|
int cskipped = 0;
|
|
|
|
int cskipped = 0;
|
|
|
|
|
|
|
|
|
|
|
|
result = nfd_nfkd(state, self, input, k);
|
|
|
|
result = nfd_nfkd(self, input, k);
|
|
|
|
if (!result)
|
|
|
|
if (!result)
|
|
|
|
return NULL;
|
|
|
|
return NULL;
|
|
|
|
/* result will be "ready". */
|
|
|
|
/* result will be "ready". */
|
|
|
@ -820,13 +802,13 @@ typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
|
|
|
|
* https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
|
|
|
|
* https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
static QuickcheckResult
|
|
|
|
static QuickcheckResult
|
|
|
|
is_normalized_quickcheck(unicodedata_module_state *state, PyObject *self,
|
|
|
|
is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
|
|
|
|
PyObject *input, bool nfc, bool k, bool yes_only)
|
|
|
|
bool yes_only)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
/* An older version of the database is requested, quickchecks must be
|
|
|
|
/* UCD 3.2.0 is requested, quickchecks must be disabled. */
|
|
|
|
disabled. */
|
|
|
|
if (UCD_Check(self)) {
|
|
|
|
if (UCD_Check(self, state->ucd_type))
|
|
|
|
|
|
|
|
return NO;
|
|
|
|
return NO;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
Py_ssize_t i, len;
|
|
|
|
Py_ssize_t i, len;
|
|
|
|
int kind;
|
|
|
|
int kind;
|
|
|
@ -885,7 +867,6 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
|
|
|
|
PyObject *input)
|
|
|
|
PyObject *input)
|
|
|
|
/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
|
|
|
|
/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
|
|
|
|
{
|
|
|
|
{
|
|
|
|
unicodedata_module_state *state = &global_module_state;
|
|
|
|
|
|
|
|
if (PyUnicode_READY(input) == -1) {
|
|
|
|
if (PyUnicode_READY(input) == -1) {
|
|
|
|
return NULL;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -921,10 +902,10 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
|
|
|
|
return NULL;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
m = is_normalized_quickcheck(state, self, input, nfc, k, false);
|
|
|
|
m = is_normalized_quickcheck(self, input, nfc, k, false);
|
|
|
|
|
|
|
|
|
|
|
|
if (m == MAYBE) {
|
|
|
|
if (m == MAYBE) {
|
|
|
|
cmp = (nfc ? nfc_nfkc : nfd_nfkd)(state, self, input, k);
|
|
|
|
cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
|
|
|
|
if (cmp == NULL) {
|
|
|
|
if (cmp == NULL) {
|
|
|
|
return NULL;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -959,7 +940,6 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
|
|
|
|
PyObject *input)
|
|
|
|
PyObject *input)
|
|
|
|
/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
|
|
|
|
/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
|
|
|
|
{
|
|
|
|
{
|
|
|
|
unicodedata_module_state *state = &global_module_state;
|
|
|
|
|
|
|
|
if (PyUnicode_GET_LENGTH(input) == 0) {
|
|
|
|
if (PyUnicode_GET_LENGTH(input) == 0) {
|
|
|
|
/* Special case empty input strings, since resizing
|
|
|
|
/* Special case empty input strings, since resizing
|
|
|
|
them later would cause internal errors. */
|
|
|
|
them later would cause internal errors. */
|
|
|
@ -968,36 +948,36 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
|
|
|
|
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
|
|
|
|
if (is_normalized_quickcheck(state, self, input,
|
|
|
|
if (is_normalized_quickcheck(self, input,
|
|
|
|
true, false, true) == YES) {
|
|
|
|
true, false, true) == YES) {
|
|
|
|
Py_INCREF(input);
|
|
|
|
Py_INCREF(input);
|
|
|
|
return input;
|
|
|
|
return input;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nfc_nfkc(state, self, input, 0);
|
|
|
|
return nfc_nfkc(self, input, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
|
|
|
|
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
|
|
|
|
if (is_normalized_quickcheck(state, self, input,
|
|
|
|
if (is_normalized_quickcheck(self, input,
|
|
|
|
true, true, true) == YES) {
|
|
|
|
true, true, true) == YES) {
|
|
|
|
Py_INCREF(input);
|
|
|
|
Py_INCREF(input);
|
|
|
|
return input;
|
|
|
|
return input;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nfc_nfkc(state, self, input, 1);
|
|
|
|
return nfc_nfkc(self, input, 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
|
|
|
|
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
|
|
|
|
if (is_normalized_quickcheck(state, self, input,
|
|
|
|
if (is_normalized_quickcheck(self, input,
|
|
|
|
false, false, true) == YES) {
|
|
|
|
false, false, true) == YES) {
|
|
|
|
Py_INCREF(input);
|
|
|
|
Py_INCREF(input);
|
|
|
|
return input;
|
|
|
|
return input;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nfd_nfkd(state, self, input, 0);
|
|
|
|
return nfd_nfkd(self, input, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
|
|
|
|
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
|
|
|
|
if (is_normalized_quickcheck(state, self, input,
|
|
|
|
if (is_normalized_quickcheck(self, input,
|
|
|
|
false, true, true) == YES) {
|
|
|
|
false, true, true) == YES) {
|
|
|
|
Py_INCREF(input);
|
|
|
|
Py_INCREF(input);
|
|
|
|
return input;
|
|
|
|
return input;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nfd_nfkd(state, self, input, 1);
|
|
|
|
return nfd_nfkd(self, input, 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
|
|
|
|
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
|
|
|
|
return NULL;
|
|
|
|
return NULL;
|
|
|
@ -1080,7 +1060,7 @@ is_unified_ideograph(Py_UCS4 code)
|
|
|
|
(cp < named_sequences_end))
|
|
|
|
(cp < named_sequences_end))
|
|
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
static int
|
|
|
|
_getucname(unicodedata_module_state *state, PyObject *self,
|
|
|
|
_getucname(PyObject *self,
|
|
|
|
Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
|
|
|
|
Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
/* Find the name associated with the given code point.
|
|
|
|
/* Find the name associated with the given code point.
|
|
|
@ -1098,7 +1078,7 @@ _getucname(unicodedata_module_state *state, PyObject *self,
|
|
|
|
if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
|
|
|
|
if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
|
|
if (UCD_Check(self, state->ucd_type)) {
|
|
|
|
if (UCD_Check(self)) {
|
|
|
|
/* in 3.2.0 there are no aliases and named sequences */
|
|
|
|
/* in 3.2.0 there are no aliases and named sequences */
|
|
|
|
const change_record *old;
|
|
|
|
const change_record *old;
|
|
|
|
if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
|
|
|
|
if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
|
|
|
@ -1182,23 +1162,21 @@ _getucname(unicodedata_module_state *state, PyObject *self,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
static int
|
|
|
|
capi_getucname(void *state_raw, PyObject *self, Py_UCS4 code,
|
|
|
|
capi_getucname(Py_UCS4 code,
|
|
|
|
char* buffer, int buflen,
|
|
|
|
char* buffer, int buflen,
|
|
|
|
int with_alias_and_seq)
|
|
|
|
int with_alias_and_seq)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
unicodedata_module_state *state = (unicodedata_module_state *)state_raw;
|
|
|
|
return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
|
|
|
|
return _getucname(state, self, code, buffer, buflen, with_alias_and_seq);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
static int
|
|
|
|
_cmpname(unicodedata_module_state *state, PyObject *self,
|
|
|
|
_cmpname(PyObject *self, int code, const char* name, int namelen)
|
|
|
|
int code, const char* name, int namelen)
|
|
|
|
|
|
|
|
{
|
|
|
|
{
|
|
|
|
/* check if code corresponds to the given name */
|
|
|
|
/* check if code corresponds to the given name */
|
|
|
|
int i;
|
|
|
|
int i;
|
|
|
|
char buffer[NAME_MAXLEN+1];
|
|
|
|
char buffer[NAME_MAXLEN+1];
|
|
|
|
if (!_getucname(state, self, code, buffer, NAME_MAXLEN, 1))
|
|
|
|
if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
for (i = 0; i < namelen; i++) {
|
|
|
|
for (i = 0; i < namelen; i++) {
|
|
|
|
if (Py_TOUPPER(name[i]) != buffer[i])
|
|
|
|
if (Py_TOUPPER(name[i]) != buffer[i])
|
|
|
@ -1243,7 +1221,7 @@ _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
static int
|
|
|
|
_getcode(unicodedata_module_state *state, PyObject* self,
|
|
|
|
_getcode(PyObject* self,
|
|
|
|
const char* name, int namelen, Py_UCS4* code, int with_named_seq)
|
|
|
|
const char* name, int namelen, Py_UCS4* code, int with_named_seq)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
/* Return the code point associated with the given name.
|
|
|
|
/* Return the code point associated with the given name.
|
|
|
@ -1305,7 +1283,7 @@ _getcode(unicodedata_module_state *state, PyObject* self,
|
|
|
|
v = code_hash[i];
|
|
|
|
v = code_hash[i];
|
|
|
|
if (!v)
|
|
|
|
if (!v)
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
if (_cmpname(state, self, v, name, namelen)) {
|
|
|
|
if (_cmpname(self, v, name, namelen)) {
|
|
|
|
return _check_alias_and_seq(v, code, with_named_seq);
|
|
|
|
return _check_alias_and_seq(v, code, with_named_seq);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
incr = (h ^ (h >> 3)) & mask;
|
|
|
|
incr = (h ^ (h >> 3)) & mask;
|
|
|
@ -1316,7 +1294,7 @@ _getcode(unicodedata_module_state *state, PyObject* self,
|
|
|
|
v = code_hash[i];
|
|
|
|
v = code_hash[i];
|
|
|
|
if (!v)
|
|
|
|
if (!v)
|
|
|
|
return 0;
|
|
|
|
return 0;
|
|
|
|
if (_cmpname(state, self, v, name, namelen)) {
|
|
|
|
if (_cmpname(self, v, name, namelen)) {
|
|
|
|
return _check_alias_and_seq(v, code, with_named_seq);
|
|
|
|
return _check_alias_and_seq(v, code, with_named_seq);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
incr = incr << 1;
|
|
|
|
incr = incr << 1;
|
|
|
@ -1326,15 +1304,20 @@ _getcode(unicodedata_module_state *state, PyObject* self,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
static int
|
|
|
|
capi_getcode(void *state_raw, PyObject* self,
|
|
|
|
capi_getcode(const char* name, int namelen, Py_UCS4* code,
|
|
|
|
const char* name, int namelen, Py_UCS4* code,
|
|
|
|
|
|
|
|
int with_named_seq)
|
|
|
|
int with_named_seq)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
unicodedata_module_state *state = (unicodedata_module_state *)state_raw;
|
|
|
|
return _getcode(NULL, name, namelen, code, with_named_seq);
|
|
|
|
return _getcode(state, self, name, namelen, code, with_named_seq);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static const _PyUnicode_Name_CAPI unicodedata_capi =
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
.getname = capi_getucname,
|
|
|
|
|
|
|
|
.getcode = capi_getcode,
|
|
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
|
|
/* Python bindings */
|
|
|
|
/* Python bindings */
|
|
|
|
|
|
|
|
|
|
|
@ -1356,11 +1339,10 @@ static PyObject *
|
|
|
|
unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
|
|
|
|
unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
|
|
|
|
/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
|
|
|
|
/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
|
|
|
|
{
|
|
|
|
{
|
|
|
|
unicodedata_module_state *state = &global_module_state;
|
|
|
|
|
|
|
|
char name[NAME_MAXLEN+1];
|
|
|
|
char name[NAME_MAXLEN+1];
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
Py_UCS4 c = (Py_UCS4)chr;
|
|
|
|
|
|
|
|
|
|
|
|
if (!_getucname(state, self, c, name, NAME_MAXLEN, 0)) {
|
|
|
|
if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
|
|
|
|
if (default_value == NULL) {
|
|
|
|
if (default_value == NULL) {
|
|
|
|
PyErr_SetString(PyExc_ValueError, "no such name");
|
|
|
|
PyErr_SetString(PyExc_ValueError, "no such name");
|
|
|
|
return NULL;
|
|
|
|
return NULL;
|
|
|
@ -1392,7 +1374,6 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
|
|
|
|
Py_ssize_clean_t name_length)
|
|
|
|
Py_ssize_clean_t name_length)
|
|
|
|
/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
|
|
|
|
/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
|
|
|
|
{
|
|
|
|
{
|
|
|
|
unicodedata_module_state *state = &global_module_state;
|
|
|
|
|
|
|
|
Py_UCS4 code;
|
|
|
|
Py_UCS4 code;
|
|
|
|
unsigned int index;
|
|
|
|
unsigned int index;
|
|
|
|
if (name_length > NAME_MAXLEN) {
|
|
|
|
if (name_length > NAME_MAXLEN) {
|
|
|
@ -1400,7 +1381,7 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
|
|
|
|
return NULL;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!_getcode(state, self, name, (int)name_length, &code, 1)) {
|
|
|
|
if (!_getcode(self, name, (int)name_length, &code, 1)) {
|
|
|
|
PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
|
|
|
|
PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
|
|
|
|
return NULL;
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -1415,8 +1396,10 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
|
|
|
|
return PyUnicode_FromOrdinal(code);
|
|
|
|
return PyUnicode_FromOrdinal(code);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* XXX Add doc strings. */
|
|
|
|
// List of functions used to define module functions *AND* unicodedata.UCD
|
|
|
|
|
|
|
|
// methods. For module functions, self is the module. For UCD methods, self
|
|
|
|
|
|
|
|
// is an UCD instance. The UCD_Check() macro is used to check if self is
|
|
|
|
|
|
|
|
// an UCD instance.
|
|
|
|
static PyMethodDef unicodedata_functions[] = {
|
|
|
|
static PyMethodDef unicodedata_functions[] = {
|
|
|
|
UNICODEDATA_UCD_DECIMAL_METHODDEF
|
|
|
|
UNICODEDATA_UCD_DECIMAL_METHODDEF
|
|
|
|
UNICODEDATA_UCD_DIGIT_METHODDEF
|
|
|
|
UNICODEDATA_UCD_DIGIT_METHODDEF
|
|
|
@ -1501,41 +1484,64 @@ static struct PyModuleDef unicodedatamodule = {
|
|
|
|
NULL
|
|
|
|
NULL
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static int
|
|
|
|
|
|
|
|
unicodedata_exec(PyObject *module)
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
Py_SET_TYPE(&UCD_Type, &PyType_Type);
|
|
|
|
|
|
|
|
PyTypeObject *ucd_type = &UCD_Type;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (PyModule_AddType(module, ucd_type) < 0) {
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* Previous versions */
|
|
|
|
|
|
|
|
PyObject *v;
|
|
|
|
|
|
|
|
v = new_previous_version(ucd_type, "3.2.0",
|
|
|
|
|
|
|
|
get_change_3_2_0, normalization_3_2_0);
|
|
|
|
|
|
|
|
if (v == NULL) {
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) {
|
|
|
|
|
|
|
|
Py_DECREF(v);
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* Export C API */
|
|
|
|
|
|
|
|
v = PyCapsule_New((void *)&unicodedata_capi, PyUnicodeData_CAPSULE_NAME,
|
|
|
|
|
|
|
|
NULL);
|
|
|
|
|
|
|
|
if (v == NULL) {
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if (PyModule_AddObject(module, "ucnhash_CAPI", v) < 0) {
|
|
|
|
|
|
|
|
Py_DECREF(v);
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PyMODINIT_FUNC
|
|
|
|
PyMODINIT_FUNC
|
|
|
|
PyInit_unicodedata(void)
|
|
|
|
PyInit_unicodedata(void)
|
|
|
|
{
|
|
|
|
{
|
|
|
|
PyObject *m, *v;
|
|
|
|
PyObject *module = PyModule_Create(&unicodedatamodule);
|
|
|
|
unicodedata_module_state *state = &global_module_state;
|
|
|
|
if (!module) {
|
|
|
|
|
|
|
|
|
|
|
|
state->capi.size = sizeof(_PyUnicode_Name_CAPI);
|
|
|
|
|
|
|
|
state->capi.state = state;
|
|
|
|
|
|
|
|
state->capi.getname = capi_getucname;
|
|
|
|
|
|
|
|
state->capi.getcode = capi_getcode;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Py_SET_TYPE(&UCD_Type, &PyType_Type);
|
|
|
|
|
|
|
|
state->ucd_type = &UCD_Type;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
m = PyModule_Create(&unicodedatamodule);
|
|
|
|
|
|
|
|
if (!m)
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
|
|
|
|
if (unicodedata_exec(module) < 0) {
|
|
|
|
Py_INCREF(state->ucd_type);
|
|
|
|
Py_DECREF(module);
|
|
|
|
PyModule_AddObject(m, "UCD", (PyObject*)state->ucd_type);
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Previous versions */
|
|
|
|
return module;
|
|
|
|
v = new_previous_version(state, "3.2.0",
|
|
|
|
|
|
|
|
get_change_3_2_0, normalization_3_2_0);
|
|
|
|
|
|
|
|
if (v != NULL)
|
|
|
|
|
|
|
|
PyModule_AddObject(m, "ucd_3_2_0", v);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* Export C API */
|
|
|
|
|
|
|
|
v = PyCapsule_New((void *)&state->capi, PyUnicodeData_CAPSULE_NAME, NULL);
|
|
|
|
|
|
|
|
if (v != NULL)
|
|
|
|
|
|
|
|
PyModule_AddObject(m, "ucnhash_CAPI", v);
|
|
|
|
|
|
|
|
return m;
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
/*
|
|
|
|
Local variables:
|
|
|
|
Local variables:
|
|
|
|
c-basic-offset: 4
|
|
|
|
c-basic-offset: 4
|
|
|
|