bpo-42157: unicodedata avoids references to UCD_Type (GH-22990)

* UCD_Check() uses PyModule_Check()
* Simplify the internal _PyUnicode_Name_CAPI structure:

  * Remove size and state members
  * Remove state and self parameters of getcode() and getname()
    functions

* Remove global_module_state
This commit is contained in:
Victor Stinner 2020-10-26 19:19:36 +01:00 committed by GitHub
parent 8374d2ee15
commit 920cb647ba
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 129 additions and 139 deletions

View File

@ -408,10 +408,8 @@ Porting to Python 3.10
(Contributed by Inada Naoki in :issue:`36346`.)
* The private ``_PyUnicode_Name_CAPI`` structure of the PyCapsule API
``unicodedata.ucnhash_CAPI`` moves to the internal C API. Moreover,
the structure gets a new ``state`` member which must be passed to the
``getcode()`` and ``getname()`` functions.
(Contributed by Victor Stinner in :issue:`1635741`.)
``unicodedata.ucnhash_CAPI`` moves to the internal C API.
(Contributed by Victor Stinner in :issue:`42157`.)
Deprecated
----------

View File

@ -15,25 +15,15 @@ extern "C" {
typedef struct {
/* Size of this struct */
int size;
// state which must be passed as the first parameter to getname()
// and getcode()
void *state;
/* Get name for a given character code. Returns non-zero if
success, zero if not. Does not set Python exceptions.
If self is NULL, data come from the default version of the database.
If it is not NULL, it should be a unicodedata.ucd_X_Y_Z object */
int (*getname)(void *state, PyObject *self, Py_UCS4 code,
char* buffer, int buflen,
/* Get name for a given character code.
Returns non-zero if success, zero if not.
Does not set Python exceptions. */
int (*getname)(Py_UCS4 code, char* buffer, int buflen,
int with_alias_and_seq);
/* Get character code for a given name. Same error handling
as for getname. */
int (*getcode)(void *state, PyObject *self,
const char* name, int namelen, Py_UCS4* code,
/* Get character code for a given name.
Same error handling as for getname(). */
int (*getcode)(const char* name, int namelen, Py_UCS4* code,
int with_named_seq);
} _PyUnicode_Name_CAPI;

View File

@ -1,4 +0,0 @@
The private ``_PyUnicode_Name_CAPI`` structure of the PyCapsule API
``unicodedata.ucnhash_CAPI`` moves to the internal C API. Moreover, the
structure gets a new ``state`` member which must be passed to the
``getcode()`` and ``getname()`` functions. Patch by Victor Stinner.

View File

@ -0,0 +1,3 @@
The private ``_PyUnicode_Name_CAPI`` structure of the PyCapsule API
``unicodedata.ucnhash_CAPI`` moves to the internal C API.
Patch by Victor Stinner.

View File

@ -93,29 +93,19 @@ static PyMemberDef DB_members[] = {
/* forward declaration */
static PyTypeObject UCD_Type;
typedef struct {
// Borrowed reference to &UCD_Type. It is used to prepare the code
// to convert the UCD_Type static type to a heap type.
PyTypeObject *ucd_type;
_PyUnicode_Name_CAPI capi;
} unicodedata_module_state;
// bpo-1635741: Temporary global state until the unicodedata module
// gets a real module state.
static unicodedata_module_state global_module_state;
// Check if self is an instance of ucd_type.
// Return 0 if self is NULL (when the PyCapsule C API is used).
#define UCD_Check(self, ucd_type) (self != NULL && Py_IS_TYPE(self, ucd_type))
// Check if self is an unicodedata.UCD instance.
// If self is NULL (when the PyCapsule C API is used), return 0.
// PyModule_Check() is used to avoid having to retrieve the ucd_type.
// See unicodedata_functions comment to the rationale of this macro.
#define UCD_Check(self) (self != NULL && !PyModule_Check(self))
static PyObject*
new_previous_version(unicodedata_module_state *state,
new_previous_version(PyTypeObject *ucd_type,
const char*name, const change_record* (*getrecord)(Py_UCS4),
Py_UCS4 (*normalization)(Py_UCS4))
{
PreviousDBVersion *self;
self = PyObject_New(PreviousDBVersion, state->ucd_type);
self = PyObject_New(PreviousDBVersion, ucd_type);
if (self == NULL)
return NULL;
self->name = name;
@ -147,12 +137,11 @@ unicodedata_UCD_decimal_impl(PyObject *self, int chr,
PyObject *default_value)
/*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
{
unicodedata_module_state *state = &global_module_state;
int have_old = 0;
long rc;
Py_UCS4 c = (Py_UCS4)chr;
if (UCD_Check(self, state->ucd_type)) {
if (UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0) {
/* unassigned */
@ -236,12 +225,11 @@ unicodedata_UCD_numeric_impl(PyObject *self, int chr,
PyObject *default_value)
/*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
{
unicodedata_module_state *state = &global_module_state;
int have_old = 0;
double rc;
Py_UCS4 c = (Py_UCS4)chr;
if (UCD_Check(self, state->ucd_type)) {
if (UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0) {
/* unassigned */
@ -283,11 +271,10 @@ static PyObject *
unicodedata_UCD_category_impl(PyObject *self, int chr)
/*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
{
unicodedata_module_state *state = &global_module_state;
int index;
Py_UCS4 c = (Py_UCS4)chr;
index = (int) _getrecord_ex(c)->category;
if (UCD_Check(self, state->ucd_type)) {
if (UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed != 0xFF)
index = old->category_changed;
@ -311,11 +298,10 @@ static PyObject *
unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
/*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
{
unicodedata_module_state *state = &global_module_state;
int index;
Py_UCS4 c = (Py_UCS4)chr;
index = (int) _getrecord_ex(c)->bidirectional;
if (UCD_Check(self, state->ucd_type)) {
if (UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0)
index = 0; /* unassigned */
@ -341,11 +327,10 @@ static int
unicodedata_UCD_combining_impl(PyObject *self, int chr)
/*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
{
unicodedata_module_state *state = &global_module_state;
int index;
Py_UCS4 c = (Py_UCS4)chr;
index = (int) _getrecord_ex(c)->combining;
if (UCD_Check(self, state->ucd_type)) {
if (UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0)
index = 0; /* unassigned */
@ -370,11 +355,10 @@ static int
unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
/*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
{
unicodedata_module_state *state = &global_module_state;
int index;
Py_UCS4 c = (Py_UCS4)chr;
index = (int) _getrecord_ex(c)->mirrored;
if (UCD_Check(self, state->ucd_type)) {
if (UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0)
index = 0; /* unassigned */
@ -398,11 +382,10 @@ static PyObject *
unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
/*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
{
unicodedata_module_state *state = &global_module_state;
int index;
Py_UCS4 c = (Py_UCS4)chr;
index = (int) _getrecord_ex(c)->east_asian_width;
if (UCD_Check(self, state->ucd_type)) {
if (UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0)
index = 0; /* unassigned */
@ -428,7 +411,6 @@ static PyObject *
unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
/*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
{
unicodedata_module_state *state = &global_module_state;
char decomp[256];
int code, index, count;
size_t i;
@ -437,7 +419,7 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
code = (int)c;
if (UCD_Check(self, state->ucd_type)) {
if (UCD_Check(self)) {
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0)
return PyUnicode_FromString(""); /* unassigned */
@ -480,13 +462,14 @@ unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
}
static void
get_decomp_record(unicodedata_module_state *state, PyObject *self,
Py_UCS4 code, int *index, int *prefix, int *count)
get_decomp_record(PyObject *self, Py_UCS4 code,
int *index, int *prefix, int *count)
{
if (code >= 0x110000) {
*index = 0;
} else if (UCD_Check(self, state->ucd_type) &&
get_old_record(self, code)->category_changed==0) {
}
else if (UCD_Check(self)
&& get_old_record(self, code)->category_changed==0) {
/* unassigned in old version */
*index = 0;
}
@ -515,8 +498,7 @@ get_decomp_record(unicodedata_module_state *state, PyObject *self,
#define SCount (LCount*NCount)
static PyObject*
nfd_nfkd(unicodedata_module_state *state, PyObject *self,
PyObject *input, int k)
nfd_nfkd(PyObject *self, PyObject *input, int k)
{
PyObject *result;
Py_UCS4 *output;
@ -584,7 +566,7 @@ nfd_nfkd(unicodedata_module_state *state, PyObject *self,
continue;
}
/* normalization changes */
if (UCD_Check(self, state->ucd_type)) {
if (UCD_Check(self)) {
Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
if (value != 0) {
stack[stackptr++] = value;
@ -593,7 +575,7 @@ nfd_nfkd(unicodedata_module_state *state, PyObject *self,
}
/* Other decompositions. */
get_decomp_record(state, self, code, &index, &prefix, &count);
get_decomp_record(self, code, &index, &prefix, &count);
/* Copy character if it is not decomposable, or has a
compatibility decomposition, but we do NFD. */
@ -665,7 +647,7 @@ find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
}
static PyObject*
nfc_nfkc(unicodedata_module_state *state, PyObject *self, PyObject *input, int k)
nfc_nfkc(PyObject *self, PyObject *input, int k)
{
PyObject *result;
int kind;
@ -677,7 +659,7 @@ nfc_nfkc(unicodedata_module_state *state, PyObject *self, PyObject *input, int k
Py_ssize_t skipped[20];
int cskipped = 0;
result = nfd_nfkd(state, self, input, k);
result = nfd_nfkd(self, input, k);
if (!result)
return NULL;
/* result will be "ready". */
@ -820,13 +802,13 @@ typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
* https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
*/
static QuickcheckResult
is_normalized_quickcheck(unicodedata_module_state *state, PyObject *self,
PyObject *input, bool nfc, bool k, bool yes_only)
is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
bool yes_only)
{
/* An older version of the database is requested, quickchecks must be
disabled. */
if (UCD_Check(self, state->ucd_type))
/* UCD 3.2.0 is requested, quickchecks must be disabled. */
if (UCD_Check(self)) {
return NO;
}
Py_ssize_t i, len;
int kind;
@ -885,7 +867,6 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
PyObject *input)
/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
{
unicodedata_module_state *state = &global_module_state;
if (PyUnicode_READY(input) == -1) {
return NULL;
}
@ -921,10 +902,10 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
return NULL;
}
m = is_normalized_quickcheck(state, self, input, nfc, k, false);
m = is_normalized_quickcheck(self, input, nfc, k, false);
if (m == MAYBE) {
cmp = (nfc ? nfc_nfkc : nfd_nfkd)(state, self, input, k);
cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
if (cmp == NULL) {
return NULL;
}
@ -959,7 +940,6 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
PyObject *input)
/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
{
unicodedata_module_state *state = &global_module_state;
if (PyUnicode_GET_LENGTH(input) == 0) {
/* Special case empty input strings, since resizing
them later would cause internal errors. */
@ -968,36 +948,36 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
}
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
if (is_normalized_quickcheck(state, self, input,
if (is_normalized_quickcheck(self, input,
true, false, true) == YES) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(state, self, input, 0);
return nfc_nfkc(self, input, 0);
}
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
if (is_normalized_quickcheck(state, self, input,
if (is_normalized_quickcheck(self, input,
true, true, true) == YES) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(state, self, input, 1);
return nfc_nfkc(self, input, 1);
}
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
if (is_normalized_quickcheck(state, self, input,
if (is_normalized_quickcheck(self, input,
false, false, true) == YES) {
Py_INCREF(input);
return input;
}
return nfd_nfkd(state, self, input, 0);
return nfd_nfkd(self, input, 0);
}
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
if (is_normalized_quickcheck(state, self, input,
if (is_normalized_quickcheck(self, input,
false, true, true) == YES) {
Py_INCREF(input);
return input;
}
return nfd_nfkd(state, self, input, 1);
return nfd_nfkd(self, input, 1);
}
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
return NULL;
@ -1080,7 +1060,7 @@ is_unified_ideograph(Py_UCS4 code)
(cp < named_sequences_end))
static int
_getucname(unicodedata_module_state *state, PyObject *self,
_getucname(PyObject *self,
Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
{
/* Find the name associated with the given code point.
@ -1098,7 +1078,7 @@ _getucname(unicodedata_module_state *state, PyObject *self,
if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
return 0;
if (UCD_Check(self, state->ucd_type)) {
if (UCD_Check(self)) {
/* in 3.2.0 there are no aliases and named sequences */
const change_record *old;
if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
@ -1182,23 +1162,21 @@ _getucname(unicodedata_module_state *state, PyObject *self,
}
static int
capi_getucname(void *state_raw, PyObject *self, Py_UCS4 code,
capi_getucname(Py_UCS4 code,
char* buffer, int buflen,
int with_alias_and_seq)
{
unicodedata_module_state *state = (unicodedata_module_state *)state_raw;
return _getucname(state, self, code, buffer, buflen, with_alias_and_seq);
return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
}
static int
_cmpname(unicodedata_module_state *state, PyObject *self,
int code, const char* name, int namelen)
_cmpname(PyObject *self, int code, const char* name, int namelen)
{
/* check if code corresponds to the given name */
int i;
char buffer[NAME_MAXLEN+1];
if (!_getucname(state, self, code, buffer, NAME_MAXLEN, 1))
if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
return 0;
for (i = 0; i < namelen; i++) {
if (Py_TOUPPER(name[i]) != buffer[i])
@ -1243,7 +1221,7 @@ _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
}
static int
_getcode(unicodedata_module_state *state, PyObject* self,
_getcode(PyObject* self,
const char* name, int namelen, Py_UCS4* code, int with_named_seq)
{
/* Return the code point associated with the given name.
@ -1305,7 +1283,7 @@ _getcode(unicodedata_module_state *state, PyObject* self,
v = code_hash[i];
if (!v)
return 0;
if (_cmpname(state, self, v, name, namelen)) {
if (_cmpname(self, v, name, namelen)) {
return _check_alias_and_seq(v, code, with_named_seq);
}
incr = (h ^ (h >> 3)) & mask;
@ -1316,7 +1294,7 @@ _getcode(unicodedata_module_state *state, PyObject* self,
v = code_hash[i];
if (!v)
return 0;
if (_cmpname(state, self, v, name, namelen)) {
if (_cmpname(self, v, name, namelen)) {
return _check_alias_and_seq(v, code, with_named_seq);
}
incr = incr << 1;
@ -1326,15 +1304,20 @@ _getcode(unicodedata_module_state *state, PyObject* self,
}
static int
capi_getcode(void *state_raw, PyObject* self,
const char* name, int namelen, Py_UCS4* code,
capi_getcode(const char* name, int namelen, Py_UCS4* code,
int with_named_seq)
{
unicodedata_module_state *state = (unicodedata_module_state *)state_raw;
return _getcode(state, self, name, namelen, code, with_named_seq);
return _getcode(NULL, name, namelen, code, with_named_seq);
}
static const _PyUnicode_Name_CAPI unicodedata_capi =
{
.getname = capi_getucname,
.getcode = capi_getcode,
};
/* -------------------------------------------------------------------- */
/* Python bindings */
@ -1356,11 +1339,10 @@ static PyObject *
unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
/*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
{
unicodedata_module_state *state = &global_module_state;
char name[NAME_MAXLEN+1];
Py_UCS4 c = (Py_UCS4)chr;
if (!_getucname(state, self, c, name, NAME_MAXLEN, 0)) {
if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
if (default_value == NULL) {
PyErr_SetString(PyExc_ValueError, "no such name");
return NULL;
@ -1392,7 +1374,6 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
Py_ssize_clean_t name_length)
/*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
{
unicodedata_module_state *state = &global_module_state;
Py_UCS4 code;
unsigned int index;
if (name_length > NAME_MAXLEN) {
@ -1400,7 +1381,7 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
return NULL;
}
if (!_getcode(state, self, name, (int)name_length, &code, 1)) {
if (!_getcode(self, name, (int)name_length, &code, 1)) {
PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
return NULL;
}
@ -1415,8 +1396,10 @@ unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
return PyUnicode_FromOrdinal(code);
}
/* XXX Add doc strings. */
// List of functions used to define module functions *AND* unicodedata.UCD
// methods. For module functions, self is the module. For UCD methods, self
// is an UCD instance. The UCD_Check() macro is used to check if self is
// an UCD instance.
static PyMethodDef unicodedata_functions[] = {
UNICODEDATA_UCD_DECIMAL_METHODDEF
UNICODEDATA_UCD_DIGIT_METHODDEF
@ -1501,41 +1484,64 @@ static struct PyModuleDef unicodedatamodule = {
NULL
};
static int
unicodedata_exec(PyObject *module)
{
Py_SET_TYPE(&UCD_Type, &PyType_Type);
PyTypeObject *ucd_type = &UCD_Type;
if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
return -1;
}
if (PyModule_AddType(module, ucd_type) < 0) {
return -1;
}
/* Previous versions */
PyObject *v;
v = new_previous_version(ucd_type, "3.2.0",
get_change_3_2_0, normalization_3_2_0);
if (v == NULL) {
return -1;
}
if (PyModule_AddObject(module, "ucd_3_2_0", v) < 0) {
Py_DECREF(v);
return -1;
}
/* Export C API */
v = PyCapsule_New((void *)&unicodedata_capi, PyUnicodeData_CAPSULE_NAME,
NULL);
if (v == NULL) {
return -1;
}
if (PyModule_AddObject(module, "ucnhash_CAPI", v) < 0) {
Py_DECREF(v);
return -1;
}
return 0;
}
PyMODINIT_FUNC
PyInit_unicodedata(void)
{
PyObject *m, *v;
unicodedata_module_state *state = &global_module_state;
state->capi.size = sizeof(_PyUnicode_Name_CAPI);
state->capi.state = state;
state->capi.getname = capi_getucname;
state->capi.getcode = capi_getcode;
Py_SET_TYPE(&UCD_Type, &PyType_Type);
state->ucd_type = &UCD_Type;
m = PyModule_Create(&unicodedatamodule);
if (!m)
PyObject *module = PyModule_Create(&unicodedatamodule);
if (!module) {
return NULL;
}
PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
Py_INCREF(state->ucd_type);
PyModule_AddObject(m, "UCD", (PyObject*)state->ucd_type);
if (unicodedata_exec(module) < 0) {
Py_DECREF(module);
return NULL;
}
/* Previous versions */
v = new_previous_version(state, "3.2.0",
get_change_3_2_0, normalization_3_2_0);
if (v != NULL)
PyModule_AddObject(m, "ucd_3_2_0", v);
/* Export C API */
v = PyCapsule_New((void *)&state->capi, PyUnicodeData_CAPSULE_NAME, NULL);
if (v != NULL)
PyModule_AddObject(m, "ucnhash_CAPI", v);
return m;
return module;
}
/*
Local variables:
c-basic-offset: 4

View File

@ -6523,8 +6523,7 @@ _PyUnicode_DecodeUnicodeEscape(const char *s,
s++;
ch = 0xffffffff; /* in case 'getcode' messes up */
if (namelen <= INT_MAX &&
ucnhash_capi->getcode(ucnhash_capi->state, NULL,
start, (int)namelen,
ucnhash_capi->getcode(start, (int)namelen,
&ch, 0)) {
assert(ch <= MAX_UNICODE);
WRITE_CHAR(ch);

View File

@ -987,8 +987,7 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
for (i = start, ressize = 0; i < end; ++i) {
/* object is guaranteed to be "ready" */
c = PyUnicode_READ_CHAR(object, i);
if (ucnhash_capi->getname(ucnhash_capi->state, NULL,
c, buffer, sizeof(buffer), 1)) {
if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
replsize = 1+1+1+(int)strlen(buffer)+1;
}
else if (c >= 0x10000) {
@ -1011,8 +1010,7 @@ PyObject *PyCodec_NameReplaceErrors(PyObject *exc)
i < end; ++i) {
c = PyUnicode_READ_CHAR(object, i);
*outp++ = '\\';
if (ucnhash_capi->getname(ucnhash_capi->state, NULL,
c, buffer, sizeof(buffer), 1)) {
if (ucnhash_capi->getname(c, buffer, sizeof(buffer), 1)) {
*outp++ = 'N';
*outp++ = '{';
strcpy((char *)outp, buffer);