mirror of https://github.com/python/cpython
closes bpo-32285: Add unicodedata.is_normalized. (GH-4806)
This commit is contained in:
parent
5d236cafd7
commit
2810dd7be9
|
@ -133,6 +133,13 @@ following functions:
|
||||||
a human reader, if one has combining characters and the other
|
a human reader, if one has combining characters and the other
|
||||||
doesn't, they may not compare equal.
|
doesn't, they may not compare equal.
|
||||||
|
|
||||||
|
.. function:: is_normalized(form, unistr)
|
||||||
|
|
||||||
|
Return whether the Unicode string *unistr* is in the normal form *form*. Valid
|
||||||
|
values for *form* are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
|
||||||
|
|
||||||
|
.. versionadded:: 3.8
|
||||||
|
|
||||||
|
|
||||||
In addition, the module exposes the following constant:
|
In addition, the module exposes the following constant:
|
||||||
|
|
||||||
|
|
|
@ -204,6 +204,13 @@ Added method :meth:`~tkinter.Canvas.moveto`
|
||||||
in the :class:`tkinter.Canvas` class.
|
in the :class:`tkinter.Canvas` class.
|
||||||
(Contributed by Juliette Monsel in :issue:`23831`.)
|
(Contributed by Juliette Monsel in :issue:`23831`.)
|
||||||
|
|
||||||
|
unicodedata
|
||||||
|
-----------
|
||||||
|
|
||||||
|
* New function :func:`~unicodedata.is_normalized` can be used to verify a string
|
||||||
|
is in a specific normal form. (Contributed by Max Belanger and David Euresti in
|
||||||
|
:issue:`32285`).
|
||||||
|
|
||||||
venv
|
venv
|
||||||
----
|
----
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ import unittest
|
||||||
|
|
||||||
from http.client import HTTPException
|
from http.client import HTTPException
|
||||||
import sys
|
import sys
|
||||||
from unicodedata import normalize, unidata_version
|
from unicodedata import normalize, is_normalized, unidata_version
|
||||||
|
|
||||||
TESTDATAFILE = "NormalizationTest.txt"
|
TESTDATAFILE = "NormalizationTest.txt"
|
||||||
TESTDATAURL = "http://www.pythontest.net/unicode/" + unidata_version + "/" + TESTDATAFILE
|
TESTDATAURL = "http://www.pythontest.net/unicode/" + unidata_version + "/" + TESTDATAFILE
|
||||||
|
@ -88,6 +88,15 @@ class NormalizationTest(unittest.TestCase):
|
||||||
NFKD(c3) == NFKD(c4) == NFKD(c5),
|
NFKD(c3) == NFKD(c4) == NFKD(c5),
|
||||||
line)
|
line)
|
||||||
|
|
||||||
|
self.assertTrue(is_normalized("NFC", c2))
|
||||||
|
self.assertTrue(is_normalized("NFC", c4))
|
||||||
|
|
||||||
|
self.assertTrue(is_normalized("NFD", c3))
|
||||||
|
self.assertTrue(is_normalized("NFD", c5))
|
||||||
|
|
||||||
|
self.assertTrue(is_normalized("NFKC", c4))
|
||||||
|
self.assertTrue(is_normalized("NFKD", c5))
|
||||||
|
|
||||||
# Record part 1 data
|
# Record part 1 data
|
||||||
if part == "@Part1":
|
if part == "@Part1":
|
||||||
part1_data[c1] = 1
|
part1_data[c1] = 1
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
New function unicodedata.is_normalized, which can check whether a string is
|
||||||
|
in a specific normal form.
|
|
@ -284,6 +284,38 @@ exit:
|
||||||
return return_value;
|
return return_value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PyDoc_STRVAR(unicodedata_UCD_is_normalized__doc__,
|
||||||
|
"is_normalized($self, form, unistr, /)\n"
|
||||||
|
"--\n"
|
||||||
|
"\n"
|
||||||
|
"Return whether the Unicode string unistr is in the normal form \'form\'.\n"
|
||||||
|
"\n"
|
||||||
|
"Valid values for form are \'NFC\', \'NFKC\', \'NFD\', and \'NFKD\'.");
|
||||||
|
|
||||||
|
#define UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF \
|
||||||
|
{"is_normalized", (PyCFunction)unicodedata_UCD_is_normalized, METH_FASTCALL, unicodedata_UCD_is_normalized__doc__},
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
|
||||||
|
PyObject *input);
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
unicodedata_UCD_is_normalized(PyObject *self, PyObject *const *args, Py_ssize_t nargs)
|
||||||
|
{
|
||||||
|
PyObject *return_value = NULL;
|
||||||
|
PyObject *form;
|
||||||
|
PyObject *input;
|
||||||
|
|
||||||
|
if (!_PyArg_ParseStack(args, nargs, "UU:is_normalized",
|
||||||
|
&form, &input)) {
|
||||||
|
goto exit;
|
||||||
|
}
|
||||||
|
return_value = unicodedata_UCD_is_normalized_impl(self, form, input);
|
||||||
|
|
||||||
|
exit:
|
||||||
|
return return_value;
|
||||||
|
}
|
||||||
|
|
||||||
PyDoc_STRVAR(unicodedata_UCD_normalize__doc__,
|
PyDoc_STRVAR(unicodedata_UCD_normalize__doc__,
|
||||||
"normalize($self, form, unistr, /)\n"
|
"normalize($self, form, unistr, /)\n"
|
||||||
"--\n"
|
"--\n"
|
||||||
|
@ -296,17 +328,17 @@ PyDoc_STRVAR(unicodedata_UCD_normalize__doc__,
|
||||||
{"normalize", (PyCFunction)unicodedata_UCD_normalize, METH_FASTCALL, unicodedata_UCD_normalize__doc__},
|
{"normalize", (PyCFunction)unicodedata_UCD_normalize, METH_FASTCALL, unicodedata_UCD_normalize__doc__},
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
|
unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
|
||||||
PyObject *input);
|
PyObject *input);
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
unicodedata_UCD_normalize(PyObject *self, PyObject *const *args, Py_ssize_t nargs)
|
unicodedata_UCD_normalize(PyObject *self, PyObject *const *args, Py_ssize_t nargs)
|
||||||
{
|
{
|
||||||
PyObject *return_value = NULL;
|
PyObject *return_value = NULL;
|
||||||
const char *form;
|
PyObject *form;
|
||||||
PyObject *input;
|
PyObject *input;
|
||||||
|
|
||||||
if (!_PyArg_ParseStack(args, nargs, "sU:normalize",
|
if (!_PyArg_ParseStack(args, nargs, "UU:normalize",
|
||||||
&form, &input)) {
|
&form, &input)) {
|
||||||
goto exit;
|
goto exit;
|
||||||
}
|
}
|
||||||
|
@ -379,4 +411,4 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg)
|
||||||
exit:
|
exit:
|
||||||
return return_value;
|
return return_value;
|
||||||
}
|
}
|
||||||
/*[clinic end generated code: output=dc899bff0ecd14c1 input=a9049054013a1b77]*/
|
/*[clinic end generated code: output=2c5fbf597c18f6b8 input=a9049054013a1b77]*/
|
||||||
|
|
|
@ -19,6 +19,11 @@
|
||||||
#include "ucnhash.h"
|
#include "ucnhash.h"
|
||||||
#include "structmember.h"
|
#include "structmember.h"
|
||||||
|
|
||||||
|
_Py_IDENTIFIER(NFC);
|
||||||
|
_Py_IDENTIFIER(NFD);
|
||||||
|
_Py_IDENTIFIER(NFKC);
|
||||||
|
_Py_IDENTIFIER(NFKD);
|
||||||
|
|
||||||
/*[clinic input]
|
/*[clinic input]
|
||||||
module unicodedata
|
module unicodedata
|
||||||
class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
|
class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
|
||||||
|
@ -770,8 +775,10 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Return 1 if the input is certainly normalized, 0 if it might not be. */
|
typedef enum {YES, NO, MAYBE} NormalMode;
|
||||||
static int
|
|
||||||
|
/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
|
||||||
|
static NormalMode
|
||||||
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
|
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
|
||||||
{
|
{
|
||||||
Py_ssize_t i, len;
|
Py_ssize_t i, len;
|
||||||
|
@ -782,7 +789,7 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
|
||||||
/* An older version of the database is requested, quickchecks must be
|
/* An older version of the database is requested, quickchecks must be
|
||||||
disabled. */
|
disabled. */
|
||||||
if (self && UCD_Check(self))
|
if (self && UCD_Check(self))
|
||||||
return 0;
|
return NO;
|
||||||
|
|
||||||
/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
|
/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
|
||||||
as described in http://unicode.org/reports/tr15/#Annex8. */
|
as described in http://unicode.org/reports/tr15/#Annex8. */
|
||||||
|
@ -799,19 +806,92 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
|
||||||
unsigned char quickcheck = record->normalization_quick_check;
|
unsigned char quickcheck = record->normalization_quick_check;
|
||||||
|
|
||||||
if (quickcheck & quickcheck_mask)
|
if (quickcheck & quickcheck_mask)
|
||||||
return 0; /* this string might need normalization */
|
return MAYBE; /* this string might need normalization */
|
||||||
if (combining && prev_combining > combining)
|
if (combining && prev_combining > combining)
|
||||||
return 0; /* non-canonical sort order, not normalized */
|
return NO; /* non-canonical sort order, not normalized */
|
||||||
prev_combining = combining;
|
prev_combining = combining;
|
||||||
}
|
}
|
||||||
return 1; /* certainly normalized */
|
return YES; /* certainly normalized */
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*[clinic input]
|
||||||
|
unicodedata.UCD.is_normalized
|
||||||
|
|
||||||
|
self: self
|
||||||
|
form: unicode
|
||||||
|
unistr as input: unicode
|
||||||
|
/
|
||||||
|
|
||||||
|
Return whether the Unicode string unistr is in the normal form 'form'.
|
||||||
|
|
||||||
|
Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
|
||||||
|
[clinic start generated code]*/
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
|
||||||
|
PyObject *input)
|
||||||
|
/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
|
||||||
|
{
|
||||||
|
if (PyUnicode_READY(input) == -1) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (PyUnicode_GET_LENGTH(input) == 0) {
|
||||||
|
/* special case empty input strings. */
|
||||||
|
Py_RETURN_TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
PyObject *result;
|
||||||
|
int nfc = 0;
|
||||||
|
int k = 0;
|
||||||
|
NormalMode m;
|
||||||
|
|
||||||
|
PyObject *cmp;
|
||||||
|
int match = 0;
|
||||||
|
|
||||||
|
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
|
||||||
|
nfc = 1;
|
||||||
|
}
|
||||||
|
else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
|
||||||
|
nfc = 1;
|
||||||
|
k = 1;
|
||||||
|
}
|
||||||
|
else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
|
||||||
|
/* matches default values for `nfc` and `k` */
|
||||||
|
}
|
||||||
|
else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
|
||||||
|
k = 1;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
PyErr_SetString(PyExc_ValueError, "invalid normalization form");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
m = is_normalized(self, input, nfc, k);
|
||||||
|
|
||||||
|
if (m == MAYBE) {
|
||||||
|
cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
|
||||||
|
if (cmp == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
match = PyUnicode_Compare(input, cmp);
|
||||||
|
Py_DECREF(cmp);
|
||||||
|
result = (match == 0) ? Py_True : Py_False;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
result = (m == YES) ? Py_True : Py_False;
|
||||||
|
}
|
||||||
|
|
||||||
|
Py_INCREF(result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*[clinic input]
|
/*[clinic input]
|
||||||
unicodedata.UCD.normalize
|
unicodedata.UCD.normalize
|
||||||
|
|
||||||
self: self
|
self: self
|
||||||
form: str
|
form: unicode
|
||||||
unistr as input: unicode
|
unistr as input: unicode
|
||||||
/
|
/
|
||||||
|
|
||||||
|
@ -821,9 +901,9 @@ Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
|
||||||
[clinic start generated code]*/
|
[clinic start generated code]*/
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
|
unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
|
||||||
PyObject *input)
|
PyObject *input)
|
||||||
/*[clinic end generated code: output=62d1f8870027efdc input=1744c55f4ab79bf0]*/
|
/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
|
||||||
{
|
{
|
||||||
if (PyUnicode_GET_LENGTH(input) == 0) {
|
if (PyUnicode_GET_LENGTH(input) == 0) {
|
||||||
/* Special case empty input strings, since resizing
|
/* Special case empty input strings, since resizing
|
||||||
|
@ -832,29 +912,29 @@ unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
|
||||||
return input;
|
return input;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (strcmp(form, "NFC") == 0) {
|
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
|
||||||
if (is_normalized(self, input, 1, 0)) {
|
if (is_normalized(self, input, 1, 0) == YES) {
|
||||||
Py_INCREF(input);
|
Py_INCREF(input);
|
||||||
return input;
|
return input;
|
||||||
}
|
}
|
||||||
return nfc_nfkc(self, input, 0);
|
return nfc_nfkc(self, input, 0);
|
||||||
}
|
}
|
||||||
if (strcmp(form, "NFKC") == 0) {
|
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
|
||||||
if (is_normalized(self, input, 1, 1)) {
|
if (is_normalized(self, input, 1, 1) == YES) {
|
||||||
Py_INCREF(input);
|
Py_INCREF(input);
|
||||||
return input;
|
return input;
|
||||||
}
|
}
|
||||||
return nfc_nfkc(self, input, 1);
|
return nfc_nfkc(self, input, 1);
|
||||||
}
|
}
|
||||||
if (strcmp(form, "NFD") == 0) {
|
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
|
||||||
if (is_normalized(self, input, 0, 0)) {
|
if (is_normalized(self, input, 0, 0) == YES) {
|
||||||
Py_INCREF(input);
|
Py_INCREF(input);
|
||||||
return input;
|
return input;
|
||||||
}
|
}
|
||||||
return nfd_nfkd(self, input, 0);
|
return nfd_nfkd(self, input, 0);
|
||||||
}
|
}
|
||||||
if (strcmp(form, "NFKD") == 0) {
|
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
|
||||||
if (is_normalized(self, input, 0, 1)) {
|
if (is_normalized(self, input, 0, 1) == YES) {
|
||||||
Py_INCREF(input);
|
Py_INCREF(input);
|
||||||
return input;
|
return input;
|
||||||
}
|
}
|
||||||
|
@ -1271,6 +1351,7 @@ static PyMethodDef unicodedata_functions[] = {
|
||||||
UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
|
UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
|
||||||
UNICODEDATA_UCD_NAME_METHODDEF
|
UNICODEDATA_UCD_NAME_METHODDEF
|
||||||
UNICODEDATA_UCD_LOOKUP_METHODDEF
|
UNICODEDATA_UCD_LOOKUP_METHODDEF
|
||||||
|
UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
|
||||||
UNICODEDATA_UCD_NORMALIZE_METHODDEF
|
UNICODEDATA_UCD_NORMALIZE_METHODDEF
|
||||||
{NULL, NULL} /* sentinel */
|
{NULL, NULL} /* sentinel */
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue