mirror of https://github.com/python/cpython
gh-110289: C API: Add PyUnicode_EqualToUTF8() and PyUnicode_EqualToUTF8AndSize() functions (GH-110297)
This commit is contained in:
parent
d1f7fae424
commit
eb50cd37ea
|
@ -1396,6 +1396,28 @@ They all return ``NULL`` or ``-1`` if an exception occurs.
|
|||
:c:func:`PyErr_Occurred` to check for errors.
|
||||
|
||||
|
||||
.. c:function:: int PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *string, Py_ssize_t size)
|
||||
|
||||
Compare a Unicode object with a char buffer which is interpreted as
|
||||
being UTF-8 or ASCII encoded and return true (``1``) if they are equal,
|
||||
or false (``0``) otherwise.
|
||||
If the Unicode object contains surrogate characters or
|
||||
the C string is not valid UTF-8, false (``0``) is returned.
|
||||
|
||||
This function does not raise exceptions.
|
||||
|
||||
.. versionadded:: 3.13
|
||||
|
||||
|
||||
.. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string)
|
||||
|
||||
Similar to :c:func:`PyUnicode_EqualToUTF8AndSize`, but compute *string*
|
||||
length using :c:func:`!strlen`.
|
||||
If the Unicode object contains null characters, false (``0``) is returned.
|
||||
|
||||
.. versionadded:: 3.13
|
||||
|
||||
|
||||
.. c:function:: int PyUnicode_CompareWithASCIIString(PyObject *uni, const char *string)
|
||||
|
||||
Compare a Unicode object, *uni*, with *string* and return ``-1``, ``0``, ``1`` for less
|
||||
|
|
|
@ -755,6 +755,8 @@ function,PyUnicode_DecodeUnicodeEscape,3.2,,
|
|||
function,PyUnicode_EncodeCodePage,3.7,on Windows,
|
||||
function,PyUnicode_EncodeFSDefault,3.2,,
|
||||
function,PyUnicode_EncodeLocale,3.7,,
|
||||
function,PyUnicode_EqualToUTF8,3.13,,
|
||||
function,PyUnicode_EqualToUTF8AndSize,3.13,,
|
||||
function,PyUnicode_FSConverter,3.2,,
|
||||
function,PyUnicode_FSDecoder,3.2,,
|
||||
function,PyUnicode_Find,3.2,,
|
||||
|
|
|
@ -1024,6 +1024,12 @@ New Features
|
|||
functions on Python 3.11 and 3.12.
|
||||
(Contributed by Victor Stinner in :gh:`107073`.)
|
||||
|
||||
* Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8`
|
||||
functions: compare Unicode object with a :c:expr:`const char*` UTF-8 encoded
|
||||
string and return true (``1``) if they are equal, or false (``0``) otherwise.
|
||||
These functions do not raise exceptions.
|
||||
(Contributed by Serhiy Storchaka in :gh:`110289`.)
|
||||
|
||||
* Add :c:func:`PyThreadState_GetUnchecked()` function: similar to
|
||||
:c:func:`PyThreadState_Get()`, but don't kill the process with a fatal error
|
||||
if it is NULL. The caller is responsible to check if the result is NULL.
|
||||
|
|
|
@ -957,6 +957,15 @@ PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
|
|||
const char *right /* ASCII-encoded string */
|
||||
);
|
||||
|
||||
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000
|
||||
/* Compare a Unicode object with UTF-8 encoded C string.
|
||||
Return 1 if they are equal, or 0 otherwise.
|
||||
This function does not raise exceptions. */
|
||||
|
||||
PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *);
|
||||
PyAPI_FUNC(int) PyUnicode_EqualToUTF8AndSize(PyObject *, const char *, Py_ssize_t);
|
||||
#endif
|
||||
|
||||
/* Rich compare two strings and return one of the following:
|
||||
|
||||
- NULL in case an exception was raised
|
||||
|
|
|
@ -1297,6 +1297,118 @@ class CAPITest(unittest.TestCase):
|
|||
# CRASHES comparewithasciistring([], b'abc')
|
||||
# CRASHES comparewithasciistring(NULL, b'abc')
|
||||
|
||||
@support.cpython_only
|
||||
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
|
||||
def test_equaltoutf8(self):
|
||||
# Test PyUnicode_EqualToUTF8()
|
||||
from _testcapi import unicode_equaltoutf8 as equaltoutf8
|
||||
from _testcapi import unicode_asutf8andsize as asutf8andsize
|
||||
|
||||
strings = [
|
||||
'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16',
|
||||
'\U0001f600\U0001f601\U0001f602',
|
||||
'\U0010ffff',
|
||||
]
|
||||
for s in strings:
|
||||
# Call PyUnicode_AsUTF8AndSize() which creates the UTF-8
|
||||
# encoded string cached in the Unicode object.
|
||||
asutf8andsize(s, 0)
|
||||
b = s.encode()
|
||||
self.assertEqual(equaltoutf8(s, b), 1) # Use the UTF-8 cache.
|
||||
s2 = b.decode() # New Unicode object without the UTF-8 cache.
|
||||
self.assertEqual(equaltoutf8(s2, b), 1)
|
||||
self.assertEqual(equaltoutf8(s + 'x', b + b'x'), 1)
|
||||
self.assertEqual(equaltoutf8(s + 'x', b + b'y'), 0)
|
||||
self.assertEqual(equaltoutf8(s, b + b'\0'), 1)
|
||||
self.assertEqual(equaltoutf8(s2, b + b'\0'), 1)
|
||||
self.assertEqual(equaltoutf8(s + '\0', b + b'\0'), 0)
|
||||
self.assertEqual(equaltoutf8(s + '\0', b), 0)
|
||||
self.assertEqual(equaltoutf8(s2, b + b'x'), 0)
|
||||
self.assertEqual(equaltoutf8(s2, b[:-1]), 0)
|
||||
self.assertEqual(equaltoutf8(s2, b[:-1] + b'x'), 0)
|
||||
|
||||
self.assertEqual(equaltoutf8('', b''), 1)
|
||||
self.assertEqual(equaltoutf8('', b'\0'), 1)
|
||||
|
||||
# embedded null chars/bytes
|
||||
self.assertEqual(equaltoutf8('abc', b'abc\0def\0'), 1)
|
||||
self.assertEqual(equaltoutf8('a\0bc', b'abc'), 0)
|
||||
self.assertEqual(equaltoutf8('abc', b'a\0bc'), 0)
|
||||
|
||||
# Surrogate characters are always treated as not equal
|
||||
self.assertEqual(equaltoutf8('\udcfe',
|
||||
'\udcfe'.encode("utf8", "surrogateescape")), 0)
|
||||
self.assertEqual(equaltoutf8('\udcfe',
|
||||
'\udcfe'.encode("utf8", "surrogatepass")), 0)
|
||||
self.assertEqual(equaltoutf8('\ud801',
|
||||
'\ud801'.encode("utf8", "surrogatepass")), 0)
|
||||
|
||||
@support.cpython_only
|
||||
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
|
||||
def test_equaltoutf8andsize(self):
|
||||
# Test PyUnicode_EqualToUTF8AndSize()
|
||||
from _testcapi import unicode_equaltoutf8andsize as equaltoutf8andsize
|
||||
from _testcapi import unicode_asutf8andsize as asutf8andsize
|
||||
|
||||
strings = [
|
||||
'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16',
|
||||
'\U0001f600\U0001f601\U0001f602',
|
||||
'\U0010ffff',
|
||||
]
|
||||
for s in strings:
|
||||
# Call PyUnicode_AsUTF8AndSize() which creates the UTF-8
|
||||
# encoded string cached in the Unicode object.
|
||||
asutf8andsize(s, 0)
|
||||
b = s.encode()
|
||||
self.assertEqual(equaltoutf8andsize(s, b), 1) # Use the UTF-8 cache.
|
||||
s2 = b.decode() # New Unicode object without the UTF-8 cache.
|
||||
self.assertEqual(equaltoutf8andsize(s2, b), 1)
|
||||
self.assertEqual(equaltoutf8andsize(s + 'x', b + b'x'), 1)
|
||||
self.assertEqual(equaltoutf8andsize(s + 'x', b + b'y'), 0)
|
||||
self.assertEqual(equaltoutf8andsize(s, b + b'\0'), 0)
|
||||
self.assertEqual(equaltoutf8andsize(s2, b + b'\0'), 0)
|
||||
self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0'), 1)
|
||||
self.assertEqual(equaltoutf8andsize(s + '\0', b), 0)
|
||||
self.assertEqual(equaltoutf8andsize(s2, b + b'x'), 0)
|
||||
self.assertEqual(equaltoutf8andsize(s2, b[:-1]), 0)
|
||||
self.assertEqual(equaltoutf8andsize(s2, b[:-1] + b'x'), 0)
|
||||
# Not null-terminated,
|
||||
self.assertEqual(equaltoutf8andsize(s, b + b'x', len(b)), 1)
|
||||
self.assertEqual(equaltoutf8andsize(s2, b + b'x', len(b)), 1)
|
||||
self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0x', len(b) + 1), 1)
|
||||
self.assertEqual(equaltoutf8andsize(s2, b, len(b) - 1), 0)
|
||||
|
||||
self.assertEqual(equaltoutf8andsize('', b''), 1)
|
||||
self.assertEqual(equaltoutf8andsize('', b'\0'), 0)
|
||||
self.assertEqual(equaltoutf8andsize('', b'x', 0), 1)
|
||||
|
||||
# embedded null chars/bytes
|
||||
self.assertEqual(equaltoutf8andsize('abc\0def', b'abc\0def'), 1)
|
||||
self.assertEqual(equaltoutf8andsize('abc\0def\0', b'abc\0def\0'), 1)
|
||||
|
||||
# Surrogate characters are always treated as not equal
|
||||
self.assertEqual(equaltoutf8andsize('\udcfe',
|
||||
'\udcfe'.encode("utf8", "surrogateescape")), 0)
|
||||
self.assertEqual(equaltoutf8andsize('\udcfe',
|
||||
'\udcfe'.encode("utf8", "surrogatepass")), 0)
|
||||
self.assertEqual(equaltoutf8andsize('\ud801',
|
||||
'\ud801'.encode("utf8", "surrogatepass")), 0)
|
||||
|
||||
def check_not_equal_encoding(text, encoding):
|
||||
self.assertEqual(equaltoutf8andsize(text, text.encode(encoding)), 0)
|
||||
self.assertNotEqual(text.encode(encoding), text.encode("utf8"))
|
||||
|
||||
# Strings encoded to other encodings are not equal to expected UTF8-encoding string
|
||||
check_not_equal_encoding('Stéphane', 'latin1')
|
||||
check_not_equal_encoding('Stéphane', 'utf-16-le') # embedded null characters
|
||||
check_not_equal_encoding('北京市', 'gbk')
|
||||
|
||||
# CRASHES equaltoutf8andsize('abc', b'abc', -1)
|
||||
# CRASHES equaltoutf8andsize(b'abc', b'abc')
|
||||
# CRASHES equaltoutf8andsize([], b'abc')
|
||||
# CRASHES equaltoutf8andsize(NULL, b'abc')
|
||||
# CRASHES equaltoutf8andsize('abc', NULL)
|
||||
|
||||
@support.cpython_only
|
||||
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
|
||||
def test_richcompare(self):
|
||||
|
|
|
@ -770,6 +770,8 @@ SYMBOL_NAMES = (
|
|||
"PyUnicode_DecodeUnicodeEscape",
|
||||
"PyUnicode_EncodeFSDefault",
|
||||
"PyUnicode_EncodeLocale",
|
||||
"PyUnicode_EqualToUTF8",
|
||||
"PyUnicode_EqualToUTF8AndSize",
|
||||
"PyUnicode_FSConverter",
|
||||
"PyUnicode_FSDecoder",
|
||||
"PyUnicode_Find",
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8` functions.
|
|
@ -2462,3 +2462,7 @@
|
|||
added = '3.13'
|
||||
[function.Py_IsFinalizing]
|
||||
added = '3.13'
|
||||
[function.PyUnicode_EqualToUTF8]
|
||||
added = '3.13'
|
||||
[function.PyUnicode_EqualToUTF8AndSize]
|
||||
added = '3.13'
|
||||
|
|
|
@ -1429,6 +1429,48 @@ unicode_comparewithasciistring(PyObject *self, PyObject *args)
|
|||
return PyLong_FromLong(result);
|
||||
}
|
||||
|
||||
/* Test PyUnicode_EqualToUTF8() */
|
||||
static PyObject *
|
||||
unicode_equaltoutf8(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyObject *left;
|
||||
const char *right = NULL;
|
||||
Py_ssize_t right_len;
|
||||
int result;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "Oz#", &left, &right, &right_len)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
NULLABLE(left);
|
||||
result = PyUnicode_EqualToUTF8(left, right);
|
||||
assert(!PyErr_Occurred());
|
||||
return PyLong_FromLong(result);
|
||||
}
|
||||
|
||||
/* Test PyUnicode_EqualToUTF8AndSize() */
|
||||
static PyObject *
|
||||
unicode_equaltoutf8andsize(PyObject *self, PyObject *args)
|
||||
{
|
||||
PyObject *left;
|
||||
const char *right = NULL;
|
||||
Py_ssize_t right_len;
|
||||
Py_ssize_t size = -100;
|
||||
int result;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "Oz#|n", &left, &right, &right_len, &size)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
NULLABLE(left);
|
||||
if (size == -100) {
|
||||
size = right_len;
|
||||
}
|
||||
result = PyUnicode_EqualToUTF8AndSize(left, right, size);
|
||||
assert(!PyErr_Occurred());
|
||||
return PyLong_FromLong(result);
|
||||
}
|
||||
|
||||
/* Test PyUnicode_RichCompare() */
|
||||
static PyObject *
|
||||
unicode_richcompare(PyObject *self, PyObject *args)
|
||||
|
@ -2044,6 +2086,8 @@ static PyMethodDef TestMethods[] = {
|
|||
{"unicode_replace", unicode_replace, METH_VARARGS},
|
||||
{"unicode_compare", unicode_compare, METH_VARARGS},
|
||||
{"unicode_comparewithasciistring",unicode_comparewithasciistring,METH_VARARGS},
|
||||
{"unicode_equaltoutf8", unicode_equaltoutf8, METH_VARARGS},
|
||||
{"unicode_equaltoutf8andsize",unicode_equaltoutf8andsize, METH_VARARGS},
|
||||
{"unicode_richcompare", unicode_richcompare, METH_VARARGS},
|
||||
{"unicode_format", unicode_format, METH_VARARGS},
|
||||
{"unicode_contains", unicode_contains, METH_VARARGS},
|
||||
|
|
|
@ -10673,6 +10673,82 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
|
|||
}
|
||||
}
|
||||
|
||||
int
|
||||
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
|
||||
{
|
||||
return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
|
||||
}
|
||||
|
||||
int
|
||||
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
|
||||
{
|
||||
assert(_PyUnicode_CHECK(unicode));
|
||||
assert(str);
|
||||
|
||||
if (PyUnicode_IS_ASCII(unicode)) {
|
||||
Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
|
||||
return size == len &&
|
||||
memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
|
||||
}
|
||||
if (PyUnicode_UTF8(unicode) != NULL) {
|
||||
Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
|
||||
return size == len &&
|
||||
memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
|
||||
}
|
||||
|
||||
Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
|
||||
if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
|
||||
return 0;
|
||||
}
|
||||
const unsigned char *s = (const unsigned char *)str;
|
||||
const unsigned char *ends = s + (size_t)size;
|
||||
int kind = PyUnicode_KIND(unicode);
|
||||
const void *data = PyUnicode_DATA(unicode);
|
||||
/* Compare Unicode string and UTF-8 string */
|
||||
for (Py_ssize_t i = 0; i < len; i++) {
|
||||
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
||||
if (ch < 0x80) {
|
||||
if (ends == s || s[0] != ch) {
|
||||
return 0;
|
||||
}
|
||||
s += 1;
|
||||
}
|
||||
else if (ch < 0x800) {
|
||||
if ((ends - s) < 2 ||
|
||||
s[0] != (0xc0 | (ch >> 6)) ||
|
||||
s[1] != (0x80 | (ch & 0x3f)))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
s += 2;
|
||||
}
|
||||
else if (ch < 0x10000) {
|
||||
if (Py_UNICODE_IS_SURROGATE(ch) ||
|
||||
(ends - s) < 3 ||
|
||||
s[0] != (0xe0 | (ch >> 12)) ||
|
||||
s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
|
||||
s[2] != (0x80 | (ch & 0x3f)))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
s += 3;
|
||||
}
|
||||
else {
|
||||
assert(ch <= MAX_UNICODE);
|
||||
if ((ends - s) < 4 ||
|
||||
s[0] != (0xf0 | (ch >> 18)) ||
|
||||
s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
|
||||
s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
|
||||
s[3] != (0x80 | (ch & 0x3f)))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
s += 4;
|
||||
}
|
||||
}
|
||||
return s == ends;
|
||||
}
|
||||
|
||||
int
|
||||
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
|
||||
{
|
||||
|
|
|
@ -689,6 +689,8 @@ EXPORT_FUNC(PyUnicode_DecodeUTF8Stateful)
|
|||
EXPORT_FUNC(PyUnicode_EncodeCodePage)
|
||||
EXPORT_FUNC(PyUnicode_EncodeFSDefault)
|
||||
EXPORT_FUNC(PyUnicode_EncodeLocale)
|
||||
EXPORT_FUNC(PyUnicode_EqualToUTF8)
|
||||
EXPORT_FUNC(PyUnicode_EqualToUTF8AndSize)
|
||||
EXPORT_FUNC(PyUnicode_Find)
|
||||
EXPORT_FUNC(PyUnicode_FindChar)
|
||||
EXPORT_FUNC(PyUnicode_Format)
|
||||
|
|
Loading…
Reference in New Issue