gh-110289: C API: Add PyUnicode_EqualToUTF8() and PyUnicode_EqualToUTF8AndSize() functions (GH-110297)

This commit is contained in:
Serhiy Storchaka 2023-10-11 16:41:58 +03:00 committed by GitHub
parent d1f7fae424
commit eb50cd37ea
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 280 additions and 0 deletions

View File

@ -1396,6 +1396,28 @@ They all return ``NULL`` or ``-1`` if an exception occurs.
:c:func:`PyErr_Occurred` to check for errors. :c:func:`PyErr_Occurred` to check for errors.
.. c:function:: int PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *string, Py_ssize_t size)
Compare a Unicode object with a char buffer which is interpreted as
being UTF-8 or ASCII encoded and return true (``1``) if they are equal,
or false (``0``) otherwise.
If the Unicode object contains surrogate characters or
the C string is not valid UTF-8, false (``0``) is returned.
This function does not raise exceptions.
.. versionadded:: 3.13
.. c:function:: int PyUnicode_EqualToUTF8(PyObject *unicode, const char *string)
Similar to :c:func:`PyUnicode_EqualToUTF8AndSize`, but compute *string*
length using :c:func:`!strlen`.
If the Unicode object contains null characters, false (``0``) is returned.
.. versionadded:: 3.13
.. c:function:: int PyUnicode_CompareWithASCIIString(PyObject *uni, const char *string) .. c:function:: int PyUnicode_CompareWithASCIIString(PyObject *uni, const char *string)
Compare a Unicode object, *uni*, with *string* and return ``-1``, ``0``, ``1`` for less Compare a Unicode object, *uni*, with *string* and return ``-1``, ``0``, ``1`` for less

View File

@ -755,6 +755,8 @@ function,PyUnicode_DecodeUnicodeEscape,3.2,,
function,PyUnicode_EncodeCodePage,3.7,on Windows, function,PyUnicode_EncodeCodePage,3.7,on Windows,
function,PyUnicode_EncodeFSDefault,3.2,, function,PyUnicode_EncodeFSDefault,3.2,,
function,PyUnicode_EncodeLocale,3.7,, function,PyUnicode_EncodeLocale,3.7,,
function,PyUnicode_EqualToUTF8,3.13,,
function,PyUnicode_EqualToUTF8AndSize,3.13,,
function,PyUnicode_FSConverter,3.2,, function,PyUnicode_FSConverter,3.2,,
function,PyUnicode_FSDecoder,3.2,, function,PyUnicode_FSDecoder,3.2,,
function,PyUnicode_Find,3.2,, function,PyUnicode_Find,3.2,,

View File

@ -1024,6 +1024,12 @@ New Features
functions on Python 3.11 and 3.12. functions on Python 3.11 and 3.12.
(Contributed by Victor Stinner in :gh:`107073`.) (Contributed by Victor Stinner in :gh:`107073`.)
* Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8`
functions: compare Unicode object with a :c:expr:`const char*` UTF-8 encoded
string and return true (``1``) if they are equal, or false (``0``) otherwise.
These functions do not raise exceptions.
(Contributed by Serhiy Storchaka in :gh:`110289`.)
* Add :c:func:`PyThreadState_GetUnchecked()` function: similar to * Add :c:func:`PyThreadState_GetUnchecked()` function: similar to
:c:func:`PyThreadState_Get()`, but don't kill the process with a fatal error :c:func:`PyThreadState_Get()`, but don't kill the process with a fatal error
if it is NULL. The caller is responsible to check if the result is NULL. if it is NULL. The caller is responsible to check if the result is NULL.

View File

@ -957,6 +957,15 @@ PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
const char *right /* ASCII-encoded string */ const char *right /* ASCII-encoded string */
); );
#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030D0000
/* Compare a Unicode object with UTF-8 encoded C string.
Return 1 if they are equal, or 0 otherwise.
This function does not raise exceptions. */
PyAPI_FUNC(int) PyUnicode_EqualToUTF8(PyObject *, const char *);
PyAPI_FUNC(int) PyUnicode_EqualToUTF8AndSize(PyObject *, const char *, Py_ssize_t);
#endif
/* Rich compare two strings and return one of the following: /* Rich compare two strings and return one of the following:
- NULL in case an exception was raised - NULL in case an exception was raised

View File

@ -1297,6 +1297,118 @@ class CAPITest(unittest.TestCase):
# CRASHES comparewithasciistring([], b'abc') # CRASHES comparewithasciistring([], b'abc')
# CRASHES comparewithasciistring(NULL, b'abc') # CRASHES comparewithasciistring(NULL, b'abc')
@support.cpython_only
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
def test_equaltoutf8(self):
# Test PyUnicode_EqualToUTF8()
from _testcapi import unicode_equaltoutf8 as equaltoutf8
from _testcapi import unicode_asutf8andsize as asutf8andsize
strings = [
'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16',
'\U0001f600\U0001f601\U0001f602',
'\U0010ffff',
]
for s in strings:
# Call PyUnicode_AsUTF8AndSize() which creates the UTF-8
# encoded string cached in the Unicode object.
asutf8andsize(s, 0)
b = s.encode()
self.assertEqual(equaltoutf8(s, b), 1) # Use the UTF-8 cache.
s2 = b.decode() # New Unicode object without the UTF-8 cache.
self.assertEqual(equaltoutf8(s2, b), 1)
self.assertEqual(equaltoutf8(s + 'x', b + b'x'), 1)
self.assertEqual(equaltoutf8(s + 'x', b + b'y'), 0)
self.assertEqual(equaltoutf8(s, b + b'\0'), 1)
self.assertEqual(equaltoutf8(s2, b + b'\0'), 1)
self.assertEqual(equaltoutf8(s + '\0', b + b'\0'), 0)
self.assertEqual(equaltoutf8(s + '\0', b), 0)
self.assertEqual(equaltoutf8(s2, b + b'x'), 0)
self.assertEqual(equaltoutf8(s2, b[:-1]), 0)
self.assertEqual(equaltoutf8(s2, b[:-1] + b'x'), 0)
self.assertEqual(equaltoutf8('', b''), 1)
self.assertEqual(equaltoutf8('', b'\0'), 1)
# embedded null chars/bytes
self.assertEqual(equaltoutf8('abc', b'abc\0def\0'), 1)
self.assertEqual(equaltoutf8('a\0bc', b'abc'), 0)
self.assertEqual(equaltoutf8('abc', b'a\0bc'), 0)
# Surrogate characters are always treated as not equal
self.assertEqual(equaltoutf8('\udcfe',
'\udcfe'.encode("utf8", "surrogateescape")), 0)
self.assertEqual(equaltoutf8('\udcfe',
'\udcfe'.encode("utf8", "surrogatepass")), 0)
self.assertEqual(equaltoutf8('\ud801',
'\ud801'.encode("utf8", "surrogatepass")), 0)
@support.cpython_only
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
def test_equaltoutf8andsize(self):
# Test PyUnicode_EqualToUTF8AndSize()
from _testcapi import unicode_equaltoutf8andsize as equaltoutf8andsize
from _testcapi import unicode_asutf8andsize as asutf8andsize
strings = [
'abc', '\xa1\xa2\xa3', '\u4f60\u597d\u4e16',
'\U0001f600\U0001f601\U0001f602',
'\U0010ffff',
]
for s in strings:
# Call PyUnicode_AsUTF8AndSize() which creates the UTF-8
# encoded string cached in the Unicode object.
asutf8andsize(s, 0)
b = s.encode()
self.assertEqual(equaltoutf8andsize(s, b), 1) # Use the UTF-8 cache.
s2 = b.decode() # New Unicode object without the UTF-8 cache.
self.assertEqual(equaltoutf8andsize(s2, b), 1)
self.assertEqual(equaltoutf8andsize(s + 'x', b + b'x'), 1)
self.assertEqual(equaltoutf8andsize(s + 'x', b + b'y'), 0)
self.assertEqual(equaltoutf8andsize(s, b + b'\0'), 0)
self.assertEqual(equaltoutf8andsize(s2, b + b'\0'), 0)
self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0'), 1)
self.assertEqual(equaltoutf8andsize(s + '\0', b), 0)
self.assertEqual(equaltoutf8andsize(s2, b + b'x'), 0)
self.assertEqual(equaltoutf8andsize(s2, b[:-1]), 0)
self.assertEqual(equaltoutf8andsize(s2, b[:-1] + b'x'), 0)
# Not null-terminated,
self.assertEqual(equaltoutf8andsize(s, b + b'x', len(b)), 1)
self.assertEqual(equaltoutf8andsize(s2, b + b'x', len(b)), 1)
self.assertEqual(equaltoutf8andsize(s + '\0', b + b'\0x', len(b) + 1), 1)
self.assertEqual(equaltoutf8andsize(s2, b, len(b) - 1), 0)
self.assertEqual(equaltoutf8andsize('', b''), 1)
self.assertEqual(equaltoutf8andsize('', b'\0'), 0)
self.assertEqual(equaltoutf8andsize('', b'x', 0), 1)
# embedded null chars/bytes
self.assertEqual(equaltoutf8andsize('abc\0def', b'abc\0def'), 1)
self.assertEqual(equaltoutf8andsize('abc\0def\0', b'abc\0def\0'), 1)
# Surrogate characters are always treated as not equal
self.assertEqual(equaltoutf8andsize('\udcfe',
'\udcfe'.encode("utf8", "surrogateescape")), 0)
self.assertEqual(equaltoutf8andsize('\udcfe',
'\udcfe'.encode("utf8", "surrogatepass")), 0)
self.assertEqual(equaltoutf8andsize('\ud801',
'\ud801'.encode("utf8", "surrogatepass")), 0)
def check_not_equal_encoding(text, encoding):
self.assertEqual(equaltoutf8andsize(text, text.encode(encoding)), 0)
self.assertNotEqual(text.encode(encoding), text.encode("utf8"))
# Strings encoded to other encodings are not equal to expected UTF8-encoding string
check_not_equal_encoding('Stéphane', 'latin1')
check_not_equal_encoding('Stéphane', 'utf-16-le') # embedded null characters
check_not_equal_encoding('北京市', 'gbk')
# CRASHES equaltoutf8andsize('abc', b'abc', -1)
# CRASHES equaltoutf8andsize(b'abc', b'abc')
# CRASHES equaltoutf8andsize([], b'abc')
# CRASHES equaltoutf8andsize(NULL, b'abc')
# CRASHES equaltoutf8andsize('abc', NULL)
@support.cpython_only @support.cpython_only
@unittest.skipIf(_testcapi is None, 'need _testcapi module') @unittest.skipIf(_testcapi is None, 'need _testcapi module')
def test_richcompare(self): def test_richcompare(self):

View File

@ -770,6 +770,8 @@ SYMBOL_NAMES = (
"PyUnicode_DecodeUnicodeEscape", "PyUnicode_DecodeUnicodeEscape",
"PyUnicode_EncodeFSDefault", "PyUnicode_EncodeFSDefault",
"PyUnicode_EncodeLocale", "PyUnicode_EncodeLocale",
"PyUnicode_EqualToUTF8",
"PyUnicode_EqualToUTF8AndSize",
"PyUnicode_FSConverter", "PyUnicode_FSConverter",
"PyUnicode_FSDecoder", "PyUnicode_FSDecoder",
"PyUnicode_Find", "PyUnicode_Find",

View File

@ -0,0 +1 @@
Add :c:func:`PyUnicode_EqualToUTF8AndSize` and :c:func:`PyUnicode_EqualToUTF8` functions.

View File

@ -2462,3 +2462,7 @@
added = '3.13' added = '3.13'
[function.Py_IsFinalizing] [function.Py_IsFinalizing]
added = '3.13' added = '3.13'
[function.PyUnicode_EqualToUTF8]
added = '3.13'
[function.PyUnicode_EqualToUTF8AndSize]
added = '3.13'

View File

@ -1429,6 +1429,48 @@ unicode_comparewithasciistring(PyObject *self, PyObject *args)
return PyLong_FromLong(result); return PyLong_FromLong(result);
} }
/* Test PyUnicode_EqualToUTF8() */
static PyObject *
unicode_equaltoutf8(PyObject *self, PyObject *args)
{
PyObject *left;
const char *right = NULL;
Py_ssize_t right_len;
int result;
if (!PyArg_ParseTuple(args, "Oz#", &left, &right, &right_len)) {
return NULL;
}
NULLABLE(left);
result = PyUnicode_EqualToUTF8(left, right);
assert(!PyErr_Occurred());
return PyLong_FromLong(result);
}
/* Test PyUnicode_EqualToUTF8AndSize() */
static PyObject *
unicode_equaltoutf8andsize(PyObject *self, PyObject *args)
{
PyObject *left;
const char *right = NULL;
Py_ssize_t right_len;
Py_ssize_t size = -100;
int result;
if (!PyArg_ParseTuple(args, "Oz#|n", &left, &right, &right_len, &size)) {
return NULL;
}
NULLABLE(left);
if (size == -100) {
size = right_len;
}
result = PyUnicode_EqualToUTF8AndSize(left, right, size);
assert(!PyErr_Occurred());
return PyLong_FromLong(result);
}
/* Test PyUnicode_RichCompare() */ /* Test PyUnicode_RichCompare() */
static PyObject * static PyObject *
unicode_richcompare(PyObject *self, PyObject *args) unicode_richcompare(PyObject *self, PyObject *args)
@ -2044,6 +2086,8 @@ static PyMethodDef TestMethods[] = {
{"unicode_replace", unicode_replace, METH_VARARGS}, {"unicode_replace", unicode_replace, METH_VARARGS},
{"unicode_compare", unicode_compare, METH_VARARGS}, {"unicode_compare", unicode_compare, METH_VARARGS},
{"unicode_comparewithasciistring",unicode_comparewithasciistring,METH_VARARGS}, {"unicode_comparewithasciistring",unicode_comparewithasciistring,METH_VARARGS},
{"unicode_equaltoutf8", unicode_equaltoutf8, METH_VARARGS},
{"unicode_equaltoutf8andsize",unicode_equaltoutf8andsize, METH_VARARGS},
{"unicode_richcompare", unicode_richcompare, METH_VARARGS}, {"unicode_richcompare", unicode_richcompare, METH_VARARGS},
{"unicode_format", unicode_format, METH_VARARGS}, {"unicode_format", unicode_format, METH_VARARGS},
{"unicode_contains", unicode_contains, METH_VARARGS}, {"unicode_contains", unicode_contains, METH_VARARGS},

View File

@ -10673,6 +10673,82 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
} }
} }
int
PyUnicode_EqualToUTF8(PyObject *unicode, const char *str)
{
return PyUnicode_EqualToUTF8AndSize(unicode, str, strlen(str));
}
int
PyUnicode_EqualToUTF8AndSize(PyObject *unicode, const char *str, Py_ssize_t size)
{
assert(_PyUnicode_CHECK(unicode));
assert(str);
if (PyUnicode_IS_ASCII(unicode)) {
Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
return size == len &&
memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
}
if (PyUnicode_UTF8(unicode) != NULL) {
Py_ssize_t len = PyUnicode_UTF8_LENGTH(unicode);
return size == len &&
memcmp(PyUnicode_UTF8(unicode), str, len) == 0;
}
Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
if ((size_t)len >= (size_t)size || (size_t)len < (size_t)size / 4) {
return 0;
}
const unsigned char *s = (const unsigned char *)str;
const unsigned char *ends = s + (size_t)size;
int kind = PyUnicode_KIND(unicode);
const void *data = PyUnicode_DATA(unicode);
/* Compare Unicode string and UTF-8 string */
for (Py_ssize_t i = 0; i < len; i++) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (ch < 0x80) {
if (ends == s || s[0] != ch) {
return 0;
}
s += 1;
}
else if (ch < 0x800) {
if ((ends - s) < 2 ||
s[0] != (0xc0 | (ch >> 6)) ||
s[1] != (0x80 | (ch & 0x3f)))
{
return 0;
}
s += 2;
}
else if (ch < 0x10000) {
if (Py_UNICODE_IS_SURROGATE(ch) ||
(ends - s) < 3 ||
s[0] != (0xe0 | (ch >> 12)) ||
s[1] != (0x80 | ((ch >> 6) & 0x3f)) ||
s[2] != (0x80 | (ch & 0x3f)))
{
return 0;
}
s += 3;
}
else {
assert(ch <= MAX_UNICODE);
if ((ends - s) < 4 ||
s[0] != (0xf0 | (ch >> 18)) ||
s[1] != (0x80 | ((ch >> 12) & 0x3f)) ||
s[2] != (0x80 | ((ch >> 6) & 0x3f)) ||
s[3] != (0x80 | (ch & 0x3f)))
{
return 0;
}
s += 4;
}
}
return s == ends;
}
int int
_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str) _PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
{ {

2
PC/python3dll.c generated
View File

@ -689,6 +689,8 @@ EXPORT_FUNC(PyUnicode_DecodeUTF8Stateful)
EXPORT_FUNC(PyUnicode_EncodeCodePage) EXPORT_FUNC(PyUnicode_EncodeCodePage)
EXPORT_FUNC(PyUnicode_EncodeFSDefault) EXPORT_FUNC(PyUnicode_EncodeFSDefault)
EXPORT_FUNC(PyUnicode_EncodeLocale) EXPORT_FUNC(PyUnicode_EncodeLocale)
EXPORT_FUNC(PyUnicode_EqualToUTF8)
EXPORT_FUNC(PyUnicode_EqualToUTF8AndSize)
EXPORT_FUNC(PyUnicode_Find) EXPORT_FUNC(PyUnicode_Find)
EXPORT_FUNC(PyUnicode_FindChar) EXPORT_FUNC(PyUnicode_FindChar)
EXPORT_FUNC(PyUnicode_Format) EXPORT_FUNC(PyUnicode_Format)