From c7ad974d341d3edb6b9d2a2dcae4d3d4794ada6b Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sat, 14 Mar 2020 12:43:18 +0900 Subject: [PATCH] bpo-39087: Add _PyUnicode_GetUTF8Buffer() (GH-17659) Co-authored-by: Victor Stinner --- Include/cpython/unicodeobject.h | 19 +- Lib/test/test_unicode.py | 22 ++ .../2019-12-19-21-19-53.bpo-39087.l4A11-.rst | 2 + Modules/_testcapimodule.c | 212 ++++++++++++++++++ Objects/unicodeobject.c | 35 +++ 5 files changed, 284 insertions(+), 6 deletions(-) create mode 100644 Misc/NEWS.d/next/C API/2019-12-19-21-19-53.bpo-39087.l4A11-.rst diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index 54a13e32ba2..be91d2d9fc6 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -734,6 +734,19 @@ PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind); /* --- Manage the default encoding ---------------------------------------- */ +/* Get a buffer to the UTF-8 encoding of the Unicode object unicode. + Returns -1 on error. + + Successful calls must be paired to + calls to PyBuffer_Release. +*/ + +PyAPI_FUNC(int) _PyUnicode_GetUTF8Buffer( + PyObject *unicode, /* Unicode object */ + const char *errors, /* error handling */ + Py_buffer *view /* (out) buffer to the UTF-8 encoding */ + ); + /* Returns a pointer to the default encoding (UTF-8) of the Unicode object unicode and the size of the encoded representation in bytes stored in *size. @@ -746,12 +759,6 @@ PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind); _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to support the previous internal function with the same behaviour. - - *** This API is for interpreter INTERNAL USE ONLY and will likely - *** be removed or changed in the future. - - *** If you need to access the Unicode object as UTF-8 bytes string, - *** please use PyUnicode_AsUTF8String() instead. */ PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize( diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 28398896467..0522513777f 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -2830,6 +2830,28 @@ class CAPITest(unittest.TestCase): self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0') self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff') + # Test _PyUnicode_GetUTF8Buffer() + @support.cpython_only + def test_getutf8buffer(self): + from _testcapi import unicode_getutf8buffer, unicode_test_getutf8buffer + + # Run tests wrtten in C. Raise an error when test failed. + unicode_test_getutf8buffer() + + ascii_ = "foo" + bmp = '\u0100' + bmp2 = '\uffff' + nonbmp = chr(0x10ffff) + surrogates = 'a\ud800b\udfffc' + + self.assertEqual(unicode_getutf8buffer(ascii_), b'foo') + self.assertEqual(unicode_getutf8buffer(bmp), b'\xc4\x80') + self.assertEqual(unicode_getutf8buffer(bmp2), b'\xef\xbf\xbf') + self.assertEqual(unicode_getutf8buffer(nonbmp), b'\xf4\x8f\xbf\xbf') + self.assertRaises(UnicodeEncodeError, unicode_getutf8buffer, surrogates) + self.assertEqual(unicode_getutf8buffer(surrogates, "surrogatepass"), + b'a\xed\xa0\x80b\xed\xbf\xbfc') + # Test PyUnicode_AsUTF8() @support.cpython_only def test_asutf8(self): diff --git a/Misc/NEWS.d/next/C API/2019-12-19-21-19-53.bpo-39087.l4A11-.rst b/Misc/NEWS.d/next/C API/2019-12-19-21-19-53.bpo-39087.l4A11-.rst new file mode 100644 index 00000000000..2c2c85d93b2 --- /dev/null +++ b/Misc/NEWS.d/next/C API/2019-12-19-21-19-53.bpo-39087.l4A11-.rst @@ -0,0 +1,2 @@ +Add new ``_PyUnicode_GetUTF8Buffer`` private API to get UTF-8 encode of the +unicode object without cache or extra allocation. diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c index 3cc558689b6..09b77064de1 100644 --- a/Modules/_testcapimodule.c +++ b/Modules/_testcapimodule.c @@ -1967,6 +1967,216 @@ unicode_asutf8andsize(PyObject *self, PyObject *args) return Py_BuildValue("(Nn)", result, utf8_len); } +static PyObject * +unicode_getutf8buffer(PyObject *self, PyObject *args) +{ + PyObject *unicode; + const char *errors = NULL; + if(!PyArg_ParseTuple(args, "O|s", &unicode, &errors)) { + return NULL; + } + + Py_buffer buffer; + if (_PyUnicode_GetUTF8Buffer(unicode, errors, &buffer) < 0) { + return NULL; + } + + assert(buffer.obj != NULL); + assert(buffer.obj == unicode || PyBytes_CheckExact(buffer.obj)); + + PyObject *result = PyBytes_FromStringAndSize(buffer.buf, buffer.len); + PyBuffer_Release(&buffer); + return result; +} + +static PyObject * +unicode_test_getutf8buffer(PyObject *self, PyObject *Py_UNUSED(ignored)) +{ + Py_buffer buf; + + // Test 1: ASCII string + PyObject *str = PyUnicode_FromString("hello"); + if (str == NULL) { + return NULL; + } + Py_ssize_t refcnt = Py_REFCNT(str); + + // _PyUnicode_GetUTF8Buffer() must not fail for ASCII string. + int ret = _PyUnicode_GetUTF8Buffer(str, NULL, &buf); + assert(ret == 0); + + if (buf.obj != str) { + PyErr_Format(TestError, + "buf.obj must be equal to str. (%s:%d)", + __FILE__, __LINE__); + PyBuffer_Release(&buf); + Py_DECREF(str); + return NULL; + } + + if (buf.len != PyUnicode_GET_LENGTH(str)) { + PyErr_Format(TestError, + "buf.len must be equal to len(str). (%s:%d)", + __FILE__, __LINE__); + PyBuffer_Release(&buf); + Py_DECREF(str); + return NULL; + } + assert(((const char*)buf.buf)[5] == '\0'); + + if ((Py_UCS1*)buf.buf != PyUnicode_1BYTE_DATA(str)) { + PyErr_Format(TestError, + "buf.buf must be equal to PyUnicode_1BYTE_DATA(str). (%s:%d)", + __FILE__, __LINE__); + PyBuffer_Release(&buf); + Py_DECREF(str); + return NULL; + } + + if (refcnt + 1 != Py_REFCNT(str)) { + PyErr_Format(TestError, + "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)", + refcnt + 1, Py_REFCNT(str), + __FILE__, __LINE__); + PyBuffer_Release(&buf); + Py_DECREF(str); + return NULL; + } + + PyBuffer_Release(&buf); + + if (refcnt != Py_REFCNT(str)) { + PyErr_Format(TestError, + "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)", + refcnt, Py_REFCNT(str), + __FILE__, __LINE__); + Py_DECREF(str); + return NULL; + } + + Py_DECREF(str); + + // Test 2: non-ASCII string + + // "hello" in Japanese. len(str)==5, len(str.encode()) == 15. + str = PyUnicode_FromString("\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf"); + if (str == NULL) { + return NULL; + } + refcnt = Py_REFCNT(str); + assert(PyUnicode_GET_LENGTH(str) == 5); + + if (_PyUnicode_GetUTF8Buffer(str, NULL, &buf) < 0) { + Py_DECREF(str); + if (!PyErr_Occurred()) { + PyErr_Format(TestError, + "_PyUnicode_GetUTF8Buffer() returned nonzero " + "without exception set. (%s:%d)", + __FILE__, __LINE__); + } + return NULL; + } + + if (!PyBytes_CheckExact(buf.obj)) { + PyErr_Format(TestError, + "buf.obj must be a bytes object, got %R (%s:%d)", + buf.obj, __FILE__, __LINE__); + PyBuffer_Release(&buf); + Py_DECREF(str); + return NULL; + } + + if (buf.len != 15) { + PyErr_Format(TestError, + "Expected buf.len == 15, actual %zd (%s:%d)", + buf.len, __FILE__, __LINE__); + PyBuffer_Release(&buf); + Py_DECREF(str); + return NULL; + } + assert(((const char*)buf.buf)[15] == '\0'); + + if (refcnt != Py_REFCNT(str)) { + PyErr_Format(TestError, + "Py_REFCNT(str) must not be changed. (%s:%d)", + __FILE__, __LINE__); + // Do not DECREF here because refcnt is broken. + return NULL; + } + + PyBuffer_Release(&buf); + + // Test 3: There is a UTF-8 cache + // Reuse str of the previoss test. + + const char *cache = PyUnicode_AsUTF8(str); + if (cache == NULL) { + return NULL; + } + + if (_PyUnicode_GetUTF8Buffer(str, NULL, &buf) < 0) { + Py_DECREF(str); + if (!PyErr_Occurred()) { + PyErr_Format(TestError, + "_PyUnicode_GetUTF8Buffer() returned nonzero " + "without exception set. (%s:%d)", + __FILE__, __LINE__); + } + return NULL; + } + + if (buf.obj != str) { + PyErr_Format(TestError, + "buf.obj must be equal to str. (%s:%d)", + __FILE__, __LINE__); + PyBuffer_Release(&buf); + Py_DECREF(str); + return NULL; + } + + if (buf.buf != cache) { + PyErr_Format(TestError, + "buf.buf must be equal to the UTF-8 cache (%s:%d)", + __FILE__, __LINE__); + PyBuffer_Release(&buf); + Py_DECREF(str); + return NULL; + } + + if (buf.len != 15) { + PyErr_Format(TestError, + "Expected buf.len == 15, actual %zd (%s:%d)", + buf.len, __FILE__, __LINE__); + PyBuffer_Release(&buf); + Py_DECREF(str); + return NULL; + } + assert(((const char*)buf.buf)[15] == '\0'); + + if (refcnt + 1 != Py_REFCNT(str)) { + PyErr_Format(TestError, + "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)", + refcnt + 1, Py_REFCNT(str), + __FILE__, __LINE__); + // Do not DECREF here because refcnt is broken. + return NULL; + } + + PyBuffer_Release(&buf); + + if (refcnt != Py_REFCNT(str)) { + PyErr_Format(TestError, + "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)", + refcnt, Py_REFCNT(str), + __FILE__, __LINE__); + // Do not DECREF here because refcnt is broken. + return NULL; + } + + Py_DECREF(str); + Py_RETURN_NONE; +} + static PyObject * unicode_findchar(PyObject *self, PyObject *args) { @@ -5392,6 +5602,8 @@ static PyMethodDef TestMethods[] = { {"unicode_asucs4", unicode_asucs4, METH_VARARGS}, {"unicode_asutf8", unicode_asutf8, METH_VARARGS}, {"unicode_asutf8andsize", unicode_asutf8andsize, METH_VARARGS}, + {"unicode_getutf8buffer", unicode_getutf8buffer, METH_VARARGS}, + {"unicode_test_getutf8buffer", unicode_test_getutf8buffer, METH_NOARGS}, {"unicode_findchar", unicode_findchar, METH_VARARGS}, {"unicode_copycharacters", unicode_copycharacters, METH_VARARGS}, {"unicode_encodedecimal", unicode_encodedecimal, METH_VARARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 3d99f11ecff..0fea435599b 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3991,6 +3991,41 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr) } +int +_PyUnicode_GetUTF8Buffer(PyObject *unicode, const char *errors, + Py_buffer *view) +{ + if (!PyUnicode_Check(unicode)) { + PyErr_BadArgument(); + return -1; + } + if (PyUnicode_READY(unicode) == -1) { + return -1; + } + + if (PyUnicode_UTF8(unicode) != NULL + && Py_TYPE(unicode)->tp_as_buffer == NULL) { + return PyBuffer_FillInfo(view, unicode, + PyUnicode_UTF8(unicode), + PyUnicode_UTF8_LENGTH(unicode), + /* readonly */ 1, PyBUF_SIMPLE); + } + + // Unlike PyUnicode_AsUTF8AndSize(), this function doesn't + // create a UTF-8 cache for speed and efficiency. + PyObject *bytes = _PyUnicode_AsUTF8String(unicode, errors); + if (bytes == NULL) { + return -1; + } + assert(PyBytes_CheckExact(bytes)); + if (PyObject_GetBuffer(bytes, view, PyBUF_SIMPLE) < 0) { + Py_DECREF(bytes); + return -1; + } + return 0; +} + + static int unicode_fill_utf8(PyObject *unicode); const char *