From 3a8c56295d6272ad2177d2de8af4c3f824f3ef92 Mon Sep 17 00:00:00 2001 From: Inada Naoki Date: Sat, 14 Mar 2020 15:59:27 +0900 Subject: [PATCH] Revert "bpo-39087: Add _PyUnicode_GetUTF8Buffer()" (GH-18985) * Revert "bpo-39087: Add _PyUnicode_GetUTF8Buffer() (GH-17659)" This reverts commit c7ad974d341d3edb6b9d2a2dcae4d3d4794ada6b. * Update unicodeobject.h --- Include/cpython/unicodeobject.h | 13 -- Lib/test/test_unicode.py | 22 -- .../2019-12-19-21-19-53.bpo-39087.l4A11-.rst | 2 - Modules/_testcapimodule.c | 212 ------------------ Objects/unicodeobject.c | 35 --- 5 files changed, 284 deletions(-) delete mode 100644 Misc/NEWS.d/next/C API/2019-12-19-21-19-53.bpo-39087.l4A11-.rst diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h index be91d2d9fc6..0df64790c1c 100644 --- a/Include/cpython/unicodeobject.h +++ b/Include/cpython/unicodeobject.h @@ -734,19 +734,6 @@ PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind); /* --- Manage the default encoding ---------------------------------------- */ -/* Get a buffer to the UTF-8 encoding of the Unicode object unicode. - Returns -1 on error. - - Successful calls must be paired to - calls to PyBuffer_Release. -*/ - -PyAPI_FUNC(int) _PyUnicode_GetUTF8Buffer( - PyObject *unicode, /* Unicode object */ - const char *errors, /* error handling */ - Py_buffer *view /* (out) buffer to the UTF-8 encoding */ - ); - /* Returns a pointer to the default encoding (UTF-8) of the Unicode object unicode and the size of the encoded representation in bytes stored in *size. diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 0522513777f..28398896467 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -2830,28 +2830,6 @@ class CAPITest(unittest.TestCase): self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0') self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff') - # Test _PyUnicode_GetUTF8Buffer() - @support.cpython_only - def test_getutf8buffer(self): - from _testcapi import unicode_getutf8buffer, unicode_test_getutf8buffer - - # Run tests wrtten in C. Raise an error when test failed. - unicode_test_getutf8buffer() - - ascii_ = "foo" - bmp = '\u0100' - bmp2 = '\uffff' - nonbmp = chr(0x10ffff) - surrogates = 'a\ud800b\udfffc' - - self.assertEqual(unicode_getutf8buffer(ascii_), b'foo') - self.assertEqual(unicode_getutf8buffer(bmp), b'\xc4\x80') - self.assertEqual(unicode_getutf8buffer(bmp2), b'\xef\xbf\xbf') - self.assertEqual(unicode_getutf8buffer(nonbmp), b'\xf4\x8f\xbf\xbf') - self.assertRaises(UnicodeEncodeError, unicode_getutf8buffer, surrogates) - self.assertEqual(unicode_getutf8buffer(surrogates, "surrogatepass"), - b'a\xed\xa0\x80b\xed\xbf\xbfc') - # Test PyUnicode_AsUTF8() @support.cpython_only def test_asutf8(self): diff --git a/Misc/NEWS.d/next/C API/2019-12-19-21-19-53.bpo-39087.l4A11-.rst b/Misc/NEWS.d/next/C API/2019-12-19-21-19-53.bpo-39087.l4A11-.rst deleted file mode 100644 index 2c2c85d93b2..00000000000 --- a/Misc/NEWS.d/next/C API/2019-12-19-21-19-53.bpo-39087.l4A11-.rst +++ /dev/null @@ -1,2 +0,0 @@ -Add new ``_PyUnicode_GetUTF8Buffer`` private API to get UTF-8 encode of the -unicode object without cache or extra allocation. diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c index 09b77064de1..3cc558689b6 100644 --- a/Modules/_testcapimodule.c +++ b/Modules/_testcapimodule.c @@ -1967,216 +1967,6 @@ unicode_asutf8andsize(PyObject *self, PyObject *args) return Py_BuildValue("(Nn)", result, utf8_len); } -static PyObject * -unicode_getutf8buffer(PyObject *self, PyObject *args) -{ - PyObject *unicode; - const char *errors = NULL; - if(!PyArg_ParseTuple(args, "O|s", &unicode, &errors)) { - return NULL; - } - - Py_buffer buffer; - if (_PyUnicode_GetUTF8Buffer(unicode, errors, &buffer) < 0) { - return NULL; - } - - assert(buffer.obj != NULL); - assert(buffer.obj == unicode || PyBytes_CheckExact(buffer.obj)); - - PyObject *result = PyBytes_FromStringAndSize(buffer.buf, buffer.len); - PyBuffer_Release(&buffer); - return result; -} - -static PyObject * -unicode_test_getutf8buffer(PyObject *self, PyObject *Py_UNUSED(ignored)) -{ - Py_buffer buf; - - // Test 1: ASCII string - PyObject *str = PyUnicode_FromString("hello"); - if (str == NULL) { - return NULL; - } - Py_ssize_t refcnt = Py_REFCNT(str); - - // _PyUnicode_GetUTF8Buffer() must not fail for ASCII string. - int ret = _PyUnicode_GetUTF8Buffer(str, NULL, &buf); - assert(ret == 0); - - if (buf.obj != str) { - PyErr_Format(TestError, - "buf.obj must be equal to str. (%s:%d)", - __FILE__, __LINE__); - PyBuffer_Release(&buf); - Py_DECREF(str); - return NULL; - } - - if (buf.len != PyUnicode_GET_LENGTH(str)) { - PyErr_Format(TestError, - "buf.len must be equal to len(str). (%s:%d)", - __FILE__, __LINE__); - PyBuffer_Release(&buf); - Py_DECREF(str); - return NULL; - } - assert(((const char*)buf.buf)[5] == '\0'); - - if ((Py_UCS1*)buf.buf != PyUnicode_1BYTE_DATA(str)) { - PyErr_Format(TestError, - "buf.buf must be equal to PyUnicode_1BYTE_DATA(str). (%s:%d)", - __FILE__, __LINE__); - PyBuffer_Release(&buf); - Py_DECREF(str); - return NULL; - } - - if (refcnt + 1 != Py_REFCNT(str)) { - PyErr_Format(TestError, - "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)", - refcnt + 1, Py_REFCNT(str), - __FILE__, __LINE__); - PyBuffer_Release(&buf); - Py_DECREF(str); - return NULL; - } - - PyBuffer_Release(&buf); - - if (refcnt != Py_REFCNT(str)) { - PyErr_Format(TestError, - "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)", - refcnt, Py_REFCNT(str), - __FILE__, __LINE__); - Py_DECREF(str); - return NULL; - } - - Py_DECREF(str); - - // Test 2: non-ASCII string - - // "hello" in Japanese. len(str)==5, len(str.encode()) == 15. - str = PyUnicode_FromString("\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf"); - if (str == NULL) { - return NULL; - } - refcnt = Py_REFCNT(str); - assert(PyUnicode_GET_LENGTH(str) == 5); - - if (_PyUnicode_GetUTF8Buffer(str, NULL, &buf) < 0) { - Py_DECREF(str); - if (!PyErr_Occurred()) { - PyErr_Format(TestError, - "_PyUnicode_GetUTF8Buffer() returned nonzero " - "without exception set. (%s:%d)", - __FILE__, __LINE__); - } - return NULL; - } - - if (!PyBytes_CheckExact(buf.obj)) { - PyErr_Format(TestError, - "buf.obj must be a bytes object, got %R (%s:%d)", - buf.obj, __FILE__, __LINE__); - PyBuffer_Release(&buf); - Py_DECREF(str); - return NULL; - } - - if (buf.len != 15) { - PyErr_Format(TestError, - "Expected buf.len == 15, actual %zd (%s:%d)", - buf.len, __FILE__, __LINE__); - PyBuffer_Release(&buf); - Py_DECREF(str); - return NULL; - } - assert(((const char*)buf.buf)[15] == '\0'); - - if (refcnt != Py_REFCNT(str)) { - PyErr_Format(TestError, - "Py_REFCNT(str) must not be changed. (%s:%d)", - __FILE__, __LINE__); - // Do not DECREF here because refcnt is broken. - return NULL; - } - - PyBuffer_Release(&buf); - - // Test 3: There is a UTF-8 cache - // Reuse str of the previoss test. - - const char *cache = PyUnicode_AsUTF8(str); - if (cache == NULL) { - return NULL; - } - - if (_PyUnicode_GetUTF8Buffer(str, NULL, &buf) < 0) { - Py_DECREF(str); - if (!PyErr_Occurred()) { - PyErr_Format(TestError, - "_PyUnicode_GetUTF8Buffer() returned nonzero " - "without exception set. (%s:%d)", - __FILE__, __LINE__); - } - return NULL; - } - - if (buf.obj != str) { - PyErr_Format(TestError, - "buf.obj must be equal to str. (%s:%d)", - __FILE__, __LINE__); - PyBuffer_Release(&buf); - Py_DECREF(str); - return NULL; - } - - if (buf.buf != cache) { - PyErr_Format(TestError, - "buf.buf must be equal to the UTF-8 cache (%s:%d)", - __FILE__, __LINE__); - PyBuffer_Release(&buf); - Py_DECREF(str); - return NULL; - } - - if (buf.len != 15) { - PyErr_Format(TestError, - "Expected buf.len == 15, actual %zd (%s:%d)", - buf.len, __FILE__, __LINE__); - PyBuffer_Release(&buf); - Py_DECREF(str); - return NULL; - } - assert(((const char*)buf.buf)[15] == '\0'); - - if (refcnt + 1 != Py_REFCNT(str)) { - PyErr_Format(TestError, - "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)", - refcnt + 1, Py_REFCNT(str), - __FILE__, __LINE__); - // Do not DECREF here because refcnt is broken. - return NULL; - } - - PyBuffer_Release(&buf); - - if (refcnt != Py_REFCNT(str)) { - PyErr_Format(TestError, - "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)", - refcnt, Py_REFCNT(str), - __FILE__, __LINE__); - // Do not DECREF here because refcnt is broken. - return NULL; - } - - Py_DECREF(str); - Py_RETURN_NONE; -} - static PyObject * unicode_findchar(PyObject *self, PyObject *args) { @@ -5602,8 +5392,6 @@ static PyMethodDef TestMethods[] = { {"unicode_asucs4", unicode_asucs4, METH_VARARGS}, {"unicode_asutf8", unicode_asutf8, METH_VARARGS}, {"unicode_asutf8andsize", unicode_asutf8andsize, METH_VARARGS}, - {"unicode_getutf8buffer", unicode_getutf8buffer, METH_VARARGS}, - {"unicode_test_getutf8buffer", unicode_test_getutf8buffer, METH_NOARGS}, {"unicode_findchar", unicode_findchar, METH_VARARGS}, {"unicode_copycharacters", unicode_copycharacters, METH_VARARGS}, {"unicode_encodedecimal", unicode_encodedecimal, METH_VARARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 0fea435599b..3d99f11ecff 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3991,41 +3991,6 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr) } -int -_PyUnicode_GetUTF8Buffer(PyObject *unicode, const char *errors, - Py_buffer *view) -{ - if (!PyUnicode_Check(unicode)) { - PyErr_BadArgument(); - return -1; - } - if (PyUnicode_READY(unicode) == -1) { - return -1; - } - - if (PyUnicode_UTF8(unicode) != NULL - && Py_TYPE(unicode)->tp_as_buffer == NULL) { - return PyBuffer_FillInfo(view, unicode, - PyUnicode_UTF8(unicode), - PyUnicode_UTF8_LENGTH(unicode), - /* readonly */ 1, PyBUF_SIMPLE); - } - - // Unlike PyUnicode_AsUTF8AndSize(), this function doesn't - // create a UTF-8 cache for speed and efficiency. - PyObject *bytes = _PyUnicode_AsUTF8String(unicode, errors); - if (bytes == NULL) { - return -1; - } - assert(PyBytes_CheckExact(bytes)); - if (PyObject_GetBuffer(bytes, view, PyBUF_SIMPLE) < 0) { - Py_DECREF(bytes); - return -1; - } - return 0; -} - - static int unicode_fill_utf8(PyObject *unicode); const char *