diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index a669f8e3df0..1ed20582351 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1806,6 +1806,36 @@ class UnicodeTest(string_tests.CommonTest, s += "4" self.assertEqual(s, "3") + def test_encode_decimal(self): + from _testcapi import unicode_encodedecimal + self.assertEqual(unicode_encodedecimal('123'), + b'123') + self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'), + b'3.14') + self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"), + b' 3.14 ') + self.assertRaises(UnicodeEncodeError, + unicode_encodedecimal, "123\u20ac", "strict") + self.assertEqual(unicode_encodedecimal("123\u20ac", "replace"), + b'123?') + self.assertEqual(unicode_encodedecimal("123\u20ac", "ignore"), + b'123') + self.assertEqual(unicode_encodedecimal("123\u20ac", "xmlcharrefreplace"), + b'123€') + self.assertEqual(unicode_encodedecimal("123\u20ac", "backslashreplace"), + b'123\\u20ac') + + def test_transform_decimal(self): + from _testcapi import unicode_transformdecimaltoascii as transform_decimal + self.assertEqual(transform_decimal('123'), + '123') + self.assertEqual(transform_decimal('\u0663.\u0661\u0664'), + '3.14') + self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"), + "\N{EM SPACE}3.14\N{EN SPACE}") + self.assertEqual(transform_decimal('123\u20ac'), + '123\u20ac') + class StringModuleTest(unittest.TestCase): def test_formatter_parser(self): diff --git a/Modules/_testcapimodule.c b/Modules/_testcapimodule.c index 3a6dcd6a853..962f10b5397 100644 --- a/Modules/_testcapimodule.c +++ b/Modules/_testcapimodule.c @@ -1499,6 +1499,51 @@ unicode_aswidecharstring(PyObject *self, PyObject *args) return Py_BuildValue("(Nn)", result, size); } +static PyObject * +unicode_encodedecimal(PyObject *self, PyObject *args) +{ + Py_UNICODE *unicode; + Py_ssize_t length; + char *errors = NULL; + PyObject *decimal; + Py_ssize_t decimal_length, new_length; + int res; + + if (!PyArg_ParseTuple(args, "u#|s", &unicode, &length, &errors)) + return NULL; + + decimal_length = length * 7; /* len('€') */ + decimal = PyBytes_FromStringAndSize(NULL, decimal_length); + if (decimal == NULL) + return NULL; + + res = PyUnicode_EncodeDecimal(unicode, length, + PyBytes_AS_STRING(decimal), + errors); + if (res < 0) { + Py_DECREF(decimal); + return NULL; + } + + new_length = strlen(PyBytes_AS_STRING(decimal)); + assert(new_length <= decimal_length); + res = _PyBytes_Resize(&decimal, new_length); + if (res < 0) + return NULL; + + return decimal; +} + +static PyObject * +unicode_transformdecimaltoascii(PyObject *self, PyObject *args) +{ + Py_UNICODE *unicode; + Py_ssize_t length; + if (!PyArg_ParseTuple(args, "u#|s", &unicode, &length)) + return NULL; + return PyUnicode_TransformDecimalToASCII(unicode, length); +} + static PyObject * getargs_w_star(PyObject *self, PyObject *args) { @@ -2384,8 +2429,10 @@ static PyMethodDef TestMethods[] = { {"test_u_code", (PyCFunction)test_u_code, METH_NOARGS}, {"test_Z_code", (PyCFunction)test_Z_code, METH_NOARGS}, {"test_widechar", (PyCFunction)test_widechar, METH_NOARGS}, - {"unicode_aswidechar", unicode_aswidechar, METH_VARARGS}, - {"unicode_aswidecharstring",unicode_aswidecharstring, METH_VARARGS}, + {"unicode_aswidechar", unicode_aswidechar, METH_VARARGS}, + {"unicode_aswidecharstring",unicode_aswidecharstring, METH_VARARGS}, + {"unicode_encodedecimal", unicode_encodedecimal, METH_VARARGS}, + {"unicode_transformdecimaltoascii", unicode_transformdecimaltoascii, METH_VARARGS}, #ifdef WITH_THREAD {"_test_thread_state", test_thread_state, METH_VARARGS}, {"_pending_threadfunc", pending_threadfunc, METH_VARARGS}, diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 9dedf0b1b48..bcd5b6438e4 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -8829,7 +8829,6 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s, char *output, const char *errors) { - Py_UNICODE *p, *end; PyObject *errorHandler = NULL; PyObject *exc = NULL; PyObject *unicode; @@ -8838,47 +8837,50 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s, /* the following variable is used for caching string comparisons * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ int known_errorHandler = -1; + Py_ssize_t i, j; + enum PyUnicode_Kind kind; + void *data; if (output == NULL) { PyErr_BadArgument(); return -1; } - p = s; - end = s + length; - while (p < end) { - register Py_UNICODE ch = *p; + unicode = PyUnicode_FromUnicode(s, length); + if (unicode == NULL) + return -1; + + if (PyUnicode_READY(unicode) < 0) + goto onError; + kind = PyUnicode_KIND(unicode); + data = PyUnicode_DATA(unicode); + + for (i=0; i < length; i++) { + Py_UCS4 ch = PyUnicode_READ(kind, data, i); int decimal; - PyObject *repunicode; - Py_ssize_t repsize; - Py_ssize_t newpos; - Py_UNICODE *uni2; - Py_UNICODE *collstart; - Py_UNICODE *collend; + Py_ssize_t startpos, endpos; if (Py_UNICODE_ISSPACE(ch)) { *output++ = ' '; - ++p; continue; } decimal = Py_UNICODE_TODECIMAL(ch); if (decimal >= 0) { *output++ = '0' + decimal; - ++p; continue; } if (0 < ch && ch < 256) { *output++ = (char)ch; - ++p; continue; } /* All other characters are considered unencodable */ - collstart = p; - collend = p+1; - while (collend < end) { - if ((0 < *collend && *collend < 256) || - !Py_UNICODE_ISSPACE(*collend) || - Py_UNICODE_TODECIMAL(*collend)) + startpos = i; + endpos = i+1; + for (; endpos < length; endpos++) { + ch = PyUnicode_READ(kind, data, endpos); + if ((0 < ch && ch < 256) || + !Py_UNICODE_ISSPACE(ch) || + Py_UNICODE_TODECIMAL(ch)) break; } /* cache callback name lookup @@ -8897,33 +8899,33 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s, } switch (known_errorHandler) { case 1: /* strict */ - unicode = PyUnicode_FromUnicode(s, length); - if (unicode == NULL) - goto onError; - raise_encode_exception(&exc, encoding, unicode, collstart-s, collend-s, reason); - Py_DECREF(unicode); + raise_encode_exception(&exc, encoding, unicode, startpos, endpos, reason); goto onError; case 2: /* replace */ - for (p = collstart; p < collend; ++p) + for (j=startpos; j < endpos; j++) *output++ = '?'; /* fall through */ case 3: /* ignore */ - p = collend; + i = endpos; break; case 4: /* xmlcharrefreplace */ - /* generate replacement (temporarily (mis)uses p) */ - for (p = collstart; p < collend; ++p) - output += sprintf(output, "&#%d;", (int)*p); - p = collend; + /* generate replacement */ + for (j=startpos; j < endpos; j++) { + ch = PyUnicode_READ(kind, data, i); + output += sprintf(output, "&#%d;", (int)ch); + i++; + } break; default: - unicode = PyUnicode_FromUnicode(s, length); - if (unicode == NULL) - goto onError; + { + PyObject *repunicode; + Py_ssize_t repsize, newpos, k; + enum PyUnicode_Kind repkind; + void *repdata; + repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, encoding, reason, unicode, &exc, - collstart-s, collend-s, &newpos); - Py_DECREF(unicode); + startpos, endpos, &newpos); if (repunicode == NULL) goto onError; if (!PyUnicode_Check(repunicode)) { @@ -8932,10 +8934,17 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s, Py_DECREF(repunicode); goto onError; } + if (PyUnicode_READY(repunicode) < 0) { + Py_DECREF(repunicode); + goto onError; + } + repkind = PyUnicode_KIND(repunicode); + repdata = PyUnicode_DATA(repunicode); + /* generate replacement */ repsize = PyUnicode_GET_SIZE(repunicode); - for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { - Py_UNICODE ch = *uni2; + for (k=0; k