Rewrite PyUnicode_EncodeDecimal() to use the new Unicode API
Add tests for PyUnicode_EncodeDecimal() and PyUnicode_TransformDecimalToASCII().
This commit is contained in:
parent
6dd381eb62
commit
42bf77537e
|
@ -1806,6 +1806,36 @@ class UnicodeTest(string_tests.CommonTest,
|
||||||
s += "4"
|
s += "4"
|
||||||
self.assertEqual(s, "3")
|
self.assertEqual(s, "3")
|
||||||
|
|
||||||
|
def test_encode_decimal(self):
|
||||||
|
from _testcapi import unicode_encodedecimal
|
||||||
|
self.assertEqual(unicode_encodedecimal('123'),
|
||||||
|
b'123')
|
||||||
|
self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
|
||||||
|
b'3.14')
|
||||||
|
self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
|
||||||
|
b' 3.14 ')
|
||||||
|
self.assertRaises(UnicodeEncodeError,
|
||||||
|
unicode_encodedecimal, "123\u20ac", "strict")
|
||||||
|
self.assertEqual(unicode_encodedecimal("123\u20ac", "replace"),
|
||||||
|
b'123?')
|
||||||
|
self.assertEqual(unicode_encodedecimal("123\u20ac", "ignore"),
|
||||||
|
b'123')
|
||||||
|
self.assertEqual(unicode_encodedecimal("123\u20ac", "xmlcharrefreplace"),
|
||||||
|
b'123€')
|
||||||
|
self.assertEqual(unicode_encodedecimal("123\u20ac", "backslashreplace"),
|
||||||
|
b'123\\u20ac')
|
||||||
|
|
||||||
|
def test_transform_decimal(self):
|
||||||
|
from _testcapi import unicode_transformdecimaltoascii as transform_decimal
|
||||||
|
self.assertEqual(transform_decimal('123'),
|
||||||
|
'123')
|
||||||
|
self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
|
||||||
|
'3.14')
|
||||||
|
self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
|
||||||
|
"\N{EM SPACE}3.14\N{EN SPACE}")
|
||||||
|
self.assertEqual(transform_decimal('123\u20ac'),
|
||||||
|
'123\u20ac')
|
||||||
|
|
||||||
|
|
||||||
class StringModuleTest(unittest.TestCase):
|
class StringModuleTest(unittest.TestCase):
|
||||||
def test_formatter_parser(self):
|
def test_formatter_parser(self):
|
||||||
|
|
|
@ -1499,6 +1499,51 @@ unicode_aswidecharstring(PyObject *self, PyObject *args)
|
||||||
return Py_BuildValue("(Nn)", result, size);
|
return Py_BuildValue("(Nn)", result, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
unicode_encodedecimal(PyObject *self, PyObject *args)
|
||||||
|
{
|
||||||
|
Py_UNICODE *unicode;
|
||||||
|
Py_ssize_t length;
|
||||||
|
char *errors = NULL;
|
||||||
|
PyObject *decimal;
|
||||||
|
Py_ssize_t decimal_length, new_length;
|
||||||
|
int res;
|
||||||
|
|
||||||
|
if (!PyArg_ParseTuple(args, "u#|s", &unicode, &length, &errors))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
decimal_length = length * 7; /* len('€') */
|
||||||
|
decimal = PyBytes_FromStringAndSize(NULL, decimal_length);
|
||||||
|
if (decimal == NULL)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
res = PyUnicode_EncodeDecimal(unicode, length,
|
||||||
|
PyBytes_AS_STRING(decimal),
|
||||||
|
errors);
|
||||||
|
if (res < 0) {
|
||||||
|
Py_DECREF(decimal);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
new_length = strlen(PyBytes_AS_STRING(decimal));
|
||||||
|
assert(new_length <= decimal_length);
|
||||||
|
res = _PyBytes_Resize(&decimal, new_length);
|
||||||
|
if (res < 0)
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
return decimal;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
unicode_transformdecimaltoascii(PyObject *self, PyObject *args)
|
||||||
|
{
|
||||||
|
Py_UNICODE *unicode;
|
||||||
|
Py_ssize_t length;
|
||||||
|
if (!PyArg_ParseTuple(args, "u#|s", &unicode, &length))
|
||||||
|
return NULL;
|
||||||
|
return PyUnicode_TransformDecimalToASCII(unicode, length);
|
||||||
|
}
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
getargs_w_star(PyObject *self, PyObject *args)
|
getargs_w_star(PyObject *self, PyObject *args)
|
||||||
{
|
{
|
||||||
|
@ -2384,8 +2429,10 @@ static PyMethodDef TestMethods[] = {
|
||||||
{"test_u_code", (PyCFunction)test_u_code, METH_NOARGS},
|
{"test_u_code", (PyCFunction)test_u_code, METH_NOARGS},
|
||||||
{"test_Z_code", (PyCFunction)test_Z_code, METH_NOARGS},
|
{"test_Z_code", (PyCFunction)test_Z_code, METH_NOARGS},
|
||||||
{"test_widechar", (PyCFunction)test_widechar, METH_NOARGS},
|
{"test_widechar", (PyCFunction)test_widechar, METH_NOARGS},
|
||||||
{"unicode_aswidechar", unicode_aswidechar, METH_VARARGS},
|
{"unicode_aswidechar", unicode_aswidechar, METH_VARARGS},
|
||||||
{"unicode_aswidecharstring",unicode_aswidecharstring, METH_VARARGS},
|
{"unicode_aswidecharstring",unicode_aswidecharstring, METH_VARARGS},
|
||||||
|
{"unicode_encodedecimal", unicode_encodedecimal, METH_VARARGS},
|
||||||
|
{"unicode_transformdecimaltoascii", unicode_transformdecimaltoascii, METH_VARARGS},
|
||||||
#ifdef WITH_THREAD
|
#ifdef WITH_THREAD
|
||||||
{"_test_thread_state", test_thread_state, METH_VARARGS},
|
{"_test_thread_state", test_thread_state, METH_VARARGS},
|
||||||
{"_pending_threadfunc", pending_threadfunc, METH_VARARGS},
|
{"_pending_threadfunc", pending_threadfunc, METH_VARARGS},
|
||||||
|
|
|
@ -8829,7 +8829,6 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
|
||||||
char *output,
|
char *output,
|
||||||
const char *errors)
|
const char *errors)
|
||||||
{
|
{
|
||||||
Py_UNICODE *p, *end;
|
|
||||||
PyObject *errorHandler = NULL;
|
PyObject *errorHandler = NULL;
|
||||||
PyObject *exc = NULL;
|
PyObject *exc = NULL;
|
||||||
PyObject *unicode;
|
PyObject *unicode;
|
||||||
|
@ -8838,47 +8837,50 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
|
||||||
/* the following variable is used for caching string comparisons
|
/* the following variable is used for caching string comparisons
|
||||||
* -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
|
* -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
|
||||||
int known_errorHandler = -1;
|
int known_errorHandler = -1;
|
||||||
|
Py_ssize_t i, j;
|
||||||
|
enum PyUnicode_Kind kind;
|
||||||
|
void *data;
|
||||||
|
|
||||||
if (output == NULL) {
|
if (output == NULL) {
|
||||||
PyErr_BadArgument();
|
PyErr_BadArgument();
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
p = s;
|
unicode = PyUnicode_FromUnicode(s, length);
|
||||||
end = s + length;
|
if (unicode == NULL)
|
||||||
while (p < end) {
|
return -1;
|
||||||
register Py_UNICODE ch = *p;
|
|
||||||
|
if (PyUnicode_READY(unicode) < 0)
|
||||||
|
goto onError;
|
||||||
|
kind = PyUnicode_KIND(unicode);
|
||||||
|
data = PyUnicode_DATA(unicode);
|
||||||
|
|
||||||
|
for (i=0; i < length; i++) {
|
||||||
|
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
||||||
int decimal;
|
int decimal;
|
||||||
PyObject *repunicode;
|
Py_ssize_t startpos, endpos;
|
||||||
Py_ssize_t repsize;
|
|
||||||
Py_ssize_t newpos;
|
|
||||||
Py_UNICODE *uni2;
|
|
||||||
Py_UNICODE *collstart;
|
|
||||||
Py_UNICODE *collend;
|
|
||||||
|
|
||||||
if (Py_UNICODE_ISSPACE(ch)) {
|
if (Py_UNICODE_ISSPACE(ch)) {
|
||||||
*output++ = ' ';
|
*output++ = ' ';
|
||||||
++p;
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
decimal = Py_UNICODE_TODECIMAL(ch);
|
decimal = Py_UNICODE_TODECIMAL(ch);
|
||||||
if (decimal >= 0) {
|
if (decimal >= 0) {
|
||||||
*output++ = '0' + decimal;
|
*output++ = '0' + decimal;
|
||||||
++p;
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (0 < ch && ch < 256) {
|
if (0 < ch && ch < 256) {
|
||||||
*output++ = (char)ch;
|
*output++ = (char)ch;
|
||||||
++p;
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
/* All other characters are considered unencodable */
|
/* All other characters are considered unencodable */
|
||||||
collstart = p;
|
startpos = i;
|
||||||
collend = p+1;
|
endpos = i+1;
|
||||||
while (collend < end) {
|
for (; endpos < length; endpos++) {
|
||||||
if ((0 < *collend && *collend < 256) ||
|
ch = PyUnicode_READ(kind, data, endpos);
|
||||||
!Py_UNICODE_ISSPACE(*collend) ||
|
if ((0 < ch && ch < 256) ||
|
||||||
Py_UNICODE_TODECIMAL(*collend))
|
!Py_UNICODE_ISSPACE(ch) ||
|
||||||
|
Py_UNICODE_TODECIMAL(ch))
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
/* cache callback name lookup
|
/* cache callback name lookup
|
||||||
|
@ -8897,33 +8899,33 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
|
||||||
}
|
}
|
||||||
switch (known_errorHandler) {
|
switch (known_errorHandler) {
|
||||||
case 1: /* strict */
|
case 1: /* strict */
|
||||||
unicode = PyUnicode_FromUnicode(s, length);
|
raise_encode_exception(&exc, encoding, unicode, startpos, endpos, reason);
|
||||||
if (unicode == NULL)
|
|
||||||
goto onError;
|
|
||||||
raise_encode_exception(&exc, encoding, unicode, collstart-s, collend-s, reason);
|
|
||||||
Py_DECREF(unicode);
|
|
||||||
goto onError;
|
goto onError;
|
||||||
case 2: /* replace */
|
case 2: /* replace */
|
||||||
for (p = collstart; p < collend; ++p)
|
for (j=startpos; j < endpos; j++)
|
||||||
*output++ = '?';
|
*output++ = '?';
|
||||||
/* fall through */
|
/* fall through */
|
||||||
case 3: /* ignore */
|
case 3: /* ignore */
|
||||||
p = collend;
|
i = endpos;
|
||||||
break;
|
break;
|
||||||
case 4: /* xmlcharrefreplace */
|
case 4: /* xmlcharrefreplace */
|
||||||
/* generate replacement (temporarily (mis)uses p) */
|
/* generate replacement */
|
||||||
for (p = collstart; p < collend; ++p)
|
for (j=startpos; j < endpos; j++) {
|
||||||
output += sprintf(output, "&#%d;", (int)*p);
|
ch = PyUnicode_READ(kind, data, i);
|
||||||
p = collend;
|
output += sprintf(output, "&#%d;", (int)ch);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
unicode = PyUnicode_FromUnicode(s, length);
|
{
|
||||||
if (unicode == NULL)
|
PyObject *repunicode;
|
||||||
goto onError;
|
Py_ssize_t repsize, newpos, k;
|
||||||
|
enum PyUnicode_Kind repkind;
|
||||||
|
void *repdata;
|
||||||
|
|
||||||
repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
|
repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
|
||||||
encoding, reason, unicode, &exc,
|
encoding, reason, unicode, &exc,
|
||||||
collstart-s, collend-s, &newpos);
|
startpos, endpos, &newpos);
|
||||||
Py_DECREF(unicode);
|
|
||||||
if (repunicode == NULL)
|
if (repunicode == NULL)
|
||||||
goto onError;
|
goto onError;
|
||||||
if (!PyUnicode_Check(repunicode)) {
|
if (!PyUnicode_Check(repunicode)) {
|
||||||
|
@ -8932,10 +8934,17 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
|
||||||
Py_DECREF(repunicode);
|
Py_DECREF(repunicode);
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
|
if (PyUnicode_READY(repunicode) < 0) {
|
||||||
|
Py_DECREF(repunicode);
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
|
repkind = PyUnicode_KIND(repunicode);
|
||||||
|
repdata = PyUnicode_DATA(repunicode);
|
||||||
|
|
||||||
/* generate replacement */
|
/* generate replacement */
|
||||||
repsize = PyUnicode_GET_SIZE(repunicode);
|
repsize = PyUnicode_GET_SIZE(repunicode);
|
||||||
for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
|
for (k=0; k<repsize; k++) {
|
||||||
Py_UNICODE ch = *uni2;
|
ch = PyUnicode_READ(repkind, repdata, k);
|
||||||
if (Py_UNICODE_ISSPACE(ch))
|
if (Py_UNICODE_ISSPACE(ch))
|
||||||
*output++ = ' ';
|
*output++ = ' ';
|
||||||
else {
|
else {
|
||||||
|
@ -8946,29 +8955,29 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
|
||||||
*output++ = (char)ch;
|
*output++ = (char)ch;
|
||||||
else {
|
else {
|
||||||
Py_DECREF(repunicode);
|
Py_DECREF(repunicode);
|
||||||
unicode = PyUnicode_FromUnicode(s, length);
|
|
||||||
if (unicode == NULL)
|
|
||||||
goto onError;
|
|
||||||
raise_encode_exception(&exc, encoding,
|
raise_encode_exception(&exc, encoding,
|
||||||
unicode, collstart-s, collend-s, reason);
|
unicode, startpos, endpos,
|
||||||
Py_DECREF(unicode);
|
reason);
|
||||||
goto onError;
|
goto onError;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
p = s + newpos;
|
i = newpos;
|
||||||
Py_DECREF(repunicode);
|
Py_DECREF(repunicode);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
/* 0-terminate the output string */
|
/* 0-terminate the output string */
|
||||||
*output++ = '\0';
|
*output++ = '\0';
|
||||||
Py_XDECREF(exc);
|
Py_XDECREF(exc);
|
||||||
Py_XDECREF(errorHandler);
|
Py_XDECREF(errorHandler);
|
||||||
|
Py_DECREF(unicode);
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
onError:
|
onError:
|
||||||
Py_XDECREF(exc);
|
Py_XDECREF(exc);
|
||||||
Py_XDECREF(errorHandler);
|
Py_XDECREF(errorHandler);
|
||||||
|
Py_DECREF(unicode);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue