Rewrite PyUnicode_EncodeDecimal() to use the new Unicode API

Add tests for PyUnicode_EncodeDecimal() and
PyUnicode_TransformDecimalToASCII().
This commit is contained in:
Victor Stinner 2011-11-21 22:52:58 +01:00
parent 6dd381eb62
commit 42bf77537e
3 changed files with 132 additions and 46 deletions

View File

@ -1806,6 +1806,36 @@ class UnicodeTest(string_tests.CommonTest,
s += "4" s += "4"
self.assertEqual(s, "3") self.assertEqual(s, "3")
def test_encode_decimal(self):
from _testcapi import unicode_encodedecimal
self.assertEqual(unicode_encodedecimal('123'),
b'123')
self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
b'3.14')
self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
b' 3.14 ')
self.assertRaises(UnicodeEncodeError,
unicode_encodedecimal, "123\u20ac", "strict")
self.assertEqual(unicode_encodedecimal("123\u20ac", "replace"),
b'123?')
self.assertEqual(unicode_encodedecimal("123\u20ac", "ignore"),
b'123')
self.assertEqual(unicode_encodedecimal("123\u20ac", "xmlcharrefreplace"),
b'123€')
self.assertEqual(unicode_encodedecimal("123\u20ac", "backslashreplace"),
b'123\\u20ac')
def test_transform_decimal(self):
from _testcapi import unicode_transformdecimaltoascii as transform_decimal
self.assertEqual(transform_decimal('123'),
'123')
self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
'3.14')
self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
"\N{EM SPACE}3.14\N{EN SPACE}")
self.assertEqual(transform_decimal('123\u20ac'),
'123\u20ac')
class StringModuleTest(unittest.TestCase): class StringModuleTest(unittest.TestCase):
def test_formatter_parser(self): def test_formatter_parser(self):

View File

@ -1499,6 +1499,51 @@ unicode_aswidecharstring(PyObject *self, PyObject *args)
return Py_BuildValue("(Nn)", result, size); return Py_BuildValue("(Nn)", result, size);
} }
static PyObject *
unicode_encodedecimal(PyObject *self, PyObject *args)
{
Py_UNICODE *unicode;
Py_ssize_t length;
char *errors = NULL;
PyObject *decimal;
Py_ssize_t decimal_length, new_length;
int res;
if (!PyArg_ParseTuple(args, "u#|s", &unicode, &length, &errors))
return NULL;
decimal_length = length * 7; /* len('€') */
decimal = PyBytes_FromStringAndSize(NULL, decimal_length);
if (decimal == NULL)
return NULL;
res = PyUnicode_EncodeDecimal(unicode, length,
PyBytes_AS_STRING(decimal),
errors);
if (res < 0) {
Py_DECREF(decimal);
return NULL;
}
new_length = strlen(PyBytes_AS_STRING(decimal));
assert(new_length <= decimal_length);
res = _PyBytes_Resize(&decimal, new_length);
if (res < 0)
return NULL;
return decimal;
}
static PyObject *
unicode_transformdecimaltoascii(PyObject *self, PyObject *args)
{
Py_UNICODE *unicode;
Py_ssize_t length;
if (!PyArg_ParseTuple(args, "u#|s", &unicode, &length))
return NULL;
return PyUnicode_TransformDecimalToASCII(unicode, length);
}
static PyObject * static PyObject *
getargs_w_star(PyObject *self, PyObject *args) getargs_w_star(PyObject *self, PyObject *args)
{ {
@ -2384,8 +2429,10 @@ static PyMethodDef TestMethods[] = {
{"test_u_code", (PyCFunction)test_u_code, METH_NOARGS}, {"test_u_code", (PyCFunction)test_u_code, METH_NOARGS},
{"test_Z_code", (PyCFunction)test_Z_code, METH_NOARGS}, {"test_Z_code", (PyCFunction)test_Z_code, METH_NOARGS},
{"test_widechar", (PyCFunction)test_widechar, METH_NOARGS}, {"test_widechar", (PyCFunction)test_widechar, METH_NOARGS},
{"unicode_aswidechar", unicode_aswidechar, METH_VARARGS}, {"unicode_aswidechar", unicode_aswidechar, METH_VARARGS},
{"unicode_aswidecharstring",unicode_aswidecharstring, METH_VARARGS}, {"unicode_aswidecharstring",unicode_aswidecharstring, METH_VARARGS},
{"unicode_encodedecimal", unicode_encodedecimal, METH_VARARGS},
{"unicode_transformdecimaltoascii", unicode_transformdecimaltoascii, METH_VARARGS},
#ifdef WITH_THREAD #ifdef WITH_THREAD
{"_test_thread_state", test_thread_state, METH_VARARGS}, {"_test_thread_state", test_thread_state, METH_VARARGS},
{"_pending_threadfunc", pending_threadfunc, METH_VARARGS}, {"_pending_threadfunc", pending_threadfunc, METH_VARARGS},

View File

@ -8829,7 +8829,6 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
char *output, char *output,
const char *errors) const char *errors)
{ {
Py_UNICODE *p, *end;
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
PyObject *unicode; PyObject *unicode;
@ -8838,47 +8837,50 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
/* the following variable is used for caching string comparisons /* the following variable is used for caching string comparisons
* -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
int known_errorHandler = -1; int known_errorHandler = -1;
Py_ssize_t i, j;
enum PyUnicode_Kind kind;
void *data;
if (output == NULL) { if (output == NULL) {
PyErr_BadArgument(); PyErr_BadArgument();
return -1; return -1;
} }
p = s; unicode = PyUnicode_FromUnicode(s, length);
end = s + length; if (unicode == NULL)
while (p < end) { return -1;
register Py_UNICODE ch = *p;
if (PyUnicode_READY(unicode) < 0)
goto onError;
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
for (i=0; i < length; i++) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
int decimal; int decimal;
PyObject *repunicode; Py_ssize_t startpos, endpos;
Py_ssize_t repsize;
Py_ssize_t newpos;
Py_UNICODE *uni2;
Py_UNICODE *collstart;
Py_UNICODE *collend;
if (Py_UNICODE_ISSPACE(ch)) { if (Py_UNICODE_ISSPACE(ch)) {
*output++ = ' '; *output++ = ' ';
++p;
continue; continue;
} }
decimal = Py_UNICODE_TODECIMAL(ch); decimal = Py_UNICODE_TODECIMAL(ch);
if (decimal >= 0) { if (decimal >= 0) {
*output++ = '0' + decimal; *output++ = '0' + decimal;
++p;
continue; continue;
} }
if (0 < ch && ch < 256) { if (0 < ch && ch < 256) {
*output++ = (char)ch; *output++ = (char)ch;
++p;
continue; continue;
} }
/* All other characters are considered unencodable */ /* All other characters are considered unencodable */
collstart = p; startpos = i;
collend = p+1; endpos = i+1;
while (collend < end) { for (; endpos < length; endpos++) {
if ((0 < *collend && *collend < 256) || ch = PyUnicode_READ(kind, data, endpos);
!Py_UNICODE_ISSPACE(*collend) || if ((0 < ch && ch < 256) ||
Py_UNICODE_TODECIMAL(*collend)) !Py_UNICODE_ISSPACE(ch) ||
Py_UNICODE_TODECIMAL(ch))
break; break;
} }
/* cache callback name lookup /* cache callback name lookup
@ -8897,33 +8899,33 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
} }
switch (known_errorHandler) { switch (known_errorHandler) {
case 1: /* strict */ case 1: /* strict */
unicode = PyUnicode_FromUnicode(s, length); raise_encode_exception(&exc, encoding, unicode, startpos, endpos, reason);
if (unicode == NULL)
goto onError;
raise_encode_exception(&exc, encoding, unicode, collstart-s, collend-s, reason);
Py_DECREF(unicode);
goto onError; goto onError;
case 2: /* replace */ case 2: /* replace */
for (p = collstart; p < collend; ++p) for (j=startpos; j < endpos; j++)
*output++ = '?'; *output++ = '?';
/* fall through */ /* fall through */
case 3: /* ignore */ case 3: /* ignore */
p = collend; i = endpos;
break; break;
case 4: /* xmlcharrefreplace */ case 4: /* xmlcharrefreplace */
/* generate replacement (temporarily (mis)uses p) */ /* generate replacement */
for (p = collstart; p < collend; ++p) for (j=startpos; j < endpos; j++) {
output += sprintf(output, "&#%d;", (int)*p); ch = PyUnicode_READ(kind, data, i);
p = collend; output += sprintf(output, "&#%d;", (int)ch);
i++;
}
break; break;
default: default:
unicode = PyUnicode_FromUnicode(s, length); {
if (unicode == NULL) PyObject *repunicode;
goto onError; Py_ssize_t repsize, newpos, k;
enum PyUnicode_Kind repkind;
void *repdata;
repunicode = unicode_encode_call_errorhandler(errors, &errorHandler, repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
encoding, reason, unicode, &exc, encoding, reason, unicode, &exc,
collstart-s, collend-s, &newpos); startpos, endpos, &newpos);
Py_DECREF(unicode);
if (repunicode == NULL) if (repunicode == NULL)
goto onError; goto onError;
if (!PyUnicode_Check(repunicode)) { if (!PyUnicode_Check(repunicode)) {
@ -8932,10 +8934,17 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
Py_DECREF(repunicode); Py_DECREF(repunicode);
goto onError; goto onError;
} }
if (PyUnicode_READY(repunicode) < 0) {
Py_DECREF(repunicode);
goto onError;
}
repkind = PyUnicode_KIND(repunicode);
repdata = PyUnicode_DATA(repunicode);
/* generate replacement */ /* generate replacement */
repsize = PyUnicode_GET_SIZE(repunicode); repsize = PyUnicode_GET_SIZE(repunicode);
for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) { for (k=0; k<repsize; k++) {
Py_UNICODE ch = *uni2; ch = PyUnicode_READ(repkind, repdata, k);
if (Py_UNICODE_ISSPACE(ch)) if (Py_UNICODE_ISSPACE(ch))
*output++ = ' '; *output++ = ' ';
else { else {
@ -8946,29 +8955,29 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
*output++ = (char)ch; *output++ = (char)ch;
else { else {
Py_DECREF(repunicode); Py_DECREF(repunicode);
unicode = PyUnicode_FromUnicode(s, length);
if (unicode == NULL)
goto onError;
raise_encode_exception(&exc, encoding, raise_encode_exception(&exc, encoding,
unicode, collstart-s, collend-s, reason); unicode, startpos, endpos,
Py_DECREF(unicode); reason);
goto onError; goto onError;
} }
} }
} }
p = s + newpos; i = newpos;
Py_DECREF(repunicode); Py_DECREF(repunicode);
} }
}
} }
/* 0-terminate the output string */ /* 0-terminate the output string */
*output++ = '\0'; *output++ = '\0';
Py_XDECREF(exc); Py_XDECREF(exc);
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_DECREF(unicode);
return 0; return 0;
onError: onError:
Py_XDECREF(exc); Py_XDECREF(exc);
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_DECREF(unicode);
return -1; return -1;
} }