Rewrite PyUnicode_EncodeDecimal() to use the new Unicode API

Add tests for PyUnicode_EncodeDecimal() and PyUnicode_TransformDecimalToASCII().
2011-11-21 22:52:58 +01:00 · 2011-11-21 22:52:58 +01:00 · 42bf77537e
parent 6dd381eb62
commit 42bf77537e
3 changed files with 132 additions and 46 deletions
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -1806,6 +1806,36 @@ class UnicodeTest(string_tests.CommonTest,
        s += "4"
        self.assertEqual(s, "3")

+    def test_encode_decimal(self):
+        from _testcapi import unicode_encodedecimal
+        self.assertEqual(unicode_encodedecimal('123'),
+                         b'123')
+        self.assertEqual(unicode_encodedecimal('\u0663.\u0661\u0664'),
+                         b'3.14')
+        self.assertEqual(unicode_encodedecimal("\N{EM SPACE}3.14\N{EN SPACE}"),
+                         b' 3.14 ')
+        self.assertRaises(UnicodeEncodeError,
+                          unicode_encodedecimal, "123\u20ac", "strict")
+        self.assertEqual(unicode_encodedecimal("123\u20ac", "replace"),
+                         b'123?')
+        self.assertEqual(unicode_encodedecimal("123\u20ac", "ignore"),
+                         b'123')
+        self.assertEqual(unicode_encodedecimal("123\u20ac", "xmlcharrefreplace"),
+                         b'123&#8364;')
+        self.assertEqual(unicode_encodedecimal("123\u20ac", "backslashreplace"),
+                         b'123\\u20ac')
+
+    def test_transform_decimal(self):
+        from _testcapi import unicode_transformdecimaltoascii as transform_decimal
+        self.assertEqual(transform_decimal('123'),
+                         '123')
+        self.assertEqual(transform_decimal('\u0663.\u0661\u0664'),
+                         '3.14')
+        self.assertEqual(transform_decimal("\N{EM SPACE}3.14\N{EN SPACE}"),
+                         "\N{EM SPACE}3.14\N{EN SPACE}")
+        self.assertEqual(transform_decimal('123\u20ac'),
+                         '123\u20ac')
+

 class StringModuleTest(unittest.TestCase):
    def test_formatter_parser(self):
--- a/Modules/_testcapimodule.c
+++ b/Modules/_testcapimodule.c
@ -1499,6 +1499,51 @@ unicode_aswidecharstring(PyObject *self, PyObject *args)
    return Py_BuildValue("(Nn)", result, size);
 }

+static PyObject *
+unicode_encodedecimal(PyObject *self, PyObject *args)
+{
+    Py_UNICODE *unicode;
+    Py_ssize_t length;
+    char *errors = NULL;
+    PyObject *decimal;
+    Py_ssize_t decimal_length, new_length;
+    int res;
+
+    if (!PyArg_ParseTuple(args, "u#|s", &unicode, &length, &errors))
+        return NULL;
+
+    decimal_length = length * 7; /* len('&#8364;') */
+    decimal = PyBytes_FromStringAndSize(NULL, decimal_length);
+    if (decimal == NULL)
+        return NULL;
+
+    res = PyUnicode_EncodeDecimal(unicode, length,
+                                  PyBytes_AS_STRING(decimal),
+                                  errors);
+    if (res < 0) {
+        Py_DECREF(decimal);
+        return NULL;
+    }
+
+    new_length = strlen(PyBytes_AS_STRING(decimal));
+    assert(new_length <= decimal_length);
+    res = _PyBytes_Resize(&decimal, new_length);
+    if (res < 0)
+        return NULL;
+
+    return decimal;
+}
+
+static PyObject *
+unicode_transformdecimaltoascii(PyObject *self, PyObject *args)
+{
+    Py_UNICODE *unicode;
+    Py_ssize_t length;
+    if (!PyArg_ParseTuple(args, "u#|s", &unicode, &length))
+        return NULL;
+    return PyUnicode_TransformDecimalToASCII(unicode, length);
+}
+
 static PyObject *
 getargs_w_star(PyObject *self, PyObject *args)
 {
@ -2384,8 +2429,10 @@ static PyMethodDef TestMethods[] = {
    {"test_u_code",             (PyCFunction)test_u_code,        METH_NOARGS},
    {"test_Z_code",             (PyCFunction)test_Z_code,        METH_NOARGS},
    {"test_widechar",           (PyCFunction)test_widechar,      METH_NOARGS},
-    {"unicode_aswidechar",      unicode_aswidechar,                 METH_VARARGS},
-    {"unicode_aswidecharstring",unicode_aswidecharstring,           METH_VARARGS},
+    {"unicode_aswidechar",      unicode_aswidechar,              METH_VARARGS},
+    {"unicode_aswidecharstring",unicode_aswidecharstring,        METH_VARARGS},
+    {"unicode_encodedecimal",   unicode_encodedecimal,           METH_VARARGS},
+    {"unicode_transformdecimaltoascii", unicode_transformdecimaltoascii, METH_VARARGS},
 #ifdef WITH_THREAD
    {"_test_thread_state",      test_thread_state,               METH_VARARGS},
    {"_pending_threadfunc",     pending_threadfunc,              METH_VARARGS},
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -8829,7 +8829,6 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
                        char *output,
                        const char *errors)
 {
-    Py_UNICODE *p, *end;
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    PyObject *unicode;
@ -8838,47 +8837,50 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
    /* the following variable is used for caching string comparisons
     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
    int known_errorHandler = -1;
+    Py_ssize_t i, j;
+    enum PyUnicode_Kind kind;
+    void *data;

    if (output == NULL) {
        PyErr_BadArgument();
        return -1;
    }

-    p = s;
-    end = s + length;
-    while (p < end) {
-        register Py_UNICODE ch = *p;
+    unicode = PyUnicode_FromUnicode(s, length);
+    if (unicode == NULL)
+        return -1;
+
+    if (PyUnicode_READY(unicode) < 0)
+        goto onError;
+    kind = PyUnicode_KIND(unicode);
+    data = PyUnicode_DATA(unicode);
+
+    for (i=0; i < length; i++) {
+        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
        int decimal;
-        PyObject *repunicode;
-        Py_ssize_t repsize;
-        Py_ssize_t newpos;
-        Py_UNICODE *uni2;
-        Py_UNICODE *collstart;
-        Py_UNICODE *collend;
+        Py_ssize_t startpos, endpos;

        if (Py_UNICODE_ISSPACE(ch)) {
            *output++ = ' ';
-            ++p;
            continue;
        }
        decimal = Py_UNICODE_TODECIMAL(ch);
        if (decimal >= 0) {
            *output++ = '0' + decimal;
-            ++p;
            continue;
        }
        if (0 < ch && ch < 256) {
            *output++ = (char)ch;
-            ++p;
            continue;
        }
        /* All other characters are considered unencodable */
-        collstart = p;
-        collend = p+1;
-        while (collend < end) {
-            if ((0 < *collend && *collend < 256) ||
-                !Py_UNICODE_ISSPACE(*collend) ||
-                Py_UNICODE_TODECIMAL(*collend))
+        startpos = i;
+        endpos = i+1;
+        for (; endpos < length; endpos++) {
+            ch = PyUnicode_READ(kind, data, endpos);
+            if ((0 < ch && ch < 256) ||
+                !Py_UNICODE_ISSPACE(ch) ||
+                Py_UNICODE_TODECIMAL(ch))
                break;
        }
        /* cache callback name lookup
@ -8897,33 +8899,33 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
        }
        switch (known_errorHandler) {
        case 1: /* strict */
-            unicode = PyUnicode_FromUnicode(s, length);
-            if (unicode == NULL)
-                goto onError;
-            raise_encode_exception(&exc, encoding, unicode, collstart-s, collend-s, reason);
-            Py_DECREF(unicode);
+            raise_encode_exception(&exc, encoding, unicode, startpos, endpos, reason);
            goto onError;
        case 2: /* replace */
-            for (p = collstart; p < collend; ++p)
+            for (j=startpos; j < endpos; j++)
                *output++ = '?';
            /* fall through */
        case 3: /* ignore */
-            p = collend;
+            i = endpos;
            break;
        case 4: /* xmlcharrefreplace */
-            /* generate replacement (temporarily (mis)uses p) */
-            for (p = collstart; p < collend; ++p)
-                output += sprintf(output, "&#%d;", (int)*p);
-            p = collend;
+            /* generate replacement */
+            for (j=startpos; j < endpos; j++) {
+                ch = PyUnicode_READ(kind, data, i);
+                output += sprintf(output, "&#%d;", (int)ch);
+                i++;
+            }
            break;
        default:
-            unicode = PyUnicode_FromUnicode(s, length);
-            if (unicode == NULL)
-                goto onError;
+        {
+            PyObject *repunicode;
+            Py_ssize_t repsize, newpos, k;
+            enum PyUnicode_Kind repkind;
+            void *repdata;
+
            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
                                                          encoding, reason, unicode, &exc,
-                                                          collstart-s, collend-s, &newpos);
-            Py_DECREF(unicode);
+                                                          startpos, endpos, &newpos);
            if (repunicode == NULL)
                goto onError;
            if (!PyUnicode_Check(repunicode)) {
@ -8932,10 +8934,17 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
                Py_DECREF(repunicode);
                goto onError;
            }
+            if (PyUnicode_READY(repunicode) < 0) {
+                Py_DECREF(repunicode);
+                goto onError;
+            }
+            repkind = PyUnicode_KIND(repunicode);
+            repdata = PyUnicode_DATA(repunicode);
+
            /* generate replacement  */
            repsize = PyUnicode_GET_SIZE(repunicode);
-            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
-                Py_UNICODE ch = *uni2;
+            for (k=0; k<repsize; k++) {
+                ch = PyUnicode_READ(repkind, repdata, k);
                if (Py_UNICODE_ISSPACE(ch))
                    *output++ = ' ';
                else {
@ -8946,29 +8955,29 @@ PyUnicode_EncodeDecimal(Py_UNICODE *s,
                        *output++ = (char)ch;
                    else {
                        Py_DECREF(repunicode);
-                        unicode = PyUnicode_FromUnicode(s, length);
-                        if (unicode == NULL)
-                            goto onError;
                        raise_encode_exception(&exc, encoding,
-                                               unicode, collstart-s, collend-s, reason);
-                        Py_DECREF(unicode);
+                                               unicode, startpos, endpos,
+                                               reason);
                        goto onError;
                    }
                }
            }
-            p = s + newpos;
+            i = newpos;
            Py_DECREF(repunicode);
        }
+        }
    }
    /* 0-terminate the output string */
    *output++ = '\0';
    Py_XDECREF(exc);
    Py_XDECREF(errorHandler);
+    Py_DECREF(unicode);
    return 0;

  onError:
    Py_XDECREF(exc);
    Py_XDECREF(errorHandler);
+    Py_DECREF(unicode);
    return -1;
 }