bpo-31979: Simplify transforming decimals to ASCII (#4336)

in int(), float() and complex() parsers. This also speeds up parsing non-ASCII numbers by around 20%.
2017-11-13 21:23:48 +02:00 · 2017-11-13 21:23:48 +02:00 · 9b6c60cbce
parent ce12629c84
commit 9b6c60cbce
7 changed files with 63 additions and 139 deletions
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -1723,6 +1723,7 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
 #endif /* MS_WINDOWS */
 #ifndef Py_LIMITED_API
 /* --- Decimal Encoder ---------------------------------------------------- */
 /* Takes a Unicode string holding a decimal value and writes it into
@ -1747,14 +1748,12 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
 */
 #ifndef Py_LIMITED_API
 PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
    Py_UNICODE *s,              /* Unicode buffer */
    Py_ssize_t length,          /* Number of Py_UNICODE chars to encode */
    char *output,               /* Output buffer; must have size >= length */
    const char *errors          /* error handling */
    ) /* Py_DEPRECATED(3.3) */;
 #endif
 /* Transforms code points that have decimal digit property to the
   corresponding ASCII digit code points.
@ -1762,19 +1761,18 @@ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
   Returns a new Unicode string on success, NULL on failure.
 */
 #ifndef Py_LIMITED_API
 PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
    Py_UNICODE *s,              /* Unicode buffer */
    Py_ssize_t length           /* Number of Py_UNICODE chars to transform */
    ) /* Py_DEPRECATED(3.3) */;
 #endif
-/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
+/* Coverts a Unicode object holding a decimal value to an ASCII string
-   as argument instead of a raw buffer and length.  This function additionally
+   for using in int, float and complex parsers.
-   transforms spaces to ASCII because this is what the callers in longobject,
+   Transforms code points that have decimal digit property to the
-   floatobject, and complexobject did anyways. */
+   corresponding ASCII digit code points.  Transforms spaces to ASCII.
   Transforms code points starting from the first non-ASCII code point that
   is neither a decimal digit nor a space to the end into '?'. */
 #ifndef Py_LIMITED_API
 PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
    PyObject *unicode           /* Unicode object */
    );
--- a/Lib/test/test_float.py
+++ b/Lib/test/test_float.py
@ -51,7 +51,7 @@ class GeneralFloatCases(unittest.TestCase):
        self.assertRaises(TypeError, float, {})
        self.assertRaisesRegex(TypeError, "not 'dict'", float, {})
        # Lone surrogate
-        self.assertRaises(UnicodeEncodeError, float, '\uD8F0')
+        self.assertRaises(ValueError, float, '\uD8F0')
        # check that we don't accept alternate exponent markers
        self.assertRaises(ValueError, float, "-1.7d29")
        self.assertRaises(ValueError, float, "3D-14")
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -2068,11 +2068,14 @@ class UnicodeTest(string_tests.CommonTest,
        # Error handling (wrong arguments)
        self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
-        # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
+        # Error handling (lone surrogate in
-        self.assertRaises(UnicodeError, float, "\ud800")
+        # _PyUnicode_TransformDecimalAndSpaceToASCII())
-        self.assertRaises(UnicodeError, float, "\udf00")
+        self.assertRaises(ValueError, int, "\ud800")
-        self.assertRaises(UnicodeError, complex, "\ud800")
+        self.assertRaises(ValueError, int, "\udf00")
-        self.assertRaises(UnicodeError, complex, "\udf00")
+        self.assertRaises(ValueError, float, "\ud800")
        self.assertRaises(ValueError, float, "\udf00")
        self.assertRaises(ValueError, complex, "\ud800")
        self.assertRaises(ValueError, complex, "\udf00")
    def test_codecs(self):
        # Encoding
--- a/Objects/complexobject.c
+++ b/Objects/complexobject.c
@ -914,10 +914,10 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
        if (s_buffer == NULL) {
            return NULL;
        }
        assert(PyUnicode_IS_ASCII(s_buffer));
        /* Simply get a pointer to existing ASCII characters. */
        s = PyUnicode_AsUTF8AndSize(s_buffer, &len);
-        if (s == NULL) {
+        assert(s != NULL);
            goto exit;
        }
    }
    else {
        PyErr_Format(PyExc_TypeError,
@ -928,7 +928,6 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
    result = _Py_string_to_number_with_underscores(s, len, "complex", v, type,
                                                   complex_from_string_inner);
  exit:
    Py_DECREF(s_buffer);
    return result;
 }
--- a/Objects/floatobject.c
+++ b/Objects/floatobject.c
@ -176,11 +176,10 @@ PyFloat_FromString(PyObject *v)
        s_buffer = _PyUnicode_TransformDecimalAndSpaceToASCII(v);
        if (s_buffer == NULL)
            return NULL;
        assert(PyUnicode_IS_ASCII(s_buffer));
        /* Simply get a pointer to existing ASCII characters. */
        s = PyUnicode_AsUTF8AndSize(s_buffer, &len);
-        if (s == NULL) {
+        assert(s != NULL);
            Py_DECREF(s_buffer);
            return NULL;
        }
    }
    else if (PyBytes_Check(v)) {
        s = PyBytes_AS_STRING(v);
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@ -2509,21 +2509,18 @@ PyLong_FromUnicodeObject(PyObject *u, int base)
    asciidig = _PyUnicode_TransformDecimalAndSpaceToASCII(u);
    if (asciidig == NULL)
        return NULL;
    assert(PyUnicode_IS_ASCII(asciidig));
    /* Simply get a pointer to existing ASCII characters. */
    buffer = PyUnicode_AsUTF8AndSize(asciidig, &buflen);
-    if (buffer == NULL) {
+    assert(buffer != NULL);
    result = PyLong_FromString(buffer, &end, base);
    if (end == NULL || (result != NULL && end == buffer + buflen)) {
        Py_DECREF(asciidig);
-        if (!PyErr_ExceptionMatches(PyExc_UnicodeEncodeError))
+        return result;
            return NULL;
    }
    else {
        result = PyLong_FromString(buffer, &end, base);
        if (end == NULL || (result != NULL && end == buffer + buflen)) {
            Py_DECREF(asciidig);
            return result;
        }
        Py_DECREF(asciidig);
        Py_XDECREF(result);
    }
    Py_DECREF(asciidig);
    Py_XDECREF(result);
    PyErr_Format(PyExc_ValueError,
                 "invalid literal for int() with base %d: %.200R",
                 base, u);
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -840,9 +840,6 @@ ensure_unicode(PyObject *obj)
 /* --- Unicode Object ----------------------------------------------------- */
 static PyObject *
 fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
 static inline Py_ssize_t
 findchar(const void *s, int kind,
         Py_ssize_t size, Py_UCS4 ch,
@ -9062,42 +9059,6 @@ PyUnicode_Translate(PyObject *str,
    return _PyUnicode_TranslateCharmap(str, mapping, errors);
 }
 static Py_UCS4
 fix_decimal_and_space_to_ascii(PyObject *self)
 {
    /* No need to call PyUnicode_READY(self) because this function is only
       called as a callback from fixup() which does it already. */
    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
    const int kind = PyUnicode_KIND(self);
    void *data = PyUnicode_DATA(self);
    Py_UCS4 maxchar = 127, ch, fixed;
    int modified = 0;
    Py_ssize_t i;
    for (i = 0; i < len; ++i) {
        ch = PyUnicode_READ(kind, data, i);
        fixed = 0;
        if (ch > 127) {
            if (Py_UNICODE_ISSPACE(ch))
                fixed = ' ';
            else {
                const int decimal = Py_UNICODE_TODECIMAL(ch);
                if (decimal >= 0)
                    fixed = '0' + decimal;
            }
            if (fixed != 0) {
                modified = 1;
                maxchar = Py_MAX(maxchar, fixed);
                PyUnicode_WRITE(kind, data, i, fixed);
            }
            else
                maxchar = Py_MAX(maxchar, ch);
        }
    }
    return (modified) ? maxchar : 0;
 }
 PyObject *
 _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
 {
@ -9107,12 +9068,42 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
    }
    if (PyUnicode_READY(unicode) == -1)
        return NULL;
-    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
+    if (PyUnicode_IS_ASCII(unicode)) {
        /* If the string is already ASCII, just return the same string */
        Py_INCREF(unicode);
        return unicode;
    }
-    return fixup(unicode, fix_decimal_and_space_to_ascii);
+
    Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
    PyObject *result = PyUnicode_New(len, 127);
    if (result == NULL) {
        return NULL;
    }
    Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
    int kind = PyUnicode_KIND(unicode);
    const void *data = PyUnicode_DATA(unicode);
    Py_ssize_t i;
    for (i = 0; i < len; ++i) {
        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
        if (ch < 127) {
            out[i] = ch;
        }
        else if (Py_UNICODE_ISSPACE(ch)) {
            out[i] = ' ';
        }
        else {
            int decimal = Py_UNICODE_TODECIMAL(ch);
            if (decimal < 0) {
                out[i] = '?';
                _PyUnicode_LENGTH(result) = i + 1;
                break;
            }
            out[i] = '0' + decimal;
        }
    }
    return result;
 }
 PyObject *
@ -9588,69 +9579,6 @@ PyUnicode_Tailmatch(PyObject *str,
    return tailmatch(str, substr, start, end, direction);
 }
 /* Apply fixfct filter to the Unicode object self and return a
   reference to the modified object */
 static PyObject *
 fixup(PyObject *self,
      Py_UCS4 (*fixfct)(PyObject *s))
 {
    PyObject *u;
    Py_UCS4 maxchar_old, maxchar_new = 0;
    PyObject *v;
    u = _PyUnicode_Copy(self);
    if (u == NULL)
        return NULL;
    maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
    /* fix functions return the new maximum character in a string,
       if the kind of the resulting unicode object does not change,
       everything is fine.  Otherwise we need to change the string kind
       and re-run the fix function. */
    maxchar_new = fixfct(u);
    if (maxchar_new == 0) {
        /* no changes */;
        if (PyUnicode_CheckExact(self)) {
            Py_DECREF(u);
            Py_INCREF(self);
            return self;
        }
        else
            return u;
    }
    maxchar_new = align_maxchar(maxchar_new);
    if (maxchar_new == maxchar_old)
        return u;
    /* In case the maximum character changed, we need to
       convert the string to the new category. */
    v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
    if (v == NULL) {
        Py_DECREF(u);
        return NULL;
    }
    if (maxchar_new > maxchar_old) {
        /* If the maxchar increased so that the kind changed, not all
           characters are representable anymore and we need to fix the
           string again. This only happens in very few cases. */
        _PyUnicode_FastCopyCharacters(v, 0,
                                      self, 0, PyUnicode_GET_LENGTH(self));
        maxchar_old = fixfct(v);
        assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
    }
    else {
        _PyUnicode_FastCopyCharacters(v, 0,
                                      u, 0, PyUnicode_GET_LENGTH(self));
    }
    Py_DECREF(u);
    assert(_PyUnicode_CheckConsistency(v, 1));
    return v;
 }
 static PyObject *
 ascii_upper_or_lower(PyObject *self, int lower)
 {