Issue #10557: Fixed error messages from float() and other numeric

types. Added a new API function, PyUnicode_TransformDecimalToASCII(), which transforms non-ASCII decimal digits in a Unicode string to their ASCII equivalents.
2010-12-04 03:38:46 +00:00 · 2010-12-04 03:38:46 +00:00 · 942af5a9a4
parent 36526bf3d9
commit 942af5a9a4
11 changed files with 169 additions and 52 deletions
--- a/Doc/c-api/unicode.rst
+++ b/Doc/c-api/unicode.rst
@ -328,6 +328,13 @@ APIs:
   Identical to :c:func:`PyUnicode_FromFormat` except that it takes exactly two
   arguments.
 .. c:function:: PyObject* PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, Py_ssize_t size)
   Create a Unicode object by replacing all decimal digits in
   :c:type:`Py_UNICODE` buffer of the given size by ASCII digits 0--9
   according to their decimal value.  Return *NULL* if an exception
   occurs.
 .. c:function:: Py_UNICODE* PyUnicode_AsUnicode(PyObject *unicode)
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -1225,6 +1225,17 @@ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
    );
 #endif
 /* Transforms code points that have decimal digit property to the
   corresponding ASCII digit code points.
   Returns a new Unicode string on success, NULL on failure.
 */
 PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
    Py_UNICODE *s,              /* Unicode buffer */
    Py_ssize_t length           /* Number of Py_UNICODE chars to transform */
    );
 /* --- File system encoding ---------------------------------------------- */
 /* ParseTuple converter: encode str objects to bytes using
--- a/Lib/test/test_complex.py
+++ b/Lib/test/test_complex.py
@ -220,6 +220,7 @@ class ComplexTest(unittest.TestCase):
        self.assertEqual(complex(NS(1+10j)), 1+10j)
        self.assertRaises(TypeError, complex, OS(None))
        self.assertRaises(TypeError, complex, NS(None))
        self.assertRaises(TypeError, complex, {})
        self.assertAlmostEqual(complex("1+10j"), 1+10j)
        self.assertAlmostEqual(complex(10), 10+0j)
@ -325,6 +326,8 @@ class ComplexTest(unittest.TestCase):
        # check that complex accepts long unicode strings
        self.assertEqual(type(complex("1"*500)), complex)
        # check whitespace processing
        self.assertEqual(complex('\N{EM SPACE}(\N{EN SPACE}1+1j ) '), 1+1j)
        class EvilExc(Exception):
            pass
--- a/Lib/test/test_float.py
+++ b/Lib/test/test_float.py
@ -43,14 +43,30 @@ class GeneralFloatCases(unittest.TestCase):
        self.assertRaises(ValueError, float, "+.inf")
        self.assertRaises(ValueError, float, ".")
        self.assertRaises(ValueError, float, "-.")
        self.assertRaises(ValueError, float, b"-")
        self.assertRaises(TypeError, float, {})
        # Lone surrogate
        self.assertRaises(UnicodeEncodeError, float, '\uD8F0')
        # check that we don't accept alternate exponent markers
        self.assertRaises(ValueError, float, "-1.7d29")
        self.assertRaises(ValueError, float, "3D-14")
-        self.assertEqual(float(b"  \u0663.\u0661\u0664  ".decode('raw-unicode-escape')), 3.14)
+        self.assertEqual(float("  \u0663.\u0661\u0664  "), 3.14)
        self.assertEqual(float("\N{EM SPACE}3.14\N{EN SPACE}"), 3.14)
        # extra long strings should not be a problem
        float(b'.' + b'1'*1000)
        float('.' + '1'*1000)
    def test_error_message(self):
        testlist = ('\xbd', '123\xbd', '  123 456  ')
        for s in testlist:
            try:
                float(s)
            except ValueError as e:
                self.assertIn(s.strip(), e.args[0])
            else:
                self.fail("Expected int(%r) to raise a ValueError", s)
    @support.run_with_locale('LC_NUMERIC', 'fr_FR', 'de_DE')
    def test_float_with_comma(self):
        # set locale to something that doesn't use '.' for the decimal point
--- a/Lib/test/test_int.py
+++ b/Lib/test/test_int.py
@ -20,7 +20,8 @@ L = [
        ('  1\02  ', ValueError),
        ('', ValueError),
        (' ', ValueError),
-        ('  \t\t  ', ValueError)
+        ('  \t\t  ', ValueError),
        ("\u0200", ValueError)
 ]
 class IntTestCases(unittest.TestCase):
@ -35,6 +36,8 @@ class IntTestCases(unittest.TestCase):
        self.assertEqual(int(3.5), 3)
        self.assertEqual(int(-3.5), -3)
        self.assertEqual(int("-3"), -3)
        self.assertEqual(int(" -3 "), -3)
        self.assertEqual(int("\N{EM SPACE}-3\N{EN SPACE}"), -3)
        # Different base:
        self.assertEqual(int("10",16), 16)
        # Test conversion from strings and various anomalies
@ -302,6 +305,16 @@ class IntTestCases(unittest.TestCase):
                    self.fail("Failed to raise TypeError with %s" %
                              ((base, trunc_result_base),))
    def test_error_message(self):
        testlist = ('\xbd', '123\xbd', '  123 456  ')
        for s in testlist:
            try:
                int(s)
            except ValueError as e:
                self.assertIn(s.strip(), e.args[0])
            else:
                self.fail("Expected int(%r) to raise a ValueError", s)
 def test_main():
    run_unittest(IntTestCases)
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -1168,8 +1168,13 @@ class UnicodeTest(string_tests.CommonTest,
        # Error handling (wrong arguments)
        self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
-        # Error handling (PyUnicode_EncodeDecimal())
+        # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
-        self.assertRaises(UnicodeError, int, "\u0200")
+        self.assertRaises(UnicodeError, int, "\ud800")
        self.assertRaises(UnicodeError, int, "\udf00")
        self.assertRaises(UnicodeError, float, "\ud800")
        self.assertRaises(UnicodeError, float, "\udf00")
        self.assertRaises(UnicodeError, complex, "\ud800")
        self.assertRaises(UnicodeError, complex, "\udf00")
    def test_codecs(self):
        # Encoding
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -222,6 +222,10 @@ Library
 C-API
 -----
 - Issue #10557: Added a new API function, PyUnicode_TransformDecimalToASCII(),
  which transforms non-ASCII decimal digits in a Unicode string to their
  ASCII equivalents. 
 - Issue #9518: Extend the PyModuleDef_HEAD_INIT macro to explicitly
  zero-initialize all fields, fixing compiler warnings seen when building
  extension modules with gcc with "-Wmissing-field-initializers" (implied by
--- a/Objects/complexobject.c
+++ b/Objects/complexobject.c
@ -766,20 +766,26 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
    char *end;
    double x=0.0, y=0.0, z;
    int got_bracket=0;
-    char *s_buffer = NULL;
+    PyObject *s_buffer = NULL;
    Py_ssize_t len;
    if (PyUnicode_Check(v)) {
-        s_buffer = (char *)PyMem_MALLOC(PyUnicode_GET_SIZE(v) + 1);
+        Py_ssize_t i, buflen = PyUnicode_GET_SIZE(v);
        Py_UNICODE *bufptr;
        s_buffer = PyUnicode_TransformDecimalToASCII(
            PyUnicode_AS_UNICODE(v), buflen);
        if (s_buffer == NULL)
-            return PyErr_NoMemory();
+            return NULL;
-        if (PyUnicode_EncodeDecimal(PyUnicode_AS_UNICODE(v),
+        /* Replace non-ASCII whitespace with ' ' */
-                                    PyUnicode_GET_SIZE(v),
+        bufptr = PyUnicode_AS_UNICODE(s_buffer);
-                                    s_buffer,
+        for (i = 0; i < buflen; i++) {
-                                    NULL))
+            Py_UNICODE ch = bufptr[i];
            if (ch > 127 && Py_UNICODE_ISSPACE(ch))
                bufptr[i] = ' ';
        }
        s = _PyUnicode_AsStringAndSize(s_buffer, &len);
        if (s == NULL)
            goto error;
        s = s_buffer;
        len = strlen(s);
    }
    else if (PyObject_AsCharBuffer(v, &s, &len)) {
        PyErr_SetString(PyExc_TypeError,
@ -894,16 +900,14 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
    if (s-start != len)
        goto parse_error;
-    if (s_buffer)
+    Py_XDECREF(s_buffer);
        PyMem_FREE(s_buffer);
    return complex_subtype_from_doubles(type, x, y);
  parse_error:
    PyErr_SetString(PyExc_ValueError,
                    "complex() arg is a malformed string");
  error:
-    if (s_buffer)
+    Py_XDECREF(s_buffer);
        PyMem_FREE(s_buffer);
    return NULL;
 }
--- a/Objects/floatobject.c
+++ b/Objects/floatobject.c
@ -174,22 +174,30 @@ PyFloat_FromString(PyObject *v)
 {
    const char *s, *last, *end;
    double x;
-    char buffer[256]; /* for errors */
+    PyObject *s_buffer = NULL;
    char *s_buffer = NULL;
    Py_ssize_t len;
    PyObject *result = NULL;
    if (PyUnicode_Check(v)) {
-        s_buffer = (char *)PyMem_MALLOC(PyUnicode_GET_SIZE(v)+1);
+        Py_ssize_t i, buflen = PyUnicode_GET_SIZE(v);
        Py_UNICODE *bufptr;
        s_buffer = PyUnicode_TransformDecimalToASCII(
            PyUnicode_AS_UNICODE(v), buflen);
        if (s_buffer == NULL)
-            return PyErr_NoMemory();
+            return NULL;
-        if (PyUnicode_EncodeDecimal(PyUnicode_AS_UNICODE(v),
+        /* Replace non-ASCII whitespace with ' ' */
-                                    PyUnicode_GET_SIZE(v),
+        bufptr = PyUnicode_AS_UNICODE(s_buffer);
-                                    s_buffer,
+        for (i = 0; i < buflen; i++) {
-                                    NULL))
+            Py_UNICODE ch = bufptr[i];
-            goto error;
+            if (ch > 127 && Py_UNICODE_ISSPACE(ch))
-        s = s_buffer;
+                bufptr[i] = ' ';
-        len = strlen(s);
+        }
        s = _PyUnicode_AsStringAndSize(s_buffer, &len);
        if (s == NULL) {
            Py_DECREF(s_buffer);
            return NULL;
        }
        last = s + len;
    }
    else if (PyObject_AsCharBuffer(v, &s, &len)) {
        PyErr_SetString(PyExc_TypeError,
@ -197,29 +205,27 @@ PyFloat_FromString(PyObject *v)
        return NULL;
    }
    last = s + len;
-
+    /* strip space */
-    while (Py_ISSPACE(*s))
+    while (s < last && Py_ISSPACE(*s))
        s++;
    while (s < last - 1 && Py_ISSPACE(last[-1]))
        last--;
    /* We don't care about overflow or underflow.  If the platform
     * supports them, infinities and signed zeroes (on underflow) are
     * fine. */
    x = PyOS_string_to_double(s, (char **)&end, NULL);
-    if (x == -1.0 && PyErr_Occurred())
+    if (end != last) {
-        goto error;
+        PyErr_Format(PyExc_ValueError,
-    while (Py_ISSPACE(*end))
+                     "could not convert string to float: "
-        end++;
+                     "%R", v);
    if (end == last)
        result = PyFloat_FromDouble(x);
    else {
        PyOS_snprintf(buffer, sizeof(buffer),
                      "invalid literal for float(): %.200s", s);
        PyErr_SetString(PyExc_ValueError, buffer);
        result = NULL;
    }
    else if (x == -1.0 && PyErr_Occurred())
        result = NULL;
    else
        result = PyFloat_FromDouble(x);
-  error:
+    Py_XDECREF(s_buffer);
    if (s_buffer)
        PyMem_FREE(s_buffer);
    return result;
 }
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@ -2133,17 +2133,34 @@ PyObject *
 PyLong_FromUnicode(Py_UNICODE *u, Py_ssize_t length, int base)
 {
    PyObject *result;
-    char *buffer = (char *)PyMem_MALLOC(length+1);
+    PyObject *asciidig;
    char *buffer, *end;
    Py_ssize_t i, buflen;
    Py_UNICODE *ptr;
-    if (buffer == NULL)
+    asciidig = PyUnicode_TransformDecimalToASCII(u, length);
    if (asciidig == NULL)
        return NULL;
-
+    /* Replace non-ASCII whitespace with ' ' */
-    if (PyUnicode_EncodeDecimal(u, length, buffer, NULL)) {
+    ptr = PyUnicode_AS_UNICODE(asciidig);
-        PyMem_FREE(buffer);
+    for (i = 0; i < length; i++) {
      Py_UNICODE ch = ptr[i];
      if (ch > 127 && Py_UNICODE_ISSPACE(ch))
        ptr[i] = ' ';
    }
    buffer = _PyUnicode_AsStringAndSize(asciidig, &buflen);
    if (buffer == NULL) {
        Py_DECREF(asciidig);
        return NULL;
    }
-    result = PyLong_FromString(buffer, NULL, base);
+    result = PyLong_FromString(buffer, &end, base);
-    PyMem_FREE(buffer);
+    if (result != NULL && end != buffer + buflen) {
        PyErr_SetString(PyExc_ValueError,
                        "null byte in argument for int()");
        Py_DECREF(result);
        result = NULL;
    }
    Py_DECREF(asciidig);
    return result;
 }
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -6206,6 +6206,30 @@ PyObject *PyUnicode_Translate(PyObject *str,
    return NULL;
 }
 PyObject *
 PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
                                  Py_ssize_t length)
 {
    PyObject *result;
    Py_UNICODE *p; /* write pointer into result */
    Py_ssize_t i;
    /* Copy to a new string */
    result = (PyObject *)_PyUnicode_New(length);
    Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
    if (result == NULL)
        return result;
    p = PyUnicode_AS_UNICODE(result);
    /* Iterate over code points */
    for (i = 0; i < length; i++) {
        Py_UNICODE ch =s[i];
        if (ch > 127) {
            int decimal = Py_UNICODE_TODECIMAL(ch);
            if (decimal >= 0)
                p[i] = '0' + decimal;
        }
    }
    return result;
 }
 /* --- Decimal Encoder ---------------------------------------------------- */
 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
@ -8967,6 +8991,13 @@ unicode_freelistsize(PyUnicodeObject *self)
 {
    return PyLong_FromLong(numfree);
 }
 static PyObject *
 unicode__decimal2ascii(PyObject *self)
 {
    return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
                                             PyUnicode_GET_SIZE(self));
 }
 #endif
 PyDoc_STRVAR(startswith__doc__,
@ -9108,7 +9139,6 @@ unicode_getnewargs(PyUnicodeObject *v)
    return Py_BuildValue("(u#)", v->str, v->length);
 }
 static PyMethodDef unicode_methods[] = {
    /* Order is according to common usage: often used methods should
@ -9170,8 +9200,9 @@ static PyMethodDef unicode_methods[] = {
 #endif
 #if 0
-    /* This one is just used for debugging the implementation. */
+    /* These methods are just used for debugging the implementation. */
    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
 #endif
    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},