diff --git a/Doc/c-api/unicode.rst b/Doc/c-api/unicode.rst index 45304222e9e..9edbcbb8c7c 100644 --- a/Doc/c-api/unicode.rst +++ b/Doc/c-api/unicode.rst @@ -328,6 +328,13 @@ APIs: Identical to :c:func:`PyUnicode_FromFormat` except that it takes exactly two arguments. +.. c:function:: PyObject* PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, Py_ssize_t size) + + Create a Unicode object by replacing all decimal digits in + :c:type:`Py_UNICODE` buffer of the given size by ASCII digits 0--9 + according to their decimal value. Return *NULL* if an exception + occurs. + .. c:function:: Py_UNICODE* PyUnicode_AsUnicode(PyObject *unicode) diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 116bb8258fa..abd286db4f7 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1225,6 +1225,17 @@ PyAPI_FUNC(int) PyUnicode_EncodeDecimal( ); #endif +/* Transforms code points that have decimal digit property to the + corresponding ASCII digit code points. + + Returns a new Unicode string on success, NULL on failure. +*/ + +PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII( + Py_UNICODE *s, /* Unicode buffer */ + Py_ssize_t length /* Number of Py_UNICODE chars to transform */ + ); + /* --- File system encoding ---------------------------------------------- */ /* ParseTuple converter: encode str objects to bytes using diff --git a/Lib/test/test_complex.py b/Lib/test/test_complex.py index cc21aa7a622..2352ef1e2c7 100644 --- a/Lib/test/test_complex.py +++ b/Lib/test/test_complex.py @@ -220,6 +220,7 @@ class ComplexTest(unittest.TestCase): self.assertEqual(complex(NS(1+10j)), 1+10j) self.assertRaises(TypeError, complex, OS(None)) self.assertRaises(TypeError, complex, NS(None)) + self.assertRaises(TypeError, complex, {}) self.assertAlmostEqual(complex("1+10j"), 1+10j) self.assertAlmostEqual(complex(10), 10+0j) @@ -325,6 +326,8 @@ class ComplexTest(unittest.TestCase): # check that complex accepts long unicode strings self.assertEqual(type(complex("1"*500)), complex) + # check whitespace processing + self.assertEqual(complex('\N{EM SPACE}(\N{EN SPACE}1+1j ) '), 1+1j) class EvilExc(Exception): pass diff --git a/Lib/test/test_float.py b/Lib/test/test_float.py index 0072133aae3..9bcd63dd230 100644 --- a/Lib/test/test_float.py +++ b/Lib/test/test_float.py @@ -43,14 +43,30 @@ class GeneralFloatCases(unittest.TestCase): self.assertRaises(ValueError, float, "+.inf") self.assertRaises(ValueError, float, ".") self.assertRaises(ValueError, float, "-.") + self.assertRaises(ValueError, float, b"-") + self.assertRaises(TypeError, float, {}) + # Lone surrogate + self.assertRaises(UnicodeEncodeError, float, '\uD8F0') # check that we don't accept alternate exponent markers self.assertRaises(ValueError, float, "-1.7d29") self.assertRaises(ValueError, float, "3D-14") - self.assertEqual(float(b" \u0663.\u0661\u0664 ".decode('raw-unicode-escape')), 3.14) + self.assertEqual(float(" \u0663.\u0661\u0664 "), 3.14) + self.assertEqual(float("\N{EM SPACE}3.14\N{EN SPACE}"), 3.14) # extra long strings should not be a problem float(b'.' + b'1'*1000) float('.' + '1'*1000) + def test_error_message(self): + testlist = ('\xbd', '123\xbd', ' 123 456 ') + for s in testlist: + try: + float(s) + except ValueError as e: + self.assertIn(s.strip(), e.args[0]) + else: + self.fail("Expected int(%r) to raise a ValueError", s) + + @support.run_with_locale('LC_NUMERIC', 'fr_FR', 'de_DE') def test_float_with_comma(self): # set locale to something that doesn't use '.' for the decimal point diff --git a/Lib/test/test_int.py b/Lib/test/test_int.py index 86c4dd79156..437e323cbcc 100644 --- a/Lib/test/test_int.py +++ b/Lib/test/test_int.py @@ -20,7 +20,8 @@ L = [ (' 1\02 ', ValueError), ('', ValueError), (' ', ValueError), - (' \t\t ', ValueError) + (' \t\t ', ValueError), + ("\u0200", ValueError) ] class IntTestCases(unittest.TestCase): @@ -35,6 +36,8 @@ class IntTestCases(unittest.TestCase): self.assertEqual(int(3.5), 3) self.assertEqual(int(-3.5), -3) self.assertEqual(int("-3"), -3) + self.assertEqual(int(" -3 "), -3) + self.assertEqual(int("\N{EM SPACE}-3\N{EN SPACE}"), -3) # Different base: self.assertEqual(int("10",16), 16) # Test conversion from strings and various anomalies @@ -302,6 +305,16 @@ class IntTestCases(unittest.TestCase): self.fail("Failed to raise TypeError with %s" % ((base, trunc_result_base),)) + def test_error_message(self): + testlist = ('\xbd', '123\xbd', ' 123 456 ') + for s in testlist: + try: + int(s) + except ValueError as e: + self.assertIn(s.strip(), e.args[0]) + else: + self.fail("Expected int(%r) to raise a ValueError", s) + def test_main(): run_unittest(IntTestCases) diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index c5a0f803a0c..2de9e7f1c5e 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1168,8 +1168,13 @@ class UnicodeTest(string_tests.CommonTest, # Error handling (wrong arguments) self.assertRaises(TypeError, "hello".encode, 42, 42, 42) - # Error handling (PyUnicode_EncodeDecimal()) - self.assertRaises(UnicodeError, int, "\u0200") + # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII()) + self.assertRaises(UnicodeError, int, "\ud800") + self.assertRaises(UnicodeError, int, "\udf00") + self.assertRaises(UnicodeError, float, "\ud800") + self.assertRaises(UnicodeError, float, "\udf00") + self.assertRaises(UnicodeError, complex, "\ud800") + self.assertRaises(UnicodeError, complex, "\udf00") def test_codecs(self): # Encoding diff --git a/Misc/NEWS b/Misc/NEWS index 59946bdc289..f53a4868929 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -222,6 +222,10 @@ Library C-API ----- +- Issue #10557: Added a new API function, PyUnicode_TransformDecimalToASCII(), + which transforms non-ASCII decimal digits in a Unicode string to their + ASCII equivalents. + - Issue #9518: Extend the PyModuleDef_HEAD_INIT macro to explicitly zero-initialize all fields, fixing compiler warnings seen when building extension modules with gcc with "-Wmissing-field-initializers" (implied by diff --git a/Objects/complexobject.c b/Objects/complexobject.c index 59997962cef..ec529d5bbf9 100644 --- a/Objects/complexobject.c +++ b/Objects/complexobject.c @@ -766,20 +766,26 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v) char *end; double x=0.0, y=0.0, z; int got_bracket=0; - char *s_buffer = NULL; + PyObject *s_buffer = NULL; Py_ssize_t len; if (PyUnicode_Check(v)) { - s_buffer = (char *)PyMem_MALLOC(PyUnicode_GET_SIZE(v) + 1); + Py_ssize_t i, buflen = PyUnicode_GET_SIZE(v); + Py_UNICODE *bufptr; + s_buffer = PyUnicode_TransformDecimalToASCII( + PyUnicode_AS_UNICODE(v), buflen); if (s_buffer == NULL) - return PyErr_NoMemory(); - if (PyUnicode_EncodeDecimal(PyUnicode_AS_UNICODE(v), - PyUnicode_GET_SIZE(v), - s_buffer, - NULL)) + return NULL; + /* Replace non-ASCII whitespace with ' ' */ + bufptr = PyUnicode_AS_UNICODE(s_buffer); + for (i = 0; i < buflen; i++) { + Py_UNICODE ch = bufptr[i]; + if (ch > 127 && Py_UNICODE_ISSPACE(ch)) + bufptr[i] = ' '; + } + s = _PyUnicode_AsStringAndSize(s_buffer, &len); + if (s == NULL) goto error; - s = s_buffer; - len = strlen(s); } else if (PyObject_AsCharBuffer(v, &s, &len)) { PyErr_SetString(PyExc_TypeError, @@ -894,16 +900,14 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v) if (s-start != len) goto parse_error; - if (s_buffer) - PyMem_FREE(s_buffer); + Py_XDECREF(s_buffer); return complex_subtype_from_doubles(type, x, y); parse_error: PyErr_SetString(PyExc_ValueError, "complex() arg is a malformed string"); error: - if (s_buffer) - PyMem_FREE(s_buffer); + Py_XDECREF(s_buffer); return NULL; } diff --git a/Objects/floatobject.c b/Objects/floatobject.c index 4decb0b6278..8409f0a13aa 100644 --- a/Objects/floatobject.c +++ b/Objects/floatobject.c @@ -174,22 +174,30 @@ PyFloat_FromString(PyObject *v) { const char *s, *last, *end; double x; - char buffer[256]; /* for errors */ - char *s_buffer = NULL; + PyObject *s_buffer = NULL; Py_ssize_t len; PyObject *result = NULL; if (PyUnicode_Check(v)) { - s_buffer = (char *)PyMem_MALLOC(PyUnicode_GET_SIZE(v)+1); + Py_ssize_t i, buflen = PyUnicode_GET_SIZE(v); + Py_UNICODE *bufptr; + s_buffer = PyUnicode_TransformDecimalToASCII( + PyUnicode_AS_UNICODE(v), buflen); if (s_buffer == NULL) - return PyErr_NoMemory(); - if (PyUnicode_EncodeDecimal(PyUnicode_AS_UNICODE(v), - PyUnicode_GET_SIZE(v), - s_buffer, - NULL)) - goto error; - s = s_buffer; - len = strlen(s); + return NULL; + /* Replace non-ASCII whitespace with ' ' */ + bufptr = PyUnicode_AS_UNICODE(s_buffer); + for (i = 0; i < buflen; i++) { + Py_UNICODE ch = bufptr[i]; + if (ch > 127 && Py_UNICODE_ISSPACE(ch)) + bufptr[i] = ' '; + } + s = _PyUnicode_AsStringAndSize(s_buffer, &len); + if (s == NULL) { + Py_DECREF(s_buffer); + return NULL; + } + last = s + len; } else if (PyObject_AsCharBuffer(v, &s, &len)) { PyErr_SetString(PyExc_TypeError, @@ -197,29 +205,27 @@ PyFloat_FromString(PyObject *v) return NULL; } last = s + len; - - while (Py_ISSPACE(*s)) + /* strip space */ + while (s < last && Py_ISSPACE(*s)) s++; + while (s < last - 1 && Py_ISSPACE(last[-1])) + last--; /* We don't care about overflow or underflow. If the platform * supports them, infinities and signed zeroes (on underflow) are * fine. */ x = PyOS_string_to_double(s, (char **)&end, NULL); - if (x == -1.0 && PyErr_Occurred()) - goto error; - while (Py_ISSPACE(*end)) - end++; - if (end == last) - result = PyFloat_FromDouble(x); - else { - PyOS_snprintf(buffer, sizeof(buffer), - "invalid literal for float(): %.200s", s); - PyErr_SetString(PyExc_ValueError, buffer); + if (end != last) { + PyErr_Format(PyExc_ValueError, + "could not convert string to float: " + "%R", v); result = NULL; } + else if (x == -1.0 && PyErr_Occurred()) + result = NULL; + else + result = PyFloat_FromDouble(x); - error: - if (s_buffer) - PyMem_FREE(s_buffer); + Py_XDECREF(s_buffer); return result; } diff --git a/Objects/longobject.c b/Objects/longobject.c index e8a728489b6..534e52dfd38 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -2133,17 +2133,34 @@ PyObject * PyLong_FromUnicode(Py_UNICODE *u, Py_ssize_t length, int base) { PyObject *result; - char *buffer = (char *)PyMem_MALLOC(length+1); + PyObject *asciidig; + char *buffer, *end; + Py_ssize_t i, buflen; + Py_UNICODE *ptr; - if (buffer == NULL) + asciidig = PyUnicode_TransformDecimalToASCII(u, length); + if (asciidig == NULL) return NULL; - - if (PyUnicode_EncodeDecimal(u, length, buffer, NULL)) { - PyMem_FREE(buffer); + /* Replace non-ASCII whitespace with ' ' */ + ptr = PyUnicode_AS_UNICODE(asciidig); + for (i = 0; i < length; i++) { + Py_UNICODE ch = ptr[i]; + if (ch > 127 && Py_UNICODE_ISSPACE(ch)) + ptr[i] = ' '; + } + buffer = _PyUnicode_AsStringAndSize(asciidig, &buflen); + if (buffer == NULL) { + Py_DECREF(asciidig); return NULL; } - result = PyLong_FromString(buffer, NULL, base); - PyMem_FREE(buffer); + result = PyLong_FromString(buffer, &end, base); + if (result != NULL && end != buffer + buflen) { + PyErr_SetString(PyExc_ValueError, + "null byte in argument for int()"); + Py_DECREF(result); + result = NULL; + } + Py_DECREF(asciidig); return result; } diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index d3a2d1b715c..751da30e423 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -6206,6 +6206,30 @@ PyObject *PyUnicode_Translate(PyObject *str, return NULL; } +PyObject * +PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, + Py_ssize_t length) +{ + PyObject *result; + Py_UNICODE *p; /* write pointer into result */ + Py_ssize_t i; + /* Copy to a new string */ + result = (PyObject *)_PyUnicode_New(length); + Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length); + if (result == NULL) + return result; + p = PyUnicode_AS_UNICODE(result); + /* Iterate over code points */ + for (i = 0; i < length; i++) { + Py_UNICODE ch =s[i]; + if (ch > 127) { + int decimal = Py_UNICODE_TODECIMAL(ch); + if (decimal >= 0) + p[i] = '0' + decimal; + } + } + return result; +} /* --- Decimal Encoder ---------------------------------------------------- */ int PyUnicode_EncodeDecimal(Py_UNICODE *s, @@ -8967,6 +8991,13 @@ unicode_freelistsize(PyUnicodeObject *self) { return PyLong_FromLong(numfree); } + +static PyObject * +unicode__decimal2ascii(PyObject *self) +{ + return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self), + PyUnicode_GET_SIZE(self)); +} #endif PyDoc_STRVAR(startswith__doc__, @@ -9108,7 +9139,6 @@ unicode_getnewargs(PyUnicodeObject *v) return Py_BuildValue("(u#)", v->str, v->length); } - static PyMethodDef unicode_methods[] = { /* Order is according to common usage: often used methods should @@ -9170,8 +9200,9 @@ static PyMethodDef unicode_methods[] = { #endif #if 0 - /* This one is just used for debugging the implementation. */ + /* These methods are just used for debugging the implementation. */ {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, + {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS}, #endif {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},