Issue #10557: Fixed error messages from float() and other numeric

types.  Added a new API function, PyUnicode_TransformDecimalToASCII(),
which transforms non-ASCII decimal digits in a Unicode string to their
ASCII equivalents.
This commit is contained in:
Alexander Belopolsky 2010-12-04 03:38:46 +00:00
parent 36526bf3d9
commit 942af5a9a4
11 changed files with 169 additions and 52 deletions

View File

@ -328,6 +328,13 @@ APIs:
Identical to :c:func:`PyUnicode_FromFormat` except that it takes exactly two Identical to :c:func:`PyUnicode_FromFormat` except that it takes exactly two
arguments. arguments.
.. c:function:: PyObject* PyUnicode_TransformDecimalToASCII(Py_UNICODE *s, Py_ssize_t size)
Create a Unicode object by replacing all decimal digits in
:c:type:`Py_UNICODE` buffer of the given size by ASCII digits 0--9
according to their decimal value. Return *NULL* if an exception
occurs.
.. c:function:: Py_UNICODE* PyUnicode_AsUnicode(PyObject *unicode) .. c:function:: Py_UNICODE* PyUnicode_AsUnicode(PyObject *unicode)

View File

@ -1225,6 +1225,17 @@ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
); );
#endif #endif
/* Transforms code points that have decimal digit property to the
corresponding ASCII digit code points.
Returns a new Unicode string on success, NULL on failure.
*/
PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
Py_UNICODE *s, /* Unicode buffer */
Py_ssize_t length /* Number of Py_UNICODE chars to transform */
);
/* --- File system encoding ---------------------------------------------- */ /* --- File system encoding ---------------------------------------------- */
/* ParseTuple converter: encode str objects to bytes using /* ParseTuple converter: encode str objects to bytes using

View File

@ -220,6 +220,7 @@ class ComplexTest(unittest.TestCase):
self.assertEqual(complex(NS(1+10j)), 1+10j) self.assertEqual(complex(NS(1+10j)), 1+10j)
self.assertRaises(TypeError, complex, OS(None)) self.assertRaises(TypeError, complex, OS(None))
self.assertRaises(TypeError, complex, NS(None)) self.assertRaises(TypeError, complex, NS(None))
self.assertRaises(TypeError, complex, {})
self.assertAlmostEqual(complex("1+10j"), 1+10j) self.assertAlmostEqual(complex("1+10j"), 1+10j)
self.assertAlmostEqual(complex(10), 10+0j) self.assertAlmostEqual(complex(10), 10+0j)
@ -325,6 +326,8 @@ class ComplexTest(unittest.TestCase):
# check that complex accepts long unicode strings # check that complex accepts long unicode strings
self.assertEqual(type(complex("1"*500)), complex) self.assertEqual(type(complex("1"*500)), complex)
# check whitespace processing
self.assertEqual(complex('\N{EM SPACE}(\N{EN SPACE}1+1j ) '), 1+1j)
class EvilExc(Exception): class EvilExc(Exception):
pass pass

View File

@ -43,14 +43,30 @@ class GeneralFloatCases(unittest.TestCase):
self.assertRaises(ValueError, float, "+.inf") self.assertRaises(ValueError, float, "+.inf")
self.assertRaises(ValueError, float, ".") self.assertRaises(ValueError, float, ".")
self.assertRaises(ValueError, float, "-.") self.assertRaises(ValueError, float, "-.")
self.assertRaises(ValueError, float, b"-")
self.assertRaises(TypeError, float, {})
# Lone surrogate
self.assertRaises(UnicodeEncodeError, float, '\uD8F0')
# check that we don't accept alternate exponent markers # check that we don't accept alternate exponent markers
self.assertRaises(ValueError, float, "-1.7d29") self.assertRaises(ValueError, float, "-1.7d29")
self.assertRaises(ValueError, float, "3D-14") self.assertRaises(ValueError, float, "3D-14")
self.assertEqual(float(b" \u0663.\u0661\u0664 ".decode('raw-unicode-escape')), 3.14) self.assertEqual(float(" \u0663.\u0661\u0664 "), 3.14)
self.assertEqual(float("\N{EM SPACE}3.14\N{EN SPACE}"), 3.14)
# extra long strings should not be a problem # extra long strings should not be a problem
float(b'.' + b'1'*1000) float(b'.' + b'1'*1000)
float('.' + '1'*1000) float('.' + '1'*1000)
def test_error_message(self):
testlist = ('\xbd', '123\xbd', ' 123 456 ')
for s in testlist:
try:
float(s)
except ValueError as e:
self.assertIn(s.strip(), e.args[0])
else:
self.fail("Expected int(%r) to raise a ValueError", s)
@support.run_with_locale('LC_NUMERIC', 'fr_FR', 'de_DE') @support.run_with_locale('LC_NUMERIC', 'fr_FR', 'de_DE')
def test_float_with_comma(self): def test_float_with_comma(self):
# set locale to something that doesn't use '.' for the decimal point # set locale to something that doesn't use '.' for the decimal point

View File

@ -20,7 +20,8 @@ L = [
(' 1\02 ', ValueError), (' 1\02 ', ValueError),
('', ValueError), ('', ValueError),
(' ', ValueError), (' ', ValueError),
(' \t\t ', ValueError) (' \t\t ', ValueError),
("\u0200", ValueError)
] ]
class IntTestCases(unittest.TestCase): class IntTestCases(unittest.TestCase):
@ -35,6 +36,8 @@ class IntTestCases(unittest.TestCase):
self.assertEqual(int(3.5), 3) self.assertEqual(int(3.5), 3)
self.assertEqual(int(-3.5), -3) self.assertEqual(int(-3.5), -3)
self.assertEqual(int("-3"), -3) self.assertEqual(int("-3"), -3)
self.assertEqual(int(" -3 "), -3)
self.assertEqual(int("\N{EM SPACE}-3\N{EN SPACE}"), -3)
# Different base: # Different base:
self.assertEqual(int("10",16), 16) self.assertEqual(int("10",16), 16)
# Test conversion from strings and various anomalies # Test conversion from strings and various anomalies
@ -302,6 +305,16 @@ class IntTestCases(unittest.TestCase):
self.fail("Failed to raise TypeError with %s" % self.fail("Failed to raise TypeError with %s" %
((base, trunc_result_base),)) ((base, trunc_result_base),))
def test_error_message(self):
testlist = ('\xbd', '123\xbd', ' 123 456 ')
for s in testlist:
try:
int(s)
except ValueError as e:
self.assertIn(s.strip(), e.args[0])
else:
self.fail("Expected int(%r) to raise a ValueError", s)
def test_main(): def test_main():
run_unittest(IntTestCases) run_unittest(IntTestCases)

View File

@ -1168,8 +1168,13 @@ class UnicodeTest(string_tests.CommonTest,
# Error handling (wrong arguments) # Error handling (wrong arguments)
self.assertRaises(TypeError, "hello".encode, 42, 42, 42) self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
# Error handling (PyUnicode_EncodeDecimal()) # Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
self.assertRaises(UnicodeError, int, "\u0200") self.assertRaises(UnicodeError, int, "\ud800")
self.assertRaises(UnicodeError, int, "\udf00")
self.assertRaises(UnicodeError, float, "\ud800")
self.assertRaises(UnicodeError, float, "\udf00")
self.assertRaises(UnicodeError, complex, "\ud800")
self.assertRaises(UnicodeError, complex, "\udf00")
def test_codecs(self): def test_codecs(self):
# Encoding # Encoding

View File

@ -222,6 +222,10 @@ Library
C-API C-API
----- -----
- Issue #10557: Added a new API function, PyUnicode_TransformDecimalToASCII(),
which transforms non-ASCII decimal digits in a Unicode string to their
ASCII equivalents.
- Issue #9518: Extend the PyModuleDef_HEAD_INIT macro to explicitly - Issue #9518: Extend the PyModuleDef_HEAD_INIT macro to explicitly
zero-initialize all fields, fixing compiler warnings seen when building zero-initialize all fields, fixing compiler warnings seen when building
extension modules with gcc with "-Wmissing-field-initializers" (implied by extension modules with gcc with "-Wmissing-field-initializers" (implied by

View File

@ -766,20 +766,26 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
char *end; char *end;
double x=0.0, y=0.0, z; double x=0.0, y=0.0, z;
int got_bracket=0; int got_bracket=0;
char *s_buffer = NULL; PyObject *s_buffer = NULL;
Py_ssize_t len; Py_ssize_t len;
if (PyUnicode_Check(v)) { if (PyUnicode_Check(v)) {
s_buffer = (char *)PyMem_MALLOC(PyUnicode_GET_SIZE(v) + 1); Py_ssize_t i, buflen = PyUnicode_GET_SIZE(v);
Py_UNICODE *bufptr;
s_buffer = PyUnicode_TransformDecimalToASCII(
PyUnicode_AS_UNICODE(v), buflen);
if (s_buffer == NULL) if (s_buffer == NULL)
return PyErr_NoMemory(); return NULL;
if (PyUnicode_EncodeDecimal(PyUnicode_AS_UNICODE(v), /* Replace non-ASCII whitespace with ' ' */
PyUnicode_GET_SIZE(v), bufptr = PyUnicode_AS_UNICODE(s_buffer);
s_buffer, for (i = 0; i < buflen; i++) {
NULL)) Py_UNICODE ch = bufptr[i];
if (ch > 127 && Py_UNICODE_ISSPACE(ch))
bufptr[i] = ' ';
}
s = _PyUnicode_AsStringAndSize(s_buffer, &len);
if (s == NULL)
goto error; goto error;
s = s_buffer;
len = strlen(s);
} }
else if (PyObject_AsCharBuffer(v, &s, &len)) { else if (PyObject_AsCharBuffer(v, &s, &len)) {
PyErr_SetString(PyExc_TypeError, PyErr_SetString(PyExc_TypeError,
@ -894,16 +900,14 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
if (s-start != len) if (s-start != len)
goto parse_error; goto parse_error;
if (s_buffer) Py_XDECREF(s_buffer);
PyMem_FREE(s_buffer);
return complex_subtype_from_doubles(type, x, y); return complex_subtype_from_doubles(type, x, y);
parse_error: parse_error:
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"complex() arg is a malformed string"); "complex() arg is a malformed string");
error: error:
if (s_buffer) Py_XDECREF(s_buffer);
PyMem_FREE(s_buffer);
return NULL; return NULL;
} }

View File

@ -174,22 +174,30 @@ PyFloat_FromString(PyObject *v)
{ {
const char *s, *last, *end; const char *s, *last, *end;
double x; double x;
char buffer[256]; /* for errors */ PyObject *s_buffer = NULL;
char *s_buffer = NULL;
Py_ssize_t len; Py_ssize_t len;
PyObject *result = NULL; PyObject *result = NULL;
if (PyUnicode_Check(v)) { if (PyUnicode_Check(v)) {
s_buffer = (char *)PyMem_MALLOC(PyUnicode_GET_SIZE(v)+1); Py_ssize_t i, buflen = PyUnicode_GET_SIZE(v);
Py_UNICODE *bufptr;
s_buffer = PyUnicode_TransformDecimalToASCII(
PyUnicode_AS_UNICODE(v), buflen);
if (s_buffer == NULL) if (s_buffer == NULL)
return PyErr_NoMemory(); return NULL;
if (PyUnicode_EncodeDecimal(PyUnicode_AS_UNICODE(v), /* Replace non-ASCII whitespace with ' ' */
PyUnicode_GET_SIZE(v), bufptr = PyUnicode_AS_UNICODE(s_buffer);
s_buffer, for (i = 0; i < buflen; i++) {
NULL)) Py_UNICODE ch = bufptr[i];
goto error; if (ch > 127 && Py_UNICODE_ISSPACE(ch))
s = s_buffer; bufptr[i] = ' ';
len = strlen(s); }
s = _PyUnicode_AsStringAndSize(s_buffer, &len);
if (s == NULL) {
Py_DECREF(s_buffer);
return NULL;
}
last = s + len;
} }
else if (PyObject_AsCharBuffer(v, &s, &len)) { else if (PyObject_AsCharBuffer(v, &s, &len)) {
PyErr_SetString(PyExc_TypeError, PyErr_SetString(PyExc_TypeError,
@ -197,29 +205,27 @@ PyFloat_FromString(PyObject *v)
return NULL; return NULL;
} }
last = s + len; last = s + len;
/* strip space */
while (Py_ISSPACE(*s)) while (s < last && Py_ISSPACE(*s))
s++; s++;
while (s < last - 1 && Py_ISSPACE(last[-1]))
last--;
/* We don't care about overflow or underflow. If the platform /* We don't care about overflow or underflow. If the platform
* supports them, infinities and signed zeroes (on underflow) are * supports them, infinities and signed zeroes (on underflow) are
* fine. */ * fine. */
x = PyOS_string_to_double(s, (char **)&end, NULL); x = PyOS_string_to_double(s, (char **)&end, NULL);
if (x == -1.0 && PyErr_Occurred()) if (end != last) {
goto error; PyErr_Format(PyExc_ValueError,
while (Py_ISSPACE(*end)) "could not convert string to float: "
end++; "%R", v);
if (end == last)
result = PyFloat_FromDouble(x);
else {
PyOS_snprintf(buffer, sizeof(buffer),
"invalid literal for float(): %.200s", s);
PyErr_SetString(PyExc_ValueError, buffer);
result = NULL; result = NULL;
} }
else if (x == -1.0 && PyErr_Occurred())
result = NULL;
else
result = PyFloat_FromDouble(x);
error: Py_XDECREF(s_buffer);
if (s_buffer)
PyMem_FREE(s_buffer);
return result; return result;
} }

View File

@ -2133,17 +2133,34 @@ PyObject *
PyLong_FromUnicode(Py_UNICODE *u, Py_ssize_t length, int base) PyLong_FromUnicode(Py_UNICODE *u, Py_ssize_t length, int base)
{ {
PyObject *result; PyObject *result;
char *buffer = (char *)PyMem_MALLOC(length+1); PyObject *asciidig;
char *buffer, *end;
Py_ssize_t i, buflen;
Py_UNICODE *ptr;
if (buffer == NULL) asciidig = PyUnicode_TransformDecimalToASCII(u, length);
if (asciidig == NULL)
return NULL; return NULL;
/* Replace non-ASCII whitespace with ' ' */
if (PyUnicode_EncodeDecimal(u, length, buffer, NULL)) { ptr = PyUnicode_AS_UNICODE(asciidig);
PyMem_FREE(buffer); for (i = 0; i < length; i++) {
Py_UNICODE ch = ptr[i];
if (ch > 127 && Py_UNICODE_ISSPACE(ch))
ptr[i] = ' ';
}
buffer = _PyUnicode_AsStringAndSize(asciidig, &buflen);
if (buffer == NULL) {
Py_DECREF(asciidig);
return NULL; return NULL;
} }
result = PyLong_FromString(buffer, NULL, base); result = PyLong_FromString(buffer, &end, base);
PyMem_FREE(buffer); if (result != NULL && end != buffer + buflen) {
PyErr_SetString(PyExc_ValueError,
"null byte in argument for int()");
Py_DECREF(result);
result = NULL;
}
Py_DECREF(asciidig);
return result; return result;
} }

View File

@ -6206,6 +6206,30 @@ PyObject *PyUnicode_Translate(PyObject *str,
return NULL; return NULL;
} }
PyObject *
PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
Py_ssize_t length)
{
PyObject *result;
Py_UNICODE *p; /* write pointer into result */
Py_ssize_t i;
/* Copy to a new string */
result = (PyObject *)_PyUnicode_New(length);
Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
if (result == NULL)
return result;
p = PyUnicode_AS_UNICODE(result);
/* Iterate over code points */
for (i = 0; i < length; i++) {
Py_UNICODE ch =s[i];
if (ch > 127) {
int decimal = Py_UNICODE_TODECIMAL(ch);
if (decimal >= 0)
p[i] = '0' + decimal;
}
}
return result;
}
/* --- Decimal Encoder ---------------------------------------------------- */ /* --- Decimal Encoder ---------------------------------------------------- */
int PyUnicode_EncodeDecimal(Py_UNICODE *s, int PyUnicode_EncodeDecimal(Py_UNICODE *s,
@ -8967,6 +8991,13 @@ unicode_freelistsize(PyUnicodeObject *self)
{ {
return PyLong_FromLong(numfree); return PyLong_FromLong(numfree);
} }
static PyObject *
unicode__decimal2ascii(PyObject *self)
{
return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
PyUnicode_GET_SIZE(self));
}
#endif #endif
PyDoc_STRVAR(startswith__doc__, PyDoc_STRVAR(startswith__doc__,
@ -9108,7 +9139,6 @@ unicode_getnewargs(PyUnicodeObject *v)
return Py_BuildValue("(u#)", v->str, v->length); return Py_BuildValue("(u#)", v->str, v->length);
} }
static PyMethodDef unicode_methods[] = { static PyMethodDef unicode_methods[] = {
/* Order is according to common usage: often used methods should /* Order is according to common usage: often used methods should
@ -9170,8 +9200,9 @@ static PyMethodDef unicode_methods[] = {
#endif #endif
#if 0 #if 0
/* This one is just used for debugging the implementation. */ /* These methods are just used for debugging the implementation. */
{"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS}, {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
{"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
#endif #endif
{"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS}, {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},