bpo-31979: Simplify transforming decimals to ASCII (#4336)

in int(), float() and complex() parsers.

This also speeds up parsing non-ASCII numbers by around 20%.
This commit is contained in:
Serhiy Storchaka 2017-11-13 21:23:48 +02:00 committed by GitHub
parent ce12629c84
commit 9b6c60cbce
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 63 additions and 139 deletions

View File

@ -1723,6 +1723,7 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
#endif /* MS_WINDOWS */ #endif /* MS_WINDOWS */
#ifndef Py_LIMITED_API
/* --- Decimal Encoder ---------------------------------------------------- */ /* --- Decimal Encoder ---------------------------------------------------- */
/* Takes a Unicode string holding a decimal value and writes it into /* Takes a Unicode string holding a decimal value and writes it into
@ -1747,14 +1748,12 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
*/ */
#ifndef Py_LIMITED_API
PyAPI_FUNC(int) PyUnicode_EncodeDecimal( PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Py_UNICODE *s, /* Unicode buffer */ Py_UNICODE *s, /* Unicode buffer */
Py_ssize_t length, /* Number of Py_UNICODE chars to encode */ Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
char *output, /* Output buffer; must have size >= length */ char *output, /* Output buffer; must have size >= length */
const char *errors /* error handling */ const char *errors /* error handling */
) /* Py_DEPRECATED(3.3) */; ) /* Py_DEPRECATED(3.3) */;
#endif
/* Transforms code points that have decimal digit property to the /* Transforms code points that have decimal digit property to the
corresponding ASCII digit code points. corresponding ASCII digit code points.
@ -1762,19 +1761,18 @@ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Returns a new Unicode string on success, NULL on failure. Returns a new Unicode string on success, NULL on failure.
*/ */
#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII( PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
Py_UNICODE *s, /* Unicode buffer */ Py_UNICODE *s, /* Unicode buffer */
Py_ssize_t length /* Number of Py_UNICODE chars to transform */ Py_ssize_t length /* Number of Py_UNICODE chars to transform */
) /* Py_DEPRECATED(3.3) */; ) /* Py_DEPRECATED(3.3) */;
#endif
/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject /* Coverts a Unicode object holding a decimal value to an ASCII string
as argument instead of a raw buffer and length. This function additionally for using in int, float and complex parsers.
transforms spaces to ASCII because this is what the callers in longobject, Transforms code points that have decimal digit property to the
floatobject, and complexobject did anyways. */ corresponding ASCII digit code points. Transforms spaces to ASCII.
Transforms code points starting from the first non-ASCII code point that
is neither a decimal digit nor a space to the end into '?'. */
#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII( PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
PyObject *unicode /* Unicode object */ PyObject *unicode /* Unicode object */
); );

View File

@ -51,7 +51,7 @@ class GeneralFloatCases(unittest.TestCase):
self.assertRaises(TypeError, float, {}) self.assertRaises(TypeError, float, {})
self.assertRaisesRegex(TypeError, "not 'dict'", float, {}) self.assertRaisesRegex(TypeError, "not 'dict'", float, {})
# Lone surrogate # Lone surrogate
self.assertRaises(UnicodeEncodeError, float, '\uD8F0') self.assertRaises(ValueError, float, '\uD8F0')
# check that we don't accept alternate exponent markers # check that we don't accept alternate exponent markers
self.assertRaises(ValueError, float, "-1.7d29") self.assertRaises(ValueError, float, "-1.7d29")
self.assertRaises(ValueError, float, "3D-14") self.assertRaises(ValueError, float, "3D-14")

View File

@ -2068,11 +2068,14 @@ class UnicodeTest(string_tests.CommonTest,
# Error handling (wrong arguments) # Error handling (wrong arguments)
self.assertRaises(TypeError, "hello".encode, 42, 42, 42) self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
# Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII()) # Error handling (lone surrogate in
self.assertRaises(UnicodeError, float, "\ud800") # _PyUnicode_TransformDecimalAndSpaceToASCII())
self.assertRaises(UnicodeError, float, "\udf00") self.assertRaises(ValueError, int, "\ud800")
self.assertRaises(UnicodeError, complex, "\ud800") self.assertRaises(ValueError, int, "\udf00")
self.assertRaises(UnicodeError, complex, "\udf00") self.assertRaises(ValueError, float, "\ud800")
self.assertRaises(ValueError, float, "\udf00")
self.assertRaises(ValueError, complex, "\ud800")
self.assertRaises(ValueError, complex, "\udf00")
def test_codecs(self): def test_codecs(self):
# Encoding # Encoding

View File

@ -914,10 +914,10 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
if (s_buffer == NULL) { if (s_buffer == NULL) {
return NULL; return NULL;
} }
assert(PyUnicode_IS_ASCII(s_buffer));
/* Simply get a pointer to existing ASCII characters. */
s = PyUnicode_AsUTF8AndSize(s_buffer, &len); s = PyUnicode_AsUTF8AndSize(s_buffer, &len);
if (s == NULL) { assert(s != NULL);
goto exit;
}
} }
else { else {
PyErr_Format(PyExc_TypeError, PyErr_Format(PyExc_TypeError,
@ -928,7 +928,6 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
result = _Py_string_to_number_with_underscores(s, len, "complex", v, type, result = _Py_string_to_number_with_underscores(s, len, "complex", v, type,
complex_from_string_inner); complex_from_string_inner);
exit:
Py_DECREF(s_buffer); Py_DECREF(s_buffer);
return result; return result;
} }

View File

@ -176,11 +176,10 @@ PyFloat_FromString(PyObject *v)
s_buffer = _PyUnicode_TransformDecimalAndSpaceToASCII(v); s_buffer = _PyUnicode_TransformDecimalAndSpaceToASCII(v);
if (s_buffer == NULL) if (s_buffer == NULL)
return NULL; return NULL;
assert(PyUnicode_IS_ASCII(s_buffer));
/* Simply get a pointer to existing ASCII characters. */
s = PyUnicode_AsUTF8AndSize(s_buffer, &len); s = PyUnicode_AsUTF8AndSize(s_buffer, &len);
if (s == NULL) { assert(s != NULL);
Py_DECREF(s_buffer);
return NULL;
}
} }
else if (PyBytes_Check(v)) { else if (PyBytes_Check(v)) {
s = PyBytes_AS_STRING(v); s = PyBytes_AS_STRING(v);

View File

@ -2509,21 +2509,18 @@ PyLong_FromUnicodeObject(PyObject *u, int base)
asciidig = _PyUnicode_TransformDecimalAndSpaceToASCII(u); asciidig = _PyUnicode_TransformDecimalAndSpaceToASCII(u);
if (asciidig == NULL) if (asciidig == NULL)
return NULL; return NULL;
assert(PyUnicode_IS_ASCII(asciidig));
/* Simply get a pointer to existing ASCII characters. */
buffer = PyUnicode_AsUTF8AndSize(asciidig, &buflen); buffer = PyUnicode_AsUTF8AndSize(asciidig, &buflen);
if (buffer == NULL) { assert(buffer != NULL);
result = PyLong_FromString(buffer, &end, base);
if (end == NULL || (result != NULL && end == buffer + buflen)) {
Py_DECREF(asciidig); Py_DECREF(asciidig);
if (!PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) return result;
return NULL;
}
else {
result = PyLong_FromString(buffer, &end, base);
if (end == NULL || (result != NULL && end == buffer + buflen)) {
Py_DECREF(asciidig);
return result;
}
Py_DECREF(asciidig);
Py_XDECREF(result);
} }
Py_DECREF(asciidig);
Py_XDECREF(result);
PyErr_Format(PyExc_ValueError, PyErr_Format(PyExc_ValueError,
"invalid literal for int() with base %d: %.200R", "invalid literal for int() with base %d: %.200R",
base, u); base, u);

View File

@ -840,9 +840,6 @@ ensure_unicode(PyObject *obj)
/* --- Unicode Object ----------------------------------------------------- */ /* --- Unicode Object ----------------------------------------------------- */
static PyObject *
fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
static inline Py_ssize_t static inline Py_ssize_t
findchar(const void *s, int kind, findchar(const void *s, int kind,
Py_ssize_t size, Py_UCS4 ch, Py_ssize_t size, Py_UCS4 ch,
@ -9062,42 +9059,6 @@ PyUnicode_Translate(PyObject *str,
return _PyUnicode_TranslateCharmap(str, mapping, errors); return _PyUnicode_TranslateCharmap(str, mapping, errors);
} }
static Py_UCS4
fix_decimal_and_space_to_ascii(PyObject *self)
{
/* No need to call PyUnicode_READY(self) because this function is only
called as a callback from fixup() which does it already. */
const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
const int kind = PyUnicode_KIND(self);
void *data = PyUnicode_DATA(self);
Py_UCS4 maxchar = 127, ch, fixed;
int modified = 0;
Py_ssize_t i;
for (i = 0; i < len; ++i) {
ch = PyUnicode_READ(kind, data, i);
fixed = 0;
if (ch > 127) {
if (Py_UNICODE_ISSPACE(ch))
fixed = ' ';
else {
const int decimal = Py_UNICODE_TODECIMAL(ch);
if (decimal >= 0)
fixed = '0' + decimal;
}
if (fixed != 0) {
modified = 1;
maxchar = Py_MAX(maxchar, fixed);
PyUnicode_WRITE(kind, data, i, fixed);
}
else
maxchar = Py_MAX(maxchar, ch);
}
}
return (modified) ? maxchar : 0;
}
PyObject * PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode) _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
{ {
@ -9107,12 +9068,42 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
} }
if (PyUnicode_READY(unicode) == -1) if (PyUnicode_READY(unicode) == -1)
return NULL; return NULL;
if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) { if (PyUnicode_IS_ASCII(unicode)) {
/* If the string is already ASCII, just return the same string */ /* If the string is already ASCII, just return the same string */
Py_INCREF(unicode); Py_INCREF(unicode);
return unicode; return unicode;
} }
return fixup(unicode, fix_decimal_and_space_to_ascii);
Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
PyObject *result = PyUnicode_New(len, 127);
if (result == NULL) {
return NULL;
}
Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
int kind = PyUnicode_KIND(unicode);
const void *data = PyUnicode_DATA(unicode);
Py_ssize_t i;
for (i = 0; i < len; ++i) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (ch < 127) {
out[i] = ch;
}
else if (Py_UNICODE_ISSPACE(ch)) {
out[i] = ' ';
}
else {
int decimal = Py_UNICODE_TODECIMAL(ch);
if (decimal < 0) {
out[i] = '?';
_PyUnicode_LENGTH(result) = i + 1;
break;
}
out[i] = '0' + decimal;
}
}
return result;
} }
PyObject * PyObject *
@ -9588,69 +9579,6 @@ PyUnicode_Tailmatch(PyObject *str,
return tailmatch(str, substr, start, end, direction); return tailmatch(str, substr, start, end, direction);
} }
/* Apply fixfct filter to the Unicode object self and return a
reference to the modified object */
static PyObject *
fixup(PyObject *self,
Py_UCS4 (*fixfct)(PyObject *s))
{
PyObject *u;
Py_UCS4 maxchar_old, maxchar_new = 0;
PyObject *v;
u = _PyUnicode_Copy(self);
if (u == NULL)
return NULL;
maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
/* fix functions return the new maximum character in a string,
if the kind of the resulting unicode object does not change,
everything is fine. Otherwise we need to change the string kind
and re-run the fix function. */
maxchar_new = fixfct(u);
if (maxchar_new == 0) {
/* no changes */;
if (PyUnicode_CheckExact(self)) {
Py_DECREF(u);
Py_INCREF(self);
return self;
}
else
return u;
}
maxchar_new = align_maxchar(maxchar_new);
if (maxchar_new == maxchar_old)
return u;
/* In case the maximum character changed, we need to
convert the string to the new category. */
v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
if (v == NULL) {
Py_DECREF(u);
return NULL;
}
if (maxchar_new > maxchar_old) {
/* If the maxchar increased so that the kind changed, not all
characters are representable anymore and we need to fix the
string again. This only happens in very few cases. */
_PyUnicode_FastCopyCharacters(v, 0,
self, 0, PyUnicode_GET_LENGTH(self));
maxchar_old = fixfct(v);
assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
}
else {
_PyUnicode_FastCopyCharacters(v, 0,
u, 0, PyUnicode_GET_LENGTH(self));
}
Py_DECREF(u);
assert(_PyUnicode_CheckConsistency(v, 1));
return v;
}
static PyObject * static PyObject *
ascii_upper_or_lower(PyObject *self, int lower) ascii_upper_or_lower(PyObject *self, int lower)
{ {