bpo-31979: Simplify transforming decimals to ASCII (#4336)

in int(), float() and complex() parsers.

This also speeds up parsing non-ASCII numbers by around 20%.
This commit is contained in:
Serhiy Storchaka 2017-11-13 21:23:48 +02:00 committed by GitHub
parent ce12629c84
commit 9b6c60cbce
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 63 additions and 139 deletions

View File

@ -1723,6 +1723,7 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
#endif /* MS_WINDOWS */
#ifndef Py_LIMITED_API
/* --- Decimal Encoder ---------------------------------------------------- */
/* Takes a Unicode string holding a decimal value and writes it into
@ -1747,14 +1748,12 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
*/
#ifndef Py_LIMITED_API
PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Py_UNICODE *s, /* Unicode buffer */
Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
char *output, /* Output buffer; must have size >= length */
const char *errors /* error handling */
) /* Py_DEPRECATED(3.3) */;
#endif
/* Transforms code points that have decimal digit property to the
corresponding ASCII digit code points.
@ -1762,19 +1761,18 @@ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
Returns a new Unicode string on success, NULL on failure.
*/
#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
Py_UNICODE *s, /* Unicode buffer */
Py_ssize_t length /* Number of Py_UNICODE chars to transform */
) /* Py_DEPRECATED(3.3) */;
#endif
/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
as argument instead of a raw buffer and length. This function additionally
transforms spaces to ASCII because this is what the callers in longobject,
floatobject, and complexobject did anyways. */
/* Coverts a Unicode object holding a decimal value to an ASCII string
for using in int, float and complex parsers.
Transforms code points that have decimal digit property to the
corresponding ASCII digit code points. Transforms spaces to ASCII.
Transforms code points starting from the first non-ASCII code point that
is neither a decimal digit nor a space to the end into '?'. */
#ifndef Py_LIMITED_API
PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
PyObject *unicode /* Unicode object */
);

View File

@ -51,7 +51,7 @@ class GeneralFloatCases(unittest.TestCase):
self.assertRaises(TypeError, float, {})
self.assertRaisesRegex(TypeError, "not 'dict'", float, {})
# Lone surrogate
self.assertRaises(UnicodeEncodeError, float, '\uD8F0')
self.assertRaises(ValueError, float, '\uD8F0')
# check that we don't accept alternate exponent markers
self.assertRaises(ValueError, float, "-1.7d29")
self.assertRaises(ValueError, float, "3D-14")

View File

@ -2068,11 +2068,14 @@ class UnicodeTest(string_tests.CommonTest,
# Error handling (wrong arguments)
self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
# Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
self.assertRaises(UnicodeError, float, "\ud800")
self.assertRaises(UnicodeError, float, "\udf00")
self.assertRaises(UnicodeError, complex, "\ud800")
self.assertRaises(UnicodeError, complex, "\udf00")
# Error handling (lone surrogate in
# _PyUnicode_TransformDecimalAndSpaceToASCII())
self.assertRaises(ValueError, int, "\ud800")
self.assertRaises(ValueError, int, "\udf00")
self.assertRaises(ValueError, float, "\ud800")
self.assertRaises(ValueError, float, "\udf00")
self.assertRaises(ValueError, complex, "\ud800")
self.assertRaises(ValueError, complex, "\udf00")
def test_codecs(self):
# Encoding

View File

@ -914,10 +914,10 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
if (s_buffer == NULL) {
return NULL;
}
assert(PyUnicode_IS_ASCII(s_buffer));
/* Simply get a pointer to existing ASCII characters. */
s = PyUnicode_AsUTF8AndSize(s_buffer, &len);
if (s == NULL) {
goto exit;
}
assert(s != NULL);
}
else {
PyErr_Format(PyExc_TypeError,
@ -928,7 +928,6 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
result = _Py_string_to_number_with_underscores(s, len, "complex", v, type,
complex_from_string_inner);
exit:
Py_DECREF(s_buffer);
return result;
}

View File

@ -176,11 +176,10 @@ PyFloat_FromString(PyObject *v)
s_buffer = _PyUnicode_TransformDecimalAndSpaceToASCII(v);
if (s_buffer == NULL)
return NULL;
assert(PyUnicode_IS_ASCII(s_buffer));
/* Simply get a pointer to existing ASCII characters. */
s = PyUnicode_AsUTF8AndSize(s_buffer, &len);
if (s == NULL) {
Py_DECREF(s_buffer);
return NULL;
}
assert(s != NULL);
}
else if (PyBytes_Check(v)) {
s = PyBytes_AS_STRING(v);

View File

@ -2509,13 +2509,11 @@ PyLong_FromUnicodeObject(PyObject *u, int base)
asciidig = _PyUnicode_TransformDecimalAndSpaceToASCII(u);
if (asciidig == NULL)
return NULL;
assert(PyUnicode_IS_ASCII(asciidig));
/* Simply get a pointer to existing ASCII characters. */
buffer = PyUnicode_AsUTF8AndSize(asciidig, &buflen);
if (buffer == NULL) {
Py_DECREF(asciidig);
if (!PyErr_ExceptionMatches(PyExc_UnicodeEncodeError))
return NULL;
}
else {
assert(buffer != NULL);
result = PyLong_FromString(buffer, &end, base);
if (end == NULL || (result != NULL && end == buffer + buflen)) {
Py_DECREF(asciidig);
@ -2523,7 +2521,6 @@ PyLong_FromUnicodeObject(PyObject *u, int base)
}
Py_DECREF(asciidig);
Py_XDECREF(result);
}
PyErr_Format(PyExc_ValueError,
"invalid literal for int() with base %d: %.200R",
base, u);

View File

@ -840,9 +840,6 @@ ensure_unicode(PyObject *obj)
/* --- Unicode Object ----------------------------------------------------- */
static PyObject *
fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
static inline Py_ssize_t
findchar(const void *s, int kind,
Py_ssize_t size, Py_UCS4 ch,
@ -9062,42 +9059,6 @@ PyUnicode_Translate(PyObject *str,
return _PyUnicode_TranslateCharmap(str, mapping, errors);
}
static Py_UCS4
fix_decimal_and_space_to_ascii(PyObject *self)
{
/* No need to call PyUnicode_READY(self) because this function is only
called as a callback from fixup() which does it already. */
const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
const int kind = PyUnicode_KIND(self);
void *data = PyUnicode_DATA(self);
Py_UCS4 maxchar = 127, ch, fixed;
int modified = 0;
Py_ssize_t i;
for (i = 0; i < len; ++i) {
ch = PyUnicode_READ(kind, data, i);
fixed = 0;
if (ch > 127) {
if (Py_UNICODE_ISSPACE(ch))
fixed = ' ';
else {
const int decimal = Py_UNICODE_TODECIMAL(ch);
if (decimal >= 0)
fixed = '0' + decimal;
}
if (fixed != 0) {
modified = 1;
maxchar = Py_MAX(maxchar, fixed);
PyUnicode_WRITE(kind, data, i, fixed);
}
else
maxchar = Py_MAX(maxchar, ch);
}
}
return (modified) ? maxchar : 0;
}
PyObject *
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
{
@ -9107,12 +9068,42 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
}
if (PyUnicode_READY(unicode) == -1)
return NULL;
if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
if (PyUnicode_IS_ASCII(unicode)) {
/* If the string is already ASCII, just return the same string */
Py_INCREF(unicode);
return unicode;
}
return fixup(unicode, fix_decimal_and_space_to_ascii);
Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
PyObject *result = PyUnicode_New(len, 127);
if (result == NULL) {
return NULL;
}
Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
int kind = PyUnicode_KIND(unicode);
const void *data = PyUnicode_DATA(unicode);
Py_ssize_t i;
for (i = 0; i < len; ++i) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (ch < 127) {
out[i] = ch;
}
else if (Py_UNICODE_ISSPACE(ch)) {
out[i] = ' ';
}
else {
int decimal = Py_UNICODE_TODECIMAL(ch);
if (decimal < 0) {
out[i] = '?';
_PyUnicode_LENGTH(result) = i + 1;
break;
}
out[i] = '0' + decimal;
}
}
return result;
}
PyObject *
@ -9588,69 +9579,6 @@ PyUnicode_Tailmatch(PyObject *str,
return tailmatch(str, substr, start, end, direction);
}
/* Apply fixfct filter to the Unicode object self and return a
reference to the modified object */
static PyObject *
fixup(PyObject *self,
Py_UCS4 (*fixfct)(PyObject *s))
{
PyObject *u;
Py_UCS4 maxchar_old, maxchar_new = 0;
PyObject *v;
u = _PyUnicode_Copy(self);
if (u == NULL)
return NULL;
maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
/* fix functions return the new maximum character in a string,
if the kind of the resulting unicode object does not change,
everything is fine. Otherwise we need to change the string kind
and re-run the fix function. */
maxchar_new = fixfct(u);
if (maxchar_new == 0) {
/* no changes */;
if (PyUnicode_CheckExact(self)) {
Py_DECREF(u);
Py_INCREF(self);
return self;
}
else
return u;
}
maxchar_new = align_maxchar(maxchar_new);
if (maxchar_new == maxchar_old)
return u;
/* In case the maximum character changed, we need to
convert the string to the new category. */
v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
if (v == NULL) {
Py_DECREF(u);
return NULL;
}
if (maxchar_new > maxchar_old) {
/* If the maxchar increased so that the kind changed, not all
characters are representable anymore and we need to fix the
string again. This only happens in very few cases. */
_PyUnicode_FastCopyCharacters(v, 0,
self, 0, PyUnicode_GET_LENGTH(self));
maxchar_old = fixfct(v);
assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
}
else {
_PyUnicode_FastCopyCharacters(v, 0,
u, 0, PyUnicode_GET_LENGTH(self));
}
Py_DECREF(u);
assert(_PyUnicode_CheckConsistency(v, 1));
return v;
}
static PyObject *
ascii_upper_or_lower(PyObject *self, int lower)
{