mirror of https://github.com/python/cpython
bpo-31979: Simplify transforming decimals to ASCII (#4336)
in int(), float() and complex() parsers. This also speeds up parsing non-ASCII numbers by around 20%.
This commit is contained in:
parent
ce12629c84
commit
9b6c60cbce
|
@ -1723,6 +1723,7 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
|
||||||
|
|
||||||
#endif /* MS_WINDOWS */
|
#endif /* MS_WINDOWS */
|
||||||
|
|
||||||
|
#ifndef Py_LIMITED_API
|
||||||
/* --- Decimal Encoder ---------------------------------------------------- */
|
/* --- Decimal Encoder ---------------------------------------------------- */
|
||||||
|
|
||||||
/* Takes a Unicode string holding a decimal value and writes it into
|
/* Takes a Unicode string holding a decimal value and writes it into
|
||||||
|
@ -1747,14 +1748,12 @@ PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef Py_LIMITED_API
|
|
||||||
PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
|
PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
|
||||||
Py_UNICODE *s, /* Unicode buffer */
|
Py_UNICODE *s, /* Unicode buffer */
|
||||||
Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
|
Py_ssize_t length, /* Number of Py_UNICODE chars to encode */
|
||||||
char *output, /* Output buffer; must have size >= length */
|
char *output, /* Output buffer; must have size >= length */
|
||||||
const char *errors /* error handling */
|
const char *errors /* error handling */
|
||||||
) /* Py_DEPRECATED(3.3) */;
|
) /* Py_DEPRECATED(3.3) */;
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Transforms code points that have decimal digit property to the
|
/* Transforms code points that have decimal digit property to the
|
||||||
corresponding ASCII digit code points.
|
corresponding ASCII digit code points.
|
||||||
|
@ -1762,19 +1761,18 @@ PyAPI_FUNC(int) PyUnicode_EncodeDecimal(
|
||||||
Returns a new Unicode string on success, NULL on failure.
|
Returns a new Unicode string on success, NULL on failure.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#ifndef Py_LIMITED_API
|
|
||||||
PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
|
PyAPI_FUNC(PyObject*) PyUnicode_TransformDecimalToASCII(
|
||||||
Py_UNICODE *s, /* Unicode buffer */
|
Py_UNICODE *s, /* Unicode buffer */
|
||||||
Py_ssize_t length /* Number of Py_UNICODE chars to transform */
|
Py_ssize_t length /* Number of Py_UNICODE chars to transform */
|
||||||
) /* Py_DEPRECATED(3.3) */;
|
) /* Py_DEPRECATED(3.3) */;
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Similar to PyUnicode_TransformDecimalToASCII(), but takes a PyObject
|
/* Coverts a Unicode object holding a decimal value to an ASCII string
|
||||||
as argument instead of a raw buffer and length. This function additionally
|
for using in int, float and complex parsers.
|
||||||
transforms spaces to ASCII because this is what the callers in longobject,
|
Transforms code points that have decimal digit property to the
|
||||||
floatobject, and complexobject did anyways. */
|
corresponding ASCII digit code points. Transforms spaces to ASCII.
|
||||||
|
Transforms code points starting from the first non-ASCII code point that
|
||||||
|
is neither a decimal digit nor a space to the end into '?'. */
|
||||||
|
|
||||||
#ifndef Py_LIMITED_API
|
|
||||||
PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
|
PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
|
||||||
PyObject *unicode /* Unicode object */
|
PyObject *unicode /* Unicode object */
|
||||||
);
|
);
|
||||||
|
|
|
@ -51,7 +51,7 @@ class GeneralFloatCases(unittest.TestCase):
|
||||||
self.assertRaises(TypeError, float, {})
|
self.assertRaises(TypeError, float, {})
|
||||||
self.assertRaisesRegex(TypeError, "not 'dict'", float, {})
|
self.assertRaisesRegex(TypeError, "not 'dict'", float, {})
|
||||||
# Lone surrogate
|
# Lone surrogate
|
||||||
self.assertRaises(UnicodeEncodeError, float, '\uD8F0')
|
self.assertRaises(ValueError, float, '\uD8F0')
|
||||||
# check that we don't accept alternate exponent markers
|
# check that we don't accept alternate exponent markers
|
||||||
self.assertRaises(ValueError, float, "-1.7d29")
|
self.assertRaises(ValueError, float, "-1.7d29")
|
||||||
self.assertRaises(ValueError, float, "3D-14")
|
self.assertRaises(ValueError, float, "3D-14")
|
||||||
|
|
|
@ -2068,11 +2068,14 @@ class UnicodeTest(string_tests.CommonTest,
|
||||||
# Error handling (wrong arguments)
|
# Error handling (wrong arguments)
|
||||||
self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
|
self.assertRaises(TypeError, "hello".encode, 42, 42, 42)
|
||||||
|
|
||||||
# Error handling (lone surrogate in PyUnicode_TransformDecimalToASCII())
|
# Error handling (lone surrogate in
|
||||||
self.assertRaises(UnicodeError, float, "\ud800")
|
# _PyUnicode_TransformDecimalAndSpaceToASCII())
|
||||||
self.assertRaises(UnicodeError, float, "\udf00")
|
self.assertRaises(ValueError, int, "\ud800")
|
||||||
self.assertRaises(UnicodeError, complex, "\ud800")
|
self.assertRaises(ValueError, int, "\udf00")
|
||||||
self.assertRaises(UnicodeError, complex, "\udf00")
|
self.assertRaises(ValueError, float, "\ud800")
|
||||||
|
self.assertRaises(ValueError, float, "\udf00")
|
||||||
|
self.assertRaises(ValueError, complex, "\ud800")
|
||||||
|
self.assertRaises(ValueError, complex, "\udf00")
|
||||||
|
|
||||||
def test_codecs(self):
|
def test_codecs(self):
|
||||||
# Encoding
|
# Encoding
|
||||||
|
|
|
@ -914,10 +914,10 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
|
||||||
if (s_buffer == NULL) {
|
if (s_buffer == NULL) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
assert(PyUnicode_IS_ASCII(s_buffer));
|
||||||
|
/* Simply get a pointer to existing ASCII characters. */
|
||||||
s = PyUnicode_AsUTF8AndSize(s_buffer, &len);
|
s = PyUnicode_AsUTF8AndSize(s_buffer, &len);
|
||||||
if (s == NULL) {
|
assert(s != NULL);
|
||||||
goto exit;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
PyErr_Format(PyExc_TypeError,
|
PyErr_Format(PyExc_TypeError,
|
||||||
|
@ -928,7 +928,6 @@ complex_subtype_from_string(PyTypeObject *type, PyObject *v)
|
||||||
|
|
||||||
result = _Py_string_to_number_with_underscores(s, len, "complex", v, type,
|
result = _Py_string_to_number_with_underscores(s, len, "complex", v, type,
|
||||||
complex_from_string_inner);
|
complex_from_string_inner);
|
||||||
exit:
|
|
||||||
Py_DECREF(s_buffer);
|
Py_DECREF(s_buffer);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
|
@ -176,11 +176,10 @@ PyFloat_FromString(PyObject *v)
|
||||||
s_buffer = _PyUnicode_TransformDecimalAndSpaceToASCII(v);
|
s_buffer = _PyUnicode_TransformDecimalAndSpaceToASCII(v);
|
||||||
if (s_buffer == NULL)
|
if (s_buffer == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
assert(PyUnicode_IS_ASCII(s_buffer));
|
||||||
|
/* Simply get a pointer to existing ASCII characters. */
|
||||||
s = PyUnicode_AsUTF8AndSize(s_buffer, &len);
|
s = PyUnicode_AsUTF8AndSize(s_buffer, &len);
|
||||||
if (s == NULL) {
|
assert(s != NULL);
|
||||||
Py_DECREF(s_buffer);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else if (PyBytes_Check(v)) {
|
else if (PyBytes_Check(v)) {
|
||||||
s = PyBytes_AS_STRING(v);
|
s = PyBytes_AS_STRING(v);
|
||||||
|
|
|
@ -2509,21 +2509,18 @@ PyLong_FromUnicodeObject(PyObject *u, int base)
|
||||||
asciidig = _PyUnicode_TransformDecimalAndSpaceToASCII(u);
|
asciidig = _PyUnicode_TransformDecimalAndSpaceToASCII(u);
|
||||||
if (asciidig == NULL)
|
if (asciidig == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
assert(PyUnicode_IS_ASCII(asciidig));
|
||||||
|
/* Simply get a pointer to existing ASCII characters. */
|
||||||
buffer = PyUnicode_AsUTF8AndSize(asciidig, &buflen);
|
buffer = PyUnicode_AsUTF8AndSize(asciidig, &buflen);
|
||||||
if (buffer == NULL) {
|
assert(buffer != NULL);
|
||||||
|
|
||||||
|
result = PyLong_FromString(buffer, &end, base);
|
||||||
|
if (end == NULL || (result != NULL && end == buffer + buflen)) {
|
||||||
Py_DECREF(asciidig);
|
Py_DECREF(asciidig);
|
||||||
if (!PyErr_ExceptionMatches(PyExc_UnicodeEncodeError))
|
return result;
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
result = PyLong_FromString(buffer, &end, base);
|
|
||||||
if (end == NULL || (result != NULL && end == buffer + buflen)) {
|
|
||||||
Py_DECREF(asciidig);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
Py_DECREF(asciidig);
|
|
||||||
Py_XDECREF(result);
|
|
||||||
}
|
}
|
||||||
|
Py_DECREF(asciidig);
|
||||||
|
Py_XDECREF(result);
|
||||||
PyErr_Format(PyExc_ValueError,
|
PyErr_Format(PyExc_ValueError,
|
||||||
"invalid literal for int() with base %d: %.200R",
|
"invalid literal for int() with base %d: %.200R",
|
||||||
base, u);
|
base, u);
|
||||||
|
|
|
@ -840,9 +840,6 @@ ensure_unicode(PyObject *obj)
|
||||||
|
|
||||||
/* --- Unicode Object ----------------------------------------------------- */
|
/* --- Unicode Object ----------------------------------------------------- */
|
||||||
|
|
||||||
static PyObject *
|
|
||||||
fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
|
|
||||||
|
|
||||||
static inline Py_ssize_t
|
static inline Py_ssize_t
|
||||||
findchar(const void *s, int kind,
|
findchar(const void *s, int kind,
|
||||||
Py_ssize_t size, Py_UCS4 ch,
|
Py_ssize_t size, Py_UCS4 ch,
|
||||||
|
@ -9062,42 +9059,6 @@ PyUnicode_Translate(PyObject *str,
|
||||||
return _PyUnicode_TranslateCharmap(str, mapping, errors);
|
return _PyUnicode_TranslateCharmap(str, mapping, errors);
|
||||||
}
|
}
|
||||||
|
|
||||||
static Py_UCS4
|
|
||||||
fix_decimal_and_space_to_ascii(PyObject *self)
|
|
||||||
{
|
|
||||||
/* No need to call PyUnicode_READY(self) because this function is only
|
|
||||||
called as a callback from fixup() which does it already. */
|
|
||||||
const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
|
|
||||||
const int kind = PyUnicode_KIND(self);
|
|
||||||
void *data = PyUnicode_DATA(self);
|
|
||||||
Py_UCS4 maxchar = 127, ch, fixed;
|
|
||||||
int modified = 0;
|
|
||||||
Py_ssize_t i;
|
|
||||||
|
|
||||||
for (i = 0; i < len; ++i) {
|
|
||||||
ch = PyUnicode_READ(kind, data, i);
|
|
||||||
fixed = 0;
|
|
||||||
if (ch > 127) {
|
|
||||||
if (Py_UNICODE_ISSPACE(ch))
|
|
||||||
fixed = ' ';
|
|
||||||
else {
|
|
||||||
const int decimal = Py_UNICODE_TODECIMAL(ch);
|
|
||||||
if (decimal >= 0)
|
|
||||||
fixed = '0' + decimal;
|
|
||||||
}
|
|
||||||
if (fixed != 0) {
|
|
||||||
modified = 1;
|
|
||||||
maxchar = Py_MAX(maxchar, fixed);
|
|
||||||
PyUnicode_WRITE(kind, data, i, fixed);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
maxchar = Py_MAX(maxchar, ch);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return (modified) ? maxchar : 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
|
_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
|
||||||
{
|
{
|
||||||
|
@ -9107,12 +9068,42 @@ _PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
|
||||||
}
|
}
|
||||||
if (PyUnicode_READY(unicode) == -1)
|
if (PyUnicode_READY(unicode) == -1)
|
||||||
return NULL;
|
return NULL;
|
||||||
if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
|
if (PyUnicode_IS_ASCII(unicode)) {
|
||||||
/* If the string is already ASCII, just return the same string */
|
/* If the string is already ASCII, just return the same string */
|
||||||
Py_INCREF(unicode);
|
Py_INCREF(unicode);
|
||||||
return unicode;
|
return unicode;
|
||||||
}
|
}
|
||||||
return fixup(unicode, fix_decimal_and_space_to_ascii);
|
|
||||||
|
Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
|
||||||
|
PyObject *result = PyUnicode_New(len, 127);
|
||||||
|
if (result == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
|
||||||
|
int kind = PyUnicode_KIND(unicode);
|
||||||
|
const void *data = PyUnicode_DATA(unicode);
|
||||||
|
Py_ssize_t i;
|
||||||
|
for (i = 0; i < len; ++i) {
|
||||||
|
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
||||||
|
if (ch < 127) {
|
||||||
|
out[i] = ch;
|
||||||
|
}
|
||||||
|
else if (Py_UNICODE_ISSPACE(ch)) {
|
||||||
|
out[i] = ' ';
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
int decimal = Py_UNICODE_TODECIMAL(ch);
|
||||||
|
if (decimal < 0) {
|
||||||
|
out[i] = '?';
|
||||||
|
_PyUnicode_LENGTH(result) = i + 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
out[i] = '0' + decimal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
|
@ -9588,69 +9579,6 @@ PyUnicode_Tailmatch(PyObject *str,
|
||||||
return tailmatch(str, substr, start, end, direction);
|
return tailmatch(str, substr, start, end, direction);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Apply fixfct filter to the Unicode object self and return a
|
|
||||||
reference to the modified object */
|
|
||||||
|
|
||||||
static PyObject *
|
|
||||||
fixup(PyObject *self,
|
|
||||||
Py_UCS4 (*fixfct)(PyObject *s))
|
|
||||||
{
|
|
||||||
PyObject *u;
|
|
||||||
Py_UCS4 maxchar_old, maxchar_new = 0;
|
|
||||||
PyObject *v;
|
|
||||||
|
|
||||||
u = _PyUnicode_Copy(self);
|
|
||||||
if (u == NULL)
|
|
||||||
return NULL;
|
|
||||||
maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
|
|
||||||
|
|
||||||
/* fix functions return the new maximum character in a string,
|
|
||||||
if the kind of the resulting unicode object does not change,
|
|
||||||
everything is fine. Otherwise we need to change the string kind
|
|
||||||
and re-run the fix function. */
|
|
||||||
maxchar_new = fixfct(u);
|
|
||||||
|
|
||||||
if (maxchar_new == 0) {
|
|
||||||
/* no changes */;
|
|
||||||
if (PyUnicode_CheckExact(self)) {
|
|
||||||
Py_DECREF(u);
|
|
||||||
Py_INCREF(self);
|
|
||||||
return self;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
return u;
|
|
||||||
}
|
|
||||||
|
|
||||||
maxchar_new = align_maxchar(maxchar_new);
|
|
||||||
|
|
||||||
if (maxchar_new == maxchar_old)
|
|
||||||
return u;
|
|
||||||
|
|
||||||
/* In case the maximum character changed, we need to
|
|
||||||
convert the string to the new category. */
|
|
||||||
v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
|
|
||||||
if (v == NULL) {
|
|
||||||
Py_DECREF(u);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
if (maxchar_new > maxchar_old) {
|
|
||||||
/* If the maxchar increased so that the kind changed, not all
|
|
||||||
characters are representable anymore and we need to fix the
|
|
||||||
string again. This only happens in very few cases. */
|
|
||||||
_PyUnicode_FastCopyCharacters(v, 0,
|
|
||||||
self, 0, PyUnicode_GET_LENGTH(self));
|
|
||||||
maxchar_old = fixfct(v);
|
|
||||||
assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
_PyUnicode_FastCopyCharacters(v, 0,
|
|
||||||
u, 0, PyUnicode_GET_LENGTH(self));
|
|
||||||
}
|
|
||||||
Py_DECREF(u);
|
|
||||||
assert(_PyUnicode_CheckConsistency(v, 1));
|
|
||||||
return v;
|
|
||||||
}
|
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
ascii_upper_or_lower(PyObject *self, int lower)
|
ascii_upper_or_lower(PyObject *self, int lower)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue