bpo-37587: json: Use _PyUnicodeWriter when scanning string. (GH-15591)

This commit is contained in:
Inada Naoki 2019-10-17 16:12:41 +09:00 committed by GitHub
parent a661392f8f
commit 9c11029bb4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 25 additions and 57 deletions

View File

@ -0,0 +1,2 @@
``_json.scanstring`` is now up to 3x faster when there are many backslash
escaped characters in the JSON string.

View File

@ -73,19 +73,6 @@ static PyMemberDef encoder_members[] = {
{NULL} {NULL}
}; };
static PyObject *
join_list_unicode(PyObject *lst)
{
/* return u''.join(lst) */
static PyObject *sep = NULL;
if (sep == NULL) {
sep = PyUnicode_FromStringAndSize("", 0);
if (sep == NULL)
return NULL;
}
return PyUnicode_Join(sep, lst);
}
/* Forward decls */ /* Forward decls */
static PyObject * static PyObject *
@ -385,21 +372,6 @@ _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) {
return tpl; return tpl;
} }
#define APPEND_OLD_CHUNK \
if (chunk != NULL) { \
if (chunks == NULL) { \
chunks = PyList_New(0); \
if (chunks == NULL) { \
goto bail; \
} \
} \
if (PyList_Append(chunks, chunk)) { \
Py_CLEAR(chunk); \
goto bail; \
} \
Py_CLEAR(chunk); \
}
static PyObject * static PyObject *
scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr) scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr)
{ {
@ -417,12 +389,14 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
Py_ssize_t next /* = begin */; Py_ssize_t next /* = begin */;
const void *buf; const void *buf;
int kind; int kind;
PyObject *chunks = NULL;
PyObject *chunk = NULL;
if (PyUnicode_READY(pystr) == -1) if (PyUnicode_READY(pystr) == -1)
return 0; return 0;
_PyUnicodeWriter writer;
_PyUnicodeWriter_Init(&writer);
writer.overallocate = 1;
len = PyUnicode_GET_LENGTH(pystr); len = PyUnicode_GET_LENGTH(pystr);
buf = PyUnicode_DATA(pystr); buf = PyUnicode_DATA(pystr);
kind = PyUnicode_KIND(pystr); kind = PyUnicode_KIND(pystr);
@ -449,18 +423,26 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
} }
c = d; c = d;
} }
if (!(c == '"' || c == '\\')) {
if (c == '"') {
// Fast path for simple case.
if (writer.buffer == NULL) {
PyObject *ret = PyUnicode_Substring(pystr, end, next);
if (ret == NULL) {
goto bail;
}
*next_end_ptr = next + 1;;
return ret;
}
}
else if (c != '\\') {
raise_errmsg("Unterminated string starting at", pystr, begin); raise_errmsg("Unterminated string starting at", pystr, begin);
goto bail; goto bail;
} }
/* Pick up this chunk if it's not zero length */ /* Pick up this chunk if it's not zero length */
if (next != end) { if (next != end) {
APPEND_OLD_CHUNK if (_PyUnicodeWriter_WriteSubstring(&writer, pystr, end, next) < 0) {
chunk = PyUnicode_FromKindAndData(
kind,
(char*)buf + kind * end,
next - end);
if (chunk == NULL) {
goto bail; goto bail;
} }
} }
@ -551,34 +533,18 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
end -= 6; end -= 6;
} }
} }
APPEND_OLD_CHUNK if (_PyUnicodeWriter_WriteChar(&writer, c) < 0) {
chunk = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, &c, 1);
if (chunk == NULL) {
goto bail; goto bail;
} }
} }
if (chunks == NULL) { rval = _PyUnicodeWriter_Finish(&writer);
if (chunk != NULL)
rval = chunk;
else
rval = PyUnicode_FromStringAndSize("", 0);
}
else {
APPEND_OLD_CHUNK
rval = join_list_unicode(chunks);
if (rval == NULL) {
goto bail;
}
Py_CLEAR(chunks);
}
*next_end_ptr = end; *next_end_ptr = end;
return rval; return rval;
bail: bail:
*next_end_ptr = -1; *next_end_ptr = -1;
Py_XDECREF(chunks); _PyUnicodeWriter_Dealloc(&writer);
Py_XDECREF(chunk);
return NULL; return NULL;
} }