Issue #7451: Improve decoding performance of JSON objects, and reduce
the memory consumption of said decoded objects when they use the same strings as keys.
This commit is contained in:
parent
d9107aaded
commit
7d6e076f6d
|
@ -147,10 +147,14 @@ WHITESPACE_STR = ' \t\n\r'
|
|||
|
||||
|
||||
def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
|
||||
_w=WHITESPACE.match, _ws=WHITESPACE_STR):
|
||||
memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
|
||||
s, end = s_and_end
|
||||
pairs = []
|
||||
pairs_append = pairs.append
|
||||
# Backwards compatibility
|
||||
if memo is None:
|
||||
memo = {}
|
||||
memo_get = memo.setdefault
|
||||
# Use a slice to prevent IndexError from being raised, the following
|
||||
# check will raise a more specific ValueError if the string is empty
|
||||
nextchar = s[end:end + 1]
|
||||
|
@ -167,6 +171,7 @@ def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
|
|||
end += 1
|
||||
while True:
|
||||
key, end = scanstring(s, end, strict)
|
||||
key = memo_get(key, key)
|
||||
# To skip some function call overhead we optimize the fast paths where
|
||||
# the JSON key separator is ": " or just ":".
|
||||
if s[end:end + 1] != ':':
|
||||
|
@ -214,7 +219,7 @@ def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
|
|||
pairs = object_hook(pairs)
|
||||
return pairs, end
|
||||
|
||||
def JSONArray(s_and_end, scan_once, context, _w=WHITESPACE.match):
|
||||
def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
|
||||
s, end = s_and_end
|
||||
values = []
|
||||
nextchar = s[end:end + 1]
|
||||
|
@ -314,6 +319,7 @@ class JSONDecoder(object):
|
|||
self.parse_object = JSONObject
|
||||
self.parse_array = JSONArray
|
||||
self.parse_string = scanstring
|
||||
self.memo = {}
|
||||
self.scan_once = make_scanner(self)
|
||||
|
||||
|
||||
|
|
|
@ -22,6 +22,8 @@ def py_make_scanner(context):
|
|||
parse_int = context.parse_int
|
||||
parse_constant = context.parse_constant
|
||||
object_hook = context.object_hook
|
||||
object_pairs_hook = context.object_pairs_hook
|
||||
memo = context.memo
|
||||
|
||||
def _scan_once(string, idx):
|
||||
try:
|
||||
|
@ -33,7 +35,7 @@ def py_make_scanner(context):
|
|||
return parse_string(string, idx + 1, strict)
|
||||
elif nextchar == '{':
|
||||
return parse_object((string, idx + 1), strict,
|
||||
_scan_once, object_hook, object_pairs_hook)
|
||||
_scan_once, object_hook, object_pairs_hook, memo)
|
||||
elif nextchar == '[':
|
||||
return parse_array((string, idx + 1), _scan_once)
|
||||
elif nextchar == 'n' and string[idx:idx + 4] == 'null':
|
||||
|
@ -60,6 +62,12 @@ def py_make_scanner(context):
|
|||
else:
|
||||
raise StopIteration
|
||||
|
||||
def scan_once(string, idx):
|
||||
try:
|
||||
return _scan_once(string, idx)
|
||||
finally:
|
||||
memo.clear()
|
||||
|
||||
return _scan_once
|
||||
|
||||
make_scanner = c_make_scanner or py_make_scanner
|
||||
|
|
|
@ -1,10 +1,25 @@
|
|||
import decimal
|
||||
from unittest import TestCase
|
||||
from io import StringIO
|
||||
from contextlib import contextmanager
|
||||
|
||||
import json
|
||||
import json.decoder
|
||||
import json.scanner
|
||||
from collections import OrderedDict
|
||||
|
||||
|
||||
@contextmanager
|
||||
def use_python_scanner():
|
||||
py_scanner = json.scanner.py_make_scanner
|
||||
old_scanner = json.decoder.make_scanner
|
||||
json.decoder.make_scanner = py_scanner
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
json.decoder.make_scanner = old_scanner
|
||||
|
||||
|
||||
class TestDecode(TestCase):
|
||||
def test_decimal(self):
|
||||
rval = json.loads('1.1', parse_float=decimal.Decimal)
|
||||
|
@ -39,3 +54,16 @@ class TestDecode(TestCase):
|
|||
# exercise the uncommon cases. The array cases are already covered.
|
||||
rval = json.loads('{ "key" : "value" , "k":"v" }')
|
||||
self.assertEquals(rval, {"key":"value", "k":"v"})
|
||||
|
||||
def check_keys_reuse(self, source, loads):
|
||||
rval = loads(source)
|
||||
(a, b), (c, d) = sorted(rval[0]), sorted(rval[1])
|
||||
self.assertIs(a, c)
|
||||
self.assertIs(b, d)
|
||||
|
||||
def test_keys_reuse(self):
|
||||
s = '[{"a_key": 1, "b_\xe9": 2}, {"a_key": 3, "b_\xe9": 4}]'
|
||||
self.check_keys_reuse(s, json.loads)
|
||||
# Disabled: the pure Python version of json simply doesn't work
|
||||
with use_python_scanner():
|
||||
self.check_keys_reuse(s, json.decoder.JSONDecoder().decode)
|
||||
|
|
|
@ -165,6 +165,10 @@ Extensions
|
|||
Library
|
||||
-------
|
||||
|
||||
- Issue #7451: Improve decoding performance of JSON objects, and reduce
|
||||
the memory consumption of said decoded objects when they use the same
|
||||
strings as keys.
|
||||
|
||||
- Issue #1100562: Fix deep-copying of objects derived from the list and
|
||||
dict types. Patch by Michele Orrù and Björn Lindqvist.
|
||||
|
||||
|
|
125
Modules/_json.c
125
Modules/_json.c
|
@ -36,6 +36,7 @@ typedef struct _PyScannerObject {
|
|||
PyObject *parse_float;
|
||||
PyObject *parse_int;
|
||||
PyObject *parse_constant;
|
||||
PyObject *memo;
|
||||
} PyScannerObject;
|
||||
|
||||
static PyMemberDef scanner_members[] = {
|
||||
|
@ -305,6 +306,21 @@ _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) {
|
|||
return tpl;
|
||||
}
|
||||
|
||||
#define APPEND_OLD_CHUNK \
|
||||
if (chunk != NULL) { \
|
||||
if (chunks == NULL) { \
|
||||
chunks = PyList_New(0); \
|
||||
if (chunks == NULL) { \
|
||||
goto bail; \
|
||||
} \
|
||||
} \
|
||||
if (PyList_Append(chunks, chunk)) { \
|
||||
Py_DECREF(chunk); \
|
||||
goto bail; \
|
||||
} \
|
||||
Py_CLEAR(chunk); \
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr)
|
||||
{
|
||||
|
@ -316,15 +332,14 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
|
|||
|
||||
Return value is a new PyUnicode
|
||||
*/
|
||||
PyObject *rval;
|
||||
PyObject *rval = NULL;
|
||||
Py_ssize_t len = PyUnicode_GET_SIZE(pystr);
|
||||
Py_ssize_t begin = end - 1;
|
||||
Py_ssize_t next = begin;
|
||||
const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr);
|
||||
PyObject *chunks = PyList_New(0);
|
||||
if (chunks == NULL) {
|
||||
goto bail;
|
||||
}
|
||||
PyObject *chunks = NULL;
|
||||
PyObject *chunk = NULL;
|
||||
|
||||
if (end < 0 || len <= end) {
|
||||
PyErr_SetString(PyExc_ValueError, "end is out of bounds");
|
||||
goto bail;
|
||||
|
@ -332,7 +347,6 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
|
|||
while (1) {
|
||||
/* Find the end of the string or the next escape */
|
||||
Py_UNICODE c = 0;
|
||||
PyObject *chunk = NULL;
|
||||
for (next = end; next < len; next++) {
|
||||
c = buf[next];
|
||||
if (c == '"' || c == '\\') {
|
||||
|
@ -349,15 +363,11 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
|
|||
}
|
||||
/* Pick up this chunk if it's not zero length */
|
||||
if (next != end) {
|
||||
APPEND_OLD_CHUNK
|
||||
chunk = PyUnicode_FromUnicode(&buf[end], next - end);
|
||||
if (chunk == NULL) {
|
||||
goto bail;
|
||||
}
|
||||
if (PyList_Append(chunks, chunk)) {
|
||||
Py_DECREF(chunk);
|
||||
goto bail;
|
||||
}
|
||||
Py_DECREF(chunk);
|
||||
}
|
||||
next++;
|
||||
if (c == '"') {
|
||||
|
@ -459,27 +469,34 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next
|
|||
}
|
||||
#endif
|
||||
}
|
||||
APPEND_OLD_CHUNK
|
||||
chunk = PyUnicode_FromUnicode(&c, 1);
|
||||
if (chunk == NULL) {
|
||||
goto bail;
|
||||
}
|
||||
if (PyList_Append(chunks, chunk)) {
|
||||
Py_DECREF(chunk);
|
||||
goto bail;
|
||||
}
|
||||
Py_DECREF(chunk);
|
||||
}
|
||||
|
||||
if (chunks == NULL) {
|
||||
if (chunk != NULL)
|
||||
rval = chunk;
|
||||
else
|
||||
rval = PyUnicode_FromStringAndSize("", 0);
|
||||
}
|
||||
else {
|
||||
APPEND_OLD_CHUNK
|
||||
rval = join_list_unicode(chunks);
|
||||
if (rval == NULL) {
|
||||
goto bail;
|
||||
}
|
||||
Py_DECREF(chunks);
|
||||
Py_CLEAR(chunks);
|
||||
}
|
||||
|
||||
*next_end_ptr = end;
|
||||
return rval;
|
||||
bail:
|
||||
*next_end_ptr = -1;
|
||||
Py_XDECREF(chunks);
|
||||
Py_XDECREF(chunk);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -578,6 +595,7 @@ scanner_clear(PyObject *self)
|
|||
Py_CLEAR(s->parse_float);
|
||||
Py_CLEAR(s->parse_int);
|
||||
Py_CLEAR(s->parse_constant);
|
||||
Py_CLEAR(s->memo);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -593,10 +611,16 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
|
|||
Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr);
|
||||
Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1;
|
||||
PyObject *val = NULL;
|
||||
PyObject *rval = PyList_New(0);
|
||||
PyObject *rval = NULL;
|
||||
PyObject *key = NULL;
|
||||
int strict = PyObject_IsTrue(s->strict);
|
||||
int has_pairs_hook = (s->object_pairs_hook != Py_None);
|
||||
Py_ssize_t next_idx;
|
||||
|
||||
if (has_pairs_hook)
|
||||
rval = PyList_New(0);
|
||||
else
|
||||
rval = PyDict_New();
|
||||
if (rval == NULL)
|
||||
return NULL;
|
||||
|
||||
|
@ -606,6 +630,8 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
|
|||
/* only loop if the object is non-empty */
|
||||
if (idx <= end_idx && str[idx] != '}') {
|
||||
while (idx <= end_idx) {
|
||||
PyObject *memokey;
|
||||
|
||||
/* read key */
|
||||
if (str[idx] != '"') {
|
||||
raise_errmsg("Expecting property name", pystr, idx);
|
||||
|
@ -614,6 +640,16 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
|
|||
key = scanstring_unicode(pystr, idx + 1, strict, &next_idx);
|
||||
if (key == NULL)
|
||||
goto bail;
|
||||
memokey = PyDict_GetItem(s->memo, key);
|
||||
if (memokey != NULL) {
|
||||
Py_INCREF(memokey);
|
||||
Py_DECREF(key);
|
||||
key = memokey;
|
||||
}
|
||||
else {
|
||||
if (PyDict_SetItem(s->memo, key, key) < 0)
|
||||
goto bail;
|
||||
}
|
||||
idx = next_idx;
|
||||
|
||||
/* skip whitespace between key and : delimiter, read :, skip whitespace */
|
||||
|
@ -630,19 +666,24 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
|
|||
if (val == NULL)
|
||||
goto bail;
|
||||
|
||||
{
|
||||
PyObject *tuple = PyTuple_Pack(2, key, val);
|
||||
if (tuple == NULL)
|
||||
if (has_pairs_hook) {
|
||||
PyObject *item = PyTuple_Pack(2, key, val);
|
||||
if (item == NULL)
|
||||
goto bail;
|
||||
if (PyList_Append(rval, tuple) == -1) {
|
||||
Py_DECREF(tuple);
|
||||
goto bail;
|
||||
}
|
||||
Py_DECREF(tuple);
|
||||
}
|
||||
|
||||
Py_CLEAR(key);
|
||||
Py_CLEAR(val);
|
||||
if (PyList_Append(rval, item) == -1) {
|
||||
Py_DECREF(item);
|
||||
goto bail;
|
||||
}
|
||||
Py_DECREF(item);
|
||||
}
|
||||
else {
|
||||
if (PyDict_SetItem(rval, key, val) < 0)
|
||||
goto bail;
|
||||
Py_CLEAR(key);
|
||||
Py_CLEAR(val);
|
||||
}
|
||||
idx = next_idx;
|
||||
|
||||
/* skip whitespace before } or , */
|
||||
|
@ -672,36 +713,23 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss
|
|||
|
||||
*next_idx_ptr = idx + 1;
|
||||
|
||||
if (s->object_pairs_hook != Py_None) {
|
||||
if (has_pairs_hook) {
|
||||
val = PyObject_CallFunctionObjArgs(s->object_pairs_hook, rval, NULL);
|
||||
if (val == NULL)
|
||||
goto bail;
|
||||
Py_DECREF(rval);
|
||||
return val;
|
||||
}
|
||||
|
||||
val = PyDict_New();
|
||||
if (val == NULL)
|
||||
goto bail;
|
||||
if (PyDict_MergeFromSeq2(val, rval, 1) == -1)
|
||||
goto bail;
|
||||
Py_DECREF(rval);
|
||||
rval = val;
|
||||
|
||||
/* if object_hook is not None: rval = object_hook(rval) */
|
||||
if (s->object_hook != Py_None) {
|
||||
val = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL);
|
||||
if (val == NULL)
|
||||
goto bail;
|
||||
Py_DECREF(rval);
|
||||
rval = val;
|
||||
val = NULL;
|
||||
return val;
|
||||
}
|
||||
return rval;
|
||||
bail:
|
||||
Py_XDECREF(key);
|
||||
Py_XDECREF(val);
|
||||
Py_DECREF(rval);
|
||||
Py_XDECREF(rval);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -988,6 +1016,9 @@ scanner_call(PyObject *self, PyObject *args, PyObject *kwds)
|
|||
Py_TYPE(pystr)->tp_name);
|
||||
return NULL;
|
||||
}
|
||||
PyDict_Clear(s->memo);
|
||||
if (rval == NULL)
|
||||
return NULL;
|
||||
return _build_rval_index_tuple(rval, next_idx);
|
||||
}
|
||||
|
||||
|
@ -1021,6 +1052,12 @@ scanner_init(PyObject *self, PyObject *args, PyObject *kwds)
|
|||
if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:make_scanner", kwlist, &ctx))
|
||||
return -1;
|
||||
|
||||
if (s->memo == NULL) {
|
||||
s->memo = PyDict_New();
|
||||
if (s->memo == NULL)
|
||||
goto bail;
|
||||
}
|
||||
|
||||
/* All of these will fail "gracefully" so we don't need to verify them */
|
||||
s->strict = PyObject_GetAttrString(ctx, "strict");
|
||||
if (s->strict == NULL)
|
||||
|
|
Loading…
Reference in New Issue