From 7d6e076f6d8dd48cfd748b02dad17dbeb0b346a3 Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Sat, 4 Sep 2010 20:16:53 +0000 Subject: [PATCH] Issue #7451: Improve decoding performance of JSON objects, and reduce the memory consumption of said decoded objects when they use the same strings as keys. --- Lib/json/decoder.py | 10 ++- Lib/json/scanner.py | 10 ++- Lib/json/tests/test_decode.py | 28 ++++++++ Misc/NEWS | 4 ++ Modules/_json.c | 129 ++++++++++++++++++++++------------ 5 files changed, 132 insertions(+), 49 deletions(-) diff --git a/Lib/json/decoder.py b/Lib/json/decoder.py index 3e7405b220c..6596154d873 100644 --- a/Lib/json/decoder.py +++ b/Lib/json/decoder.py @@ -147,10 +147,14 @@ WHITESPACE_STR = ' \t\n\r' def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook, - _w=WHITESPACE.match, _ws=WHITESPACE_STR): + memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR): s, end = s_and_end pairs = [] pairs_append = pairs.append + # Backwards compatibility + if memo is None: + memo = {} + memo_get = memo.setdefault # Use a slice to prevent IndexError from being raised, the following # check will raise a more specific ValueError if the string is empty nextchar = s[end:end + 1] @@ -167,6 +171,7 @@ def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook, end += 1 while True: key, end = scanstring(s, end, strict) + key = memo_get(key, key) # To skip some function call overhead we optimize the fast paths where # the JSON key separator is ": " or just ":". if s[end:end + 1] != ':': @@ -214,7 +219,7 @@ def JSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook, pairs = object_hook(pairs) return pairs, end -def JSONArray(s_and_end, scan_once, context, _w=WHITESPACE.match): +def JSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR): s, end = s_and_end values = [] nextchar = s[end:end + 1] @@ -314,6 +319,7 @@ class JSONDecoder(object): self.parse_object = JSONObject self.parse_array = JSONArray self.parse_string = scanstring + self.memo = {} self.scan_once = make_scanner(self) diff --git a/Lib/json/scanner.py b/Lib/json/scanner.py index b4f356146fe..23eef61b94b 100644 --- a/Lib/json/scanner.py +++ b/Lib/json/scanner.py @@ -22,6 +22,8 @@ def py_make_scanner(context): parse_int = context.parse_int parse_constant = context.parse_constant object_hook = context.object_hook + object_pairs_hook = context.object_pairs_hook + memo = context.memo def _scan_once(string, idx): try: @@ -33,7 +35,7 @@ def py_make_scanner(context): return parse_string(string, idx + 1, strict) elif nextchar == '{': return parse_object((string, idx + 1), strict, - _scan_once, object_hook, object_pairs_hook) + _scan_once, object_hook, object_pairs_hook, memo) elif nextchar == '[': return parse_array((string, idx + 1), _scan_once) elif nextchar == 'n' and string[idx:idx + 4] == 'null': @@ -60,6 +62,12 @@ def py_make_scanner(context): else: raise StopIteration + def scan_once(string, idx): + try: + return _scan_once(string, idx) + finally: + memo.clear() + return _scan_once make_scanner = c_make_scanner or py_make_scanner diff --git a/Lib/json/tests/test_decode.py b/Lib/json/tests/test_decode.py index 4610c6cadbd..beb82a7f44e 100644 --- a/Lib/json/tests/test_decode.py +++ b/Lib/json/tests/test_decode.py @@ -1,10 +1,25 @@ import decimal from unittest import TestCase from io import StringIO +from contextlib import contextmanager import json +import json.decoder +import json.scanner from collections import OrderedDict + +@contextmanager +def use_python_scanner(): + py_scanner = json.scanner.py_make_scanner + old_scanner = json.decoder.make_scanner + json.decoder.make_scanner = py_scanner + try: + yield + finally: + json.decoder.make_scanner = old_scanner + + class TestDecode(TestCase): def test_decimal(self): rval = json.loads('1.1', parse_float=decimal.Decimal) @@ -39,3 +54,16 @@ class TestDecode(TestCase): # exercise the uncommon cases. The array cases are already covered. rval = json.loads('{ "key" : "value" , "k":"v" }') self.assertEquals(rval, {"key":"value", "k":"v"}) + + def check_keys_reuse(self, source, loads): + rval = loads(source) + (a, b), (c, d) = sorted(rval[0]), sorted(rval[1]) + self.assertIs(a, c) + self.assertIs(b, d) + + def test_keys_reuse(self): + s = '[{"a_key": 1, "b_\xe9": 2}, {"a_key": 3, "b_\xe9": 4}]' + self.check_keys_reuse(s, json.loads) + # Disabled: the pure Python version of json simply doesn't work + with use_python_scanner(): + self.check_keys_reuse(s, json.decoder.JSONDecoder().decode) diff --git a/Misc/NEWS b/Misc/NEWS index 77e9dd5a3d5..f413da2e5a6 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -165,6 +165,10 @@ Extensions Library ------- +- Issue #7451: Improve decoding performance of JSON objects, and reduce + the memory consumption of said decoded objects when they use the same + strings as keys. + - Issue #1100562: Fix deep-copying of objects derived from the list and dict types. Patch by Michele Orrù and Björn Lindqvist. diff --git a/Modules/_json.c b/Modules/_json.c index 2e395255f0a..bbaf7dd4a71 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -36,6 +36,7 @@ typedef struct _PyScannerObject { PyObject *parse_float; PyObject *parse_int; PyObject *parse_constant; + PyObject *memo; } PyScannerObject; static PyMemberDef scanner_members[] = { @@ -305,6 +306,21 @@ _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) { return tpl; } +#define APPEND_OLD_CHUNK \ + if (chunk != NULL) { \ + if (chunks == NULL) { \ + chunks = PyList_New(0); \ + if (chunks == NULL) { \ + goto bail; \ + } \ + } \ + if (PyList_Append(chunks, chunk)) { \ + Py_DECREF(chunk); \ + goto bail; \ + } \ + Py_CLEAR(chunk); \ + } + static PyObject * scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next_end_ptr) { @@ -316,15 +332,14 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next Return value is a new PyUnicode */ - PyObject *rval; + PyObject *rval = NULL; Py_ssize_t len = PyUnicode_GET_SIZE(pystr); Py_ssize_t begin = end - 1; Py_ssize_t next = begin; const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr); - PyObject *chunks = PyList_New(0); - if (chunks == NULL) { - goto bail; - } + PyObject *chunks = NULL; + PyObject *chunk = NULL; + if (end < 0 || len <= end) { PyErr_SetString(PyExc_ValueError, "end is out of bounds"); goto bail; @@ -332,7 +347,6 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next while (1) { /* Find the end of the string or the next escape */ Py_UNICODE c = 0; - PyObject *chunk = NULL; for (next = end; next < len; next++) { c = buf[next]; if (c == '"' || c == '\\') { @@ -349,15 +363,11 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next } /* Pick up this chunk if it's not zero length */ if (next != end) { + APPEND_OLD_CHUNK chunk = PyUnicode_FromUnicode(&buf[end], next - end); if (chunk == NULL) { goto bail; } - if (PyList_Append(chunks, chunk)) { - Py_DECREF(chunk); - goto bail; - } - Py_DECREF(chunk); } next++; if (c == '"') { @@ -459,27 +469,34 @@ scanstring_unicode(PyObject *pystr, Py_ssize_t end, int strict, Py_ssize_t *next } #endif } + APPEND_OLD_CHUNK chunk = PyUnicode_FromUnicode(&c, 1); if (chunk == NULL) { goto bail; } - if (PyList_Append(chunks, chunk)) { - Py_DECREF(chunk); - goto bail; - } - Py_DECREF(chunk); } - rval = join_list_unicode(chunks); - if (rval == NULL) { - goto bail; + if (chunks == NULL) { + if (chunk != NULL) + rval = chunk; + else + rval = PyUnicode_FromStringAndSize("", 0); } - Py_DECREF(chunks); + else { + APPEND_OLD_CHUNK + rval = join_list_unicode(chunks); + if (rval == NULL) { + goto bail; + } + Py_CLEAR(chunks); + } + *next_end_ptr = end; return rval; bail: *next_end_ptr = -1; Py_XDECREF(chunks); + Py_XDECREF(chunk); return NULL; } @@ -578,6 +595,7 @@ scanner_clear(PyObject *self) Py_CLEAR(s->parse_float); Py_CLEAR(s->parse_int); Py_CLEAR(s->parse_constant); + Py_CLEAR(s->memo); return 0; } @@ -593,10 +611,16 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; PyObject *val = NULL; - PyObject *rval = PyList_New(0); + PyObject *rval = NULL; PyObject *key = NULL; int strict = PyObject_IsTrue(s->strict); + int has_pairs_hook = (s->object_pairs_hook != Py_None); Py_ssize_t next_idx; + + if (has_pairs_hook) + rval = PyList_New(0); + else + rval = PyDict_New(); if (rval == NULL) return NULL; @@ -606,6 +630,8 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss /* only loop if the object is non-empty */ if (idx <= end_idx && str[idx] != '}') { while (idx <= end_idx) { + PyObject *memokey; + /* read key */ if (str[idx] != '"') { raise_errmsg("Expecting property name", pystr, idx); @@ -614,6 +640,16 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss key = scanstring_unicode(pystr, idx + 1, strict, &next_idx); if (key == NULL) goto bail; + memokey = PyDict_GetItem(s->memo, key); + if (memokey != NULL) { + Py_INCREF(memokey); + Py_DECREF(key); + key = memokey; + } + else { + if (PyDict_SetItem(s->memo, key, key) < 0) + goto bail; + } idx = next_idx; /* skip whitespace between key and : delimiter, read :, skip whitespace */ @@ -630,19 +666,24 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss if (val == NULL) goto bail; - { - PyObject *tuple = PyTuple_Pack(2, key, val); - if (tuple == NULL) + if (has_pairs_hook) { + PyObject *item = PyTuple_Pack(2, key, val); + if (item == NULL) goto bail; - if (PyList_Append(rval, tuple) == -1) { - Py_DECREF(tuple); + Py_CLEAR(key); + Py_CLEAR(val); + if (PyList_Append(rval, item) == -1) { + Py_DECREF(item); goto bail; } - Py_DECREF(tuple); + Py_DECREF(item); + } + else { + if (PyDict_SetItem(rval, key, val) < 0) + goto bail; + Py_CLEAR(key); + Py_CLEAR(val); } - - Py_CLEAR(key); - Py_CLEAR(val); idx = next_idx; /* skip whitespace before } or , */ @@ -672,36 +713,23 @@ _parse_object_unicode(PyScannerObject *s, PyObject *pystr, Py_ssize_t idx, Py_ss *next_idx_ptr = idx + 1; - if (s->object_pairs_hook != Py_None) { + if (has_pairs_hook) { val = PyObject_CallFunctionObjArgs(s->object_pairs_hook, rval, NULL); - if (val == NULL) - goto bail; Py_DECREF(rval); return val; } - val = PyDict_New(); - if (val == NULL) - goto bail; - if (PyDict_MergeFromSeq2(val, rval, 1) == -1) - goto bail; - Py_DECREF(rval); - rval = val; - /* if object_hook is not None: rval = object_hook(rval) */ if (s->object_hook != Py_None) { val = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL); - if (val == NULL) - goto bail; Py_DECREF(rval); - rval = val; - val = NULL; + return val; } return rval; bail: Py_XDECREF(key); Py_XDECREF(val); - Py_DECREF(rval); + Py_XDECREF(rval); return NULL; } @@ -988,6 +1016,9 @@ scanner_call(PyObject *self, PyObject *args, PyObject *kwds) Py_TYPE(pystr)->tp_name); return NULL; } + PyDict_Clear(s->memo); + if (rval == NULL) + return NULL; return _build_rval_index_tuple(rval, next_idx); } @@ -1021,6 +1052,12 @@ scanner_init(PyObject *self, PyObject *args, PyObject *kwds) if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:make_scanner", kwlist, &ctx)) return -1; + if (s->memo == NULL) { + s->memo = PyDict_New(); + if (s->memo == NULL) + goto bail; + } + /* All of these will fail "gracefully" so we don't need to verify them */ s->strict = PyObject_GetAttrString(ctx, "strict"); if (s->strict == NULL)