diff --git a/Lib/test/json_tests/test_dump.py b/Lib/test/json_tests/test_dump.py index 083c11f08c8..f1c1b0478ea 100644 --- a/Lib/test/json_tests/test_dump.py +++ b/Lib/test/json_tests/test_dump.py @@ -1,6 +1,7 @@ from io import StringIO from test.json_tests import PyTest, CTest +from test.support import precisionbigmemtest, _1G class TestDump: def test_dump(self): @@ -21,4 +22,20 @@ class TestDump: class TestPyDump(TestDump, PyTest): pass -class TestCDump(TestDump, CTest): pass + +class TestCDump(TestDump, CTest): + + # The size requirement here is hopefully over-estimated (actual + # memory consumption depending on implementation details, and also + # system memory management, since this may allocate a lot of + # small objects). + + @precisionbigmemtest(size=_1G, memuse=1) + def test_large_list(self, size): + N = int(30 * 1024 * 1024 * (size / _1G)) + l = [1] * N + encoded = self.dumps(l) + self.assertEqual(len(encoded), N * 3) + self.assertEqual(encoded[:1], "[") + self.assertEqual(encoded[-2:], "1]") + self.assertEqual(encoded[1:-2], "1, " * (N - 1)) diff --git a/Misc/NEWS b/Misc/NEWS index b96bb3da17c..36c9b8b181d 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -262,6 +262,9 @@ Core and Builtins Library ------- +- Issue #12778: Reduce memory consumption when JSON-encoding a large + container of many small objects. + - Issue #12650: Fix a race condition where a subprocess.Popen could leak resources (FD/zombie) when killed at the wrong time. diff --git a/Modules/_json.c b/Modules/_json.c index a392c973266..7c5e5e43ed8 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -75,6 +75,121 @@ static PyMemberDef encoder_members[] = { {NULL} }; +/* + * A two-level accumulator of unicode objects that avoids both the overhead + * of keeping a huge number of small separate objects, and the quadratic + * behaviour of using a naive repeated concatenation scheme. + */ + +typedef struct { + PyObject *large; /* A list of previously accumulated large strings */ + PyObject *small; /* Pending small strings */ +} accumulator; + +static PyObject * +join_list_unicode(PyObject *lst) +{ + /* return u''.join(lst) */ + static PyObject *sep = NULL; + if (sep == NULL) { + sep = PyUnicode_FromStringAndSize("", 0); + if (sep == NULL) + return NULL; + } + return PyUnicode_Join(sep, lst); +} + +static int +init_accumulator(accumulator *acc) +{ + acc->large = PyList_New(0); + if (acc->large == NULL) + return -1; + acc->small = PyList_New(0); + if (acc->small == NULL) { + Py_CLEAR(acc->large); + return -1; + } + return 0; +} + +static int +flush_accumulator(accumulator *acc) +{ + Py_ssize_t nsmall = PyList_GET_SIZE(acc->small); + if (nsmall) { + int ret; + PyObject *joined = join_list_unicode(acc->small); + if (joined == NULL) + return -1; + if (PyList_SetSlice(acc->small, 0, nsmall, NULL)) { + Py_DECREF(joined); + return -1; + } + ret = PyList_Append(acc->large, joined); + Py_DECREF(joined); + return ret; + } + return 0; +} + +static int +accumulate_unicode(accumulator *acc, PyObject *obj) +{ + int ret; + Py_ssize_t nsmall; + assert(PyUnicode_Check(obj)); + + if (PyList_Append(acc->small, obj)) + return -1; + nsmall = PyList_GET_SIZE(acc->small); + /* Each item in a list of unicode objects has an overhead (in 64-bit + * builds) of: + * - 8 bytes for the list slot + * - 56 bytes for the header of the unicode object + * that is, 64 bytes. 100000 such objects waste more than 6MB + * compared to a single concatenated string. + */ + if (nsmall < 100000) + return 0; + PyObject *joined = join_list_unicode(acc->small); + if (joined == NULL) + return -1; + if (PyList_SetSlice(acc->small, 0, nsmall, NULL)) { + Py_DECREF(joined); + return -1; + } + ret = PyList_Append(acc->large, joined); + Py_DECREF(joined); + return ret; +} + +static PyObject * +finish_accumulator(accumulator *acc) +{ + int ret; + PyObject *res; + + ret = flush_accumulator(acc); + Py_CLEAR(acc->small); + if (ret) { + Py_CLEAR(acc->large); + return NULL; + } + res = acc->large; + acc->large = NULL; + return res; +} + +static void +destroy_accumulator(accumulator *acc) +{ + Py_CLEAR(acc->small); + Py_CLEAR(acc->large); +} + +/* Forward decls */ + static PyObject * ascii_escape_unicode(PyObject *pystr); static PyObject * @@ -101,11 +216,11 @@ encoder_dealloc(PyObject *self); static int encoder_clear(PyObject *self); static int -encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level); +encoder_listencode_list(PyEncoderObject *s, accumulator *acc, PyObject *seq, Py_ssize_t indent_level); static int -encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level); +encoder_listencode_obj(PyEncoderObject *s, accumulator *acc, PyObject *obj, Py_ssize_t indent_level); static int -encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level); +encoder_listencode_dict(PyEncoderObject *s, accumulator *acc, PyObject *dct, Py_ssize_t indent_level); static PyObject * _encoded_const(PyObject *obj); static void @@ -266,19 +381,6 @@ raise_errmsg(char *msg, PyObject *s, Py_ssize_t end) } } -static PyObject * -join_list_unicode(PyObject *lst) -{ - /* return u''.join(lst) */ - static PyObject *sep = NULL; - if (sep == NULL) { - sep = PyUnicode_FromStringAndSize("", 0); - if (sep == NULL) - return NULL; - } - return PyUnicode_Join(sep, lst); -} - static PyObject * _build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) { /* return (rval, idx) tuple, stealing reference to rval */ @@ -1226,22 +1328,22 @@ encoder_call(PyObject *self, PyObject *args, PyObject *kwds) /* Python callable interface to encode_listencode_obj */ static char *kwlist[] = {"obj", "_current_indent_level", NULL}; PyObject *obj; - PyObject *rval; Py_ssize_t indent_level; PyEncoderObject *s; + accumulator acc; + assert(PyEncoder_Check(self)); s = (PyEncoderObject *)self; if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO&:_iterencode", kwlist, &obj, _convertPyInt_AsSsize_t, &indent_level)) return NULL; - rval = PyList_New(0); - if (rval == NULL) + if (init_accumulator(&acc)) return NULL; - if (encoder_listencode_obj(s, rval, obj, indent_level)) { - Py_DECREF(rval); + if (encoder_listencode_obj(s, &acc, obj, indent_level)) { + destroy_accumulator(&acc); return NULL; } - return rval; + return finish_accumulator(&acc); } static PyObject * @@ -1313,18 +1415,19 @@ encoder_encode_string(PyEncoderObject *s, PyObject *obj) } static int -_steal_list_append(PyObject *lst, PyObject *stolen) +_steal_accumulate(accumulator *acc, PyObject *stolen) { /* Append stolen and then decrement its reference count */ - int rval = PyList_Append(lst, stolen); + int rval = accumulate_unicode(acc, stolen); Py_DECREF(stolen); return rval; } static int -encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level) +encoder_listencode_obj(PyEncoderObject *s, accumulator *acc, + PyObject *obj, Py_ssize_t indent_level) { - /* Encode Python object obj to a JSON term, rval is a PyList */ + /* Encode Python object obj to a JSON term */ PyObject *newobj; int rv; @@ -1332,38 +1435,38 @@ encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssi PyObject *cstr = _encoded_const(obj); if (cstr == NULL) return -1; - return _steal_list_append(rval, cstr); + return _steal_accumulate(acc, cstr); } else if (PyUnicode_Check(obj)) { PyObject *encoded = encoder_encode_string(s, obj); if (encoded == NULL) return -1; - return _steal_list_append(rval, encoded); + return _steal_accumulate(acc, encoded); } else if (PyLong_Check(obj)) { PyObject *encoded = PyObject_Str(obj); if (encoded == NULL) return -1; - return _steal_list_append(rval, encoded); + return _steal_accumulate(acc, encoded); } else if (PyFloat_Check(obj)) { PyObject *encoded = encoder_encode_float(s, obj); if (encoded == NULL) return -1; - return _steal_list_append(rval, encoded); + return _steal_accumulate(acc, encoded); } else if (PyList_Check(obj) || PyTuple_Check(obj)) { if (Py_EnterRecursiveCall(" while encoding a JSON object")) return -1; - rv = encoder_listencode_list(s, rval, obj, indent_level); + rv = encoder_listencode_list(s, acc, obj, indent_level); Py_LeaveRecursiveCall(); return rv; } else if (PyDict_Check(obj)) { if (Py_EnterRecursiveCall(" while encoding a JSON object")) return -1; - rv = encoder_listencode_dict(s, rval, obj, indent_level); + rv = encoder_listencode_dict(s, acc, obj, indent_level); Py_LeaveRecursiveCall(); return rv; } @@ -1394,7 +1497,7 @@ encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssi if (Py_EnterRecursiveCall(" while encoding a JSON object")) return -1; - rv = encoder_listencode_obj(s, rval, newobj, indent_level); + rv = encoder_listencode_obj(s, acc, newobj, indent_level); Py_LeaveRecursiveCall(); Py_DECREF(newobj); @@ -1414,9 +1517,10 @@ encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssi } static int -encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level) +encoder_listencode_dict(PyEncoderObject *s, accumulator *acc, + PyObject *dct, Py_ssize_t indent_level) { - /* Encode Python dict dct a JSON term, rval is a PyList */ + /* Encode Python dict dct a JSON term */ static PyObject *open_dict = NULL; static PyObject *close_dict = NULL; static PyObject *empty_dict = NULL; @@ -1436,7 +1540,7 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss return -1; } if (Py_SIZE(dct) == 0) - return PyList_Append(rval, empty_dict); + return accumulate_unicode(acc, empty_dict); if (s->markers != Py_None) { int has_key; @@ -1454,7 +1558,7 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss } } - if (PyList_Append(rval, open_dict)) + if (accumulate_unicode(acc, open_dict)) goto bail; if (s->indent != Py_None) { @@ -1541,7 +1645,7 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss } if (idx) { - if (PyList_Append(rval, s->item_separator)) + if (accumulate_unicode(acc, s->item_separator)) goto bail; } @@ -1549,16 +1653,16 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss Py_CLEAR(kstr); if (encoded == NULL) goto bail; - if (PyList_Append(rval, encoded)) { + if (accumulate_unicode(acc, encoded)) { Py_DECREF(encoded); goto bail; } Py_DECREF(encoded); - if (PyList_Append(rval, s->key_separator)) + if (accumulate_unicode(acc, s->key_separator)) goto bail; value = PyTuple_GET_ITEM(item, 1); - if (encoder_listencode_obj(s, rval, value, indent_level)) + if (encoder_listencode_obj(s, acc, value, indent_level)) goto bail; idx += 1; Py_DECREF(item); @@ -1578,7 +1682,7 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss yield '\n' + (' ' * (_indent * _current_indent_level)) }*/ - if (PyList_Append(rval, close_dict)) + if (accumulate_unicode(acc, close_dict)) goto bail; return 0; @@ -1592,9 +1696,10 @@ bail: static int -encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level) +encoder_listencode_list(PyEncoderObject *s, accumulator *acc, + PyObject *seq, Py_ssize_t indent_level) { - /* Encode Python list seq to a JSON term, rval is a PyList */ + /* Encode Python list seq to a JSON term */ static PyObject *open_array = NULL; static PyObject *close_array = NULL; static PyObject *empty_array = NULL; @@ -1618,7 +1723,7 @@ encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ss num_items = PySequence_Fast_GET_SIZE(s_fast); if (num_items == 0) { Py_DECREF(s_fast); - return PyList_Append(rval, empty_array); + return accumulate_unicode(acc, empty_array); } if (s->markers != Py_None) { @@ -1638,7 +1743,7 @@ encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ss } seq_items = PySequence_Fast_ITEMS(s_fast); - if (PyList_Append(rval, open_array)) + if (accumulate_unicode(acc, open_array)) goto bail; if (s->indent != Py_None) { /* TODO: DOES NOT RUN */ @@ -1652,10 +1757,10 @@ encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ss for (i = 0; i < num_items; i++) { PyObject *obj = seq_items[i]; if (i) { - if (PyList_Append(rval, s->item_separator)) + if (accumulate_unicode(acc, s->item_separator)) goto bail; } - if (encoder_listencode_obj(s, rval, obj, indent_level)) + if (encoder_listencode_obj(s, acc, obj, indent_level)) goto bail; } if (ident != NULL) { @@ -1670,7 +1775,7 @@ encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ss yield '\n' + (' ' * (_indent * _current_indent_level)) }*/ - if (PyList_Append(rval, close_array)) + if (accumulate_unicode(acc, close_array)) goto bail; Py_DECREF(s_fast); return 0;