Issue #12778: Reduce memory consumption when JSON-encoding a large container of many small objects.
This commit is contained in:
parent
9bb6dfe0a8
commit
df7fc9dda7
|
@ -1,6 +1,7 @@
|
|||
from io import StringIO
|
||||
from test.json_tests import PyTest, CTest
|
||||
|
||||
from test.support import precisionbigmemtest, _1G
|
||||
|
||||
class TestDump:
|
||||
def test_dump(self):
|
||||
|
@ -21,4 +22,20 @@ class TestDump:
|
|||
|
||||
|
||||
class TestPyDump(TestDump, PyTest): pass
|
||||
class TestCDump(TestDump, CTest): pass
|
||||
|
||||
class TestCDump(TestDump, CTest):
|
||||
|
||||
# The size requirement here is hopefully over-estimated (actual
|
||||
# memory consumption depending on implementation details, and also
|
||||
# system memory management, since this may allocate a lot of
|
||||
# small objects).
|
||||
|
||||
@precisionbigmemtest(size=_1G, memuse=1)
|
||||
def test_large_list(self, size):
|
||||
N = int(30 * 1024 * 1024 * (size / _1G))
|
||||
l = [1] * N
|
||||
encoded = self.dumps(l)
|
||||
self.assertEqual(len(encoded), N * 3)
|
||||
self.assertEqual(encoded[:1], "[")
|
||||
self.assertEqual(encoded[-2:], "1]")
|
||||
self.assertEqual(encoded[1:-2], "1, " * (N - 1))
|
||||
|
|
|
@ -262,6 +262,9 @@ Core and Builtins
|
|||
Library
|
||||
-------
|
||||
|
||||
- Issue #12778: Reduce memory consumption when JSON-encoding a large
|
||||
container of many small objects.
|
||||
|
||||
- Issue #12650: Fix a race condition where a subprocess.Popen could leak
|
||||
resources (FD/zombie) when killed at the wrong time.
|
||||
|
||||
|
|
203
Modules/_json.c
203
Modules/_json.c
|
@ -75,6 +75,121 @@ static PyMemberDef encoder_members[] = {
|
|||
{NULL}
|
||||
};
|
||||
|
||||
/*
|
||||
* A two-level accumulator of unicode objects that avoids both the overhead
|
||||
* of keeping a huge number of small separate objects, and the quadratic
|
||||
* behaviour of using a naive repeated concatenation scheme.
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
PyObject *large; /* A list of previously accumulated large strings */
|
||||
PyObject *small; /* Pending small strings */
|
||||
} accumulator;
|
||||
|
||||
static PyObject *
|
||||
join_list_unicode(PyObject *lst)
|
||||
{
|
||||
/* return u''.join(lst) */
|
||||
static PyObject *sep = NULL;
|
||||
if (sep == NULL) {
|
||||
sep = PyUnicode_FromStringAndSize("", 0);
|
||||
if (sep == NULL)
|
||||
return NULL;
|
||||
}
|
||||
return PyUnicode_Join(sep, lst);
|
||||
}
|
||||
|
||||
static int
|
||||
init_accumulator(accumulator *acc)
|
||||
{
|
||||
acc->large = PyList_New(0);
|
||||
if (acc->large == NULL)
|
||||
return -1;
|
||||
acc->small = PyList_New(0);
|
||||
if (acc->small == NULL) {
|
||||
Py_CLEAR(acc->large);
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
flush_accumulator(accumulator *acc)
|
||||
{
|
||||
Py_ssize_t nsmall = PyList_GET_SIZE(acc->small);
|
||||
if (nsmall) {
|
||||
int ret;
|
||||
PyObject *joined = join_list_unicode(acc->small);
|
||||
if (joined == NULL)
|
||||
return -1;
|
||||
if (PyList_SetSlice(acc->small, 0, nsmall, NULL)) {
|
||||
Py_DECREF(joined);
|
||||
return -1;
|
||||
}
|
||||
ret = PyList_Append(acc->large, joined);
|
||||
Py_DECREF(joined);
|
||||
return ret;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
accumulate_unicode(accumulator *acc, PyObject *obj)
|
||||
{
|
||||
int ret;
|
||||
Py_ssize_t nsmall;
|
||||
assert(PyUnicode_Check(obj));
|
||||
|
||||
if (PyList_Append(acc->small, obj))
|
||||
return -1;
|
||||
nsmall = PyList_GET_SIZE(acc->small);
|
||||
/* Each item in a list of unicode objects has an overhead (in 64-bit
|
||||
* builds) of:
|
||||
* - 8 bytes for the list slot
|
||||
* - 56 bytes for the header of the unicode object
|
||||
* that is, 64 bytes. 100000 such objects waste more than 6MB
|
||||
* compared to a single concatenated string.
|
||||
*/
|
||||
if (nsmall < 100000)
|
||||
return 0;
|
||||
PyObject *joined = join_list_unicode(acc->small);
|
||||
if (joined == NULL)
|
||||
return -1;
|
||||
if (PyList_SetSlice(acc->small, 0, nsmall, NULL)) {
|
||||
Py_DECREF(joined);
|
||||
return -1;
|
||||
}
|
||||
ret = PyList_Append(acc->large, joined);
|
||||
Py_DECREF(joined);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
finish_accumulator(accumulator *acc)
|
||||
{
|
||||
int ret;
|
||||
PyObject *res;
|
||||
|
||||
ret = flush_accumulator(acc);
|
||||
Py_CLEAR(acc->small);
|
||||
if (ret) {
|
||||
Py_CLEAR(acc->large);
|
||||
return NULL;
|
||||
}
|
||||
res = acc->large;
|
||||
acc->large = NULL;
|
||||
return res;
|
||||
}
|
||||
|
||||
static void
|
||||
destroy_accumulator(accumulator *acc)
|
||||
{
|
||||
Py_CLEAR(acc->small);
|
||||
Py_CLEAR(acc->large);
|
||||
}
|
||||
|
||||
/* Forward decls */
|
||||
|
||||
static PyObject *
|
||||
ascii_escape_unicode(PyObject *pystr);
|
||||
static PyObject *
|
||||
|
@ -101,11 +216,11 @@ encoder_dealloc(PyObject *self);
|
|||
static int
|
||||
encoder_clear(PyObject *self);
|
||||
static int
|
||||
encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level);
|
||||
encoder_listencode_list(PyEncoderObject *s, accumulator *acc, PyObject *seq, Py_ssize_t indent_level);
|
||||
static int
|
||||
encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level);
|
||||
encoder_listencode_obj(PyEncoderObject *s, accumulator *acc, PyObject *obj, Py_ssize_t indent_level);
|
||||
static int
|
||||
encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level);
|
||||
encoder_listencode_dict(PyEncoderObject *s, accumulator *acc, PyObject *dct, Py_ssize_t indent_level);
|
||||
static PyObject *
|
||||
_encoded_const(PyObject *obj);
|
||||
static void
|
||||
|
@ -266,19 +381,6 @@ raise_errmsg(char *msg, PyObject *s, Py_ssize_t end)
|
|||
}
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
join_list_unicode(PyObject *lst)
|
||||
{
|
||||
/* return u''.join(lst) */
|
||||
static PyObject *sep = NULL;
|
||||
if (sep == NULL) {
|
||||
sep = PyUnicode_FromStringAndSize("", 0);
|
||||
if (sep == NULL)
|
||||
return NULL;
|
||||
}
|
||||
return PyUnicode_Join(sep, lst);
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
_build_rval_index_tuple(PyObject *rval, Py_ssize_t idx) {
|
||||
/* return (rval, idx) tuple, stealing reference to rval */
|
||||
|
@ -1226,22 +1328,22 @@ encoder_call(PyObject *self, PyObject *args, PyObject *kwds)
|
|||
/* Python callable interface to encode_listencode_obj */
|
||||
static char *kwlist[] = {"obj", "_current_indent_level", NULL};
|
||||
PyObject *obj;
|
||||
PyObject *rval;
|
||||
Py_ssize_t indent_level;
|
||||
PyEncoderObject *s;
|
||||
accumulator acc;
|
||||
|
||||
assert(PyEncoder_Check(self));
|
||||
s = (PyEncoderObject *)self;
|
||||
if (!PyArg_ParseTupleAndKeywords(args, kwds, "OO&:_iterencode", kwlist,
|
||||
&obj, _convertPyInt_AsSsize_t, &indent_level))
|
||||
return NULL;
|
||||
rval = PyList_New(0);
|
||||
if (rval == NULL)
|
||||
if (init_accumulator(&acc))
|
||||
return NULL;
|
||||
if (encoder_listencode_obj(s, rval, obj, indent_level)) {
|
||||
Py_DECREF(rval);
|
||||
if (encoder_listencode_obj(s, &acc, obj, indent_level)) {
|
||||
destroy_accumulator(&acc);
|
||||
return NULL;
|
||||
}
|
||||
return rval;
|
||||
return finish_accumulator(&acc);
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
|
@ -1313,18 +1415,19 @@ encoder_encode_string(PyEncoderObject *s, PyObject *obj)
|
|||
}
|
||||
|
||||
static int
|
||||
_steal_list_append(PyObject *lst, PyObject *stolen)
|
||||
_steal_accumulate(accumulator *acc, PyObject *stolen)
|
||||
{
|
||||
/* Append stolen and then decrement its reference count */
|
||||
int rval = PyList_Append(lst, stolen);
|
||||
int rval = accumulate_unicode(acc, stolen);
|
||||
Py_DECREF(stolen);
|
||||
return rval;
|
||||
}
|
||||
|
||||
static int
|
||||
encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssize_t indent_level)
|
||||
encoder_listencode_obj(PyEncoderObject *s, accumulator *acc,
|
||||
PyObject *obj, Py_ssize_t indent_level)
|
||||
{
|
||||
/* Encode Python object obj to a JSON term, rval is a PyList */
|
||||
/* Encode Python object obj to a JSON term */
|
||||
PyObject *newobj;
|
||||
int rv;
|
||||
|
||||
|
@ -1332,38 +1435,38 @@ encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssi
|
|||
PyObject *cstr = _encoded_const(obj);
|
||||
if (cstr == NULL)
|
||||
return -1;
|
||||
return _steal_list_append(rval, cstr);
|
||||
return _steal_accumulate(acc, cstr);
|
||||
}
|
||||
else if (PyUnicode_Check(obj))
|
||||
{
|
||||
PyObject *encoded = encoder_encode_string(s, obj);
|
||||
if (encoded == NULL)
|
||||
return -1;
|
||||
return _steal_list_append(rval, encoded);
|
||||
return _steal_accumulate(acc, encoded);
|
||||
}
|
||||
else if (PyLong_Check(obj)) {
|
||||
PyObject *encoded = PyObject_Str(obj);
|
||||
if (encoded == NULL)
|
||||
return -1;
|
||||
return _steal_list_append(rval, encoded);
|
||||
return _steal_accumulate(acc, encoded);
|
||||
}
|
||||
else if (PyFloat_Check(obj)) {
|
||||
PyObject *encoded = encoder_encode_float(s, obj);
|
||||
if (encoded == NULL)
|
||||
return -1;
|
||||
return _steal_list_append(rval, encoded);
|
||||
return _steal_accumulate(acc, encoded);
|
||||
}
|
||||
else if (PyList_Check(obj) || PyTuple_Check(obj)) {
|
||||
if (Py_EnterRecursiveCall(" while encoding a JSON object"))
|
||||
return -1;
|
||||
rv = encoder_listencode_list(s, rval, obj, indent_level);
|
||||
rv = encoder_listencode_list(s, acc, obj, indent_level);
|
||||
Py_LeaveRecursiveCall();
|
||||
return rv;
|
||||
}
|
||||
else if (PyDict_Check(obj)) {
|
||||
if (Py_EnterRecursiveCall(" while encoding a JSON object"))
|
||||
return -1;
|
||||
rv = encoder_listencode_dict(s, rval, obj, indent_level);
|
||||
rv = encoder_listencode_dict(s, acc, obj, indent_level);
|
||||
Py_LeaveRecursiveCall();
|
||||
return rv;
|
||||
}
|
||||
|
@ -1394,7 +1497,7 @@ encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssi
|
|||
|
||||
if (Py_EnterRecursiveCall(" while encoding a JSON object"))
|
||||
return -1;
|
||||
rv = encoder_listencode_obj(s, rval, newobj, indent_level);
|
||||
rv = encoder_listencode_obj(s, acc, newobj, indent_level);
|
||||
Py_LeaveRecursiveCall();
|
||||
|
||||
Py_DECREF(newobj);
|
||||
|
@ -1414,9 +1517,10 @@ encoder_listencode_obj(PyEncoderObject *s, PyObject *rval, PyObject *obj, Py_ssi
|
|||
}
|
||||
|
||||
static int
|
||||
encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ssize_t indent_level)
|
||||
encoder_listencode_dict(PyEncoderObject *s, accumulator *acc,
|
||||
PyObject *dct, Py_ssize_t indent_level)
|
||||
{
|
||||
/* Encode Python dict dct a JSON term, rval is a PyList */
|
||||
/* Encode Python dict dct a JSON term */
|
||||
static PyObject *open_dict = NULL;
|
||||
static PyObject *close_dict = NULL;
|
||||
static PyObject *empty_dict = NULL;
|
||||
|
@ -1436,7 +1540,7 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss
|
|||
return -1;
|
||||
}
|
||||
if (Py_SIZE(dct) == 0)
|
||||
return PyList_Append(rval, empty_dict);
|
||||
return accumulate_unicode(acc, empty_dict);
|
||||
|
||||
if (s->markers != Py_None) {
|
||||
int has_key;
|
||||
|
@ -1454,7 +1558,7 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss
|
|||
}
|
||||
}
|
||||
|
||||
if (PyList_Append(rval, open_dict))
|
||||
if (accumulate_unicode(acc, open_dict))
|
||||
goto bail;
|
||||
|
||||
if (s->indent != Py_None) {
|
||||
|
@ -1541,7 +1645,7 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss
|
|||
}
|
||||
|
||||
if (idx) {
|
||||
if (PyList_Append(rval, s->item_separator))
|
||||
if (accumulate_unicode(acc, s->item_separator))
|
||||
goto bail;
|
||||
}
|
||||
|
||||
|
@ -1549,16 +1653,16 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss
|
|||
Py_CLEAR(kstr);
|
||||
if (encoded == NULL)
|
||||
goto bail;
|
||||
if (PyList_Append(rval, encoded)) {
|
||||
if (accumulate_unicode(acc, encoded)) {
|
||||
Py_DECREF(encoded);
|
||||
goto bail;
|
||||
}
|
||||
Py_DECREF(encoded);
|
||||
if (PyList_Append(rval, s->key_separator))
|
||||
if (accumulate_unicode(acc, s->key_separator))
|
||||
goto bail;
|
||||
|
||||
value = PyTuple_GET_ITEM(item, 1);
|
||||
if (encoder_listencode_obj(s, rval, value, indent_level))
|
||||
if (encoder_listencode_obj(s, acc, value, indent_level))
|
||||
goto bail;
|
||||
idx += 1;
|
||||
Py_DECREF(item);
|
||||
|
@ -1578,7 +1682,7 @@ encoder_listencode_dict(PyEncoderObject *s, PyObject *rval, PyObject *dct, Py_ss
|
|||
|
||||
yield '\n' + (' ' * (_indent * _current_indent_level))
|
||||
}*/
|
||||
if (PyList_Append(rval, close_dict))
|
||||
if (accumulate_unicode(acc, close_dict))
|
||||
goto bail;
|
||||
return 0;
|
||||
|
||||
|
@ -1592,9 +1696,10 @@ bail:
|
|||
|
||||
|
||||
static int
|
||||
encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ssize_t indent_level)
|
||||
encoder_listencode_list(PyEncoderObject *s, accumulator *acc,
|
||||
PyObject *seq, Py_ssize_t indent_level)
|
||||
{
|
||||
/* Encode Python list seq to a JSON term, rval is a PyList */
|
||||
/* Encode Python list seq to a JSON term */
|
||||
static PyObject *open_array = NULL;
|
||||
static PyObject *close_array = NULL;
|
||||
static PyObject *empty_array = NULL;
|
||||
|
@ -1618,7 +1723,7 @@ encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ss
|
|||
num_items = PySequence_Fast_GET_SIZE(s_fast);
|
||||
if (num_items == 0) {
|
||||
Py_DECREF(s_fast);
|
||||
return PyList_Append(rval, empty_array);
|
||||
return accumulate_unicode(acc, empty_array);
|
||||
}
|
||||
|
||||
if (s->markers != Py_None) {
|
||||
|
@ -1638,7 +1743,7 @@ encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ss
|
|||
}
|
||||
|
||||
seq_items = PySequence_Fast_ITEMS(s_fast);
|
||||
if (PyList_Append(rval, open_array))
|
||||
if (accumulate_unicode(acc, open_array))
|
||||
goto bail;
|
||||
if (s->indent != Py_None) {
|
||||
/* TODO: DOES NOT RUN */
|
||||
|
@ -1652,10 +1757,10 @@ encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ss
|
|||
for (i = 0; i < num_items; i++) {
|
||||
PyObject *obj = seq_items[i];
|
||||
if (i) {
|
||||
if (PyList_Append(rval, s->item_separator))
|
||||
if (accumulate_unicode(acc, s->item_separator))
|
||||
goto bail;
|
||||
}
|
||||
if (encoder_listencode_obj(s, rval, obj, indent_level))
|
||||
if (encoder_listencode_obj(s, acc, obj, indent_level))
|
||||
goto bail;
|
||||
}
|
||||
if (ident != NULL) {
|
||||
|
@ -1670,7 +1775,7 @@ encoder_listencode_list(PyEncoderObject *s, PyObject *rval, PyObject *seq, Py_ss
|
|||
|
||||
yield '\n' + (' ' * (_indent * _current_indent_level))
|
||||
}*/
|
||||
if (PyList_Append(rval, close_array))
|
||||
if (accumulate_unicode(acc, close_array))
|
||||
goto bail;
|
||||
Py_DECREF(s_fast);
|
||||
return 0;
|
||||
|
|
Loading…
Reference in New Issue