From 05adfbba2abafcdd271bf144a7b3f80bcd927288 Mon Sep 17 00:00:00 2001 From: Pieter Eendebak Date: Mon, 6 May 2024 10:04:39 +0200 Subject: [PATCH] gh-95382: Improve performance of json encoder with indent (GH-118105) --- Lib/json/encoder.py | 14 +- ...4-05-03-18-01-26.gh-issue-95382.73FSEv.rst | 2 + Modules/_json.c | 136 ++++++++++++------ 3 files changed, 105 insertions(+), 47 deletions(-) create mode 100644 Misc/NEWS.d/next/Core and Builtins/2024-05-03-18-01-26.gh-issue-95382.73FSEv.rst diff --git a/Lib/json/encoder.py b/Lib/json/encoder.py index 597849eca05..323332f064e 100644 --- a/Lib/json/encoder.py +++ b/Lib/json/encoder.py @@ -244,15 +244,18 @@ class JSONEncoder(object): return text - if (_one_shot and c_make_encoder is not None - and self.indent is None): + if self.indent is None or isinstance(self.indent, str): + indent = self.indent + else: + indent = ' ' * self.indent + if _one_shot and c_make_encoder is not None: _iterencode = c_make_encoder( - markers, self.default, _encoder, self.indent, + markers, self.default, _encoder, indent, self.key_separator, self.item_separator, self.sort_keys, self.skipkeys, self.allow_nan) else: _iterencode = _make_iterencode( - markers, self.default, _encoder, self.indent, floatstr, + markers, self.default, _encoder, indent, floatstr, self.key_separator, self.item_separator, self.sort_keys, self.skipkeys, _one_shot) return _iterencode(o, 0) @@ -272,9 +275,6 @@ def _make_iterencode(markers, _default, _encoder, _indent, _floatstr, _intstr=int.__repr__, ): - if _indent is not None and not isinstance(_indent, str): - _indent = ' ' * _indent - def _iterencode_list(lst, _current_indent_level): if not lst: yield '[]' diff --git a/Misc/NEWS.d/next/Core and Builtins/2024-05-03-18-01-26.gh-issue-95382.73FSEv.rst b/Misc/NEWS.d/next/Core and Builtins/2024-05-03-18-01-26.gh-issue-95382.73FSEv.rst new file mode 100644 index 00000000000..097a663e3f5 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2024-05-03-18-01-26.gh-issue-95382.73FSEv.rst @@ -0,0 +1,2 @@ +Improve performance of :func:`json.dumps` and :func:`json.dump` when using the argument *indent*. Depending on the data the encoding using +:func:`json.dumps` with *indent* can be up to 2 to 3 times faster. diff --git a/Modules/_json.c b/Modules/_json.c index fc39f624b72..e33ef1f5eea 100644 --- a/Modules/_json.c +++ b/Modules/_json.c @@ -85,11 +85,11 @@ encoder_dealloc(PyObject *self); static int encoder_clear(PyEncoderObject *self); static int -encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *seq, Py_ssize_t indent_level); +encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *seq, PyObject *newline_indent); static int -encoder_listencode_obj(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *obj, Py_ssize_t indent_level); +encoder_listencode_obj(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *obj, PyObject *newline_indent); static int -encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *dct, Py_ssize_t indent_level); +encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, PyObject *dct, PyObject *newline_indent); static PyObject * _encoded_const(PyObject *obj); static void @@ -1251,6 +1251,17 @@ encoder_new(PyTypeObject *type, PyObject *args, PyObject *kwds) return (PyObject *)s; } +static PyObject * +_create_newline_indent(PyObject *indent, Py_ssize_t indent_level) +{ + PyObject *newline_indent = PyUnicode_FromOrdinal('\n'); + if (newline_indent != NULL && indent_level) { + PyUnicode_AppendAndDel(&newline_indent, + PySequence_Repeat(indent, indent_level)); + } + return newline_indent; +} + static PyObject * encoder_call(PyEncoderObject *self, PyObject *args, PyObject *kwds) { @@ -1267,10 +1278,20 @@ encoder_call(PyEncoderObject *self, PyObject *args, PyObject *kwds) _PyUnicodeWriter_Init(&writer); writer.overallocate = 1; - if (encoder_listencode_obj(self, &writer, obj, indent_level)) { + PyObject *newline_indent = NULL; + if (self->indent != Py_None) { + newline_indent = _create_newline_indent(self->indent, indent_level); + if (newline_indent == NULL) { + _PyUnicodeWriter_Dealloc(&writer); + return NULL; + } + } + if (encoder_listencode_obj(self, &writer, obj, newline_indent)) { _PyUnicodeWriter_Dealloc(&writer); + Py_XDECREF(newline_indent); return NULL; } + Py_XDECREF(newline_indent); result = PyTuple_New(1); if (result == NULL || @@ -1358,7 +1379,7 @@ _steal_accumulate(_PyUnicodeWriter *writer, PyObject *stolen) static int encoder_listencode_obj(PyEncoderObject *s, _PyUnicodeWriter *writer, - PyObject *obj, Py_ssize_t indent_level) + PyObject *obj, PyObject *newline_indent) { /* Encode Python object obj to a JSON term */ PyObject *newobj; @@ -1394,14 +1415,14 @@ encoder_listencode_obj(PyEncoderObject *s, _PyUnicodeWriter *writer, else if (PyList_Check(obj) || PyTuple_Check(obj)) { if (_Py_EnterRecursiveCall(" while encoding a JSON object")) return -1; - rv = encoder_listencode_list(s, writer, obj, indent_level); + rv = encoder_listencode_list(s, writer, obj, newline_indent); _Py_LeaveRecursiveCall(); return rv; } else if (PyDict_Check(obj)) { if (_Py_EnterRecursiveCall(" while encoding a JSON object")) return -1; - rv = encoder_listencode_dict(s, writer, obj, indent_level); + rv = encoder_listencode_dict(s, writer, obj, newline_indent); _Py_LeaveRecursiveCall(); return rv; } @@ -1435,7 +1456,7 @@ encoder_listencode_obj(PyEncoderObject *s, _PyUnicodeWriter *writer, Py_XDECREF(ident); return -1; } - rv = encoder_listencode_obj(s, writer, newobj, indent_level); + rv = encoder_listencode_obj(s, writer, newobj, newline_indent); _Py_LeaveRecursiveCall(); Py_DECREF(newobj); @@ -1456,7 +1477,9 @@ encoder_listencode_obj(PyEncoderObject *s, _PyUnicodeWriter *writer, static int encoder_encode_key_value(PyEncoderObject *s, _PyUnicodeWriter *writer, bool *first, - PyObject *key, PyObject *value, Py_ssize_t indent_level) + PyObject *key, PyObject *value, + PyObject *newline_indent, + PyObject *item_separator) { PyObject *keystr = NULL; PyObject *encoded; @@ -1493,7 +1516,7 @@ encoder_encode_key_value(PyEncoderObject *s, _PyUnicodeWriter *writer, bool *fir *first = false; } else { - if (_PyUnicodeWriter_WriteStr(writer, s->item_separator) < 0) { + if (_PyUnicodeWriter_WriteStr(writer, item_separator) < 0) { Py_DECREF(keystr); return -1; } @@ -1511,7 +1534,7 @@ encoder_encode_key_value(PyEncoderObject *s, _PyUnicodeWriter *writer, bool *fir if (_PyUnicodeWriter_WriteStr(writer, s->key_separator) < 0) { return -1; } - if (encoder_listencode_obj(s, writer, value, indent_level) < 0) { + if (encoder_listencode_obj(s, writer, value, newline_indent) < 0) { return -1; } return 0; @@ -1519,13 +1542,15 @@ encoder_encode_key_value(PyEncoderObject *s, _PyUnicodeWriter *writer, bool *fir static int encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, - PyObject *dct, Py_ssize_t indent_level) + PyObject *dct, PyObject *newline_indent) { /* Encode Python dict dct a JSON term */ PyObject *ident = NULL; PyObject *items = NULL; PyObject *key, *value; bool first = true; + PyObject *new_newline_indent = NULL; + PyObject *separator_indent = NULL; if (PyDict_GET_SIZE(dct) == 0) /* Fast path */ return _PyUnicodeWriter_WriteASCIIString(writer, "{}", 2); @@ -1549,14 +1574,21 @@ encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, if (_PyUnicodeWriter_WriteChar(writer, '{')) goto bail; + PyObject *current_item_separator = s->item_separator; // borrowed reference if (s->indent != Py_None) { - /* TODO: DOES NOT RUN */ - indent_level += 1; - /* - newline_indent = '\n' + (' ' * (_indent * _current_indent_level)) - separator = _item_separator + newline_indent - buf += newline_indent - */ + new_newline_indent = PyUnicode_Concat(newline_indent, s->indent); + if (new_newline_indent == NULL) { + goto bail; + } + separator_indent = PyUnicode_Concat(current_item_separator, new_newline_indent); + if (separator_indent == NULL) { + goto bail; + } + // update item separator with a borrowed reference + current_item_separator = separator_indent; + if (_PyUnicodeWriter_WriteStr(writer, new_newline_indent) < 0) { + goto bail; + } } if (s->sort_keys || !PyDict_CheckExact(dct)) { @@ -1574,7 +1606,9 @@ encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, key = PyTuple_GET_ITEM(item, 0); value = PyTuple_GET_ITEM(item, 1); - if (encoder_encode_key_value(s, writer, &first, key, value, indent_level) < 0) + if (encoder_encode_key_value(s, writer, &first, key, value, + new_newline_indent, + current_item_separator) < 0) goto bail; } Py_CLEAR(items); @@ -1582,7 +1616,9 @@ encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, } else { Py_ssize_t pos = 0; while (PyDict_Next(dct, &pos, &key, &value)) { - if (encoder_encode_key_value(s, writer, &first, key, value, indent_level) < 0) + if (encoder_encode_key_value(s, writer, &first, key, value, + new_newline_indent, + current_item_separator) < 0) goto bail; } } @@ -1592,12 +1628,15 @@ encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, goto bail; Py_CLEAR(ident); } - /* TODO DOES NOT RUN; dead code if (s->indent != Py_None) { - indent_level -= 1; + Py_CLEAR(new_newline_indent); + Py_CLEAR(separator_indent); + + if (_PyUnicodeWriter_WriteStr(writer, newline_indent) < 0) { + goto bail; + } + } - yield '\n' + (' ' * (_indent * _current_indent_level)) - }*/ if (_PyUnicodeWriter_WriteChar(writer, '}')) goto bail; return 0; @@ -1605,16 +1644,20 @@ encoder_listencode_dict(PyEncoderObject *s, _PyUnicodeWriter *writer, bail: Py_XDECREF(items); Py_XDECREF(ident); + Py_XDECREF(separator_indent); + Py_XDECREF(new_newline_indent); return -1; } static int encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer, - PyObject *seq, Py_ssize_t indent_level) + PyObject *seq, PyObject *newline_indent) { PyObject *ident = NULL; PyObject *s_fast = NULL; Py_ssize_t i; + PyObject *new_newline_indent = NULL; + PyObject *separator_indent = NULL; ident = NULL; s_fast = PySequence_Fast(seq, "_iterencode_list needs a sequence"); @@ -1643,22 +1686,31 @@ encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer, if (_PyUnicodeWriter_WriteChar(writer, '[')) goto bail; + + PyObject *separator = s->item_separator; // borrowed reference if (s->indent != Py_None) { - /* TODO: DOES NOT RUN */ - indent_level += 1; - /* - newline_indent = '\n' + (' ' * (_indent * _current_indent_level)) - separator = _item_separator + newline_indent - buf += newline_indent - */ + new_newline_indent = PyUnicode_Concat(newline_indent, s->indent); + if (new_newline_indent == NULL) { + goto bail; + } + + if (_PyUnicodeWriter_WriteStr(writer, new_newline_indent) < 0) { + goto bail; + } + + separator_indent = PyUnicode_Concat(separator, new_newline_indent); + if (separator_indent == NULL) { + goto bail; + } + separator = separator_indent; // assign separator with borrowed reference } for (i = 0; i < PySequence_Fast_GET_SIZE(s_fast); i++) { PyObject *obj = PySequence_Fast_GET_ITEM(s_fast, i); if (i) { - if (_PyUnicodeWriter_WriteStr(writer, s->item_separator)) + if (_PyUnicodeWriter_WriteStr(writer, separator) < 0) goto bail; } - if (encoder_listencode_obj(s, writer, obj, indent_level)) + if (encoder_listencode_obj(s, writer, obj, new_newline_indent)) goto bail; } if (ident != NULL) { @@ -1667,12 +1719,14 @@ encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer, Py_CLEAR(ident); } - /* TODO: DOES NOT RUN if (s->indent != Py_None) { - indent_level -= 1; + Py_CLEAR(new_newline_indent); + Py_CLEAR(separator_indent); + if (_PyUnicodeWriter_WriteStr(writer, newline_indent) < 0) { + goto bail; + } + } - yield '\n' + (' ' * (_indent * _current_indent_level)) - }*/ if (_PyUnicodeWriter_WriteChar(writer, ']')) goto bail; Py_DECREF(s_fast); @@ -1681,6 +1735,8 @@ encoder_listencode_list(PyEncoderObject *s, _PyUnicodeWriter *writer, bail: Py_XDECREF(ident); Py_DECREF(s_fast); + Py_XDECREF(separator_indent); + Py_XDECREF(new_newline_indent); return -1; } @@ -1721,7 +1777,7 @@ encoder_clear(PyEncoderObject *self) return 0; } -PyDoc_STRVAR(encoder_doc, "_iterencode(obj, _current_indent_level) -> iterable"); +PyDoc_STRVAR(encoder_doc, "Encoder(markers, default, encoder, indent, key_separator, item_separator, sort_keys, skipkeys, allow_nan)"); static PyType_Slot PyEncoderType_slots[] = { {Py_tp_doc, (void *)encoder_doc},