From fb75d015f487e50079e8d2ea7859750684b124e4 Mon Sep 17 00:00:00 2001 From: Dong-hee Na Date: Mon, 1 Aug 2022 22:15:07 +0900 Subject: [PATCH] gh-91146: More reduce allocation size of list from str.split/rsplit (gh-95493) Co-authored-by: Inada Naoki --- ...2-07-31-03-22-58.gh-issue-91146.Y2Hziy.rst | 2 +- Objects/unicodeobject.c | 31 +++++++++++++------ 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-07-31-03-22-58.gh-issue-91146.Y2Hziy.rst b/Misc/NEWS.d/next/Core and Builtins/2022-07-31-03-22-58.gh-issue-91146.Y2Hziy.rst index 52568dbedd1..9172ca298e8 100644 --- a/Misc/NEWS.d/next/Core and Builtins/2022-07-31-03-22-58.gh-issue-91146.Y2Hziy.rst +++ b/Misc/NEWS.d/next/Core and Builtins/2022-07-31-03-22-58.gh-issue-91146.Y2Hziy.rst @@ -1,2 +1,2 @@ Reduce allocation size of :class:`list` from :meth:`str.split` -and :meth:`str.rsplit`. Patch by Dong-hee Na. +and :meth:`str.rsplit`. Patch by Dong-hee Na and Inada Naoki. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 355d74fe3bb..7ff79953257 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -9698,11 +9698,11 @@ split(PyObject *self, PyObject* out; len1 = PyUnicode_GET_LENGTH(self); kind1 = PyUnicode_KIND(self); - if (maxcount < 0) { - maxcount = len1; - } - if (substring == NULL) + if (substring == NULL) { + if (maxcount < 0) { + maxcount = (len1 - 1) / 2 + 1; + } switch (kind1) { case PyUnicode_1BYTE_KIND: if (PyUnicode_IS_ASCII(self)) @@ -9728,9 +9728,16 @@ split(PyObject *self, default: Py_UNREACHABLE(); } + } kind2 = PyUnicode_KIND(substring); len2 = PyUnicode_GET_LENGTH(substring); + if (maxcount < 0) { + // if len2 == 0, it will raise ValueError. + maxcount = len2 == 0 ? 0 : (len1 / len2) + 1; + // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1 + maxcount = maxcount < 0 ? len1 : maxcount; + } if (kind1 < kind2 || len1 < len2) { out = PyList_New(1); if (out == NULL) @@ -9785,11 +9792,11 @@ rsplit(PyObject *self, len1 = PyUnicode_GET_LENGTH(self); kind1 = PyUnicode_KIND(self); - if (maxcount < 0) { - maxcount = len1; - } - if (substring == NULL) + if (substring == NULL) { + if (maxcount < 0) { + maxcount = (len1 - 1) / 2 + 1; + } switch (kind1) { case PyUnicode_1BYTE_KIND: if (PyUnicode_IS_ASCII(self)) @@ -9815,9 +9822,15 @@ rsplit(PyObject *self, default: Py_UNREACHABLE(); } - + } kind2 = PyUnicode_KIND(substring); len2 = PyUnicode_GET_LENGTH(substring); + if (maxcount < 0) { + // if len2 == 0, it will raise ValueError. + maxcount = len2 == 0 ? 0 : (len1 / len2) + 1; + // handle expected overflow case: (Py_SSIZE_T_MAX / 1) + 1 + maxcount = maxcount < 0 ? len1 : maxcount; + } if (kind1 < kind2 || len1 < len2) { out = PyList_New(1); if (out == NULL)