Issue #16061: Speed up str.replace() for replacing 1-character strings.
This commit is contained in:
parent
a707f299cb
commit
e2cef885a2
|
@ -726,6 +726,7 @@ UNICODE_DEPS = \
|
||||||
$(srcdir)/Objects/stringlib/find_max_char.h \
|
$(srcdir)/Objects/stringlib/find_max_char.h \
|
||||||
$(srcdir)/Objects/stringlib/localeutil.h \
|
$(srcdir)/Objects/stringlib/localeutil.h \
|
||||||
$(srcdir)/Objects/stringlib/partition.h \
|
$(srcdir)/Objects/stringlib/partition.h \
|
||||||
|
$(srcdir)/Objects/stringlib/replace.h \
|
||||||
$(srcdir)/Objects/stringlib/split.h \
|
$(srcdir)/Objects/stringlib/split.h \
|
||||||
$(srcdir)/Objects/stringlib/ucs1lib.h \
|
$(srcdir)/Objects/stringlib/ucs1lib.h \
|
||||||
$(srcdir)/Objects/stringlib/ucs2lib.h \
|
$(srcdir)/Objects/stringlib/ucs2lib.h \
|
||||||
|
|
|
@ -10,6 +10,8 @@ What's New in Python 3.4.0 Alpha 1?
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #16061: Speed up str.replace() for replacing 1-character strings.
|
||||||
|
|
||||||
- Issue #17715: Fix segmentation fault from raising an exception in a __trunc__
|
- Issue #17715: Fix segmentation fault from raising an exception in a __trunc__
|
||||||
method.
|
method.
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,53 @@
|
||||||
|
/* stringlib: replace implementation */
|
||||||
|
|
||||||
|
#ifndef STRINGLIB_FASTSEARCH_H
|
||||||
|
#error must include "stringlib/fastsearch.h" before including this module
|
||||||
|
#endif
|
||||||
|
|
||||||
|
Py_LOCAL_INLINE(void)
|
||||||
|
STRINGLIB(replace_1char_inplace)(STRINGLIB_CHAR* s, STRINGLIB_CHAR* end,
|
||||||
|
Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
|
||||||
|
{
|
||||||
|
*s = u2;
|
||||||
|
while (--maxcount && ++s != end) {
|
||||||
|
/* Find the next character to be replaced.
|
||||||
|
|
||||||
|
If it occurs often, it is faster to scan for it using an inline
|
||||||
|
loop. If it occurs seldom, it is faster to scan for it using a
|
||||||
|
function call; the overhead of the function call is amortized
|
||||||
|
across the many characters that call covers. We start with an
|
||||||
|
inline loop and use a heuristic to determine whether to fall back
|
||||||
|
to a function call. */
|
||||||
|
if (*s != u1) {
|
||||||
|
int attempts = 10;
|
||||||
|
/* search u1 in a dummy loop */
|
||||||
|
while (1) {
|
||||||
|
if (++s == end)
|
||||||
|
return;
|
||||||
|
if (*s == u1)
|
||||||
|
break;
|
||||||
|
if (!--attempts) {
|
||||||
|
/* if u1 was not found for attempts iterations,
|
||||||
|
use FASTSEARCH() or memchr() */
|
||||||
|
#if STRINGLIB_SIZEOF_CHAR == 1
|
||||||
|
s++;
|
||||||
|
s = memchr(s, u1, end - s);
|
||||||
|
if (s == NULL)
|
||||||
|
return;
|
||||||
|
#else
|
||||||
|
Py_ssize_t i;
|
||||||
|
STRINGLIB_CHAR ch1 = (STRINGLIB_CHAR) u1;
|
||||||
|
s++;
|
||||||
|
i = FASTSEARCH(s, end - s, &ch1, 1, 0, FAST_SEARCH);
|
||||||
|
if (i < 0)
|
||||||
|
return;
|
||||||
|
s += i;
|
||||||
|
#endif
|
||||||
|
/* restart the dummy loop */
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*s = u2;
|
||||||
|
}
|
||||||
|
}
|
|
@ -605,6 +605,7 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
|
||||||
#include "stringlib/split.h"
|
#include "stringlib/split.h"
|
||||||
#include "stringlib/count.h"
|
#include "stringlib/count.h"
|
||||||
#include "stringlib/find.h"
|
#include "stringlib/find.h"
|
||||||
|
#include "stringlib/replace.h"
|
||||||
#include "stringlib/find_max_char.h"
|
#include "stringlib/find_max_char.h"
|
||||||
#include "stringlib/localeutil.h"
|
#include "stringlib/localeutil.h"
|
||||||
#include "stringlib/undef.h"
|
#include "stringlib/undef.h"
|
||||||
|
@ -615,6 +616,7 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
|
||||||
#include "stringlib/split.h"
|
#include "stringlib/split.h"
|
||||||
#include "stringlib/count.h"
|
#include "stringlib/count.h"
|
||||||
#include "stringlib/find.h"
|
#include "stringlib/find.h"
|
||||||
|
#include "stringlib/replace.h"
|
||||||
#include "stringlib/find_max_char.h"
|
#include "stringlib/find_max_char.h"
|
||||||
#include "stringlib/localeutil.h"
|
#include "stringlib/localeutil.h"
|
||||||
#include "stringlib/undef.h"
|
#include "stringlib/undef.h"
|
||||||
|
@ -625,6 +627,7 @@ make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
|
||||||
#include "stringlib/split.h"
|
#include "stringlib/split.h"
|
||||||
#include "stringlib/count.h"
|
#include "stringlib/count.h"
|
||||||
#include "stringlib/find.h"
|
#include "stringlib/find.h"
|
||||||
|
#include "stringlib/replace.h"
|
||||||
#include "stringlib/find_max_char.h"
|
#include "stringlib/find_max_char.h"
|
||||||
#include "stringlib/localeutil.h"
|
#include "stringlib/localeutil.h"
|
||||||
#include "stringlib/undef.h"
|
#include "stringlib/undef.h"
|
||||||
|
@ -9927,6 +9930,31 @@ anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
replace_1char_inplace(PyObject *u, Py_ssize_t pos,
|
||||||
|
Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
|
||||||
|
{
|
||||||
|
int kind = PyUnicode_KIND(u);
|
||||||
|
void *data = PyUnicode_DATA(u);
|
||||||
|
Py_ssize_t len = PyUnicode_GET_LENGTH(u);
|
||||||
|
if (kind == PyUnicode_1BYTE_KIND) {
|
||||||
|
ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
|
||||||
|
(Py_UCS1 *)data + len,
|
||||||
|
u1, u2, maxcount);
|
||||||
|
}
|
||||||
|
else if (kind == PyUnicode_2BYTE_KIND) {
|
||||||
|
ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
|
||||||
|
(Py_UCS2 *)data + len,
|
||||||
|
u1, u2, maxcount);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
assert(kind == PyUnicode_4BYTE_KIND);
|
||||||
|
ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
|
||||||
|
(Py_UCS4 *)data + len,
|
||||||
|
u1, u2, maxcount);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
replace(PyObject *self, PyObject *str1,
|
replace(PyObject *self, PyObject *str1,
|
||||||
PyObject *str2, Py_ssize_t maxcount)
|
PyObject *str2, Py_ssize_t maxcount)
|
||||||
|
@ -9943,7 +9971,7 @@ replace(PyObject *self, PyObject *str1,
|
||||||
Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
|
Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
|
||||||
Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
|
Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
|
||||||
int mayshrink;
|
int mayshrink;
|
||||||
Py_UCS4 maxchar, maxchar_str2;
|
Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
|
||||||
|
|
||||||
if (maxcount < 0)
|
if (maxcount < 0)
|
||||||
maxcount = PY_SSIZE_T_MAX;
|
maxcount = PY_SSIZE_T_MAX;
|
||||||
|
@ -9952,15 +9980,16 @@ replace(PyObject *self, PyObject *str1,
|
||||||
|
|
||||||
if (str1 == str2)
|
if (str1 == str2)
|
||||||
goto nothing;
|
goto nothing;
|
||||||
if (skind < kind1)
|
|
||||||
/* substring too wide to be present */
|
|
||||||
goto nothing;
|
|
||||||
|
|
||||||
maxchar = PyUnicode_MAX_CHAR_VALUE(self);
|
maxchar = PyUnicode_MAX_CHAR_VALUE(self);
|
||||||
|
maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
|
||||||
|
if (maxchar < maxchar_str1)
|
||||||
|
/* substring too wide to be present */
|
||||||
|
goto nothing;
|
||||||
maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
|
maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
|
||||||
/* Replacing str1 with str2 may cause a maxchar reduction in the
|
/* Replacing str1 with str2 may cause a maxchar reduction in the
|
||||||
result string. */
|
result string. */
|
||||||
mayshrink = (maxchar_str2 < maxchar);
|
mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
|
||||||
maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
|
maxchar = MAX_MAXCHAR(maxchar, maxchar_str2);
|
||||||
|
|
||||||
if (len1 == len2) {
|
if (len1 == len2) {
|
||||||
|
@ -9970,36 +9999,19 @@ replace(PyObject *self, PyObject *str1,
|
||||||
if (len1 == 1) {
|
if (len1 == 1) {
|
||||||
/* replace characters */
|
/* replace characters */
|
||||||
Py_UCS4 u1, u2;
|
Py_UCS4 u1, u2;
|
||||||
int rkind;
|
Py_ssize_t pos;
|
||||||
Py_ssize_t index, pos;
|
|
||||||
char *src, *rbuf;
|
|
||||||
|
|
||||||
u1 = PyUnicode_READ(kind1, buf1, 0);
|
u1 = PyUnicode_READ(kind1, buf1, 0);
|
||||||
pos = findchar(sbuf, PyUnicode_KIND(self), slen, u1, 1);
|
pos = findchar(sbuf, skind, slen, u1, 1);
|
||||||
if (pos < 0)
|
if (pos < 0)
|
||||||
goto nothing;
|
goto nothing;
|
||||||
u2 = PyUnicode_READ(kind2, buf2, 0);
|
u2 = PyUnicode_READ(kind2, buf2, 0);
|
||||||
u = PyUnicode_New(slen, maxchar);
|
u = PyUnicode_New(slen, maxchar);
|
||||||
if (!u)
|
if (!u)
|
||||||
goto error;
|
goto error;
|
||||||
_PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
|
|
||||||
rkind = PyUnicode_KIND(u);
|
|
||||||
rbuf = PyUnicode_DATA(u);
|
|
||||||
|
|
||||||
PyUnicode_WRITE(rkind, rbuf, pos, u2);
|
_PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
|
||||||
index = 0;
|
replace_1char_inplace(u, pos, u1, u2, maxcount);
|
||||||
src = sbuf;
|
|
||||||
while (--maxcount)
|
|
||||||
{
|
|
||||||
pos++;
|
|
||||||
src += pos * PyUnicode_KIND(self);
|
|
||||||
slen -= pos;
|
|
||||||
index += pos;
|
|
||||||
pos = findchar(src, PyUnicode_KIND(self), slen, u1, 1);
|
|
||||||
if (pos < 0)
|
|
||||||
break;
|
|
||||||
PyUnicode_WRITE(rkind, rbuf, index + pos, u2);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
int rkind = skind;
|
int rkind = skind;
|
||||||
|
|
|
@ -1586,6 +1586,10 @@
|
||||||
RelativePath="..\..\Objects\rangeobject.c"
|
RelativePath="..\..\Objects\rangeobject.c"
|
||||||
>
|
>
|
||||||
</File>
|
</File>
|
||||||
|
<File
|
||||||
|
RelativePath="..\..\Objects\stringlib\replace.h"
|
||||||
|
>
|
||||||
|
</File>
|
||||||
<File
|
<File
|
||||||
RelativePath="..\..\Objects\setobject.c"
|
RelativePath="..\..\Objects\setobject.c"
|
||||||
>
|
>
|
||||||
|
|
|
@ -475,6 +475,7 @@
|
||||||
<ClInclude Include="..\Objects\stringlib\fastsearch.h" />
|
<ClInclude Include="..\Objects\stringlib\fastsearch.h" />
|
||||||
<ClInclude Include="..\Objects\stringlib\find.h" />
|
<ClInclude Include="..\Objects\stringlib\find.h" />
|
||||||
<ClInclude Include="..\Objects\stringlib\partition.h" />
|
<ClInclude Include="..\Objects\stringlib\partition.h" />
|
||||||
|
<ClInclude Include="..\Objects\stringlib\replace.h" />
|
||||||
<ClInclude Include="..\Objects\stringlib\split.h" />
|
<ClInclude Include="..\Objects\stringlib\split.h" />
|
||||||
<ClInclude Include="..\Objects\unicodetype_db.h" />
|
<ClInclude Include="..\Objects\unicodetype_db.h" />
|
||||||
<ClInclude Include="..\Parser\parser.h" />
|
<ClInclude Include="..\Parser\parser.h" />
|
||||||
|
|
|
@ -378,6 +378,9 @@
|
||||||
<ClInclude Include="..\Objects\stringlib\partition.h">
|
<ClInclude Include="..\Objects\stringlib\partition.h">
|
||||||
<Filter>Objects</Filter>
|
<Filter>Objects</Filter>
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
|
<ClInclude Include="..\Objects\stringlib\replace.h">
|
||||||
|
<Filter>Objects</Filter>
|
||||||
|
</ClInclude>
|
||||||
<ClInclude Include="..\Objects\stringlib\split.h">
|
<ClInclude Include="..\Objects\stringlib\split.h">
|
||||||
<Filter>Objects</Filter>
|
<Filter>Objects</Filter>
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
|
|
Loading…
Reference in New Issue