mirror of https://github.com/python/cpython
needforspeed: partition implementation, part two.
feel free to improve the documentation and the docstrings.
This commit is contained in:
parent
19bebf2e2f
commit
06a69dd8ff
|
@ -727,6 +727,14 @@ a prefix; rather, all combinations of its values are stripped:
|
|||
\versionchanged[Support for the \var{chars} argument]{2.2.2}
|
||||
\end{methoddesc}
|
||||
|
||||
\begin{methoddesc}[string]{partition}{sep}
|
||||
Splits the string at the \var{sep}, and return a 3-tuple containing
|
||||
the part before the separator, the separator itself, and the part
|
||||
after the separator. If the separator is not found, return a 3-tuple
|
||||
containing the string itself, followed by two empty strings.
|
||||
\versionadded{2.5}
|
||||
\end{methoddesc}
|
||||
|
||||
\begin{methoddesc}[string]{replace}{old, new\optional{, count}}
|
||||
Return a copy of the string with all occurrences of substring
|
||||
\var{old} replaced by \var{new}. If the optional argument
|
||||
|
|
|
@ -184,6 +184,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
|
||||
# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
|
||||
# define PyUnicode_Join PyUnicodeUCS2_Join
|
||||
# define PyUnicode_Partition PyUnicodeUCS2_Partition
|
||||
# define PyUnicode_Replace PyUnicodeUCS2_Replace
|
||||
# define PyUnicode_Resize PyUnicodeUCS2_Resize
|
||||
# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
|
||||
|
@ -259,6 +260,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
|
|||
# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
|
||||
# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
|
||||
# define PyUnicode_Join PyUnicodeUCS4_Join
|
||||
# define PyUnicode_Partition PyUnicodeUCS4_Partition
|
||||
# define PyUnicode_Replace PyUnicodeUCS4_Replace
|
||||
# define PyUnicode_Resize PyUnicodeUCS4_Resize
|
||||
# define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
|
||||
|
@ -1018,6 +1020,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
|
|||
int keepends /* If true, line end markers are included */
|
||||
);
|
||||
|
||||
/* Partition a string using a given separator. */
|
||||
|
||||
PyAPI_FUNC(PyObject*) PyUnicode_Partition(
|
||||
PyObject *s, /* String to partition */
|
||||
PyObject *sep /* String separator */
|
||||
);
|
||||
|
||||
/* Split a string giving a list of Unicode strings.
|
||||
|
||||
If sep is NULL, splitting will be done at all whitespace
|
||||
|
|
|
@ -900,6 +900,21 @@ class MixinStrUnicodeUserStringTest:
|
|||
self.checkequal('A', 'a', 'title')
|
||||
self.checkequal(True, 'a', 'islower')
|
||||
|
||||
def test_partition(self):
|
||||
|
||||
self.checkequal(('this', ' is ', 'the partition method'),
|
||||
'this is the partition method', 'partition', ' is ')
|
||||
|
||||
# from raymond's original specification
|
||||
S = 'http://www.python.org'
|
||||
self.checkequal(('http', '://', 'www.python.org'), S, 'partition', '://')
|
||||
self.checkequal(('http://www.python.org', '', ''), S, 'partition', '?')
|
||||
self.checkequal(('', 'http://', 'www.python.org'), S, 'partition', 'http://')
|
||||
self.checkequal(('http://www.python.', 'org', ''), S, 'partition', 'org')
|
||||
|
||||
self.checkraises(ValueError, S, 'partition', '')
|
||||
self.checkraises(TypeError, S, 'partition', None)
|
||||
|
||||
|
||||
class MixinStrStringUserStringTest:
|
||||
# Additional tests for 8bit strings, i.e. str, UserString and
|
||||
|
|
|
@ -1610,20 +1610,20 @@ string_partition(PyStringObject *self, PyObject *args)
|
|||
{
|
||||
Py_ssize_t len = PyString_GET_SIZE(self), sep_len, pos;
|
||||
const char *str = PyString_AS_STRING(self), *sep;
|
||||
PyObject *sepobj;
|
||||
PyObject *sep_obj;
|
||||
PyObject * out;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O:partition", &sepobj))
|
||||
if (!PyArg_ParseTuple(args, "O:partition", &sep_obj))
|
||||
return NULL;
|
||||
if (PyString_Check(sepobj)) {
|
||||
sep = PyString_AS_STRING(sepobj);
|
||||
sep_len = PyString_GET_SIZE(sepobj);
|
||||
if (PyString_Check(sep_obj)) {
|
||||
sep = PyString_AS_STRING(sep_obj);
|
||||
sep_len = PyString_GET_SIZE(sep_obj);
|
||||
}
|
||||
#ifdef Py_USING_UNICODE_NOTYET
|
||||
else if (PyUnicode_Check(sepobj))
|
||||
return PyUnicode_Partition((PyObject *)self, sepobj);
|
||||
#ifdef Py_USING_UNICODE
|
||||
else if (PyUnicode_Check(sep_obj))
|
||||
return PyUnicode_Partition((PyObject *)self, sep_obj);
|
||||
#endif
|
||||
else if (PyObject_AsCharBuffer(sepobj, &sep, &sep_len))
|
||||
else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
|
||||
return NULL;
|
||||
|
||||
if (sep_len == 0) {
|
||||
|
@ -1644,13 +1644,13 @@ string_partition(PyStringObject *self, PyObject *args)
|
|||
Py_INCREF(nullstring);
|
||||
PyTuple_SET_ITEM(out, 2, (PyObject*) nullstring);
|
||||
} else {
|
||||
Py_INCREF(sepobj);
|
||||
PyObject* obj;
|
||||
PyTuple_SET_ITEM(out, 0, PyString_FromStringAndSize(str, pos));
|
||||
PyTuple_SET_ITEM(out, 1, sepobj);
|
||||
PyTuple_SET_ITEM(out, 2,
|
||||
PyString_FromStringAndSize(str + sep_len + pos,
|
||||
len - sep_len - pos)
|
||||
);
|
||||
Py_INCREF(sep_obj);
|
||||
PyTuple_SET_ITEM(out, 1, sep_obj);
|
||||
pos += sep_len;
|
||||
obj = PyString_FromStringAndSize(str + pos, len - pos);
|
||||
PyTuple_SET_ITEM(out, 2, obj);
|
||||
if (PyErr_Occurred()) {
|
||||
Py_DECREF(out);
|
||||
return NULL;
|
||||
|
|
|
@ -4,6 +4,9 @@ Unicode implementation based on original code by Fredrik Lundh,
|
|||
modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
|
||||
Unicode Integration Proposal (see file Misc/unicode.txt).
|
||||
|
||||
Major speed upgrades to the method implementations at the Reykjavik
|
||||
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
|
||||
|
||||
Copyright (c) Corporation for National Research Initiatives.
|
||||
|
||||
--------------------------------------------------------------------
|
||||
|
@ -193,6 +196,7 @@ int unicode_resize(register PyUnicodeObject *unicode,
|
|||
/* Resizing shared object (unicode_empty or single character
|
||||
objects) in-place is not allowed. Use PyUnicode_Resize()
|
||||
instead ! */
|
||||
|
||||
if (unicode == unicode_empty ||
|
||||
(unicode->length == 1 &&
|
||||
unicode->str[0] < 256U &&
|
||||
|
@ -202,8 +206,11 @@ int unicode_resize(register PyUnicodeObject *unicode,
|
|||
return -1;
|
||||
}
|
||||
|
||||
/* We allocate one more byte to make sure the string is
|
||||
Ux0000 terminated -- XXX is this needed ? */
|
||||
/* We allocate one more byte to make sure the string is Ux0000 terminated.
|
||||
The overallocation is also used by fastsearch, which assumes that it's
|
||||
safe to look at str[length] (without makeing any assumptions about what
|
||||
it contains). */
|
||||
|
||||
oldstr = unicode->str;
|
||||
PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
|
||||
if (!unicode->str) {
|
||||
|
@ -3859,8 +3866,6 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
|
|||
|
||||
/* --- Helpers ------------------------------------------------------------ */
|
||||
|
||||
#define USE_FAST /* experimental fast search implementation */
|
||||
|
||||
/* fast search/count implementation, based on a mix between boyer-
|
||||
moore and horspool, with a few more bells and whistles on the top.
|
||||
for some more background, see: http://effbot.org/stringlib */
|
||||
|
@ -3936,10 +3941,8 @@ fastsearch(Py_UNICODE* s, Py_ssize_t n, Py_UNICODE* p, Py_ssize_t m, int mode)
|
|||
/* miss: check if next character is part of pattern */
|
||||
if (!(mask & (1 << (s[i+m] & 0x1F))))
|
||||
i = i + m;
|
||||
else {
|
||||
else
|
||||
i = i + skip;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
/* skip: check if next character is part of pattern */
|
||||
if (!(mask & (1 << (s[i+m] & 0x1F))))
|
||||
|
@ -3973,23 +3976,13 @@ LOCAL(Py_ssize_t) count(PyUnicodeObject *self,
|
|||
if (substring->length == 0)
|
||||
return (end - start + 1);
|
||||
|
||||
#ifdef USE_FAST
|
||||
count = fastsearch(
|
||||
PyUnicode_AS_UNICODE(self) + start, end - start,
|
||||
substring->str, substring->length, FAST_COUNT
|
||||
);
|
||||
|
||||
if (count < 0)
|
||||
count = 0; /* no match */
|
||||
#else
|
||||
end -= substring->length;
|
||||
|
||||
while (start <= end)
|
||||
if (Py_UNICODE_MATCH(self, start, substring)) {
|
||||
count++;
|
||||
start += substring->length;
|
||||
} else
|
||||
start++;
|
||||
#endif
|
||||
|
||||
return count;
|
||||
}
|
||||
|
@ -4040,30 +4033,19 @@ static Py_ssize_t findstring(PyUnicodeObject *self,
|
|||
if (substring->length == 0)
|
||||
return (direction > 0) ? start : end;
|
||||
|
||||
#ifdef USE_FAST
|
||||
if (direction > 0) {
|
||||
Py_ssize_t pos = fastsearch(
|
||||
PyUnicode_AS_UNICODE(self) + start, end - start,
|
||||
substring->str, substring->length, FAST_SEARCH
|
||||
);
|
||||
if (pos < 0)
|
||||
return pos;
|
||||
if (pos >= 0)
|
||||
return pos + start;
|
||||
}
|
||||
#endif
|
||||
|
||||
} else {
|
||||
end -= substring->length;
|
||||
|
||||
if (direction < 0) {
|
||||
for (; end >= start; end--)
|
||||
if (Py_UNICODE_MATCH(self, end, substring))
|
||||
return end;
|
||||
} else {
|
||||
for (; start <= end; start++)
|
||||
if (Py_UNICODE_MATCH(self, start, substring))
|
||||
return start;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -5167,11 +5149,8 @@ int PyUnicode_Contains(PyObject *container,
|
|||
PyObject *element)
|
||||
{
|
||||
PyUnicodeObject *u, *v;
|
||||
int result;
|
||||
Py_ssize_t size;
|
||||
#ifdef USE_FAST
|
||||
Py_ssize_t pos;
|
||||
#endif
|
||||
|
||||
/* Coerce the two arguments */
|
||||
v = (PyUnicodeObject *) PyUnicode_FromObject(element);
|
||||
|
@ -5189,44 +5168,19 @@ int PyUnicode_Contains(PyObject *container,
|
|||
|
||||
size = PyUnicode_GET_SIZE(v);
|
||||
if (!size) {
|
||||
result = 1;
|
||||
pos = 0;
|
||||
goto done;
|
||||
}
|
||||
|
||||
#ifdef USE_FAST
|
||||
pos = fastsearch(
|
||||
PyUnicode_AS_UNICODE(u), PyUnicode_GET_SIZE(u),
|
||||
PyUnicode_AS_UNICODE(v), size, FAST_SEARCH
|
||||
);
|
||||
result = (pos != -1);
|
||||
#else
|
||||
result = 0;
|
||||
|
||||
if (size == 1) {
|
||||
Py_UNICODE chr = PyUnicode_AS_UNICODE(v)[0];
|
||||
Py_UNICODE* ptr = PyUnicode_AS_UNICODE(u);
|
||||
Py_UNICODE* end = ptr + PyUnicode_GET_SIZE(u);
|
||||
for (; ptr < end; ptr++) {
|
||||
if (*ptr == chr) {
|
||||
result = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Py_ssize_t start = 0;
|
||||
Py_ssize_t end = PyUnicode_GET_SIZE(u) - size;
|
||||
for (; start <= end; start++)
|
||||
if (Py_UNICODE_MATCH(u, start, v)) {
|
||||
result = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
done:
|
||||
Py_DECREF(u);
|
||||
Py_DECREF(v);
|
||||
return result;
|
||||
return (pos != -1);
|
||||
}
|
||||
|
||||
/* Concat to string or Unicode object giving a new Unicode object. */
|
||||
|
@ -6335,6 +6289,84 @@ unicode_split(PyUnicodeObject *self, PyObject *args)
|
|||
return PyUnicode_Split((PyObject *)self, substring, maxcount);
|
||||
}
|
||||
|
||||
PyObject *
|
||||
PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
|
||||
{
|
||||
PyObject* str_obj;
|
||||
PyObject* sep_obj;
|
||||
Py_UNICODE *str, *sep;
|
||||
Py_ssize_t len, sep_len, pos;
|
||||
PyObject* out;
|
||||
|
||||
str_obj = PyUnicode_FromObject(str_in);
|
||||
if (!str_obj)
|
||||
return NULL;
|
||||
sep_obj = PyUnicode_FromObject(sep_in);
|
||||
if (!sep_obj)
|
||||
goto error;
|
||||
|
||||
str = PyUnicode_AS_UNICODE(str_obj);
|
||||
len = PyUnicode_GET_SIZE(str_obj);
|
||||
|
||||
sep = PyUnicode_AS_UNICODE(sep_obj);
|
||||
sep_len = PyUnicode_GET_SIZE(sep_obj);
|
||||
|
||||
if (sep_len == 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "empty separator");
|
||||
goto error;
|
||||
}
|
||||
|
||||
out = PyTuple_New(3);
|
||||
if (!out)
|
||||
goto error;
|
||||
|
||||
pos = fastsearch(str, len, sep, sep_len, FAST_SEARCH);
|
||||
if (pos < 0) {
|
||||
Py_INCREF(str_obj);
|
||||
PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj);
|
||||
Py_INCREF(unicode_empty);
|
||||
PyTuple_SET_ITEM(out, 1, (PyObject*) unicode_empty);
|
||||
Py_INCREF(unicode_empty);
|
||||
PyTuple_SET_ITEM(out, 2, (PyObject*) unicode_empty);
|
||||
} else {
|
||||
PyObject* obj;
|
||||
PyTuple_SET_ITEM(out, 0, PyUnicode_FromUnicode(str, pos));
|
||||
Py_INCREF(sep_obj);
|
||||
PyTuple_SET_ITEM(out, 1, sep_obj);
|
||||
obj = PyUnicode_FromUnicode(str + sep_len + pos, len - sep_len - pos);
|
||||
PyTuple_SET_ITEM(out, 2, obj);
|
||||
if (PyErr_Occurred()) {
|
||||
Py_DECREF(out);
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
|
||||
return out;
|
||||
|
||||
error:
|
||||
Py_XDECREF(sep_obj);
|
||||
Py_DECREF(str_obj);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(partition__doc__,
|
||||
"S.partition(sep) -> (head, sep, tail)\n\
|
||||
\n\
|
||||
Searches for the separator sep in S, and returns the part before it,\n\
|
||||
the separator itself, and the part after it. If the separator is not\n\
|
||||
found, returns S and two empty strings.");
|
||||
|
||||
static PyObject*
|
||||
unicode_partition(PyUnicodeObject *self, PyObject *args)
|
||||
{
|
||||
PyObject *separator;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O:partition", &separator))
|
||||
return NULL;
|
||||
|
||||
return PyUnicode_Partition((PyObject *)self, separator);
|
||||
}
|
||||
|
||||
PyObject *PyUnicode_RSplit(PyObject *s,
|
||||
PyObject *sep,
|
||||
Py_ssize_t maxsplit)
|
||||
|
@ -6588,6 +6620,7 @@ static PyMethodDef unicode_methods[] = {
|
|||
{"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
|
||||
{"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
|
||||
{"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
|
||||
{"partition", (PyCFunction) unicode_partition, METH_VARARGS, partition__doc__},
|
||||
{"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
|
||||
{"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
|
||||
{"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
|
||||
|
|
Loading…
Reference in New Issue