needforspeed: partition implementation, part two.

feel free to improve the documentation and the docstrings.
This commit is contained in:
Fredrik Lundh 2006-05-26 08:54:28 +00:00
parent 19bebf2e2f
commit 06a69dd8ff
5 changed files with 143 additions and 78 deletions

View File

@ -727,6 +727,14 @@ a prefix; rather, all combinations of its values are stripped:
\versionchanged[Support for the \var{chars} argument]{2.2.2} \versionchanged[Support for the \var{chars} argument]{2.2.2}
\end{methoddesc} \end{methoddesc}
\begin{methoddesc}[string]{partition}{sep}
Splits the string at the \var{sep}, and return a 3-tuple containing
the part before the separator, the separator itself, and the part
after the separator. If the separator is not found, return a 3-tuple
containing the string itself, followed by two empty strings.
\versionadded{2.5}
\end{methoddesc}
\begin{methoddesc}[string]{replace}{old, new\optional{, count}} \begin{methoddesc}[string]{replace}{old, new\optional{, count}}
Return a copy of the string with all occurrences of substring Return a copy of the string with all occurrences of substring
\var{old} replaced by \var{new}. If the optional argument \var{old} replaced by \var{new}. If the optional argument

View File

@ -184,6 +184,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_GetMax PyUnicodeUCS2_GetMax # define PyUnicode_GetMax PyUnicodeUCS2_GetMax
# define PyUnicode_GetSize PyUnicodeUCS2_GetSize # define PyUnicode_GetSize PyUnicodeUCS2_GetSize
# define PyUnicode_Join PyUnicodeUCS2_Join # define PyUnicode_Join PyUnicodeUCS2_Join
# define PyUnicode_Partition PyUnicodeUCS2_Partition
# define PyUnicode_Replace PyUnicodeUCS2_Replace # define PyUnicode_Replace PyUnicodeUCS2_Replace
# define PyUnicode_Resize PyUnicodeUCS2_Resize # define PyUnicode_Resize PyUnicodeUCS2_Resize
# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
@ -259,6 +260,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_GetMax PyUnicodeUCS4_GetMax # define PyUnicode_GetMax PyUnicodeUCS4_GetMax
# define PyUnicode_GetSize PyUnicodeUCS4_GetSize # define PyUnicode_GetSize PyUnicodeUCS4_GetSize
# define PyUnicode_Join PyUnicodeUCS4_Join # define PyUnicode_Join PyUnicodeUCS4_Join
# define PyUnicode_Partition PyUnicodeUCS4_Partition
# define PyUnicode_Replace PyUnicodeUCS4_Replace # define PyUnicode_Replace PyUnicodeUCS4_Replace
# define PyUnicode_Resize PyUnicodeUCS4_Resize # define PyUnicode_Resize PyUnicodeUCS4_Resize
# define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding # define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
@ -1018,6 +1020,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
int keepends /* If true, line end markers are included */ int keepends /* If true, line end markers are included */
); );
/* Partition a string using a given separator. */
PyAPI_FUNC(PyObject*) PyUnicode_Partition(
PyObject *s, /* String to partition */
PyObject *sep /* String separator */
);
/* Split a string giving a list of Unicode strings. /* Split a string giving a list of Unicode strings.
If sep is NULL, splitting will be done at all whitespace If sep is NULL, splitting will be done at all whitespace

View File

@ -900,6 +900,21 @@ class MixinStrUnicodeUserStringTest:
self.checkequal('A', 'a', 'title') self.checkequal('A', 'a', 'title')
self.checkequal(True, 'a', 'islower') self.checkequal(True, 'a', 'islower')
def test_partition(self):
self.checkequal(('this', ' is ', 'the partition method'),
'this is the partition method', 'partition', ' is ')
# from raymond's original specification
S = 'http://www.python.org'
self.checkequal(('http', '://', 'www.python.org'), S, 'partition', '://')
self.checkequal(('http://www.python.org', '', ''), S, 'partition', '?')
self.checkequal(('', 'http://', 'www.python.org'), S, 'partition', 'http://')
self.checkequal(('http://www.python.', 'org', ''), S, 'partition', 'org')
self.checkraises(ValueError, S, 'partition', '')
self.checkraises(TypeError, S, 'partition', None)
class MixinStrStringUserStringTest: class MixinStrStringUserStringTest:
# Additional tests for 8bit strings, i.e. str, UserString and # Additional tests for 8bit strings, i.e. str, UserString and

View File

@ -1610,20 +1610,20 @@ string_partition(PyStringObject *self, PyObject *args)
{ {
Py_ssize_t len = PyString_GET_SIZE(self), sep_len, pos; Py_ssize_t len = PyString_GET_SIZE(self), sep_len, pos;
const char *str = PyString_AS_STRING(self), *sep; const char *str = PyString_AS_STRING(self), *sep;
PyObject *sepobj; PyObject *sep_obj;
PyObject * out; PyObject * out;
if (!PyArg_ParseTuple(args, "O:partition", &sepobj)) if (!PyArg_ParseTuple(args, "O:partition", &sep_obj))
return NULL; return NULL;
if (PyString_Check(sepobj)) { if (PyString_Check(sep_obj)) {
sep = PyString_AS_STRING(sepobj); sep = PyString_AS_STRING(sep_obj);
sep_len = PyString_GET_SIZE(sepobj); sep_len = PyString_GET_SIZE(sep_obj);
} }
#ifdef Py_USING_UNICODE_NOTYET #ifdef Py_USING_UNICODE
else if (PyUnicode_Check(sepobj)) else if (PyUnicode_Check(sep_obj))
return PyUnicode_Partition((PyObject *)self, sepobj); return PyUnicode_Partition((PyObject *)self, sep_obj);
#endif #endif
else if (PyObject_AsCharBuffer(sepobj, &sep, &sep_len)) else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
return NULL; return NULL;
if (sep_len == 0) { if (sep_len == 0) {
@ -1644,13 +1644,13 @@ string_partition(PyStringObject *self, PyObject *args)
Py_INCREF(nullstring); Py_INCREF(nullstring);
PyTuple_SET_ITEM(out, 2, (PyObject*) nullstring); PyTuple_SET_ITEM(out, 2, (PyObject*) nullstring);
} else { } else {
Py_INCREF(sepobj); PyObject* obj;
PyTuple_SET_ITEM(out, 0, PyString_FromStringAndSize(str, pos)); PyTuple_SET_ITEM(out, 0, PyString_FromStringAndSize(str, pos));
PyTuple_SET_ITEM(out, 1, sepobj); Py_INCREF(sep_obj);
PyTuple_SET_ITEM(out, 2, PyTuple_SET_ITEM(out, 1, sep_obj);
PyString_FromStringAndSize(str + sep_len + pos, pos += sep_len;
len - sep_len - pos) obj = PyString_FromStringAndSize(str + pos, len - pos);
); PyTuple_SET_ITEM(out, 2, obj);
if (PyErr_Occurred()) { if (PyErr_Occurred()) {
Py_DECREF(out); Py_DECREF(out);
return NULL; return NULL;

View File

@ -4,6 +4,9 @@ Unicode implementation based on original code by Fredrik Lundh,
modified by Marc-Andre Lemburg <mal@lemburg.com> according to the modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Unicode Integration Proposal (see file Misc/unicode.txt). Unicode Integration Proposal (see file Misc/unicode.txt).
Major speed upgrades to the method implementations at the Reykjavik
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
Copyright (c) Corporation for National Research Initiatives. Copyright (c) Corporation for National Research Initiatives.
-------------------------------------------------------------------- --------------------------------------------------------------------
@ -193,6 +196,7 @@ int unicode_resize(register PyUnicodeObject *unicode,
/* Resizing shared object (unicode_empty or single character /* Resizing shared object (unicode_empty or single character
objects) in-place is not allowed. Use PyUnicode_Resize() objects) in-place is not allowed. Use PyUnicode_Resize()
instead ! */ instead ! */
if (unicode == unicode_empty || if (unicode == unicode_empty ||
(unicode->length == 1 && (unicode->length == 1 &&
unicode->str[0] < 256U && unicode->str[0] < 256U &&
@ -202,8 +206,11 @@ int unicode_resize(register PyUnicodeObject *unicode,
return -1; return -1;
} }
/* We allocate one more byte to make sure the string is /* We allocate one more byte to make sure the string is Ux0000 terminated.
Ux0000 terminated -- XXX is this needed ? */ The overallocation is also used by fastsearch, which assumes that it's
safe to look at str[length] (without makeing any assumptions about what
it contains). */
oldstr = unicode->str; oldstr = unicode->str;
PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
if (!unicode->str) { if (!unicode->str) {
@ -3859,8 +3866,6 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
/* --- Helpers ------------------------------------------------------------ */ /* --- Helpers ------------------------------------------------------------ */
#define USE_FAST /* experimental fast search implementation */
/* fast search/count implementation, based on a mix between boyer- /* fast search/count implementation, based on a mix between boyer-
moore and horspool, with a few more bells and whistles on the top. moore and horspool, with a few more bells and whistles on the top.
for some more background, see: http://effbot.org/stringlib */ for some more background, see: http://effbot.org/stringlib */
@ -3936,10 +3941,8 @@ fastsearch(Py_UNICODE* s, Py_ssize_t n, Py_UNICODE* p, Py_ssize_t m, int mode)
/* miss: check if next character is part of pattern */ /* miss: check if next character is part of pattern */
if (!(mask & (1 << (s[i+m] & 0x1F)))) if (!(mask & (1 << (s[i+m] & 0x1F))))
i = i + m; i = i + m;
else { else
i = i + skip; i = i + skip;
continue;
}
} else { } else {
/* skip: check if next character is part of pattern */ /* skip: check if next character is part of pattern */
if (!(mask & (1 << (s[i+m] & 0x1F)))) if (!(mask & (1 << (s[i+m] & 0x1F))))
@ -3973,23 +3976,13 @@ LOCAL(Py_ssize_t) count(PyUnicodeObject *self,
if (substring->length == 0) if (substring->length == 0)
return (end - start + 1); return (end - start + 1);
#ifdef USE_FAST
count = fastsearch( count = fastsearch(
PyUnicode_AS_UNICODE(self) + start, end - start, PyUnicode_AS_UNICODE(self) + start, end - start,
substring->str, substring->length, FAST_COUNT substring->str, substring->length, FAST_COUNT
); );
if (count < 0) if (count < 0)
count = 0; /* no match */ count = 0; /* no match */
#else
end -= substring->length;
while (start <= end)
if (Py_UNICODE_MATCH(self, start, substring)) {
count++;
start += substring->length;
} else
start++;
#endif
return count; return count;
} }
@ -4040,30 +4033,19 @@ static Py_ssize_t findstring(PyUnicodeObject *self,
if (substring->length == 0) if (substring->length == 0)
return (direction > 0) ? start : end; return (direction > 0) ? start : end;
#ifdef USE_FAST
if (direction > 0) { if (direction > 0) {
Py_ssize_t pos = fastsearch( Py_ssize_t pos = fastsearch(
PyUnicode_AS_UNICODE(self) + start, end - start, PyUnicode_AS_UNICODE(self) + start, end - start,
substring->str, substring->length, FAST_SEARCH substring->str, substring->length, FAST_SEARCH
); );
if (pos < 0) if (pos >= 0)
return pos;
return pos + start; return pos + start;
} } else {
#endif
end -= substring->length; end -= substring->length;
if (direction < 0) {
for (; end >= start; end--) for (; end >= start; end--)
if (Py_UNICODE_MATCH(self, end, substring)) if (Py_UNICODE_MATCH(self, end, substring))
return end; return end;
} else {
for (; start <= end; start++)
if (Py_UNICODE_MATCH(self, start, substring))
return start;
} }
return -1; return -1;
} }
@ -5167,11 +5149,8 @@ int PyUnicode_Contains(PyObject *container,
PyObject *element) PyObject *element)
{ {
PyUnicodeObject *u, *v; PyUnicodeObject *u, *v;
int result;
Py_ssize_t size; Py_ssize_t size;
#ifdef USE_FAST
Py_ssize_t pos; Py_ssize_t pos;
#endif
/* Coerce the two arguments */ /* Coerce the two arguments */
v = (PyUnicodeObject *) PyUnicode_FromObject(element); v = (PyUnicodeObject *) PyUnicode_FromObject(element);
@ -5189,44 +5168,19 @@ int PyUnicode_Contains(PyObject *container,
size = PyUnicode_GET_SIZE(v); size = PyUnicode_GET_SIZE(v);
if (!size) { if (!size) {
result = 1; pos = 0;
goto done; goto done;
} }
#ifdef USE_FAST
pos = fastsearch( pos = fastsearch(
PyUnicode_AS_UNICODE(u), PyUnicode_GET_SIZE(u), PyUnicode_AS_UNICODE(u), PyUnicode_GET_SIZE(u),
PyUnicode_AS_UNICODE(v), size, FAST_SEARCH PyUnicode_AS_UNICODE(v), size, FAST_SEARCH
); );
result = (pos != -1);
#else
result = 0;
if (size == 1) {
Py_UNICODE chr = PyUnicode_AS_UNICODE(v)[0];
Py_UNICODE* ptr = PyUnicode_AS_UNICODE(u);
Py_UNICODE* end = ptr + PyUnicode_GET_SIZE(u);
for (; ptr < end; ptr++) {
if (*ptr == chr) {
result = 1;
break;
}
}
} else {
Py_ssize_t start = 0;
Py_ssize_t end = PyUnicode_GET_SIZE(u) - size;
for (; start <= end; start++)
if (Py_UNICODE_MATCH(u, start, v)) {
result = 1;
break;
}
}
#endif
done: done:
Py_DECREF(u); Py_DECREF(u);
Py_DECREF(v); Py_DECREF(v);
return result; return (pos != -1);
} }
/* Concat to string or Unicode object giving a new Unicode object. */ /* Concat to string or Unicode object giving a new Unicode object. */
@ -6335,6 +6289,84 @@ unicode_split(PyUnicodeObject *self, PyObject *args)
return PyUnicode_Split((PyObject *)self, substring, maxcount); return PyUnicode_Split((PyObject *)self, substring, maxcount);
} }
PyObject *
PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
{
PyObject* str_obj;
PyObject* sep_obj;
Py_UNICODE *str, *sep;
Py_ssize_t len, sep_len, pos;
PyObject* out;
str_obj = PyUnicode_FromObject(str_in);
if (!str_obj)
return NULL;
sep_obj = PyUnicode_FromObject(sep_in);
if (!sep_obj)
goto error;
str = PyUnicode_AS_UNICODE(str_obj);
len = PyUnicode_GET_SIZE(str_obj);
sep = PyUnicode_AS_UNICODE(sep_obj);
sep_len = PyUnicode_GET_SIZE(sep_obj);
if (sep_len == 0) {
PyErr_SetString(PyExc_ValueError, "empty separator");
goto error;
}
out = PyTuple_New(3);
if (!out)
goto error;
pos = fastsearch(str, len, sep, sep_len, FAST_SEARCH);
if (pos < 0) {
Py_INCREF(str_obj);
PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj);
Py_INCREF(unicode_empty);
PyTuple_SET_ITEM(out, 1, (PyObject*) unicode_empty);
Py_INCREF(unicode_empty);
PyTuple_SET_ITEM(out, 2, (PyObject*) unicode_empty);
} else {
PyObject* obj;
PyTuple_SET_ITEM(out, 0, PyUnicode_FromUnicode(str, pos));
Py_INCREF(sep_obj);
PyTuple_SET_ITEM(out, 1, sep_obj);
obj = PyUnicode_FromUnicode(str + sep_len + pos, len - sep_len - pos);
PyTuple_SET_ITEM(out, 2, obj);
if (PyErr_Occurred()) {
Py_DECREF(out);
goto error;
}
}
return out;
error:
Py_XDECREF(sep_obj);
Py_DECREF(str_obj);
return NULL;
}
PyDoc_STRVAR(partition__doc__,
"S.partition(sep) -> (head, sep, tail)\n\
\n\
Searches for the separator sep in S, and returns the part before it,\n\
the separator itself, and the part after it. If the separator is not\n\
found, returns S and two empty strings.");
static PyObject*
unicode_partition(PyUnicodeObject *self, PyObject *args)
{
PyObject *separator;
if (!PyArg_ParseTuple(args, "O:partition", &separator))
return NULL;
return PyUnicode_Partition((PyObject *)self, separator);
}
PyObject *PyUnicode_RSplit(PyObject *s, PyObject *PyUnicode_RSplit(PyObject *s,
PyObject *sep, PyObject *sep,
Py_ssize_t maxsplit) Py_ssize_t maxsplit)
@ -6588,6 +6620,7 @@ static PyMethodDef unicode_methods[] = {
{"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
{"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
{"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
{"partition", (PyCFunction) unicode_partition, METH_VARARGS, partition__doc__},
{"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
{"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
{"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},