needforspeed: partition implementation, part two.

feel free to improve the documentation and the docstrings.
This commit is contained in:
Fredrik Lundh 2006-05-26 08:54:28 +00:00
parent 19bebf2e2f
commit 06a69dd8ff
5 changed files with 143 additions and 78 deletions

View File

@ -727,6 +727,14 @@ a prefix; rather, all combinations of its values are stripped:
\versionchanged[Support for the \var{chars} argument]{2.2.2}
\end{methoddesc}
\begin{methoddesc}[string]{partition}{sep}
Splits the string at the \var{sep}, and return a 3-tuple containing
the part before the separator, the separator itself, and the part
after the separator. If the separator is not found, return a 3-tuple
containing the string itself, followed by two empty strings.
\versionadded{2.5}
\end{methoddesc}
\begin{methoddesc}[string]{replace}{old, new\optional{, count}}
Return a copy of the string with all occurrences of substring
\var{old} replaced by \var{new}. If the optional argument

View File

@ -184,6 +184,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_GetMax PyUnicodeUCS2_GetMax
# define PyUnicode_GetSize PyUnicodeUCS2_GetSize
# define PyUnicode_Join PyUnicodeUCS2_Join
# define PyUnicode_Partition PyUnicodeUCS2_Partition
# define PyUnicode_Replace PyUnicodeUCS2_Replace
# define PyUnicode_Resize PyUnicodeUCS2_Resize
# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
@ -259,6 +260,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_GetMax PyUnicodeUCS4_GetMax
# define PyUnicode_GetSize PyUnicodeUCS4_GetSize
# define PyUnicode_Join PyUnicodeUCS4_Join
# define PyUnicode_Partition PyUnicodeUCS4_Partition
# define PyUnicode_Replace PyUnicodeUCS4_Replace
# define PyUnicode_Resize PyUnicodeUCS4_Resize
# define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding
@ -1018,6 +1020,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
int keepends /* If true, line end markers are included */
);
/* Partition a string using a given separator. */
PyAPI_FUNC(PyObject*) PyUnicode_Partition(
PyObject *s, /* String to partition */
PyObject *sep /* String separator */
);
/* Split a string giving a list of Unicode strings.
If sep is NULL, splitting will be done at all whitespace

View File

@ -900,6 +900,21 @@ class MixinStrUnicodeUserStringTest:
self.checkequal('A', 'a', 'title')
self.checkequal(True, 'a', 'islower')
def test_partition(self):
self.checkequal(('this', ' is ', 'the partition method'),
'this is the partition method', 'partition', ' is ')
# from raymond's original specification
S = 'http://www.python.org'
self.checkequal(('http', '://', 'www.python.org'), S, 'partition', '://')
self.checkequal(('http://www.python.org', '', ''), S, 'partition', '?')
self.checkequal(('', 'http://', 'www.python.org'), S, 'partition', 'http://')
self.checkequal(('http://www.python.', 'org', ''), S, 'partition', 'org')
self.checkraises(ValueError, S, 'partition', '')
self.checkraises(TypeError, S, 'partition', None)
class MixinStrStringUserStringTest:
# Additional tests for 8bit strings, i.e. str, UserString and

View File

@ -1610,20 +1610,20 @@ string_partition(PyStringObject *self, PyObject *args)
{
Py_ssize_t len = PyString_GET_SIZE(self), sep_len, pos;
const char *str = PyString_AS_STRING(self), *sep;
PyObject *sepobj;
PyObject *sep_obj;
PyObject * out;
if (!PyArg_ParseTuple(args, "O:partition", &sepobj))
if (!PyArg_ParseTuple(args, "O:partition", &sep_obj))
return NULL;
if (PyString_Check(sepobj)) {
sep = PyString_AS_STRING(sepobj);
sep_len = PyString_GET_SIZE(sepobj);
if (PyString_Check(sep_obj)) {
sep = PyString_AS_STRING(sep_obj);
sep_len = PyString_GET_SIZE(sep_obj);
}
#ifdef Py_USING_UNICODE_NOTYET
else if (PyUnicode_Check(sepobj))
return PyUnicode_Partition((PyObject *)self, sepobj);
#ifdef Py_USING_UNICODE
else if (PyUnicode_Check(sep_obj))
return PyUnicode_Partition((PyObject *)self, sep_obj);
#endif
else if (PyObject_AsCharBuffer(sepobj, &sep, &sep_len))
else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len))
return NULL;
if (sep_len == 0) {
@ -1644,13 +1644,13 @@ string_partition(PyStringObject *self, PyObject *args)
Py_INCREF(nullstring);
PyTuple_SET_ITEM(out, 2, (PyObject*) nullstring);
} else {
Py_INCREF(sepobj);
PyObject* obj;
PyTuple_SET_ITEM(out, 0, PyString_FromStringAndSize(str, pos));
PyTuple_SET_ITEM(out, 1, sepobj);
PyTuple_SET_ITEM(out, 2,
PyString_FromStringAndSize(str + sep_len + pos,
len - sep_len - pos)
);
Py_INCREF(sep_obj);
PyTuple_SET_ITEM(out, 1, sep_obj);
pos += sep_len;
obj = PyString_FromStringAndSize(str + pos, len - pos);
PyTuple_SET_ITEM(out, 2, obj);
if (PyErr_Occurred()) {
Py_DECREF(out);
return NULL;

View File

@ -4,6 +4,9 @@ Unicode implementation based on original code by Fredrik Lundh,
modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Unicode Integration Proposal (see file Misc/unicode.txt).
Major speed upgrades to the method implementations at the Reykjavik
NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
Copyright (c) Corporation for National Research Initiatives.
--------------------------------------------------------------------
@ -193,6 +196,7 @@ int unicode_resize(register PyUnicodeObject *unicode,
/* Resizing shared object (unicode_empty or single character
objects) in-place is not allowed. Use PyUnicode_Resize()
instead ! */
if (unicode == unicode_empty ||
(unicode->length == 1 &&
unicode->str[0] < 256U &&
@ -202,8 +206,11 @@ int unicode_resize(register PyUnicodeObject *unicode,
return -1;
}
/* We allocate one more byte to make sure the string is
Ux0000 terminated -- XXX is this needed ? */
/* We allocate one more byte to make sure the string is Ux0000 terminated.
The overallocation is also used by fastsearch, which assumes that it's
safe to look at str[length] (without makeing any assumptions about what
it contains). */
oldstr = unicode->str;
PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
if (!unicode->str) {
@ -3859,8 +3866,6 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
/* --- Helpers ------------------------------------------------------------ */
#define USE_FAST /* experimental fast search implementation */
/* fast search/count implementation, based on a mix between boyer-
moore and horspool, with a few more bells and whistles on the top.
for some more background, see: http://effbot.org/stringlib */
@ -3936,10 +3941,8 @@ fastsearch(Py_UNICODE* s, Py_ssize_t n, Py_UNICODE* p, Py_ssize_t m, int mode)
/* miss: check if next character is part of pattern */
if (!(mask & (1 << (s[i+m] & 0x1F))))
i = i + m;
else {
else
i = i + skip;
continue;
}
} else {
/* skip: check if next character is part of pattern */
if (!(mask & (1 << (s[i+m] & 0x1F))))
@ -3973,23 +3976,13 @@ LOCAL(Py_ssize_t) count(PyUnicodeObject *self,
if (substring->length == 0)
return (end - start + 1);
#ifdef USE_FAST
count = fastsearch(
PyUnicode_AS_UNICODE(self) + start, end - start,
substring->str, substring->length, FAST_COUNT
);
if (count < 0)
count = 0; /* no match */
#else
end -= substring->length;
while (start <= end)
if (Py_UNICODE_MATCH(self, start, substring)) {
count++;
start += substring->length;
} else
start++;
#endif
return count;
}
@ -4040,30 +4033,19 @@ static Py_ssize_t findstring(PyUnicodeObject *self,
if (substring->length == 0)
return (direction > 0) ? start : end;
#ifdef USE_FAST
if (direction > 0) {
Py_ssize_t pos = fastsearch(
PyUnicode_AS_UNICODE(self) + start, end - start,
substring->str, substring->length, FAST_SEARCH
);
if (pos < 0)
return pos;
return pos + start;
}
#endif
end -= substring->length;
if (direction < 0) {
if (pos >= 0)
return pos + start;
} else {
end -= substring->length;
for (; end >= start; end--)
if (Py_UNICODE_MATCH(self, end, substring))
return end;
} else {
for (; start <= end; start++)
if (Py_UNICODE_MATCH(self, start, substring))
return start;
}
return -1;
}
@ -5167,11 +5149,8 @@ int PyUnicode_Contains(PyObject *container,
PyObject *element)
{
PyUnicodeObject *u, *v;
int result;
Py_ssize_t size;
#ifdef USE_FAST
Py_ssize_t pos;
#endif
/* Coerce the two arguments */
v = (PyUnicodeObject *) PyUnicode_FromObject(element);
@ -5189,44 +5168,19 @@ int PyUnicode_Contains(PyObject *container,
size = PyUnicode_GET_SIZE(v);
if (!size) {
result = 1;
pos = 0;
goto done;
}
#ifdef USE_FAST
pos = fastsearch(
PyUnicode_AS_UNICODE(u), PyUnicode_GET_SIZE(u),
PyUnicode_AS_UNICODE(v), size, FAST_SEARCH
);
result = (pos != -1);
#else
result = 0;
if (size == 1) {
Py_UNICODE chr = PyUnicode_AS_UNICODE(v)[0];
Py_UNICODE* ptr = PyUnicode_AS_UNICODE(u);
Py_UNICODE* end = ptr + PyUnicode_GET_SIZE(u);
for (; ptr < end; ptr++) {
if (*ptr == chr) {
result = 1;
break;
}
}
} else {
Py_ssize_t start = 0;
Py_ssize_t end = PyUnicode_GET_SIZE(u) - size;
for (; start <= end; start++)
if (Py_UNICODE_MATCH(u, start, v)) {
result = 1;
break;
}
}
#endif
done:
Py_DECREF(u);
Py_DECREF(v);
return result;
return (pos != -1);
}
/* Concat to string or Unicode object giving a new Unicode object. */
@ -6335,6 +6289,84 @@ unicode_split(PyUnicodeObject *self, PyObject *args)
return PyUnicode_Split((PyObject *)self, substring, maxcount);
}
PyObject *
PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
{
PyObject* str_obj;
PyObject* sep_obj;
Py_UNICODE *str, *sep;
Py_ssize_t len, sep_len, pos;
PyObject* out;
str_obj = PyUnicode_FromObject(str_in);
if (!str_obj)
return NULL;
sep_obj = PyUnicode_FromObject(sep_in);
if (!sep_obj)
goto error;
str = PyUnicode_AS_UNICODE(str_obj);
len = PyUnicode_GET_SIZE(str_obj);
sep = PyUnicode_AS_UNICODE(sep_obj);
sep_len = PyUnicode_GET_SIZE(sep_obj);
if (sep_len == 0) {
PyErr_SetString(PyExc_ValueError, "empty separator");
goto error;
}
out = PyTuple_New(3);
if (!out)
goto error;
pos = fastsearch(str, len, sep, sep_len, FAST_SEARCH);
if (pos < 0) {
Py_INCREF(str_obj);
PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj);
Py_INCREF(unicode_empty);
PyTuple_SET_ITEM(out, 1, (PyObject*) unicode_empty);
Py_INCREF(unicode_empty);
PyTuple_SET_ITEM(out, 2, (PyObject*) unicode_empty);
} else {
PyObject* obj;
PyTuple_SET_ITEM(out, 0, PyUnicode_FromUnicode(str, pos));
Py_INCREF(sep_obj);
PyTuple_SET_ITEM(out, 1, sep_obj);
obj = PyUnicode_FromUnicode(str + sep_len + pos, len - sep_len - pos);
PyTuple_SET_ITEM(out, 2, obj);
if (PyErr_Occurred()) {
Py_DECREF(out);
goto error;
}
}
return out;
error:
Py_XDECREF(sep_obj);
Py_DECREF(str_obj);
return NULL;
}
PyDoc_STRVAR(partition__doc__,
"S.partition(sep) -> (head, sep, tail)\n\
\n\
Searches for the separator sep in S, and returns the part before it,\n\
the separator itself, and the part after it. If the separator is not\n\
found, returns S and two empty strings.");
static PyObject*
unicode_partition(PyUnicodeObject *self, PyObject *args)
{
PyObject *separator;
if (!PyArg_ParseTuple(args, "O:partition", &separator))
return NULL;
return PyUnicode_Partition((PyObject *)self, separator);
}
PyObject *PyUnicode_RSplit(PyObject *s,
PyObject *sep,
Py_ssize_t maxsplit)
@ -6588,6 +6620,7 @@ static PyMethodDef unicode_methods[] = {
{"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
{"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
{"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
{"partition", (PyCFunction) unicode_partition, METH_VARARGS, partition__doc__},
{"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
{"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
{"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},