From 06a69dd8ffcbac16e7f5c81b457c40ca4ce94c00 Mon Sep 17 00:00:00 2001 From: Fredrik Lundh Date: Fri, 26 May 2006 08:54:28 +0000 Subject: [PATCH] needforspeed: partition implementation, part two. feel free to improve the documentation and the docstrings. --- Doc/lib/libstdtypes.tex | 8 ++ Include/unicodeobject.h | 9 +++ Lib/test/string_tests.py | 15 ++++ Objects/stringobject.c | 30 ++++---- Objects/unicodeobject.c | 159 +++++++++++++++++++++++---------------- 5 files changed, 143 insertions(+), 78 deletions(-) diff --git a/Doc/lib/libstdtypes.tex b/Doc/lib/libstdtypes.tex index 6760e478f03..80d27173a5b 100644 --- a/Doc/lib/libstdtypes.tex +++ b/Doc/lib/libstdtypes.tex @@ -727,6 +727,14 @@ a prefix; rather, all combinations of its values are stripped: \versionchanged[Support for the \var{chars} argument]{2.2.2} \end{methoddesc} +\begin{methoddesc}[string]{partition}{sep} +Splits the string at the \var{sep}, and return a 3-tuple containing +the part before the separator, the separator itself, and the part +after the separator. If the separator is not found, return a 3-tuple +containing the string itself, followed by two empty strings. +\versionadded{2.5} +\end{methoddesc} + \begin{methoddesc}[string]{replace}{old, new\optional{, count}} Return a copy of the string with all occurrences of substring \var{old} replaced by \var{new}. If the optional argument diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 82a02327590..664578223c9 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -184,6 +184,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_GetMax PyUnicodeUCS2_GetMax # define PyUnicode_GetSize PyUnicodeUCS2_GetSize # define PyUnicode_Join PyUnicodeUCS2_Join +# define PyUnicode_Partition PyUnicodeUCS2_Partition # define PyUnicode_Replace PyUnicodeUCS2_Replace # define PyUnicode_Resize PyUnicodeUCS2_Resize # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding @@ -259,6 +260,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE; # define PyUnicode_GetMax PyUnicodeUCS4_GetMax # define PyUnicode_GetSize PyUnicodeUCS4_GetSize # define PyUnicode_Join PyUnicodeUCS4_Join +# define PyUnicode_Partition PyUnicodeUCS4_Partition # define PyUnicode_Replace PyUnicodeUCS4_Replace # define PyUnicode_Resize PyUnicodeUCS4_Resize # define PyUnicode_SetDefaultEncoding PyUnicodeUCS4_SetDefaultEncoding @@ -1018,6 +1020,13 @@ PyAPI_FUNC(PyObject*) PyUnicode_Splitlines( int keepends /* If true, line end markers are included */ ); +/* Partition a string using a given separator. */ + +PyAPI_FUNC(PyObject*) PyUnicode_Partition( + PyObject *s, /* String to partition */ + PyObject *sep /* String separator */ + ); + /* Split a string giving a list of Unicode strings. If sep is NULL, splitting will be done at all whitespace diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index d4fdd8fc0cc..260b2d83d90 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -900,6 +900,21 @@ class MixinStrUnicodeUserStringTest: self.checkequal('A', 'a', 'title') self.checkequal(True, 'a', 'islower') + def test_partition(self): + + self.checkequal(('this', ' is ', 'the partition method'), + 'this is the partition method', 'partition', ' is ') + + # from raymond's original specification + S = 'http://www.python.org' + self.checkequal(('http', '://', 'www.python.org'), S, 'partition', '://') + self.checkequal(('http://www.python.org', '', ''), S, 'partition', '?') + self.checkequal(('', 'http://', 'www.python.org'), S, 'partition', 'http://') + self.checkequal(('http://www.python.', 'org', ''), S, 'partition', 'org') + + self.checkraises(ValueError, S, 'partition', '') + self.checkraises(TypeError, S, 'partition', None) + class MixinStrStringUserStringTest: # Additional tests for 8bit strings, i.e. str, UserString and diff --git a/Objects/stringobject.c b/Objects/stringobject.c index 2dfac03a906..0e0af89e5fa 100644 --- a/Objects/stringobject.c +++ b/Objects/stringobject.c @@ -1610,20 +1610,20 @@ string_partition(PyStringObject *self, PyObject *args) { Py_ssize_t len = PyString_GET_SIZE(self), sep_len, pos; const char *str = PyString_AS_STRING(self), *sep; - PyObject *sepobj; + PyObject *sep_obj; PyObject * out; - if (!PyArg_ParseTuple(args, "O:partition", &sepobj)) + if (!PyArg_ParseTuple(args, "O:partition", &sep_obj)) return NULL; - if (PyString_Check(sepobj)) { - sep = PyString_AS_STRING(sepobj); - sep_len = PyString_GET_SIZE(sepobj); + if (PyString_Check(sep_obj)) { + sep = PyString_AS_STRING(sep_obj); + sep_len = PyString_GET_SIZE(sep_obj); } -#ifdef Py_USING_UNICODE_NOTYET - else if (PyUnicode_Check(sepobj)) - return PyUnicode_Partition((PyObject *)self, sepobj); +#ifdef Py_USING_UNICODE + else if (PyUnicode_Check(sep_obj)) + return PyUnicode_Partition((PyObject *)self, sep_obj); #endif - else if (PyObject_AsCharBuffer(sepobj, &sep, &sep_len)) + else if (PyObject_AsCharBuffer(sep_obj, &sep, &sep_len)) return NULL; if (sep_len == 0) { @@ -1644,13 +1644,13 @@ string_partition(PyStringObject *self, PyObject *args) Py_INCREF(nullstring); PyTuple_SET_ITEM(out, 2, (PyObject*) nullstring); } else { - Py_INCREF(sepobj); + PyObject* obj; PyTuple_SET_ITEM(out, 0, PyString_FromStringAndSize(str, pos)); - PyTuple_SET_ITEM(out, 1, sepobj); - PyTuple_SET_ITEM(out, 2, - PyString_FromStringAndSize(str + sep_len + pos, - len - sep_len - pos) - ); + Py_INCREF(sep_obj); + PyTuple_SET_ITEM(out, 1, sep_obj); + pos += sep_len; + obj = PyString_FromStringAndSize(str + pos, len - pos); + PyTuple_SET_ITEM(out, 2, obj); if (PyErr_Occurred()) { Py_DECREF(out); return NULL; diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index aff14f593aa..770224884cc 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4,6 +4,9 @@ Unicode implementation based on original code by Fredrik Lundh, modified by Marc-Andre Lemburg according to the Unicode Integration Proposal (see file Misc/unicode.txt). +Major speed upgrades to the method implementations at the Reykjavik +NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke. + Copyright (c) Corporation for National Research Initiatives. -------------------------------------------------------------------- @@ -193,6 +196,7 @@ int unicode_resize(register PyUnicodeObject *unicode, /* Resizing shared object (unicode_empty or single character objects) in-place is not allowed. Use PyUnicode_Resize() instead ! */ + if (unicode == unicode_empty || (unicode->length == 1 && unicode->str[0] < 256U && @@ -202,8 +206,11 @@ int unicode_resize(register PyUnicodeObject *unicode, return -1; } - /* We allocate one more byte to make sure the string is - Ux0000 terminated -- XXX is this needed ? */ + /* We allocate one more byte to make sure the string is Ux0000 terminated. + The overallocation is also used by fastsearch, which assumes that it's + safe to look at str[length] (without makeing any assumptions about what + it contains). */ + oldstr = unicode->str; PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1); if (!unicode->str) { @@ -3859,8 +3866,6 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s, /* --- Helpers ------------------------------------------------------------ */ -#define USE_FAST /* experimental fast search implementation */ - /* fast search/count implementation, based on a mix between boyer- moore and horspool, with a few more bells and whistles on the top. for some more background, see: http://effbot.org/stringlib */ @@ -3936,10 +3941,8 @@ fastsearch(Py_UNICODE* s, Py_ssize_t n, Py_UNICODE* p, Py_ssize_t m, int mode) /* miss: check if next character is part of pattern */ if (!(mask & (1 << (s[i+m] & 0x1F)))) i = i + m; - else { + else i = i + skip; - continue; - } } else { /* skip: check if next character is part of pattern */ if (!(mask & (1 << (s[i+m] & 0x1F)))) @@ -3973,23 +3976,13 @@ LOCAL(Py_ssize_t) count(PyUnicodeObject *self, if (substring->length == 0) return (end - start + 1); -#ifdef USE_FAST count = fastsearch( PyUnicode_AS_UNICODE(self) + start, end - start, substring->str, substring->length, FAST_COUNT ); + if (count < 0) count = 0; /* no match */ -#else - end -= substring->length; - - while (start <= end) - if (Py_UNICODE_MATCH(self, start, substring)) { - count++; - start += substring->length; - } else - start++; -#endif return count; } @@ -4040,30 +4033,19 @@ static Py_ssize_t findstring(PyUnicodeObject *self, if (substring->length == 0) return (direction > 0) ? start : end; -#ifdef USE_FAST if (direction > 0) { Py_ssize_t pos = fastsearch( PyUnicode_AS_UNICODE(self) + start, end - start, substring->str, substring->length, FAST_SEARCH ); - if (pos < 0) - return pos; - return pos + start; - } -#endif - - end -= substring->length; - - if (direction < 0) { + if (pos >= 0) + return pos + start; + } else { + end -= substring->length; for (; end >= start; end--) if (Py_UNICODE_MATCH(self, end, substring)) return end; - } else { - for (; start <= end; start++) - if (Py_UNICODE_MATCH(self, start, substring)) - return start; } - return -1; } @@ -5167,11 +5149,8 @@ int PyUnicode_Contains(PyObject *container, PyObject *element) { PyUnicodeObject *u, *v; - int result; Py_ssize_t size; -#ifdef USE_FAST Py_ssize_t pos; -#endif /* Coerce the two arguments */ v = (PyUnicodeObject *) PyUnicode_FromObject(element); @@ -5189,44 +5168,19 @@ int PyUnicode_Contains(PyObject *container, size = PyUnicode_GET_SIZE(v); if (!size) { - result = 1; + pos = 0; goto done; } -#ifdef USE_FAST pos = fastsearch( PyUnicode_AS_UNICODE(u), PyUnicode_GET_SIZE(u), PyUnicode_AS_UNICODE(v), size, FAST_SEARCH ); - result = (pos != -1); -#else - result = 0; - - if (size == 1) { - Py_UNICODE chr = PyUnicode_AS_UNICODE(v)[0]; - Py_UNICODE* ptr = PyUnicode_AS_UNICODE(u); - Py_UNICODE* end = ptr + PyUnicode_GET_SIZE(u); - for (; ptr < end; ptr++) { - if (*ptr == chr) { - result = 1; - break; - } - } - } else { - Py_ssize_t start = 0; - Py_ssize_t end = PyUnicode_GET_SIZE(u) - size; - for (; start <= end; start++) - if (Py_UNICODE_MATCH(u, start, v)) { - result = 1; - break; - } - } -#endif done: Py_DECREF(u); Py_DECREF(v); - return result; + return (pos != -1); } /* Concat to string or Unicode object giving a new Unicode object. */ @@ -6335,6 +6289,84 @@ unicode_split(PyUnicodeObject *self, PyObject *args) return PyUnicode_Split((PyObject *)self, substring, maxcount); } +PyObject * +PyUnicode_Partition(PyObject *str_in, PyObject *sep_in) +{ + PyObject* str_obj; + PyObject* sep_obj; + Py_UNICODE *str, *sep; + Py_ssize_t len, sep_len, pos; + PyObject* out; + + str_obj = PyUnicode_FromObject(str_in); + if (!str_obj) + return NULL; + sep_obj = PyUnicode_FromObject(sep_in); + if (!sep_obj) + goto error; + + str = PyUnicode_AS_UNICODE(str_obj); + len = PyUnicode_GET_SIZE(str_obj); + + sep = PyUnicode_AS_UNICODE(sep_obj); + sep_len = PyUnicode_GET_SIZE(sep_obj); + + if (sep_len == 0) { + PyErr_SetString(PyExc_ValueError, "empty separator"); + goto error; + } + + out = PyTuple_New(3); + if (!out) + goto error; + + pos = fastsearch(str, len, sep, sep_len, FAST_SEARCH); + if (pos < 0) { + Py_INCREF(str_obj); + PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj); + Py_INCREF(unicode_empty); + PyTuple_SET_ITEM(out, 1, (PyObject*) unicode_empty); + Py_INCREF(unicode_empty); + PyTuple_SET_ITEM(out, 2, (PyObject*) unicode_empty); + } else { + PyObject* obj; + PyTuple_SET_ITEM(out, 0, PyUnicode_FromUnicode(str, pos)); + Py_INCREF(sep_obj); + PyTuple_SET_ITEM(out, 1, sep_obj); + obj = PyUnicode_FromUnicode(str + sep_len + pos, len - sep_len - pos); + PyTuple_SET_ITEM(out, 2, obj); + if (PyErr_Occurred()) { + Py_DECREF(out); + goto error; + } + } + + return out; + +error: + Py_XDECREF(sep_obj); + Py_DECREF(str_obj); + return NULL; +} + +PyDoc_STRVAR(partition__doc__, +"S.partition(sep) -> (head, sep, tail)\n\ +\n\ +Searches for the separator sep in S, and returns the part before it,\n\ +the separator itself, and the part after it. If the separator is not\n\ +found, returns S and two empty strings."); + +static PyObject* +unicode_partition(PyUnicodeObject *self, PyObject *args) +{ + PyObject *separator; + + if (!PyArg_ParseTuple(args, "O:partition", &separator)) + return NULL; + + return PyUnicode_Partition((PyObject *)self, separator); +} + PyObject *PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) @@ -6588,6 +6620,7 @@ static PyMethodDef unicode_methods[] = { {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__}, {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__}, {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__}, + {"partition", (PyCFunction) unicode_partition, METH_VARARGS, partition__doc__}, {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},