Add rsplit method for str and unicode builtin types.

SF feature request #801847.
Original patch is written by Sean Reifschneider.
This commit is contained in:
Hye-Shik Chang 2003-12-15 18:49:53 +00:00
parent dce391cb39
commit 3ae811b57d
7 changed files with 402 additions and 1 deletions

View File

@ -694,6 +694,24 @@ The original string is returned if
\versionchanged[Support for the \var{fillchar} argument]{2.4}
\end{methoddesc}
\begin{methoddesc}[string]{rsplit}{\optional{, sep\optional{, maxsplit}}}
Return a list of the words of the string, scanning the string from
the end working forward. The resulting list of words is in the
same order as \function{split()}. If the optional second argument
\var{sep} is absent or \code{None}, the words are separated by
arbitrary strings of whitespace characters (space, tab, newline,
return, formfeed). If the second argument \var{sep} is present and
not \code{None}, it specifies a string to be used as the word
separator. The returned list will then have one more item than the
number of non-overlapping occurrences of the separator in the string.
The optional third argument \var{maxsplit} defaults to 0. If it
is nonzero, at most \var{maxsplit} number of splits occur, and the
remainder of the string is returned as the first element of the
list (thus, the list will have at most \code{\var{maxsplit}+1}
elements).
\versionadded{2.4}
\end{methoddesc}
\begin{methoddesc}[string]{rstrip}{\optional{chars}}
Return a copy of the string with trailing characters removed. If
\var{chars} is omitted or \code{None}, whitespace characters are

View File

@ -215,6 +215,23 @@ The functions defined in this module are:
elements).
\end{funcdesc}
\begin{funcdesc}{rsplit}{s\optional{, sep\optional{, maxsplit}}}
Return a list of the words of the string \var{s}, scanning \var{s} from
the end working forward. The resulting list of words is in the same
order as \function{split()}. If the optional second argument \var{sep}
is absent or \code{None}, the words are separated by arbitrary strings
of whitespace characters (space, tab, newline, return, formfeed).
If the second argument \var{sep} is present and not \code{None}, it
specifies a string to be used as the word separator. The returned
list will then have one more item than the number of non-overlapping
occurrences of the separator in the string. The optional third argument
\var{maxsplit} defaults to 0. If it is nonzero, at most \var{maxsplit}
number of splits occur, and the remainder of the string is returned
as the first element of the list (thus, the list will have at most
\code{\var{maxsplit}+1} elements).
\versionadded{2.4}
\end{funcdesc}
\begin{funcdesc}{splitfields}{s\optional{, sep\optional{, maxsplit}}}
This function behaves identically to \function{split()}. (In the
past, \function{split()} was only used with one argument, while

View File

@ -185,6 +185,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
# define PyUnicode_Resize PyUnicodeUCS2_Resize
# define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
# define PyUnicode_Split PyUnicodeUCS2_Split
# define PyUnicode_RSplit PyUnicodeUCS2_RSplit
# define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
# define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
# define PyUnicode_Translate PyUnicodeUCS2_Translate
@ -959,6 +960,25 @@ PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
int keepends /* If true, line end markers are included */
);
/* Split a string giving a list of Unicode strings.
If sep is NULL, splitting will be done at all whitespace
substrings. Otherwise, splits occur at the given separator.
At most maxsplit splits will be done. But unlike PyUnicode_Split
PyUnicode_RSplit splits from the end of the string. If negative,
no limit is set.
Separators are not included in the resulting list.
*/
PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
PyObject *s, /* String to split */
PyObject *sep, /* String separator */
int maxsplit /* Maxsplit count */
);
/* Translate a string by applying a character mapping table to it and
return the resulting Unicode object.

View File

@ -121,6 +121,18 @@ def split(s, sep=None, maxsplit=-1):
return s.split(sep, maxsplit)
splitfields = split
# Split a string into a list of space/tab-separated words
def rsplit(s, sep=None, maxsplit=-1):
"""rsplit(s [,sep [,maxsplit]]) -> list of strings
Return a list of the words in the string s, using sep as the
delimiter string, starting at the end of the string and working
to the front. If maxsplit is given, at most maxsplit splits are
done. If sep is not specified or is None, any whitespace string
is a separator.
"""
return s.rsplit(sep, maxsplit)
# Join fields with optional separator
def join(words, sep = ' '):
"""join(list [,sep]) -> string

View File

@ -189,6 +189,26 @@ class CommonTest(unittest.TestCase):
self.checkraises(TypeError, 'hello', 'split', 42, 42, 42)
def test_rsplit(self):
self.checkequal(['this', 'is', 'the', 'rsplit', 'function'],
'this is the rsplit function', 'rsplit')
self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|')
self.checkequal(['a|b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|', 2)
self.checkequal(['a b c', 'd'], 'a b c d', 'rsplit', None, 1)
self.checkequal(['a b', 'c', 'd'], 'a b c d', 'rsplit', None, 2)
self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'rsplit', None, 3)
self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'rsplit', None, 4)
self.checkequal(['a b c d'], 'a b c d', 'rsplit', None, 0)
self.checkequal(['a, b, c', 'd'], 'a, b, c, d', 'rsplit', ', ', 1)
self.checkequal(['a, b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 2)
self.checkequal(['a', 'b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 3)
self.checkequal(['a', 'b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 4)
self.checkequal(['a, b, c, d'], 'a, b, c, d', 'rsplit', ', ', 0)
self.checkequal(['a b', 'c', 'd'], 'a b c d', 'rsplit', None, 2)
self.checkequal(['a\x00b', 'c'], 'a\x00b\x00c', 'rsplit', '\x00', 1)
self.checkequal(['', ''], 'abcd', 'rsplit', 'abcd')
self.checkequal([u'a b', u'c', u'd'], 'a b c d', 'rsplit', u' ', 2)
def test_strip(self):
self.checkequal('hello', ' hello ', 'strip')
self.checkequal('hello ', ' hello ', 'lstrip')

View File

@ -1407,6 +1407,129 @@ string_split(PyStringObject *self, PyObject *args)
return NULL;
}
static PyObject *
rsplit_whitespace(const char *s, int len, int maxsplit)
{
int i, j, err;
PyObject* item;
PyObject *list = PyList_New(0);
if (list == NULL)
return NULL;
for (i = j = len - 1; i >= 0; ) {
while (i >= 0 && isspace(Py_CHARMASK(s[i])))
i--;
j = i;
while (i >= 0 && !isspace(Py_CHARMASK(s[i])))
i--;
if (j > i) {
if (maxsplit-- <= 0)
break;
item = PyString_FromStringAndSize(s+i+1, (int)(j-i));
if (item == NULL)
goto finally;
err = PyList_Insert(list, 0, item);
Py_DECREF(item);
if (err < 0)
goto finally;
while (i >= 0 && isspace(Py_CHARMASK(s[i])))
i--;
j = i;
}
}
if (j >= 0) {
item = PyString_FromStringAndSize(s, (int)(j + 1));
if (item == NULL)
goto finally;
err = PyList_Insert(list, 0, item);
Py_DECREF(item);
if (err < 0)
goto finally;
}
return list;
finally:
Py_DECREF(list);
return NULL;
}
PyDoc_STRVAR(rsplit__doc__,
"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
\n\
Return a list of the words in the string S, using sep as the\n\
delimiter string, starting at the end of the string and working\n\
to the front. If maxsplit is given, at most maxsplit splits are\n\
done. If sep is not specified or is None, any whitespace string\n\
is a separator.");
static PyObject *
string_rsplit(PyStringObject *self, PyObject *args)
{
int len = PyString_GET_SIZE(self), n, i, j, err;
int maxsplit = -1;
const char *s = PyString_AS_STRING(self), *sub;
PyObject *list, *item, *subobj = Py_None;
if (!PyArg_ParseTuple(args, "|Oi:rsplit", &subobj, &maxsplit))
return NULL;
if (maxsplit < 0)
maxsplit = INT_MAX;
if (subobj == Py_None)
return rsplit_whitespace(s, len, maxsplit);
if (PyString_Check(subobj)) {
sub = PyString_AS_STRING(subobj);
n = PyString_GET_SIZE(subobj);
}
#ifdef Py_USING_UNICODE
else if (PyUnicode_Check(subobj))
return PyUnicode_RSplit((PyObject *)self, subobj, maxsplit);
#endif
else if (PyObject_AsCharBuffer(subobj, &sub, &n))
return NULL;
if (n == 0) {
PyErr_SetString(PyExc_ValueError, "empty separator");
return NULL;
}
list = PyList_New(0);
if (list == NULL)
return NULL;
j = len;
i = j - n;
while (i >= 0) {
if (s[i] == sub[0] && memcmp(s+i, sub, n) == 0) {
if (maxsplit-- <= 0)
break;
item = PyString_FromStringAndSize(s+i+n, (int)(j-i-n));
if (item == NULL)
goto fail;
err = PyList_Insert(list, 0, item);
Py_DECREF(item);
if (err < 0)
goto fail;
j = i;
i -= n;
}
else
i--;
}
item = PyString_FromStringAndSize(s, j);
if (item == NULL)
goto fail;
err = PyList_Insert(list, 0, item);
Py_DECREF(item);
if (err < 0)
goto fail;
return list;
fail:
Py_DECREF(list);
return NULL;
}
PyDoc_STRVAR(join__doc__,
"S.join(sequence) -> string\n\
@ -3064,6 +3187,7 @@ string_methods[] = {
string.maketrans(). */
{"join", (PyCFunction)string_join, METH_O, join__doc__},
{"split", (PyCFunction)string_split, METH_VARARGS, split__doc__},
{"rsplit", (PyCFunction)string_rsplit, METH_VARARGS, rsplit__doc__},
{"lower", (PyCFunction)string_lower, METH_NOARGS, lower__doc__},
{"upper", (PyCFunction)string_upper, METH_NOARGS, upper__doc__},
{"islower", (PyCFunction)string_islower, METH_NOARGS, islower__doc__},

View File

@ -4053,7 +4053,7 @@ PyUnicodeObject *pad(PyUnicodeObject *self,
}
#define SPLIT_APPEND(data, left, right) \
str = PyUnicode_FromUnicode(data + left, right - left); \
str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
if (!str) \
goto onError; \
if (PyList_Append(list, str)) { \
@ -4063,6 +4063,17 @@ PyUnicodeObject *pad(PyUnicodeObject *self,
else \
Py_DECREF(str);
#define SPLIT_INSERT(data, left, right) \
str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
if (!str) \
goto onError; \
if (PyList_Insert(list, 0, str)) { \
Py_DECREF(str); \
goto onError; \
} \
else \
Py_DECREF(str);
static
PyObject *split_whitespace(PyUnicodeObject *self,
PyObject *list,
@ -4214,7 +4225,106 @@ PyObject *split_substring(PyUnicodeObject *self,
return NULL;
}
static
PyObject *rsplit_whitespace(PyUnicodeObject *self,
PyObject *list,
int maxcount)
{
register int i;
register int j;
int len = self->length;
PyObject *str;
for (i = j = len - 1; i >= 0; ) {
/* find a token */
while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
i--;
j = i;
while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
i--;
if (j > i) {
if (maxcount-- <= 0)
break;
SPLIT_INSERT(self->str, i + 1, j + 1);
while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
i--;
j = i;
}
}
if (j >= 0) {
SPLIT_INSERT(self->str, 0, j + 1);
}
return list;
onError:
Py_DECREF(list);
return NULL;
}
static
PyObject *rsplit_char(PyUnicodeObject *self,
PyObject *list,
Py_UNICODE ch,
int maxcount)
{
register int i;
register int j;
int len = self->length;
PyObject *str;
for (i = j = len - 1; i >= 0; ) {
if (self->str[i] == ch) {
if (maxcount-- <= 0)
break;
SPLIT_INSERT(self->str, i + 1, j + 1);
j = i = i - 1;
} else
i--;
}
if (j >= 0) {
SPLIT_INSERT(self->str, 0, j + 1);
}
return list;
onError:
Py_DECREF(list);
return NULL;
}
static
PyObject *rsplit_substring(PyUnicodeObject *self,
PyObject *list,
PyUnicodeObject *substring,
int maxcount)
{
register int i;
register int j;
int len = self->length;
int sublen = substring->length;
PyObject *str;
for (i = len - sublen, j = len; i >= 0; ) {
if (Py_UNICODE_MATCH(self, i, substring)) {
if (maxcount-- <= 0)
break;
SPLIT_INSERT(self->str, i + sublen, j);
j = i;
i -= sublen;
} else
i--;
}
if (j >= 0) {
SPLIT_INSERT(self->str, 0, j);
}
return list;
onError:
Py_DECREF(list);
return NULL;
}
#undef SPLIT_APPEND
#undef SPLIT_INSERT
static
PyObject *split(PyUnicodeObject *self,
@ -4245,6 +4355,35 @@ PyObject *split(PyUnicodeObject *self,
return split_substring(self,list,substring,maxcount);
}
static
PyObject *rsplit(PyUnicodeObject *self,
PyUnicodeObject *substring,
int maxcount)
{
PyObject *list;
if (maxcount < 0)
maxcount = INT_MAX;
list = PyList_New(0);
if (!list)
return NULL;
if (substring == NULL)
return rsplit_whitespace(self,list,maxcount);
else if (substring->length == 1)
return rsplit_char(self,list,substring->str[0],maxcount);
else if (substring->length == 0) {
Py_DECREF(list);
PyErr_SetString(PyExc_ValueError, "empty separator");
return NULL;
}
else
return rsplit_substring(self,list,substring,maxcount);
}
static
PyObject *replace(PyUnicodeObject *self,
PyUnicodeObject *str1,
@ -5675,6 +5814,56 @@ unicode_split(PyUnicodeObject *self, PyObject *args)
return PyUnicode_Split((PyObject *)self, substring, maxcount);
}
PyObject *PyUnicode_RSplit(PyObject *s,
PyObject *sep,
int maxsplit)
{
PyObject *result;
s = PyUnicode_FromObject(s);
if (s == NULL)
return NULL;
if (sep != NULL) {
sep = PyUnicode_FromObject(sep);
if (sep == NULL) {
Py_DECREF(s);
return NULL;
}
}
result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
Py_DECREF(s);
Py_XDECREF(sep);
return result;
}
PyDoc_STRVAR(rsplit__doc__,
"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
\n\
Return a list of the words in S, using sep as the\n\
delimiter string, starting at the end of the string and\n\
working to the front. If maxsplit is given, at most maxsplit\n\
splits are done. If sep is not specified, any whitespace string\n\
is a separator.");
static PyObject*
unicode_rsplit(PyUnicodeObject *self, PyObject *args)
{
PyObject *substring = Py_None;
int maxcount = -1;
if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
return NULL;
if (substring == Py_None)
return rsplit(self, NULL, maxcount);
else if (PyUnicode_Check(substring))
return rsplit(self, (PyUnicodeObject *)substring, maxcount);
else
return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
}
PyDoc_STRVAR(splitlines__doc__,
"S.splitlines([keepends]]) -> list of strings\n\
\n\
@ -5870,6 +6059,7 @@ static PyMethodDef unicode_methods[] = {
{"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
{"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
{"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
{"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
{"join", (PyCFunction) unicode_join, METH_O, join__doc__},
{"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
{"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},