Add rsplit method for str and unicode builtin types.

SF feature request #801847. Original patch is written by Sean Reifschneider.
2003-12-15 18:49:53 +00:00 · 2003-12-15 18:49:53 +00:00 · 3ae811b57d
parent dce391cb39
commit 3ae811b57d
7 changed files with 402 additions and 1 deletions
--- a/Doc/lib/libstdtypes.tex
+++ b/Doc/lib/libstdtypes.tex
@ -694,6 +694,24 @@ The original string is returned if
 \versionchanged[Support for the \var{fillchar} argument]{2.4}
 \end{methoddesc}

+\begin{methoddesc}[string]{rsplit}{\optional{, sep\optional{, maxsplit}}}
+Return a list of the words of the string, scanning the string from
+the end working forward.  The resulting list of words is in the
+same order as \function{split()}.  If the optional second argument
+\var{sep} is absent or \code{None}, the words are separated by
+arbitrary strings of whitespace characters (space, tab, newline,
+return, formfeed).  If the second argument \var{sep} is present and
+not \code{None}, it specifies a string to be used as the word
+separator.  The returned list will then have one more item than the
+number of non-overlapping occurrences of the separator in the string.
+The optional third argument \var{maxsplit} defaults to 0.  If it
+is nonzero, at most \var{maxsplit} number of splits occur, and the
+remainder of the string is returned as the first element of the
+list (thus, the list will have at most \code{\var{maxsplit}+1}
+elements).
+\versionadded{2.4}
+\end{methoddesc}
+
 \begin{methoddesc}[string]{rstrip}{\optional{chars}}
 Return a copy of the string with trailing characters removed.  If
 \var{chars} is omitted or \code{None}, whitespace characters are
--- a/Doc/lib/libstring.tex
+++ b/Doc/lib/libstring.tex
@ -215,6 +215,23 @@ The functions defined in this module are:
  elements).
 \end{funcdesc}

+\begin{funcdesc}{rsplit}{s\optional{, sep\optional{, maxsplit}}}
+  Return a list of the words of the string \var{s}, scanning \var{s} from
+  the end working forward.  The resulting list of words is in the same
+  order as \function{split()}.  If the optional second argument \var{sep}
+  is absent or \code{None}, the words are separated by arbitrary strings
+  of whitespace characters (space, tab, newline, return, formfeed).
+  If the second argument \var{sep} is present and not \code{None}, it
+  specifies a string to be used as the word separator.  The returned
+  list will then have one more item than the number of non-overlapping
+  occurrences of the separator in the string.  The optional third argument
+  \var{maxsplit} defaults to 0.  If it is nonzero, at most \var{maxsplit}
+  number of splits occur, and the remainder of the string is returned
+  as the first element of the list (thus, the list will have at most
+  \code{\var{maxsplit}+1} elements).
+  \versionadded{2.4}
+\end{funcdesc}
+
 \begin{funcdesc}{splitfields}{s\optional{, sep\optional{, maxsplit}}}
  This function behaves identically to \function{split()}.  (In the
  past, \function{split()} was only used with one argument, while
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -185,6 +185,7 @@ typedef PY_UNICODE_TYPE Py_UNICODE;
 # define PyUnicode_Resize PyUnicodeUCS2_Resize
 # define PyUnicode_SetDefaultEncoding PyUnicodeUCS2_SetDefaultEncoding
 # define PyUnicode_Split PyUnicodeUCS2_Split
+# define PyUnicode_RSplit PyUnicodeUCS2_RSplit
 # define PyUnicode_Splitlines PyUnicodeUCS2_Splitlines
 # define PyUnicode_Tailmatch PyUnicodeUCS2_Tailmatch
 # define PyUnicode_Translate PyUnicodeUCS2_Translate
@ -959,6 +960,25 @@ PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
    int keepends		/* If true, line end markers are included */
    );		

+/* Split a string giving a list of Unicode strings.
+
+   If sep is NULL, splitting will be done at all whitespace
+   substrings. Otherwise, splits occur at the given separator.
+
+   At most maxsplit splits will be done. But unlike PyUnicode_Split
+   PyUnicode_RSplit splits from the end of the string. If negative,
+   no limit is set.
+
+   Separators are not included in the resulting list.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
+    PyObject *s,		/* String to split */
+    PyObject *sep,		/* String separator */
+    int maxsplit		/* Maxsplit count */
+    );		
+
 /* Translate a string by applying a character mapping table to it and
   return the resulting Unicode object.

--- a/Lib/string.py
+++ b/Lib/string.py
@ -121,6 +121,18 @@ def split(s, sep=None, maxsplit=-1):
    return s.split(sep, maxsplit)
 splitfields = split

+# Split a string into a list of space/tab-separated words
+def rsplit(s, sep=None, maxsplit=-1):
+    """rsplit(s [,sep [,maxsplit]]) -> list of strings
+
+    Return a list of the words in the string s, using sep as the
+    delimiter string, starting at the end of the string and working
+    to the front.  If maxsplit is given, at most maxsplit splits are
+    done. If sep is not specified or is None, any whitespace string
+    is a separator.
+    """
+    return s.rsplit(sep, maxsplit)
+
 # Join fields with optional separator
 def join(words, sep = ' '):
    """join(list [,sep]) -> string
--- a/Lib/test/string_tests.py
+++ b/Lib/test/string_tests.py
@ -189,6 +189,26 @@ class CommonTest(unittest.TestCase):

        self.checkraises(TypeError, 'hello', 'split', 42, 42, 42)

+    def test_rsplit(self):
+        self.checkequal(['this', 'is', 'the', 'rsplit', 'function'],
+                         'this is the rsplit function', 'rsplit')
+        self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|')
+        self.checkequal(['a|b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|', 2)
+        self.checkequal(['a b c', 'd'], 'a b c d', 'rsplit', None, 1)
+        self.checkequal(['a b', 'c', 'd'], 'a b c d', 'rsplit', None, 2)
+        self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'rsplit', None, 3)
+        self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'rsplit', None, 4)
+        self.checkequal(['a b c d'], 'a b c d', 'rsplit', None, 0)
+        self.checkequal(['a, b, c', 'd'], 'a, b, c, d', 'rsplit', ', ', 1)
+        self.checkequal(['a, b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 2)
+        self.checkequal(['a', 'b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 3)
+        self.checkequal(['a', 'b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 4)
+        self.checkequal(['a, b, c, d'], 'a, b, c, d', 'rsplit', ', ', 0)
+        self.checkequal(['a  b', 'c', 'd'], 'a  b  c  d', 'rsplit', None, 2)
+        self.checkequal(['a\x00b', 'c'], 'a\x00b\x00c', 'rsplit', '\x00', 1)
+        self.checkequal(['', ''], 'abcd', 'rsplit', 'abcd')
+        self.checkequal([u'a b', u'c', u'd'], 'a b c d', 'rsplit', u' ', 2)
+
    def test_strip(self):
        self.checkequal('hello', '   hello   ', 'strip')
        self.checkequal('hello   ', '   hello   ', 'lstrip')
--- a/Objects/stringobject.c
+++ b/Objects/stringobject.c
@ -1407,6 +1407,129 @@ string_split(PyStringObject *self, PyObject *args)
 	return NULL;
 }

+static PyObject *
+rsplit_whitespace(const char *s, int len, int maxsplit)
+{
+	int i, j, err;
+	PyObject* item;
+	PyObject *list = PyList_New(0);
+
+	if (list == NULL)
+		return NULL;
+
+	for (i = j = len - 1; i >= 0; ) {
+		while (i >= 0 && isspace(Py_CHARMASK(s[i])))
+			i--;
+		j = i;
+		while (i >= 0 && !isspace(Py_CHARMASK(s[i])))
+			i--;
+		if (j > i) {
+			if (maxsplit-- <= 0)
+				break;
+			item = PyString_FromStringAndSize(s+i+1, (int)(j-i));
+			if (item == NULL)
+				goto finally;
+			err = PyList_Insert(list, 0, item);
+			Py_DECREF(item);
+			if (err < 0)
+				goto finally;
+			while (i >= 0 && isspace(Py_CHARMASK(s[i])))
+				i--;
+			j = i;
+		}
+	}
+	if (j >= 0) {
+		item = PyString_FromStringAndSize(s, (int)(j + 1));
+		if (item == NULL)
+			goto finally;
+		err = PyList_Insert(list, 0, item);
+		Py_DECREF(item);
+		if (err < 0)
+			goto finally;
+	}
+	return list;
+  finally:
+	Py_DECREF(list);
+	return NULL;
+}
+
+
+PyDoc_STRVAR(rsplit__doc__,
+"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
+\n\
+Return a list of the words in the string S, using sep as the\n\
+delimiter string, starting at the end of the string and working\n\
+to the front.  If maxsplit is given, at most maxsplit splits are\n\
+done. If sep is not specified or is None, any whitespace string\n\
+is a separator.");
+
+static PyObject *
+string_rsplit(PyStringObject *self, PyObject *args)
+{
+	int len = PyString_GET_SIZE(self), n, i, j, err;
+	int maxsplit = -1;
+	const char *s = PyString_AS_STRING(self), *sub;
+	PyObject *list, *item, *subobj = Py_None;
+
+	if (!PyArg_ParseTuple(args, "|Oi:rsplit", &subobj, &maxsplit))
+		return NULL;
+	if (maxsplit < 0)
+		maxsplit = INT_MAX;
+	if (subobj == Py_None)
+		return rsplit_whitespace(s, len, maxsplit);
+	if (PyString_Check(subobj)) {
+		sub = PyString_AS_STRING(subobj);
+		n = PyString_GET_SIZE(subobj);
+	}
+#ifdef Py_USING_UNICODE
+	else if (PyUnicode_Check(subobj))
+		return PyUnicode_RSplit((PyObject *)self, subobj, maxsplit);
+#endif
+	else if (PyObject_AsCharBuffer(subobj, &sub, &n))
+		return NULL;
+	if (n == 0) {
+		PyErr_SetString(PyExc_ValueError, "empty separator");
+		return NULL;
+	}
+
+	list = PyList_New(0);
+	if (list == NULL)
+		return NULL;
+
+	j = len;
+	i = j - n;
+	while (i >= 0) {
+		if (s[i] == sub[0] && memcmp(s+i, sub, n) == 0) {
+			if (maxsplit-- <= 0)
+				break;
+			item = PyString_FromStringAndSize(s+i+n, (int)(j-i-n));
+			if (item == NULL)
+				goto fail;
+			err = PyList_Insert(list, 0, item);
+			Py_DECREF(item);
+			if (err < 0)
+				goto fail;
+			j = i;
+			i -= n;
+		}
+		else
+			i--;
+	}
+	item = PyString_FromStringAndSize(s, j);
+	if (item == NULL)
+		goto fail;
+	err = PyList_Insert(list, 0, item);
+	Py_DECREF(item);
+	if (err < 0)
+		goto fail;
+
+	return list;
+
+ fail:
+	Py_DECREF(list);
+	return NULL;
+}
+

 PyDoc_STRVAR(join__doc__,
 "S.join(sequence) -> string\n\
@ -3064,6 +3187,7 @@ string_methods[] = {
 	   string.maketrans(). */
 	{"join", (PyCFunction)string_join, METH_O, join__doc__},
 	{"split", (PyCFunction)string_split, METH_VARARGS, split__doc__},
+	{"rsplit", (PyCFunction)string_rsplit, METH_VARARGS, rsplit__doc__},
 	{"lower", (PyCFunction)string_lower, METH_NOARGS, lower__doc__},
 	{"upper", (PyCFunction)string_upper, METH_NOARGS, upper__doc__},
 	{"islower", (PyCFunction)string_islower, METH_NOARGS, islower__doc__},
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -4053,7 +4053,7 @@ PyUnicodeObject *pad(PyUnicodeObject *self,
 }

 #define SPLIT_APPEND(data, left, right)					\
-	str = PyUnicode_FromUnicode(data + left, right - left);		\
+	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
 	if (!str)							\
 	    goto onError;						\
 	if (PyList_Append(list, str)) {					\
@ -4063,6 +4063,17 @@ PyUnicodeObject *pad(PyUnicodeObject *self,
        else								\
            Py_DECREF(str);

+#define SPLIT_INSERT(data, left, right)					\
+	str = PyUnicode_FromUnicode((data) + (left), (right) - (left));	\
+	if (!str)							\
+	    goto onError;						\
+	if (PyList_Insert(list, 0, str)) {				\
+	    Py_DECREF(str);						\
+	    goto onError;						\
+	}								\
+        else								\
+            Py_DECREF(str);
+
 static
 PyObject *split_whitespace(PyUnicodeObject *self,
 			   PyObject *list,
@ -4214,7 +4225,106 @@ PyObject *split_substring(PyUnicodeObject *self,
    return NULL;
 }

+static
+PyObject *rsplit_whitespace(PyUnicodeObject *self,
+			    PyObject *list,
+			    int maxcount)
+{
+    register int i;
+    register int j;
+    int len = self->length;
+    PyObject *str;
+
+    for (i = j = len - 1; i >= 0; ) {
+	/* find a token */
+	while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
+	    i--;
+	j = i;
+	while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
+	    i--;
+	if (j > i) {
+	    if (maxcount-- <= 0)
+		break;
+	    SPLIT_INSERT(self->str, i + 1, j + 1);
+	    while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
+		i--;
+	    j = i;
+	}
+    }
+    if (j >= 0) {
+	SPLIT_INSERT(self->str, 0, j + 1);
+    }
+    return list;
+
+ onError:
+    Py_DECREF(list);
+    return NULL;
+}
+
+static 
+PyObject *rsplit_char(PyUnicodeObject *self,
+		      PyObject *list,
+		      Py_UNICODE ch,
+		      int maxcount)
+{
+    register int i;
+    register int j;
+    int len = self->length;
+    PyObject *str;
+
+    for (i = j = len - 1; i >= 0; ) {
+	if (self->str[i] == ch) {
+	    if (maxcount-- <= 0)
+		break;
+	    SPLIT_INSERT(self->str, i + 1, j + 1);
+	    j = i = i - 1;
+	} else
+	    i--;
+    }
+    if (j >= 0) {
+	SPLIT_INSERT(self->str, 0, j + 1);
+    }
+    return list;
+
+ onError:
+    Py_DECREF(list);
+    return NULL;
+}
+
+static 
+PyObject *rsplit_substring(PyUnicodeObject *self,
+			   PyObject *list,
+			   PyUnicodeObject *substring,
+			   int maxcount)
+{
+    register int i;
+    register int j;
+    int len = self->length;
+    int sublen = substring->length;
+    PyObject *str;
+
+    for (i = len - sublen, j = len; i >= 0; ) {
+	if (Py_UNICODE_MATCH(self, i, substring)) {
+	    if (maxcount-- <= 0)
+		break;
+	    SPLIT_INSERT(self->str, i + sublen, j);
+	    j = i;
+	    i -= sublen;
+	} else
+	    i--;
+    }
+    if (j >= 0) {
+	SPLIT_INSERT(self->str, 0, j);
+    }
+    return list;
+
+ onError:
+    Py_DECREF(list);
+    return NULL;
+}
+
 #undef SPLIT_APPEND
+#undef SPLIT_INSERT

 static
 PyObject *split(PyUnicodeObject *self,
@ -4245,6 +4355,35 @@ PyObject *split(PyUnicodeObject *self,
 	return split_substring(self,list,substring,maxcount);
 }

+static
+PyObject *rsplit(PyUnicodeObject *self,
+		 PyUnicodeObject *substring,
+		 int maxcount)
+{
+    PyObject *list;
+
+    if (maxcount < 0)
+        maxcount = INT_MAX;
+
+    list = PyList_New(0);
+    if (!list)
+        return NULL;
+
+    if (substring == NULL)
+	return rsplit_whitespace(self,list,maxcount);
+
+    else if (substring->length == 1)
+	return rsplit_char(self,list,substring->str[0],maxcount);
+
+    else if (substring->length == 0) {
+	Py_DECREF(list);
+	PyErr_SetString(PyExc_ValueError, "empty separator");
+	return NULL;
+    }
+    else
+	return rsplit_substring(self,list,substring,maxcount);
+}
+
 static
 PyObject *replace(PyUnicodeObject *self,
 		  PyUnicodeObject *str1,
@ -5675,6 +5814,56 @@ unicode_split(PyUnicodeObject *self, PyObject *args)
 	return PyUnicode_Split((PyObject *)self, substring, maxcount);
 }

+PyObject *PyUnicode_RSplit(PyObject *s,
+			   PyObject *sep,
+			   int maxsplit)
+{
+    PyObject *result;
+    
+    s = PyUnicode_FromObject(s);
+    if (s == NULL)
+	return NULL;
+    if (sep != NULL) {
+	sep = PyUnicode_FromObject(sep);
+	if (sep == NULL) {
+	    Py_DECREF(s);
+	    return NULL;
+	}
+    }
+
+    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
+
+    Py_DECREF(s);
+    Py_XDECREF(sep);
+    return result;
+}
+
+PyDoc_STRVAR(rsplit__doc__,
+"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
+\n\
+Return a list of the words in S, using sep as the\n\
+delimiter string, starting at the end of the string and\n\
+working to the front.  If maxsplit is given, at most maxsplit\n\
+splits are done. If sep is not specified, any whitespace string\n\
+is a separator.");
+
+static PyObject*
+unicode_rsplit(PyUnicodeObject *self, PyObject *args)
+{
+    PyObject *substring = Py_None;
+    int maxcount = -1;
+
+    if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
+        return NULL;
+
+    if (substring == Py_None)
+	return rsplit(self, NULL, maxcount);
+    else if (PyUnicode_Check(substring))
+	return rsplit(self, (PyUnicodeObject *)substring, maxcount);
+    else
+	return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
+}
+
 PyDoc_STRVAR(splitlines__doc__,
 "S.splitlines([keepends]]) -> list of strings\n\
 \n\
@ -5870,6 +6059,7 @@ static PyMethodDef unicode_methods[] = {
    {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
+    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},