diff --git a/Doc/lib/libstring.tex b/Doc/lib/libstring.tex index 5f71ec81029..b3699010169 100644 --- a/Doc/lib/libstring.tex +++ b/Doc/lib/libstring.tex @@ -235,17 +235,28 @@ The functions defined in this module are: \function{joinfields()} was only used with two arguments.) \end{funcdesc} -\begin{funcdesc}{lstrip}{s} - Return a copy of \var{s} but without leading whitespace characters. +\begin{funcdesc}{lstrip}{s\optional{, chars}} +Return a copy of the string with leading characters removed. If +\var{chars} is omitted or \code{None}, whitespace characters are +removed. If given and not \code{None}, \var{chars} must be a string; +the characters in the string will be stripped from the beginning of +the string this method is called on. \end{funcdesc} -\begin{funcdesc}{rstrip}{s} - Return a copy of \var{s} but without trailing whitespace - characters. +\begin{funcdesc}{rstrip}{s\optional{, chars}} +Return a copy of the string with trailing characters removed. If +\var{chars} is omitted or \code{None}, whitespace characters are +removed. If given and not \code{None}, \var{chars} must be a string; +the characters in the string will be stripped from the end of the +string this method is called on. \end{funcdesc} -\begin{funcdesc}{strip}{s} - Return a copy of \var{s} without leading or trailing whitespace. +\begin{funcdesc}{strip}{s\optional{, chars}} +Return a copy of the string with leading and trailing characters +removed. If \var{chars} is omitted or \code{None}, whitespace +characters are removed. If given and not \code{None}, \var{chars} +must be a string; the characters in the string will be stripped from +the both ends of the string this method is called on. \end{funcdesc} \begin{funcdesc}{swapcase}{s} diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 368a2121c45..103649deb35 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1040,6 +1040,13 @@ extern DL_IMPORT(int) PyUnicode_Contains( PyObject *element /* Element string */ ); +/* Externally visible for str.strip(unicode) */ +extern DL_IMPORT(PyObject *) _PyUnicode_XStrip( + PyUnicodeObject *self, + int striptype, + PyObject *sepobj + ); + /* === Characters Type APIs =============================================== */ /* These should not be used directly. Use the Py_UNICODE_IS* and diff --git a/Lib/test/string_tests.py b/Lib/test/string_tests.py index 180072c2fca..4185b9b1514 100644 --- a/Lib/test/string_tests.py +++ b/Lib/test/string_tests.py @@ -169,12 +169,18 @@ def run_method_tests(test): test('rstrip', ' hello ', ' hello', None) test('strip', 'hello', 'hello', None) - # strip/lstrip/rstrip with real arg + # strip/lstrip/rstrip with str arg test('strip', 'xyzzyhelloxyzzy', 'hello', 'xyz') test('lstrip', 'xyzzyhelloxyzzy', 'helloxyzzy', 'xyz') test('rstrip', 'xyzzyhelloxyzzy', 'xyzzyhello', 'xyz') test('strip', 'hello', 'hello', 'xyz') + # strip/lstrip/rstrip with unicode arg + test('strip', 'xyzzyhelloxyzzy', u'hello', u'xyz') + test('lstrip', 'xyzzyhelloxyzzy', u'helloxyzzy', u'xyz') + test('rstrip', 'xyzzyhelloxyzzy', u'xyzzyhello', u'xyz') + test('strip', 'hello', u'hello', u'xyz') + test('swapcase', 'HeLLo cOmpUteRs', 'hEllO CoMPuTErS') test('translate', 'xyzabcdef', 'xyzxyz', transtable, 'def') diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py index 8e8ddf9e2c5..5d739397710 100644 --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -169,6 +169,24 @@ test('lstrip', u' hello ', u'hello ') test('rstrip', u' hello ', u' hello') test('strip', u'hello', u'hello') +# strip/lstrip/rstrip with None arg +test('strip', u' hello ', u'hello', None) +test('lstrip', u' hello ', u'hello ', None) +test('rstrip', u' hello ', u' hello', None) +test('strip', u'hello', u'hello', None) + +# strip/lstrip/rstrip with unicode arg +test('strip', u'xyzzyhelloxyzzy', u'hello', u'xyz') +test('lstrip', u'xyzzyhelloxyzzy', u'helloxyzzy', u'xyz') +test('rstrip', u'xyzzyhelloxyzzy', u'xyzzyhello', u'xyz') +test('strip', u'hello', u'hello', u'xyz') + +# strip/lstrip/rstrip with str arg +test('strip', u'xyzzyhelloxyzzy', u'hello', 'xyz') +test('lstrip', u'xyzzyhelloxyzzy', u'helloxyzzy', 'xyz') +test('rstrip', u'xyzzyhelloxyzzy', u'xyzzyhello', 'xyz') +test('strip', u'hello', u'hello', 'xyz') + test('swapcase', u'HeLLo cOmpUteRs', u'hEllO CoMPuTErS') if 0: diff --git a/Objects/stringobject.c b/Objects/stringobject.c index 6a0eece665e..d3c9e4bdf75 100644 --- a/Objects/stringobject.c +++ b/Objects/stringobject.c @@ -1005,7 +1005,9 @@ static PyBufferProcs string_as_buffer = { #define BOTHSTRIP 2 /* Arrays indexed by above */ -static const char *stripname[] = {"lstrip", "rstrip", "strip"}; +static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; + +#define STRIPNAME(i) (stripformat[i]+3) static PyObject * @@ -1449,15 +1451,26 @@ do_argstrip(PyStringObject *self, int striptype, PyObject *args) { PyObject *sep = NULL; - if (!PyArg_ParseTuple(args, "|O:[lr]strip", &sep)) + if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) return NULL; if (sep != NULL && sep != Py_None) { - /* XXX What about Unicode? */ - if (!PyString_Check(sep)) { + if (PyString_Check(sep)) + return do_xstrip(self, striptype, sep); + else if (PyUnicode_Check(sep)) { + PyObject *uniself = PyUnicode_FromObject((PyObject *)self); + PyObject *res; + if (uniself==NULL) + return NULL; + res = _PyUnicode_XStrip((PyUnicodeObject *)uniself, + striptype, sep); + Py_DECREF(uniself); + return res; + } + else { PyErr_Format(PyExc_TypeError, - "%s arg must be None or string", - stripname[striptype]); + "%s arg must be None, str or unicode", + STRIPNAME(striptype)); return NULL; } return do_xstrip(self, striptype, sep); @@ -1468,11 +1481,12 @@ do_argstrip(PyStringObject *self, int striptype, PyObject *args) static char strip__doc__[] = -"S.strip([sep]) -> string\n\ +"S.strip([sep]) -> string or unicode\n\ \n\ Return a copy of the string S with leading and trailing\n\ whitespace removed.\n\ -If sep is given and not None, remove characters in sep instead."; +If sep is given and not None, remove characters in sep instead.\n\ +If sep is unicode, S will be converted to unicode before stripping"; static PyObject * string_strip(PyStringObject *self, PyObject *args) @@ -1485,10 +1499,11 @@ string_strip(PyStringObject *self, PyObject *args) static char lstrip__doc__[] = -"S.lstrip([sep]) -> string\n\ +"S.lstrip([sep]) -> string or unicode\n\ \n\ Return a copy of the string S with leading whitespace removed.\n\ -If sep is given and not None, remove characters in sep instead."; +If sep is given and not None, remove characters in sep instead.\n\ +If sep is unicode, S will be converted to unicode before stripping"; static PyObject * string_lstrip(PyStringObject *self, PyObject *args) @@ -1501,10 +1516,11 @@ string_lstrip(PyStringObject *self, PyObject *args) static char rstrip__doc__[] = -"S.rstrip([sep]) -> string\n\ +"S.rstrip([sep]) -> string or unicode\n\ \n\ Return a copy of the string S with trailing whitespace removed.\n\ -If sep is given and not None, remove characters in sep instead."; +If sep is given and not None, remove characters in sep instead.\n\ +If sep is unicode, S will be converted to unicode before stripping"; static PyObject * string_rstrip(PyStringObject *self, PyObject *args) diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 8dbca6de1ee..2fe96681a05 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3503,35 +3503,6 @@ PyObject *split(PyUnicodeObject *self, return split_substring(self,list,substring,maxcount); } -static -PyObject *strip(PyUnicodeObject *self, - int left, - int right) -{ - Py_UNICODE *p = self->str; - int start = 0; - int end = self->length; - - if (left) - while (start < end && Py_UNICODE_ISSPACE(p[start])) - start++; - - if (right) - while (end > start && Py_UNICODE_ISSPACE(p[end-1])) - end--; - - if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) { - /* couldn't strip anything off, return original string */ - Py_INCREF(self); - return (PyObject*) self; - } - - return (PyObject*) PyUnicode_FromUnicode( - self->str + start, - end - start - ); -} - static PyObject *replace(PyUnicodeObject *self, PyUnicodeObject *str1, @@ -4464,17 +4435,173 @@ unicode_lower(PyUnicodeObject *self) return fixup(self, fixlower); } -static char lstrip__doc__[] = -"S.lstrip() -> unicode\n\ -\n\ -Return a copy of the string S with leading whitespace removed."; +#define LEFTSTRIP 0 +#define RIGHTSTRIP 1 +#define BOTHSTRIP 2 + +/* Arrays indexed by above */ +static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; + +#define STRIPNAME(i) (stripformat[i]+3) + +static const Py_UNICODE * +unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n) +{ + int i; + for (i = 0; i= i && unicode_memchr(sep, s[j], seplen)); + j++; + } + + if (i == 0 && j == len && PyUnicode_CheckExact(self)) { + Py_INCREF(self); + return (PyObject*)self; + } + else + return PyUnicode_FromUnicode(s+i, j-i); +} + static PyObject * -unicode_lstrip(PyUnicodeObject *self) +do_strip(PyUnicodeObject *self, int striptype) { - return strip(self, 1, 0); + Py_UNICODE *s = PyUnicode_AS_UNICODE(self); + int len = PyUnicode_GET_SIZE(self), i, j; + + i = 0; + if (striptype != RIGHTSTRIP) { + while (i < len && Py_UNICODE_ISSPACE(s[i])) { + i++; + } + } + + j = len; + if (striptype != LEFTSTRIP) { + do { + j--; + } while (j >= i && Py_UNICODE_ISSPACE(s[j])); + j++; + } + + if (i == 0 && j == len && PyUnicode_CheckExact(self)) { + Py_INCREF(self); + return (PyObject*)self; + } + else + return PyUnicode_FromUnicode(s+i, j-i); } + +static PyObject * +do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) +{ + PyObject *sep = NULL; + + if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) + return NULL; + + if (sep != NULL && sep != Py_None) { + if (PyUnicode_Check(sep)) + return _PyUnicode_XStrip(self, striptype, sep); + else if (PyString_Check(sep)) { + PyObject *res; + sep = PyUnicode_FromObject(sep); + if (sep==NULL) + return NULL; + res = _PyUnicode_XStrip(self, striptype, sep); + Py_DECREF(sep); + return res; + } + else { + PyErr_Format(PyExc_TypeError, + "%s arg must be None, unicode or str", + STRIPNAME(striptype)); + return NULL; + } + } + + return do_strip(self, striptype); +} + + +static char strip__doc__[] = +"S.strip([sep]) -> unicode\n\ +\n\ +Return a copy of the string S with leading and trailing\n\ +whitespace removed.\n\ +If sep is given and not None, remove characters in sep instead.\n\ +If sep is a str, it will be converted to unicode before stripping"; + +static PyObject * +unicode_strip(PyUnicodeObject *self, PyObject *args) +{ + if (PyTuple_GET_SIZE(args) == 0) + return do_strip(self, BOTHSTRIP); /* Common case */ + else + return do_argstrip(self, BOTHSTRIP, args); +} + + +static char lstrip__doc__[] = +"S.lstrip([sep]) -> unicode\n\ +\n\ +Return a copy of the string S with leading whitespace removed.\n\ +If sep is given and not None, remove characters in sep instead.\n\ +If sep is a str, it will be converted to unicode before stripping"; + +static PyObject * +unicode_lstrip(PyUnicodeObject *self, PyObject *args) +{ + if (PyTuple_GET_SIZE(args) == 0) + return do_strip(self, LEFTSTRIP); /* Common case */ + else + return do_argstrip(self, LEFTSTRIP, args); +} + + +static char rstrip__doc__[] = +"S.rstrip([sep]) -> unicode\n\ +\n\ +Return a copy of the string S with trailing whitespace removed.\n\ +If sep is given and not None, remove characters in sep instead.\n\ +If sep is a str, it will be converted to unicode before stripping"; + +static PyObject * +unicode_rstrip(PyUnicodeObject *self, PyObject *args) +{ + if (PyTuple_GET_SIZE(args) == 0) + return do_strip(self, RIGHTSTRIP); /* Common case */ + else + return do_argstrip(self, RIGHTSTRIP, args); +} + + static PyObject* unicode_repeat(PyUnicodeObject *str, int len) { @@ -4677,17 +4804,6 @@ unicode_rjust(PyUnicodeObject *self, PyObject *args) return (PyObject*) pad(self, width - self->length, 0, ' '); } -static char rstrip__doc__[] = -"S.rstrip() -> unicode\n\ -\n\ -Return a copy of the string S with trailing whitespace removed."; - -static PyObject * -unicode_rstrip(PyUnicodeObject *self) -{ - return strip(self, 0, 1); -} - static PyObject* unicode_slice(PyUnicodeObject *self, int start, int end) { @@ -4783,17 +4899,6 @@ PyObject *unicode_str(PyUnicodeObject *self) return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL); } -static char strip__doc__[] = -"S.strip() -> unicode\n\ -\n\ -Return a copy of S with leading and trailing whitespace removed."; - -static PyObject * -unicode_strip(PyUnicodeObject *self) -{ - return strip(self, 1, 1); -} - static char swapcase__doc__[] = "S.swapcase() -> unicode\n\ \n\ @@ -4966,14 +5071,14 @@ static PyMethodDef unicode_methods[] = { {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, - {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__}, + {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */ {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, - {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__}, + {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, - {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__}, + {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},