Bug # 1125 (my code).

Support bytes.split() and bytes.strip() -- these split/strip using ASCII
whitespace (tab, space, CR, LF, FF, VT) like their str counterparts.
Also for rsplit(), lstrip() and rstrip().
And change all these functions to accept arbitrary buffer-API-supporting
arguments.
With unit tests.
This commit is contained in:
Guido van Rossum 2007-09-10 16:53:45 +00:00
parent 954c31bcc7
commit 8f95067915
2 changed files with 233 additions and 59 deletions

View File

@ -617,16 +617,46 @@ class BytesTest(unittest.TestCase):
self.assertEqual(b.split(b'i'), [b'm', b'ss', b'ss', b'pp', b''])
self.assertEqual(b.split(b'ss'), [b'mi', b'i', b'ippi'])
self.assertEqual(b.split(b'w'), [b])
# require an arg (no magic whitespace split)
self.assertRaises(TypeError, b.split)
def test_split_whitespace(self):
for b in (b' arf barf ', b'arf\tbarf', b'arf\nbarf', b'arf\rbarf',
b'arf\fbarf', b'arf\vbarf'):
self.assertEqual(b.split(), [b'arf', b'barf'])
self.assertEqual(b.split(None), [b'arf', b'barf'])
self.assertEqual(b.split(None, 2), [b'arf', b'barf'])
self.assertEqual(b' a bb c '.split(None, 0), [b'a bb c '])
self.assertEqual(b' a bb c '.split(None, 1), [b'a', b'bb c '])
self.assertEqual(b' a bb c '.split(None, 2), [b'a', b'bb', b'c '])
self.assertEqual(b' a bb c '.split(None, 3), [b'a', b'bb', b'c'])
def test_split_buffer(self):
self.assertEqual(b'a b'.split(buffer(b' ')), [b'a', b'b'])
def test_split_string_error(self):
self.assertRaises(TypeError, b'a b'.split, ' ')
def test_rsplit(self):
b = b'mississippi'
self.assertEqual(b.rsplit(b'i'), [b'm', b'ss', b'ss', b'pp', b''])
self.assertEqual(b.rsplit(b'ss'), [b'mi', b'i', b'ippi'])
self.assertEqual(b.rsplit(b'w'), [b])
# require an arg (no magic whitespace split)
self.assertRaises(TypeError, b.rsplit)
def test_rsplit_whitespace(self):
for b in (b' arf barf ', b'arf\tbarf', b'arf\nbarf', b'arf\rbarf',
b'arf\fbarf', b'arf\vbarf'):
self.assertEqual(b.rsplit(), [b'arf', b'barf'])
self.assertEqual(b.rsplit(None), [b'arf', b'barf'])
self.assertEqual(b.rsplit(None, 2), [b'arf', b'barf'])
self.assertEqual(b' a bb c '.rsplit(None, 0), [b' a bb c'])
self.assertEqual(b' a bb c '.rsplit(None, 1), [b' a bb', b'c'])
self.assertEqual(b' a bb c '.rsplit(None,2), [b' a', b'bb', b'c'])
self.assertEqual(b' a bb c '.rsplit(None, 3), [b'a', b'bb', b'c'])
def test_rplit_buffer(self):
self.assertEqual(b'a b'.rsplit(buffer(b' ')), [b'a', b'b'])
def test_rplit_string_error(self):
self.assertRaises(TypeError, b'a b'.rsplit, ' ')
def test_partition(self):
b = b'mississippi'
@ -670,6 +700,22 @@ class BytesTest(unittest.TestCase):
self.assertEqual(b.rstrip(b'im'), b'mississipp')
self.assertEqual(b.rstrip(b'pim'), b'mississ')
def test_strip_whitespace(self):
b = b' \t\n\r\f\vabc \t\n\r\f\v'
self.assertEqual(b.strip(), b'abc')
self.assertEqual(b.lstrip(), b'abc \t\n\r\f\v')
self.assertEqual(b.rstrip(), b' \t\n\r\f\vabc')
def test_strip_buffer(self):
self.assertEqual(b'abc'.strip(buffer(b'ac')), b'b')
self.assertEqual(b'abc'.lstrip(buffer(b'ac')), b'bc')
self.assertEqual(b'abc'.rstrip(buffer(b'ac')), b'ab')
def test_strip_string_error(self):
self.assertRaises(TypeError, b'abc'.strip, 'b')
self.assertRaises(TypeError, b'abc'.lstrip, 'b')
self.assertRaises(TypeError, b'abc'.rstrip, 'b')
def test_ord(self):
b = b'\0A\x7f\x80\xff'
self.assertEqual([ord(b[i:i+1]) for i in range(len(b))],

View File

@ -2104,7 +2104,7 @@ bytes_replace(PyBytesObject *self, PyObject *args)
Py_LOCAL_INLINE(PyObject *)
split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
{
register Py_ssize_t i, j, count=0;
register Py_ssize_t i, j, count = 0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
@ -2113,7 +2113,7 @@ split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
i = j = 0;
while ((j < len) && (maxcount-- > 0)) {
for(; j<len; j++) {
for(; j < len; j++) {
/* I found that using memchr makes no difference */
if (s[j] == ch) {
SPLIT_ADD(s, i, j);
@ -2133,46 +2133,91 @@ split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
return NULL;
}
#define ISSPACE(c) (isspace(Py_CHARMASK(c)) && ((c) & 0x80) == 0)
Py_LOCAL_INLINE(PyObject *)
split_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
{
register Py_ssize_t i, j, count = 0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
if (list == NULL)
return NULL;
for (i = j = 0; i < len; ) {
/* find a token */
while (i < len && ISSPACE(s[i]))
i++;
j = i;
while (i < len && !ISSPACE(s[i]))
i++;
if (j < i) {
if (maxcount-- <= 0)
break;
SPLIT_ADD(s, j, i);
while (i < len && ISSPACE(s[i]))
i++;
j = i;
}
}
if (j < len) {
SPLIT_ADD(s, j, len);
}
FIX_PREALLOC_SIZE(list);
return list;
onError:
Py_DECREF(list);
return NULL;
}
PyDoc_STRVAR(split__doc__,
"B.split(sep [,maxsplit]) -> list of bytes\n\
"B.split([sep [, maxsplit]]) -> list of bytes\n\
\n\
Return a list of the bytes in the string B, using sep as the\n\
delimiter. If maxsplit is given, at most maxsplit\n\
splits are done.");
Return a list of the bytes in the string B, using sep as the delimiter.\n\
If sep is not given, B is split on ASCII whitespace charcters\n\
(space, tab, return, newline, formfeed, vertical tab).\n\
If maxsplit is given, at most maxsplit splits are done.");
static PyObject *
bytes_split(PyBytesObject *self, PyObject *args)
{
Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j;
Py_ssize_t maxsplit = -1, count=0;
Py_ssize_t maxsplit = -1, count = 0;
const char *s = PyBytes_AS_STRING(self), *sub;
PyObject *list, *str, *subobj;
PyObject *list, *str, *subobj = Py_None;
PyBuffer vsub;
#ifdef USE_FAST
Py_ssize_t pos;
#endif
if (!PyArg_ParseTuple(args, "O|n:split", &subobj, &maxsplit))
if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
return NULL;
if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;
if (PyBytes_Check(subobj)) {
sub = PyBytes_AS_STRING(subobj);
n = PyBytes_GET_SIZE(subobj);
}
/* XXX -> use the modern buffer interface */
else if (PyObject_AsCharBuffer(subobj, &sub, &n))
if (subobj == Py_None)
return split_whitespace(s, len, maxsplit);
if (_getbuffer(subobj, &vsub) < 0)
return NULL;
sub = vsub.buf;
n = vsub.len;
if (n == 0) {
PyErr_SetString(PyExc_ValueError, "empty separator");
PyObject_ReleaseBuffer(subobj, &vsub);
return NULL;
}
else if (n == 1)
if (n == 1)
return split_char(s, len, sub[0], maxsplit);
list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL)
if (list == NULL) {
PyObject_ReleaseBuffer(subobj, &vsub);
return NULL;
}
#ifdef USE_FAST
i = j = 0;
@ -2198,10 +2243,12 @@ bytes_split(PyBytesObject *self, PyObject *args)
#endif
SPLIT_ADD(s, i, len);
FIX_PREALLOC_SIZE(list);
PyObject_ReleaseBuffer(subobj, &vsub);
return list;
onError:
Py_DECREF(list);
PyObject_ReleaseBuffer(subobj, &vsub);
return NULL;
}
@ -2293,44 +2340,90 @@ rsplit_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
return NULL;
}
Py_LOCAL_INLINE(PyObject *)
rsplit_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
{
register Py_ssize_t i, j, count = 0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
if (list == NULL)
return NULL;
for (i = j = len - 1; i >= 0; ) {
/* find a token */
while (i >= 0 && Py_UNICODE_ISSPACE(s[i]))
i--;
j = i;
while (i >= 0 && !Py_UNICODE_ISSPACE(s[i]))
i--;
if (j > i) {
if (maxcount-- <= 0)
break;
SPLIT_ADD(s, i + 1, j + 1);
while (i >= 0 && Py_UNICODE_ISSPACE(s[i]))
i--;
j = i;
}
}
if (j >= 0) {
SPLIT_ADD(s, 0, j + 1);
}
FIX_PREALLOC_SIZE(list);
if (PyList_Reverse(list) < 0)
goto onError;
return list;
onError:
Py_DECREF(list);
return NULL;
}
PyDoc_STRVAR(rsplit__doc__,
"B.rsplit(sep [,maxsplit]) -> list of bytes\n\
\n\
Return a list of the sections in the byte B, using sep as the\n\
delimiter, starting at the end of the bytes and working\n\
to the front. If maxsplit is given, at most maxsplit splits are\n\
done.");
Return a list of the sections in the byte B, using sep as the delimiter,\n\
starting at the end of the bytes and working to the front.\n\
If sep is not given, B is split on ASCII whitespace characters\n\
(space, tab, return, newline, formfeed, vertical tab).\n\
If maxsplit is given, at most maxsplit splits are done.");
static PyObject *
bytes_rsplit(PyBytesObject *self, PyObject *args)
{
Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j;
Py_ssize_t maxsplit = -1, count=0;
Py_ssize_t maxsplit = -1, count = 0;
const char *s = PyBytes_AS_STRING(self), *sub;
PyObject *list, *str, *subobj;
PyObject *list, *str, *subobj = Py_None;
PyBuffer vsub;
if (!PyArg_ParseTuple(args, "O|n:rsplit", &subobj, &maxsplit))
if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
return NULL;
if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX;
if (PyBytes_Check(subobj)) {
sub = PyBytes_AS_STRING(subobj);
n = PyBytes_GET_SIZE(subobj);
}
/* XXX -> Use the modern buffer interface */
else if (PyObject_AsCharBuffer(subobj, &sub, &n))
if (subobj == Py_None)
return rsplit_whitespace(s, len, maxsplit);
if (_getbuffer(subobj, &vsub) < 0)
return NULL;
sub = vsub.buf;
n = vsub.len;
if (n == 0) {
PyErr_SetString(PyExc_ValueError, "empty separator");
PyObject_ReleaseBuffer(subobj, &vsub);
return NULL;
}
else if (n == 1)
return rsplit_char(s, len, sub[0], maxsplit);
list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL)
if (list == NULL) {
PyObject_ReleaseBuffer(subobj, &vsub);
return NULL;
}
j = len;
i = j - n;
@ -2349,10 +2442,12 @@ bytes_rsplit(PyBytesObject *self, PyObject *args)
FIX_PREALLOC_SIZE(list);
if (PyList_Reverse(list) < 0)
goto onError;
PyObject_ReleaseBuffer(subobj, &vsub);
return list;
onError:
Py_DECREF(list);
PyObject_ReleaseBuffer(subobj, &vsub);
return NULL;
}
@ -2542,71 +2637,104 @@ rstrip_helper(unsigned char *myptr, Py_ssize_t mysize,
}
PyDoc_STRVAR(strip__doc__,
"B.strip(bytes) -> bytes\n\
"B.strip([bytes]) -> bytes\n\
\n\
Strip leading and trailing bytes contained in the argument.");
Strip leading and trailing bytes contained in the argument.\n\
If the argument is omitted, strip ASCII whitespace.");
static PyObject *
bytes_strip(PyBytesObject *self, PyObject *arg)
bytes_strip(PyBytesObject *self, PyObject *args)
{
Py_ssize_t left, right, mysize, argsize;
void *myptr, *argptr;
if (arg == NULL || !PyBytes_Check(arg)) {
PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument");
PyObject *arg = Py_None;
PyBuffer varg;
if (!PyArg_ParseTuple(args, "|O:strip", &arg))
return NULL;
if (arg == Py_None) {
argptr = "\t\n\r\f\v ";
argsize = 6;
}
else {
if (_getbuffer(arg, &varg) < 0)
return NULL;
argptr = varg.buf;
argsize = varg.len;
}
myptr = self->ob_bytes;
mysize = Py_Size(self);
argptr = ((PyBytesObject *)arg)->ob_bytes;
argsize = Py_Size(arg);
left = lstrip_helper(myptr, mysize, argptr, argsize);
if (left == mysize)
right = left;
else
right = rstrip_helper(myptr, mysize, argptr, argsize);
if (arg != Py_None)
PyObject_ReleaseBuffer(arg, &varg);
return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left);
}
PyDoc_STRVAR(lstrip__doc__,
"B.lstrip(bytes) -> bytes\n\
"B.lstrip([bytes]) -> bytes\n\
\n\
Strip leading bytes contained in the argument.");
Strip leading bytes contained in the argument.\n\
If the argument is omitted, strip leading ASCII whitespace.");
static PyObject *
bytes_lstrip(PyBytesObject *self, PyObject *arg)
bytes_lstrip(PyBytesObject *self, PyObject *args)
{
Py_ssize_t left, right, mysize, argsize;
void *myptr, *argptr;
if (arg == NULL || !PyBytes_Check(arg)) {
PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument");
PyObject *arg = Py_None;
PyBuffer varg;
if (!PyArg_ParseTuple(args, "|O:lstrip", &arg))
return NULL;
if (arg == Py_None) {
argptr = "\t\n\r\f\v ";
argsize = 6;
}
else {
if (_getbuffer(arg, &varg) < 0)
return NULL;
argptr = varg.buf;
argsize = varg.len;
}
myptr = self->ob_bytes;
mysize = Py_Size(self);
argptr = ((PyBytesObject *)arg)->ob_bytes;
argsize = Py_Size(arg);
left = lstrip_helper(myptr, mysize, argptr, argsize);
right = mysize;
if (arg != Py_None)
PyObject_ReleaseBuffer(arg, &varg);
return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left);
}
PyDoc_STRVAR(rstrip__doc__,
"B.rstrip(bytes) -> bytes\n\
"B.rstrip([bytes]) -> bytes\n\
\n\
Strip trailing bytes contained in the argument.");
Strip trailing bytes contained in the argument.\n\
If the argument is omitted, strip trailing ASCII whitespace.");
static PyObject *
bytes_rstrip(PyBytesObject *self, PyObject *arg)
bytes_rstrip(PyBytesObject *self, PyObject *args)
{
Py_ssize_t left, right, mysize, argsize;
void *myptr, *argptr;
if (arg == NULL || !PyBytes_Check(arg)) {
PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument");
PyObject *arg = Py_None;
PyBuffer varg;
if (!PyArg_ParseTuple(args, "|O:rstrip", &arg))
return NULL;
if (arg == Py_None) {
argptr = "\t\n\r\f\v ";
argsize = 6;
}
else {
if (_getbuffer(arg, &varg) < 0)
return NULL;
argptr = varg.buf;
argsize = varg.len;
}
myptr = self->ob_bytes;
mysize = Py_Size(self);
argptr = ((PyBytesObject *)arg)->ob_bytes;
argsize = Py_Size(arg);
left = 0;
right = rstrip_helper(myptr, mysize, argptr, argsize);
if (arg != Py_None)
PyObject_ReleaseBuffer(arg, &varg);
return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left);
}
@ -2839,9 +2967,9 @@ bytes_methods[] = {
{"reverse", (PyCFunction)bytes_reverse, METH_NOARGS, reverse__doc__},
{"pop", (PyCFunction)bytes_pop, METH_VARARGS, pop__doc__},
{"remove", (PyCFunction)bytes_remove, METH_O, remove__doc__},
{"strip", (PyCFunction)bytes_strip, METH_O, strip__doc__},
{"lstrip", (PyCFunction)bytes_lstrip, METH_O, lstrip__doc__},
{"rstrip", (PyCFunction)bytes_rstrip, METH_O, rstrip__doc__},
{"strip", (PyCFunction)bytes_strip, METH_VARARGS, strip__doc__},
{"lstrip", (PyCFunction)bytes_lstrip, METH_VARARGS, lstrip__doc__},
{"rstrip", (PyCFunction)bytes_rstrip, METH_VARARGS, rstrip__doc__},
{"decode", (PyCFunction)bytes_decode, METH_VARARGS, decode_doc},
{"__alloc__", (PyCFunction)bytes_alloc, METH_NOARGS, alloc_doc},
{"fromhex", (PyCFunction)bytes_fromhex, METH_VARARGS|METH_CLASS,