Bug # 1125 (my code).

Support bytes.split() and bytes.strip() -- these split/strip using ASCII
whitespace (tab, space, CR, LF, FF, VT) like their str counterparts.
Also for rsplit(), lstrip() and rstrip().
And change all these functions to accept arbitrary buffer-API-supporting
arguments.
With unit tests.
This commit is contained in:
Guido van Rossum 2007-09-10 16:53:45 +00:00
parent 954c31bcc7
commit 8f95067915
2 changed files with 233 additions and 59 deletions

View File

@ -617,16 +617,46 @@ class BytesTest(unittest.TestCase):
self.assertEqual(b.split(b'i'), [b'm', b'ss', b'ss', b'pp', b'']) self.assertEqual(b.split(b'i'), [b'm', b'ss', b'ss', b'pp', b''])
self.assertEqual(b.split(b'ss'), [b'mi', b'i', b'ippi']) self.assertEqual(b.split(b'ss'), [b'mi', b'i', b'ippi'])
self.assertEqual(b.split(b'w'), [b]) self.assertEqual(b.split(b'w'), [b])
# require an arg (no magic whitespace split)
self.assertRaises(TypeError, b.split) def test_split_whitespace(self):
for b in (b' arf barf ', b'arf\tbarf', b'arf\nbarf', b'arf\rbarf',
b'arf\fbarf', b'arf\vbarf'):
self.assertEqual(b.split(), [b'arf', b'barf'])
self.assertEqual(b.split(None), [b'arf', b'barf'])
self.assertEqual(b.split(None, 2), [b'arf', b'barf'])
self.assertEqual(b' a bb c '.split(None, 0), [b'a bb c '])
self.assertEqual(b' a bb c '.split(None, 1), [b'a', b'bb c '])
self.assertEqual(b' a bb c '.split(None, 2), [b'a', b'bb', b'c '])
self.assertEqual(b' a bb c '.split(None, 3), [b'a', b'bb', b'c'])
def test_split_buffer(self):
self.assertEqual(b'a b'.split(buffer(b' ')), [b'a', b'b'])
def test_split_string_error(self):
self.assertRaises(TypeError, b'a b'.split, ' ')
def test_rsplit(self): def test_rsplit(self):
b = b'mississippi' b = b'mississippi'
self.assertEqual(b.rsplit(b'i'), [b'm', b'ss', b'ss', b'pp', b'']) self.assertEqual(b.rsplit(b'i'), [b'm', b'ss', b'ss', b'pp', b''])
self.assertEqual(b.rsplit(b'ss'), [b'mi', b'i', b'ippi']) self.assertEqual(b.rsplit(b'ss'), [b'mi', b'i', b'ippi'])
self.assertEqual(b.rsplit(b'w'), [b]) self.assertEqual(b.rsplit(b'w'), [b])
# require an arg (no magic whitespace split)
self.assertRaises(TypeError, b.rsplit) def test_rsplit_whitespace(self):
for b in (b' arf barf ', b'arf\tbarf', b'arf\nbarf', b'arf\rbarf',
b'arf\fbarf', b'arf\vbarf'):
self.assertEqual(b.rsplit(), [b'arf', b'barf'])
self.assertEqual(b.rsplit(None), [b'arf', b'barf'])
self.assertEqual(b.rsplit(None, 2), [b'arf', b'barf'])
self.assertEqual(b' a bb c '.rsplit(None, 0), [b' a bb c'])
self.assertEqual(b' a bb c '.rsplit(None, 1), [b' a bb', b'c'])
self.assertEqual(b' a bb c '.rsplit(None,2), [b' a', b'bb', b'c'])
self.assertEqual(b' a bb c '.rsplit(None, 3), [b'a', b'bb', b'c'])
def test_rplit_buffer(self):
self.assertEqual(b'a b'.rsplit(buffer(b' ')), [b'a', b'b'])
def test_rplit_string_error(self):
self.assertRaises(TypeError, b'a b'.rsplit, ' ')
def test_partition(self): def test_partition(self):
b = b'mississippi' b = b'mississippi'
@ -670,6 +700,22 @@ class BytesTest(unittest.TestCase):
self.assertEqual(b.rstrip(b'im'), b'mississipp') self.assertEqual(b.rstrip(b'im'), b'mississipp')
self.assertEqual(b.rstrip(b'pim'), b'mississ') self.assertEqual(b.rstrip(b'pim'), b'mississ')
def test_strip_whitespace(self):
b = b' \t\n\r\f\vabc \t\n\r\f\v'
self.assertEqual(b.strip(), b'abc')
self.assertEqual(b.lstrip(), b'abc \t\n\r\f\v')
self.assertEqual(b.rstrip(), b' \t\n\r\f\vabc')
def test_strip_buffer(self):
self.assertEqual(b'abc'.strip(buffer(b'ac')), b'b')
self.assertEqual(b'abc'.lstrip(buffer(b'ac')), b'bc')
self.assertEqual(b'abc'.rstrip(buffer(b'ac')), b'ab')
def test_strip_string_error(self):
self.assertRaises(TypeError, b'abc'.strip, 'b')
self.assertRaises(TypeError, b'abc'.lstrip, 'b')
self.assertRaises(TypeError, b'abc'.rstrip, 'b')
def test_ord(self): def test_ord(self):
b = b'\0A\x7f\x80\xff' b = b'\0A\x7f\x80\xff'
self.assertEqual([ord(b[i:i+1]) for i in range(len(b))], self.assertEqual([ord(b[i:i+1]) for i in range(len(b))],

View File

@ -2104,7 +2104,7 @@ bytes_replace(PyBytesObject *self, PyObject *args)
Py_LOCAL_INLINE(PyObject *) Py_LOCAL_INLINE(PyObject *)
split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount) split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
{ {
register Py_ssize_t i, j, count=0; register Py_ssize_t i, j, count = 0;
PyObject *str; PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
@ -2113,7 +2113,7 @@ split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
i = j = 0; i = j = 0;
while ((j < len) && (maxcount-- > 0)) { while ((j < len) && (maxcount-- > 0)) {
for(; j<len; j++) { for(; j < len; j++) {
/* I found that using memchr makes no difference */ /* I found that using memchr makes no difference */
if (s[j] == ch) { if (s[j] == ch) {
SPLIT_ADD(s, i, j); SPLIT_ADD(s, i, j);
@ -2133,46 +2133,91 @@ split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
return NULL; return NULL;
} }
#define ISSPACE(c) (isspace(Py_CHARMASK(c)) && ((c) & 0x80) == 0)
Py_LOCAL_INLINE(PyObject *)
split_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
{
register Py_ssize_t i, j, count = 0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
if (list == NULL)
return NULL;
for (i = j = 0; i < len; ) {
/* find a token */
while (i < len && ISSPACE(s[i]))
i++;
j = i;
while (i < len && !ISSPACE(s[i]))
i++;
if (j < i) {
if (maxcount-- <= 0)
break;
SPLIT_ADD(s, j, i);
while (i < len && ISSPACE(s[i]))
i++;
j = i;
}
}
if (j < len) {
SPLIT_ADD(s, j, len);
}
FIX_PREALLOC_SIZE(list);
return list;
onError:
Py_DECREF(list);
return NULL;
}
PyDoc_STRVAR(split__doc__, PyDoc_STRVAR(split__doc__,
"B.split(sep [,maxsplit]) -> list of bytes\n\ "B.split([sep [, maxsplit]]) -> list of bytes\n\
\n\ \n\
Return a list of the bytes in the string B, using sep as the\n\ Return a list of the bytes in the string B, using sep as the delimiter.\n\
delimiter. If maxsplit is given, at most maxsplit\n\ If sep is not given, B is split on ASCII whitespace charcters\n\
splits are done."); (space, tab, return, newline, formfeed, vertical tab).\n\
If maxsplit is given, at most maxsplit splits are done.");
static PyObject * static PyObject *
bytes_split(PyBytesObject *self, PyObject *args) bytes_split(PyBytesObject *self, PyObject *args)
{ {
Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j; Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j;
Py_ssize_t maxsplit = -1, count=0; Py_ssize_t maxsplit = -1, count = 0;
const char *s = PyBytes_AS_STRING(self), *sub; const char *s = PyBytes_AS_STRING(self), *sub;
PyObject *list, *str, *subobj; PyObject *list, *str, *subobj = Py_None;
PyBuffer vsub;
#ifdef USE_FAST #ifdef USE_FAST
Py_ssize_t pos; Py_ssize_t pos;
#endif #endif
if (!PyArg_ParseTuple(args, "O|n:split", &subobj, &maxsplit)) if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
return NULL; return NULL;
if (maxsplit < 0) if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX; maxsplit = PY_SSIZE_T_MAX;
if (PyBytes_Check(subobj)) {
sub = PyBytes_AS_STRING(subobj); if (subobj == Py_None)
n = PyBytes_GET_SIZE(subobj); return split_whitespace(s, len, maxsplit);
}
/* XXX -> use the modern buffer interface */ if (_getbuffer(subobj, &vsub) < 0)
else if (PyObject_AsCharBuffer(subobj, &sub, &n))
return NULL; return NULL;
sub = vsub.buf;
n = vsub.len;
if (n == 0) { if (n == 0) {
PyErr_SetString(PyExc_ValueError, "empty separator"); PyErr_SetString(PyExc_ValueError, "empty separator");
PyObject_ReleaseBuffer(subobj, &vsub);
return NULL; return NULL;
} }
else if (n == 1) if (n == 1)
return split_char(s, len, sub[0], maxsplit); return split_char(s, len, sub[0], maxsplit);
list = PyList_New(PREALLOC_SIZE(maxsplit)); list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL) if (list == NULL) {
PyObject_ReleaseBuffer(subobj, &vsub);
return NULL; return NULL;
}
#ifdef USE_FAST #ifdef USE_FAST
i = j = 0; i = j = 0;
@ -2198,10 +2243,12 @@ bytes_split(PyBytesObject *self, PyObject *args)
#endif #endif
SPLIT_ADD(s, i, len); SPLIT_ADD(s, i, len);
FIX_PREALLOC_SIZE(list); FIX_PREALLOC_SIZE(list);
PyObject_ReleaseBuffer(subobj, &vsub);
return list; return list;
onError: onError:
Py_DECREF(list); Py_DECREF(list);
PyObject_ReleaseBuffer(subobj, &vsub);
return NULL; return NULL;
} }
@ -2293,44 +2340,90 @@ rsplit_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
return NULL; return NULL;
} }
Py_LOCAL_INLINE(PyObject *)
rsplit_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
{
register Py_ssize_t i, j, count = 0;
PyObject *str;
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
if (list == NULL)
return NULL;
for (i = j = len - 1; i >= 0; ) {
/* find a token */
while (i >= 0 && Py_UNICODE_ISSPACE(s[i]))
i--;
j = i;
while (i >= 0 && !Py_UNICODE_ISSPACE(s[i]))
i--;
if (j > i) {
if (maxcount-- <= 0)
break;
SPLIT_ADD(s, i + 1, j + 1);
while (i >= 0 && Py_UNICODE_ISSPACE(s[i]))
i--;
j = i;
}
}
if (j >= 0) {
SPLIT_ADD(s, 0, j + 1);
}
FIX_PREALLOC_SIZE(list);
if (PyList_Reverse(list) < 0)
goto onError;
return list;
onError:
Py_DECREF(list);
return NULL;
}
PyDoc_STRVAR(rsplit__doc__, PyDoc_STRVAR(rsplit__doc__,
"B.rsplit(sep [,maxsplit]) -> list of bytes\n\ "B.rsplit(sep [,maxsplit]) -> list of bytes\n\
\n\ \n\
Return a list of the sections in the byte B, using sep as the\n\ Return a list of the sections in the byte B, using sep as the delimiter,\n\
delimiter, starting at the end of the bytes and working\n\ starting at the end of the bytes and working to the front.\n\
to the front. If maxsplit is given, at most maxsplit splits are\n\ If sep is not given, B is split on ASCII whitespace characters\n\
done."); (space, tab, return, newline, formfeed, vertical tab).\n\
If maxsplit is given, at most maxsplit splits are done.");
static PyObject * static PyObject *
bytes_rsplit(PyBytesObject *self, PyObject *args) bytes_rsplit(PyBytesObject *self, PyObject *args)
{ {
Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j; Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j;
Py_ssize_t maxsplit = -1, count=0; Py_ssize_t maxsplit = -1, count = 0;
const char *s = PyBytes_AS_STRING(self), *sub; const char *s = PyBytes_AS_STRING(self), *sub;
PyObject *list, *str, *subobj; PyObject *list, *str, *subobj = Py_None;
PyBuffer vsub;
if (!PyArg_ParseTuple(args, "O|n:rsplit", &subobj, &maxsplit)) if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
return NULL; return NULL;
if (maxsplit < 0) if (maxsplit < 0)
maxsplit = PY_SSIZE_T_MAX; maxsplit = PY_SSIZE_T_MAX;
if (PyBytes_Check(subobj)) {
sub = PyBytes_AS_STRING(subobj); if (subobj == Py_None)
n = PyBytes_GET_SIZE(subobj); return rsplit_whitespace(s, len, maxsplit);
}
/* XXX -> Use the modern buffer interface */ if (_getbuffer(subobj, &vsub) < 0)
else if (PyObject_AsCharBuffer(subobj, &sub, &n))
return NULL; return NULL;
sub = vsub.buf;
n = vsub.len;
if (n == 0) { if (n == 0) {
PyErr_SetString(PyExc_ValueError, "empty separator"); PyErr_SetString(PyExc_ValueError, "empty separator");
PyObject_ReleaseBuffer(subobj, &vsub);
return NULL; return NULL;
} }
else if (n == 1) else if (n == 1)
return rsplit_char(s, len, sub[0], maxsplit); return rsplit_char(s, len, sub[0], maxsplit);
list = PyList_New(PREALLOC_SIZE(maxsplit)); list = PyList_New(PREALLOC_SIZE(maxsplit));
if (list == NULL) if (list == NULL) {
PyObject_ReleaseBuffer(subobj, &vsub);
return NULL; return NULL;
}
j = len; j = len;
i = j - n; i = j - n;
@ -2349,10 +2442,12 @@ bytes_rsplit(PyBytesObject *self, PyObject *args)
FIX_PREALLOC_SIZE(list); FIX_PREALLOC_SIZE(list);
if (PyList_Reverse(list) < 0) if (PyList_Reverse(list) < 0)
goto onError; goto onError;
PyObject_ReleaseBuffer(subobj, &vsub);
return list; return list;
onError: onError:
Py_DECREF(list); Py_DECREF(list);
PyObject_ReleaseBuffer(subobj, &vsub);
return NULL; return NULL;
} }
@ -2542,71 +2637,104 @@ rstrip_helper(unsigned char *myptr, Py_ssize_t mysize,
} }
PyDoc_STRVAR(strip__doc__, PyDoc_STRVAR(strip__doc__,
"B.strip(bytes) -> bytes\n\ "B.strip([bytes]) -> bytes\n\
\n\ \n\
Strip leading and trailing bytes contained in the argument."); Strip leading and trailing bytes contained in the argument.\n\
If the argument is omitted, strip ASCII whitespace.");
static PyObject * static PyObject *
bytes_strip(PyBytesObject *self, PyObject *arg) bytes_strip(PyBytesObject *self, PyObject *args)
{ {
Py_ssize_t left, right, mysize, argsize; Py_ssize_t left, right, mysize, argsize;
void *myptr, *argptr; void *myptr, *argptr;
if (arg == NULL || !PyBytes_Check(arg)) { PyObject *arg = Py_None;
PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument"); PyBuffer varg;
if (!PyArg_ParseTuple(args, "|O:strip", &arg))
return NULL; return NULL;
if (arg == Py_None) {
argptr = "\t\n\r\f\v ";
argsize = 6;
}
else {
if (_getbuffer(arg, &varg) < 0)
return NULL;
argptr = varg.buf;
argsize = varg.len;
} }
myptr = self->ob_bytes; myptr = self->ob_bytes;
mysize = Py_Size(self); mysize = Py_Size(self);
argptr = ((PyBytesObject *)arg)->ob_bytes;
argsize = Py_Size(arg);
left = lstrip_helper(myptr, mysize, argptr, argsize); left = lstrip_helper(myptr, mysize, argptr, argsize);
if (left == mysize) if (left == mysize)
right = left; right = left;
else else
right = rstrip_helper(myptr, mysize, argptr, argsize); right = rstrip_helper(myptr, mysize, argptr, argsize);
if (arg != Py_None)
PyObject_ReleaseBuffer(arg, &varg);
return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left); return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left);
} }
PyDoc_STRVAR(lstrip__doc__, PyDoc_STRVAR(lstrip__doc__,
"B.lstrip(bytes) -> bytes\n\ "B.lstrip([bytes]) -> bytes\n\
\n\ \n\
Strip leading bytes contained in the argument."); Strip leading bytes contained in the argument.\n\
If the argument is omitted, strip leading ASCII whitespace.");
static PyObject * static PyObject *
bytes_lstrip(PyBytesObject *self, PyObject *arg) bytes_lstrip(PyBytesObject *self, PyObject *args)
{ {
Py_ssize_t left, right, mysize, argsize; Py_ssize_t left, right, mysize, argsize;
void *myptr, *argptr; void *myptr, *argptr;
if (arg == NULL || !PyBytes_Check(arg)) { PyObject *arg = Py_None;
PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument"); PyBuffer varg;
if (!PyArg_ParseTuple(args, "|O:lstrip", &arg))
return NULL; return NULL;
if (arg == Py_None) {
argptr = "\t\n\r\f\v ";
argsize = 6;
}
else {
if (_getbuffer(arg, &varg) < 0)
return NULL;
argptr = varg.buf;
argsize = varg.len;
} }
myptr = self->ob_bytes; myptr = self->ob_bytes;
mysize = Py_Size(self); mysize = Py_Size(self);
argptr = ((PyBytesObject *)arg)->ob_bytes;
argsize = Py_Size(arg);
left = lstrip_helper(myptr, mysize, argptr, argsize); left = lstrip_helper(myptr, mysize, argptr, argsize);
right = mysize; right = mysize;
if (arg != Py_None)
PyObject_ReleaseBuffer(arg, &varg);
return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left); return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left);
} }
PyDoc_STRVAR(rstrip__doc__, PyDoc_STRVAR(rstrip__doc__,
"B.rstrip(bytes) -> bytes\n\ "B.rstrip([bytes]) -> bytes\n\
\n\ \n\
Strip trailing bytes contained in the argument."); Strip trailing bytes contained in the argument.\n\
If the argument is omitted, strip trailing ASCII whitespace.");
static PyObject * static PyObject *
bytes_rstrip(PyBytesObject *self, PyObject *arg) bytes_rstrip(PyBytesObject *self, PyObject *args)
{ {
Py_ssize_t left, right, mysize, argsize; Py_ssize_t left, right, mysize, argsize;
void *myptr, *argptr; void *myptr, *argptr;
if (arg == NULL || !PyBytes_Check(arg)) { PyObject *arg = Py_None;
PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument"); PyBuffer varg;
if (!PyArg_ParseTuple(args, "|O:rstrip", &arg))
return NULL; return NULL;
if (arg == Py_None) {
argptr = "\t\n\r\f\v ";
argsize = 6;
}
else {
if (_getbuffer(arg, &varg) < 0)
return NULL;
argptr = varg.buf;
argsize = varg.len;
} }
myptr = self->ob_bytes; myptr = self->ob_bytes;
mysize = Py_Size(self); mysize = Py_Size(self);
argptr = ((PyBytesObject *)arg)->ob_bytes;
argsize = Py_Size(arg);
left = 0; left = 0;
right = rstrip_helper(myptr, mysize, argptr, argsize); right = rstrip_helper(myptr, mysize, argptr, argsize);
if (arg != Py_None)
PyObject_ReleaseBuffer(arg, &varg);
return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left); return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left);
} }
@ -2839,9 +2967,9 @@ bytes_methods[] = {
{"reverse", (PyCFunction)bytes_reverse, METH_NOARGS, reverse__doc__}, {"reverse", (PyCFunction)bytes_reverse, METH_NOARGS, reverse__doc__},
{"pop", (PyCFunction)bytes_pop, METH_VARARGS, pop__doc__}, {"pop", (PyCFunction)bytes_pop, METH_VARARGS, pop__doc__},
{"remove", (PyCFunction)bytes_remove, METH_O, remove__doc__}, {"remove", (PyCFunction)bytes_remove, METH_O, remove__doc__},
{"strip", (PyCFunction)bytes_strip, METH_O, strip__doc__}, {"strip", (PyCFunction)bytes_strip, METH_VARARGS, strip__doc__},
{"lstrip", (PyCFunction)bytes_lstrip, METH_O, lstrip__doc__}, {"lstrip", (PyCFunction)bytes_lstrip, METH_VARARGS, lstrip__doc__},
{"rstrip", (PyCFunction)bytes_rstrip, METH_O, rstrip__doc__}, {"rstrip", (PyCFunction)bytes_rstrip, METH_VARARGS, rstrip__doc__},
{"decode", (PyCFunction)bytes_decode, METH_VARARGS, decode_doc}, {"decode", (PyCFunction)bytes_decode, METH_VARARGS, decode_doc},
{"__alloc__", (PyCFunction)bytes_alloc, METH_NOARGS, alloc_doc}, {"__alloc__", (PyCFunction)bytes_alloc, METH_NOARGS, alloc_doc},
{"fromhex", (PyCFunction)bytes_fromhex, METH_VARARGS|METH_CLASS, {"fromhex", (PyCFunction)bytes_fromhex, METH_VARARGS|METH_CLASS,