Bug # 1125 (my code).
Support bytes.split() and bytes.strip() -- these split/strip using ASCII whitespace (tab, space, CR, LF, FF, VT) like their str counterparts. Also for rsplit(), lstrip() and rstrip(). And change all these functions to accept arbitrary buffer-API-supporting arguments. With unit tests.
This commit is contained in:
parent
954c31bcc7
commit
8f95067915
|
@ -617,16 +617,46 @@ class BytesTest(unittest.TestCase):
|
|||
self.assertEqual(b.split(b'i'), [b'm', b'ss', b'ss', b'pp', b''])
|
||||
self.assertEqual(b.split(b'ss'), [b'mi', b'i', b'ippi'])
|
||||
self.assertEqual(b.split(b'w'), [b])
|
||||
# require an arg (no magic whitespace split)
|
||||
self.assertRaises(TypeError, b.split)
|
||||
|
||||
def test_split_whitespace(self):
|
||||
for b in (b' arf barf ', b'arf\tbarf', b'arf\nbarf', b'arf\rbarf',
|
||||
b'arf\fbarf', b'arf\vbarf'):
|
||||
self.assertEqual(b.split(), [b'arf', b'barf'])
|
||||
self.assertEqual(b.split(None), [b'arf', b'barf'])
|
||||
self.assertEqual(b.split(None, 2), [b'arf', b'barf'])
|
||||
self.assertEqual(b' a bb c '.split(None, 0), [b'a bb c '])
|
||||
self.assertEqual(b' a bb c '.split(None, 1), [b'a', b'bb c '])
|
||||
self.assertEqual(b' a bb c '.split(None, 2), [b'a', b'bb', b'c '])
|
||||
self.assertEqual(b' a bb c '.split(None, 3), [b'a', b'bb', b'c'])
|
||||
|
||||
def test_split_buffer(self):
|
||||
self.assertEqual(b'a b'.split(buffer(b' ')), [b'a', b'b'])
|
||||
|
||||
def test_split_string_error(self):
|
||||
self.assertRaises(TypeError, b'a b'.split, ' ')
|
||||
|
||||
def test_rsplit(self):
|
||||
b = b'mississippi'
|
||||
self.assertEqual(b.rsplit(b'i'), [b'm', b'ss', b'ss', b'pp', b''])
|
||||
self.assertEqual(b.rsplit(b'ss'), [b'mi', b'i', b'ippi'])
|
||||
self.assertEqual(b.rsplit(b'w'), [b])
|
||||
# require an arg (no magic whitespace split)
|
||||
self.assertRaises(TypeError, b.rsplit)
|
||||
|
||||
def test_rsplit_whitespace(self):
|
||||
for b in (b' arf barf ', b'arf\tbarf', b'arf\nbarf', b'arf\rbarf',
|
||||
b'arf\fbarf', b'arf\vbarf'):
|
||||
self.assertEqual(b.rsplit(), [b'arf', b'barf'])
|
||||
self.assertEqual(b.rsplit(None), [b'arf', b'barf'])
|
||||
self.assertEqual(b.rsplit(None, 2), [b'arf', b'barf'])
|
||||
self.assertEqual(b' a bb c '.rsplit(None, 0), [b' a bb c'])
|
||||
self.assertEqual(b' a bb c '.rsplit(None, 1), [b' a bb', b'c'])
|
||||
self.assertEqual(b' a bb c '.rsplit(None,2), [b' a', b'bb', b'c'])
|
||||
self.assertEqual(b' a bb c '.rsplit(None, 3), [b'a', b'bb', b'c'])
|
||||
|
||||
def test_rplit_buffer(self):
|
||||
self.assertEqual(b'a b'.rsplit(buffer(b' ')), [b'a', b'b'])
|
||||
|
||||
def test_rplit_string_error(self):
|
||||
self.assertRaises(TypeError, b'a b'.rsplit, ' ')
|
||||
|
||||
def test_partition(self):
|
||||
b = b'mississippi'
|
||||
|
@ -670,6 +700,22 @@ class BytesTest(unittest.TestCase):
|
|||
self.assertEqual(b.rstrip(b'im'), b'mississipp')
|
||||
self.assertEqual(b.rstrip(b'pim'), b'mississ')
|
||||
|
||||
def test_strip_whitespace(self):
|
||||
b = b' \t\n\r\f\vabc \t\n\r\f\v'
|
||||
self.assertEqual(b.strip(), b'abc')
|
||||
self.assertEqual(b.lstrip(), b'abc \t\n\r\f\v')
|
||||
self.assertEqual(b.rstrip(), b' \t\n\r\f\vabc')
|
||||
|
||||
def test_strip_buffer(self):
|
||||
self.assertEqual(b'abc'.strip(buffer(b'ac')), b'b')
|
||||
self.assertEqual(b'abc'.lstrip(buffer(b'ac')), b'bc')
|
||||
self.assertEqual(b'abc'.rstrip(buffer(b'ac')), b'ab')
|
||||
|
||||
def test_strip_string_error(self):
|
||||
self.assertRaises(TypeError, b'abc'.strip, 'b')
|
||||
self.assertRaises(TypeError, b'abc'.lstrip, 'b')
|
||||
self.assertRaises(TypeError, b'abc'.rstrip, 'b')
|
||||
|
||||
def test_ord(self):
|
||||
b = b'\0A\x7f\x80\xff'
|
||||
self.assertEqual([ord(b[i:i+1]) for i in range(len(b))],
|
||||
|
|
|
@ -2104,7 +2104,7 @@ bytes_replace(PyBytesObject *self, PyObject *args)
|
|||
Py_LOCAL_INLINE(PyObject *)
|
||||
split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
|
||||
{
|
||||
register Py_ssize_t i, j, count=0;
|
||||
register Py_ssize_t i, j, count = 0;
|
||||
PyObject *str;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
|
||||
|
@ -2113,7 +2113,7 @@ split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
|
|||
|
||||
i = j = 0;
|
||||
while ((j < len) && (maxcount-- > 0)) {
|
||||
for(; j<len; j++) {
|
||||
for(; j < len; j++) {
|
||||
/* I found that using memchr makes no difference */
|
||||
if (s[j] == ch) {
|
||||
SPLIT_ADD(s, i, j);
|
||||
|
@ -2133,46 +2133,91 @@ split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
#define ISSPACE(c) (isspace(Py_CHARMASK(c)) && ((c) & 0x80) == 0)
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
split_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
|
||||
{
|
||||
register Py_ssize_t i, j, count = 0;
|
||||
PyObject *str;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
for (i = j = 0; i < len; ) {
|
||||
/* find a token */
|
||||
while (i < len && ISSPACE(s[i]))
|
||||
i++;
|
||||
j = i;
|
||||
while (i < len && !ISSPACE(s[i]))
|
||||
i++;
|
||||
if (j < i) {
|
||||
if (maxcount-- <= 0)
|
||||
break;
|
||||
SPLIT_ADD(s, j, i);
|
||||
while (i < len && ISSPACE(s[i]))
|
||||
i++;
|
||||
j = i;
|
||||
}
|
||||
}
|
||||
if (j < len) {
|
||||
SPLIT_ADD(s, j, len);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(split__doc__,
|
||||
"B.split(sep [,maxsplit]) -> list of bytes\n\
|
||||
"B.split([sep [, maxsplit]]) -> list of bytes\n\
|
||||
\n\
|
||||
Return a list of the bytes in the string B, using sep as the\n\
|
||||
delimiter. If maxsplit is given, at most maxsplit\n\
|
||||
splits are done.");
|
||||
Return a list of the bytes in the string B, using sep as the delimiter.\n\
|
||||
If sep is not given, B is split on ASCII whitespace charcters\n\
|
||||
(space, tab, return, newline, formfeed, vertical tab).\n\
|
||||
If maxsplit is given, at most maxsplit splits are done.");
|
||||
|
||||
static PyObject *
|
||||
bytes_split(PyBytesObject *self, PyObject *args)
|
||||
{
|
||||
Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j;
|
||||
Py_ssize_t maxsplit = -1, count=0;
|
||||
Py_ssize_t maxsplit = -1, count = 0;
|
||||
const char *s = PyBytes_AS_STRING(self), *sub;
|
||||
PyObject *list, *str, *subobj;
|
||||
PyObject *list, *str, *subobj = Py_None;
|
||||
PyBuffer vsub;
|
||||
#ifdef USE_FAST
|
||||
Py_ssize_t pos;
|
||||
#endif
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O|n:split", &subobj, &maxsplit))
|
||||
if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
|
||||
return NULL;
|
||||
if (maxsplit < 0)
|
||||
maxsplit = PY_SSIZE_T_MAX;
|
||||
if (PyBytes_Check(subobj)) {
|
||||
sub = PyBytes_AS_STRING(subobj);
|
||||
n = PyBytes_GET_SIZE(subobj);
|
||||
}
|
||||
/* XXX -> use the modern buffer interface */
|
||||
else if (PyObject_AsCharBuffer(subobj, &sub, &n))
|
||||
|
||||
if (subobj == Py_None)
|
||||
return split_whitespace(s, len, maxsplit);
|
||||
|
||||
if (_getbuffer(subobj, &vsub) < 0)
|
||||
return NULL;
|
||||
sub = vsub.buf;
|
||||
n = vsub.len;
|
||||
|
||||
if (n == 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "empty separator");
|
||||
PyObject_ReleaseBuffer(subobj, &vsub);
|
||||
return NULL;
|
||||
}
|
||||
else if (n == 1)
|
||||
if (n == 1)
|
||||
return split_char(s, len, sub[0], maxsplit);
|
||||
|
||||
list = PyList_New(PREALLOC_SIZE(maxsplit));
|
||||
if (list == NULL)
|
||||
if (list == NULL) {
|
||||
PyObject_ReleaseBuffer(subobj, &vsub);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef USE_FAST
|
||||
i = j = 0;
|
||||
|
@ -2198,10 +2243,12 @@ bytes_split(PyBytesObject *self, PyObject *args)
|
|||
#endif
|
||||
SPLIT_ADD(s, i, len);
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
PyObject_ReleaseBuffer(subobj, &vsub);
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
PyObject_ReleaseBuffer(subobj, &vsub);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -2293,44 +2340,90 @@ rsplit_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
rsplit_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
|
||||
{
|
||||
register Py_ssize_t i, j, count = 0;
|
||||
PyObject *str;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
for (i = j = len - 1; i >= 0; ) {
|
||||
/* find a token */
|
||||
while (i >= 0 && Py_UNICODE_ISSPACE(s[i]))
|
||||
i--;
|
||||
j = i;
|
||||
while (i >= 0 && !Py_UNICODE_ISSPACE(s[i]))
|
||||
i--;
|
||||
if (j > i) {
|
||||
if (maxcount-- <= 0)
|
||||
break;
|
||||
SPLIT_ADD(s, i + 1, j + 1);
|
||||
while (i >= 0 && Py_UNICODE_ISSPACE(s[i]))
|
||||
i--;
|
||||
j = i;
|
||||
}
|
||||
}
|
||||
if (j >= 0) {
|
||||
SPLIT_ADD(s, 0, j + 1);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(rsplit__doc__,
|
||||
"B.rsplit(sep [,maxsplit]) -> list of bytes\n\
|
||||
\n\
|
||||
Return a list of the sections in the byte B, using sep as the\n\
|
||||
delimiter, starting at the end of the bytes and working\n\
|
||||
to the front. If maxsplit is given, at most maxsplit splits are\n\
|
||||
done.");
|
||||
Return a list of the sections in the byte B, using sep as the delimiter,\n\
|
||||
starting at the end of the bytes and working to the front.\n\
|
||||
If sep is not given, B is split on ASCII whitespace characters\n\
|
||||
(space, tab, return, newline, formfeed, vertical tab).\n\
|
||||
If maxsplit is given, at most maxsplit splits are done.");
|
||||
|
||||
static PyObject *
|
||||
bytes_rsplit(PyBytesObject *self, PyObject *args)
|
||||
{
|
||||
Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j;
|
||||
Py_ssize_t maxsplit = -1, count=0;
|
||||
Py_ssize_t maxsplit = -1, count = 0;
|
||||
const char *s = PyBytes_AS_STRING(self), *sub;
|
||||
PyObject *list, *str, *subobj;
|
||||
PyObject *list, *str, *subobj = Py_None;
|
||||
PyBuffer vsub;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "O|n:rsplit", &subobj, &maxsplit))
|
||||
if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
|
||||
return NULL;
|
||||
if (maxsplit < 0)
|
||||
maxsplit = PY_SSIZE_T_MAX;
|
||||
if (PyBytes_Check(subobj)) {
|
||||
sub = PyBytes_AS_STRING(subobj);
|
||||
n = PyBytes_GET_SIZE(subobj);
|
||||
}
|
||||
/* XXX -> Use the modern buffer interface */
|
||||
else if (PyObject_AsCharBuffer(subobj, &sub, &n))
|
||||
|
||||
if (subobj == Py_None)
|
||||
return rsplit_whitespace(s, len, maxsplit);
|
||||
|
||||
if (_getbuffer(subobj, &vsub) < 0)
|
||||
return NULL;
|
||||
sub = vsub.buf;
|
||||
n = vsub.len;
|
||||
|
||||
if (n == 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "empty separator");
|
||||
PyObject_ReleaseBuffer(subobj, &vsub);
|
||||
return NULL;
|
||||
}
|
||||
else if (n == 1)
|
||||
return rsplit_char(s, len, sub[0], maxsplit);
|
||||
|
||||
list = PyList_New(PREALLOC_SIZE(maxsplit));
|
||||
if (list == NULL)
|
||||
if (list == NULL) {
|
||||
PyObject_ReleaseBuffer(subobj, &vsub);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
j = len;
|
||||
i = j - n;
|
||||
|
@ -2349,10 +2442,12 @@ bytes_rsplit(PyBytesObject *self, PyObject *args)
|
|||
FIX_PREALLOC_SIZE(list);
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
PyObject_ReleaseBuffer(subobj, &vsub);
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
PyObject_ReleaseBuffer(subobj, &vsub);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -2542,71 +2637,104 @@ rstrip_helper(unsigned char *myptr, Py_ssize_t mysize,
|
|||
}
|
||||
|
||||
PyDoc_STRVAR(strip__doc__,
|
||||
"B.strip(bytes) -> bytes\n\
|
||||
"B.strip([bytes]) -> bytes\n\
|
||||
\n\
|
||||
Strip leading and trailing bytes contained in the argument.");
|
||||
Strip leading and trailing bytes contained in the argument.\n\
|
||||
If the argument is omitted, strip ASCII whitespace.");
|
||||
static PyObject *
|
||||
bytes_strip(PyBytesObject *self, PyObject *arg)
|
||||
bytes_strip(PyBytesObject *self, PyObject *args)
|
||||
{
|
||||
Py_ssize_t left, right, mysize, argsize;
|
||||
void *myptr, *argptr;
|
||||
if (arg == NULL || !PyBytes_Check(arg)) {
|
||||
PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument");
|
||||
PyObject *arg = Py_None;
|
||||
PyBuffer varg;
|
||||
if (!PyArg_ParseTuple(args, "|O:strip", &arg))
|
||||
return NULL;
|
||||
if (arg == Py_None) {
|
||||
argptr = "\t\n\r\f\v ";
|
||||
argsize = 6;
|
||||
}
|
||||
else {
|
||||
if (_getbuffer(arg, &varg) < 0)
|
||||
return NULL;
|
||||
argptr = varg.buf;
|
||||
argsize = varg.len;
|
||||
}
|
||||
myptr = self->ob_bytes;
|
||||
mysize = Py_Size(self);
|
||||
argptr = ((PyBytesObject *)arg)->ob_bytes;
|
||||
argsize = Py_Size(arg);
|
||||
left = lstrip_helper(myptr, mysize, argptr, argsize);
|
||||
if (left == mysize)
|
||||
right = left;
|
||||
else
|
||||
right = rstrip_helper(myptr, mysize, argptr, argsize);
|
||||
if (arg != Py_None)
|
||||
PyObject_ReleaseBuffer(arg, &varg);
|
||||
return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left);
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(lstrip__doc__,
|
||||
"B.lstrip(bytes) -> bytes\n\
|
||||
"B.lstrip([bytes]) -> bytes\n\
|
||||
\n\
|
||||
Strip leading bytes contained in the argument.");
|
||||
Strip leading bytes contained in the argument.\n\
|
||||
If the argument is omitted, strip leading ASCII whitespace.");
|
||||
static PyObject *
|
||||
bytes_lstrip(PyBytesObject *self, PyObject *arg)
|
||||
bytes_lstrip(PyBytesObject *self, PyObject *args)
|
||||
{
|
||||
Py_ssize_t left, right, mysize, argsize;
|
||||
void *myptr, *argptr;
|
||||
if (arg == NULL || !PyBytes_Check(arg)) {
|
||||
PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument");
|
||||
PyObject *arg = Py_None;
|
||||
PyBuffer varg;
|
||||
if (!PyArg_ParseTuple(args, "|O:lstrip", &arg))
|
||||
return NULL;
|
||||
if (arg == Py_None) {
|
||||
argptr = "\t\n\r\f\v ";
|
||||
argsize = 6;
|
||||
}
|
||||
else {
|
||||
if (_getbuffer(arg, &varg) < 0)
|
||||
return NULL;
|
||||
argptr = varg.buf;
|
||||
argsize = varg.len;
|
||||
}
|
||||
myptr = self->ob_bytes;
|
||||
mysize = Py_Size(self);
|
||||
argptr = ((PyBytesObject *)arg)->ob_bytes;
|
||||
argsize = Py_Size(arg);
|
||||
left = lstrip_helper(myptr, mysize, argptr, argsize);
|
||||
right = mysize;
|
||||
if (arg != Py_None)
|
||||
PyObject_ReleaseBuffer(arg, &varg);
|
||||
return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left);
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(rstrip__doc__,
|
||||
"B.rstrip(bytes) -> bytes\n\
|
||||
"B.rstrip([bytes]) -> bytes\n\
|
||||
\n\
|
||||
Strip trailing bytes contained in the argument.");
|
||||
Strip trailing bytes contained in the argument.\n\
|
||||
If the argument is omitted, strip trailing ASCII whitespace.");
|
||||
static PyObject *
|
||||
bytes_rstrip(PyBytesObject *self, PyObject *arg)
|
||||
bytes_rstrip(PyBytesObject *self, PyObject *args)
|
||||
{
|
||||
Py_ssize_t left, right, mysize, argsize;
|
||||
void *myptr, *argptr;
|
||||
if (arg == NULL || !PyBytes_Check(arg)) {
|
||||
PyErr_SetString(PyExc_TypeError, "strip() requires a bytes argument");
|
||||
PyObject *arg = Py_None;
|
||||
PyBuffer varg;
|
||||
if (!PyArg_ParseTuple(args, "|O:rstrip", &arg))
|
||||
return NULL;
|
||||
if (arg == Py_None) {
|
||||
argptr = "\t\n\r\f\v ";
|
||||
argsize = 6;
|
||||
}
|
||||
else {
|
||||
if (_getbuffer(arg, &varg) < 0)
|
||||
return NULL;
|
||||
argptr = varg.buf;
|
||||
argsize = varg.len;
|
||||
}
|
||||
myptr = self->ob_bytes;
|
||||
mysize = Py_Size(self);
|
||||
argptr = ((PyBytesObject *)arg)->ob_bytes;
|
||||
argsize = Py_Size(arg);
|
||||
left = 0;
|
||||
right = rstrip_helper(myptr, mysize, argptr, argsize);
|
||||
if (arg != Py_None)
|
||||
PyObject_ReleaseBuffer(arg, &varg);
|
||||
return PyBytes_FromStringAndSize(self->ob_bytes + left, right - left);
|
||||
}
|
||||
|
||||
|
@ -2839,9 +2967,9 @@ bytes_methods[] = {
|
|||
{"reverse", (PyCFunction)bytes_reverse, METH_NOARGS, reverse__doc__},
|
||||
{"pop", (PyCFunction)bytes_pop, METH_VARARGS, pop__doc__},
|
||||
{"remove", (PyCFunction)bytes_remove, METH_O, remove__doc__},
|
||||
{"strip", (PyCFunction)bytes_strip, METH_O, strip__doc__},
|
||||
{"lstrip", (PyCFunction)bytes_lstrip, METH_O, lstrip__doc__},
|
||||
{"rstrip", (PyCFunction)bytes_rstrip, METH_O, rstrip__doc__},
|
||||
{"strip", (PyCFunction)bytes_strip, METH_VARARGS, strip__doc__},
|
||||
{"lstrip", (PyCFunction)bytes_lstrip, METH_VARARGS, lstrip__doc__},
|
||||
{"rstrip", (PyCFunction)bytes_rstrip, METH_VARARGS, rstrip__doc__},
|
||||
{"decode", (PyCFunction)bytes_decode, METH_VARARGS, decode_doc},
|
||||
{"__alloc__", (PyCFunction)bytes_alloc, METH_NOARGS, alloc_doc},
|
||||
{"fromhex", (PyCFunction)bytes_fromhex, METH_VARARGS|METH_CLASS,
|
||||
|
|
Loading…
Reference in New Issue