[SF #866875] Add a specialized routine for one character

separaters on str.split() and str.rsplit().
This commit is contained in:
Hye-Shik Chang 2004-01-05 00:29:51 +00:00
parent cb2117a83c
commit 75c00efcc7
2 changed files with 150 additions and 59 deletions

View File

@ -175,41 +175,82 @@ class CommonTest(unittest.TestCase):
def test_split(self): def test_split(self):
self.checkequal(['this', 'is', 'the', 'split', 'function'], self.checkequal(['this', 'is', 'the', 'split', 'function'],
'this is the split function', 'split') 'this is the split function', 'split')
self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'split', '|')
self.checkequal(['a', 'b', 'c|d'], 'a|b|c|d', 'split', '|', 2) # by whitespace
self.checkequal(['a', 'b', 'c', 'd'], 'a b c d ', 'split')
self.checkequal(['a', 'b c d'], 'a b c d', 'split', None, 1) self.checkequal(['a', 'b c d'], 'a b c d', 'split', None, 1)
self.checkequal(['a', 'b', 'c d'], 'a b c d', 'split', None, 2) self.checkequal(['a', 'b', 'c d'], 'a b c d', 'split', None, 2)
self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'split', None, 3) self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'split', None, 3)
self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'split', None, 4) self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'split', None, 4)
self.checkequal(['a b c d'], 'a b c d', 'split', None, 0) self.checkequal(['a b c d'], 'a b c d', 'split', None, 0)
self.checkequal(['a', 'b', 'c d'], 'a b c d', 'split', None, 2) self.checkequal(['a', 'b', 'c d'], 'a b c d', 'split', None, 2)
self.checkequal(['a', 'b', 'c', 'd'], 'a b c d ', 'split')
# by a char
self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'split', '|')
self.checkequal(['a', 'b|c|d'], 'a|b|c|d', 'split', '|', 1)
self.checkequal(['a', 'b', 'c|d'], 'a|b|c|d', 'split', '|', 2)
self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'split', '|', 3)
self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'split', '|', 4)
self.checkequal(['a|b|c|d'], 'a|b|c|d', 'split', '|', 0)
self.checkequal(['a', '', 'b||c||d'], 'a||b||c||d', 'split', '|', 2)
self.checkequal(['endcase ', ''], 'endcase |', 'split', '|')
self.checkequal(['a', '', 'b\x00c\x00d'], 'a\x00\x00b\x00c\x00d', 'split', '\x00', 2)
# by string
self.checkequal(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//') self.checkequal(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//')
self.checkequal(['a', 'b//c//d'], 'a//b//c//d', 'split', '//', 1)
self.checkequal(['a', 'b', 'c//d'], 'a//b//c//d', 'split', '//', 2)
self.checkequal(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//', 3)
self.checkequal(['a', 'b', 'c', 'd'], 'a//b//c//d', 'split', '//', 4)
self.checkequal(['a//b//c//d'], 'a//b//c//d', 'split', '//', 0)
self.checkequal(['a', '', 'b////c////d'], 'a////b////c////d', 'split', '//', 2)
self.checkequal(['endcase ', ''], 'endcase test', 'split', 'test') self.checkequal(['endcase ', ''], 'endcase test', 'split', 'test')
# mixed use of str and unicode
self.checkequal([u'a', u'b', u'c d'], 'a b c d', 'split', u' ', 2)
# argument type
self.checkraises(TypeError, 'hello', 'split', 42, 42, 42) self.checkraises(TypeError, 'hello', 'split', 42, 42, 42)
def test_rsplit(self): def test_rsplit(self):
self.checkequal(['this', 'is', 'the', 'rsplit', 'function'], self.checkequal(['this', 'is', 'the', 'rsplit', 'function'],
'this is the rsplit function', 'rsplit') 'this is the rsplit function', 'rsplit')
self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|')
self.checkequal(['a|b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|', 2) # by whitespace
self.checkequal(['a', 'b', 'c', 'd'], 'a b c d ', 'rsplit')
self.checkequal(['a b c', 'd'], 'a b c d', 'rsplit', None, 1) self.checkequal(['a b c', 'd'], 'a b c d', 'rsplit', None, 1)
self.checkequal(['a b', 'c', 'd'], 'a b c d', 'rsplit', None, 2) self.checkequal(['a b', 'c', 'd'], 'a b c d', 'rsplit', None, 2)
self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'rsplit', None, 3) self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'rsplit', None, 3)
self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'rsplit', None, 4) self.checkequal(['a', 'b', 'c', 'd'], 'a b c d', 'rsplit', None, 4)
self.checkequal(['a b c d'], 'a b c d', 'rsplit', None, 0) self.checkequal(['a b c d'], 'a b c d', 'rsplit', None, 0)
self.checkequal(['a, b, c', 'd'], 'a, b, c, d', 'rsplit', ', ', 1)
self.checkequal(['a, b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 2)
self.checkequal(['a', 'b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 3)
self.checkequal(['a', 'b', 'c', 'd'], 'a, b, c, d', 'rsplit', ', ', 4)
self.checkequal(['a, b, c, d'], 'a, b, c, d', 'rsplit', ', ', 0)
self.checkequal(['a b', 'c', 'd'], 'a b c d', 'rsplit', None, 2) self.checkequal(['a b', 'c', 'd'], 'a b c d', 'rsplit', None, 2)
self.checkequal(['a\x00b', 'c'], 'a\x00b\x00c', 'rsplit', '\x00', 1)
self.checkequal(['', ''], 'abcd', 'rsplit', 'abcd') # by a char
self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|')
self.checkequal(['a|b|c', 'd'], 'a|b|c|d', 'rsplit', '|', 1)
self.checkequal(['a|b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|', 2)
self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|', 3)
self.checkequal(['a', 'b', 'c', 'd'], 'a|b|c|d', 'rsplit', '|', 4)
self.checkequal(['a|b|c|d'], 'a|b|c|d', 'rsplit', '|', 0)
self.checkequal(['a||b||c', '', 'd'], 'a||b||c||d', 'rsplit', '|', 2)
self.checkequal(['', ' begincase'], '| begincase', 'rsplit', '|')
self.checkequal(['a\x00\x00b', 'c', 'd'], 'a\x00\x00b\x00c\x00d', 'rsplit', '\x00', 2)
# by string
self.checkequal(['a', 'b', 'c', 'd'], 'a//b//c//d', 'rsplit', '//')
self.checkequal(['a//b//c', 'd'], 'a//b//c//d', 'rsplit', '//', 1)
self.checkequal(['a//b', 'c', 'd'], 'a//b//c//d', 'rsplit', '//', 2)
self.checkequal(['a', 'b', 'c', 'd'], 'a//b//c//d', 'rsplit', '//', 3)
self.checkequal(['a', 'b', 'c', 'd'], 'a//b//c//d', 'rsplit', '//', 4)
self.checkequal(['a//b//c//d'], 'a//b//c//d', 'rsplit', '//', 0)
self.checkequal(['a////b////c', '', 'd'], 'a////b////c////d', 'rsplit', '//', 2)
self.checkequal(['', ' begincase'], 'test begincase', 'rsplit', 'test')
# mixed use of str and unicode
self.checkequal([u'a b', u'c', u'd'], 'a b c d', 'rsplit', u' ', 2) self.checkequal([u'a b', u'c', u'd'], 'a b c d', 'rsplit', u' ', 2)
self.checkequal(['', ' endcase'], '| endcase', 'rsplit', '|')
self.checkequal(['', ' endcase'], 'test endcase', 'rsplit', 'test') # argument type
self.checkraises(TypeError, 'hello', 'rsplit', 42, 42, 42)
def test_strip(self): def test_strip(self):
self.checkequal('hello', ' hello ', 'strip') self.checkequal('hello', ' hello ', 'strip')

View File

@ -1282,12 +1282,35 @@ static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
#define STRIPNAME(i) (stripformat[i]+3) #define STRIPNAME(i) (stripformat[i]+3)
#define SPLIT_APPEND(data, left, right) \
str = PyString_FromStringAndSize((data) + (left), \
(right) - (left)); \
if (str == NULL) \
goto onError; \
if (PyList_Append(list, str)) { \
Py_DECREF(str); \
goto onError; \
} \
else \
Py_DECREF(str);
#define SPLIT_INSERT(data, left, right) \
str = PyString_FromStringAndSize((data) + (left), \
(right) - (left)); \
if (str == NULL) \
goto onError; \
if (PyList_Insert(list, 0, str)) { \
Py_DECREF(str); \
goto onError; \
} \
else \
Py_DECREF(str);
static PyObject * static PyObject *
split_whitespace(const char *s, int len, int maxsplit) split_whitespace(const char *s, int len, int maxsplit)
{ {
int i, j, err; int i, j;
PyObject* item; PyObject *str;
PyObject *list = PyList_New(0); PyObject *list = PyList_New(0);
if (list == NULL) if (list == NULL)
@ -1302,33 +1325,49 @@ split_whitespace(const char *s, int len, int maxsplit)
if (j < i) { if (j < i) {
if (maxsplit-- <= 0) if (maxsplit-- <= 0)
break; break;
item = PyString_FromStringAndSize(s+j, (int)(i-j)); SPLIT_APPEND(s, j, i);
if (item == NULL)
goto finally;
err = PyList_Append(list, item);
Py_DECREF(item);
if (err < 0)
goto finally;
while (i < len && isspace(Py_CHARMASK(s[i]))) while (i < len && isspace(Py_CHARMASK(s[i])))
i++; i++;
j = i; j = i;
} }
} }
if (j < len) { if (j < len) {
item = PyString_FromStringAndSize(s+j, (int)(len - j)); SPLIT_APPEND(s, j, len);
if (item == NULL)
goto finally;
err = PyList_Append(list, item);
Py_DECREF(item);
if (err < 0)
goto finally;
} }
return list; return list;
finally: onError:
Py_DECREF(list); Py_DECREF(list);
return NULL; return NULL;
} }
static PyObject *
split_char(const char *s, int len, char ch, int maxcount)
{
register int i, j;
PyObject *str;
PyObject *list = PyList_New(0);
if (list == NULL)
return NULL;
for (i = j = 0; i < len; ) {
if (s[i] == ch) {
if (maxcount-- <= 0)
break;
SPLIT_APPEND(s, j, i);
i = j = i + 1;
} else
i++;
}
if (j <= len) {
SPLIT_APPEND(s, j, len);
}
return list;
onError:
Py_DECREF(list);
return NULL;
}
PyDoc_STRVAR(split__doc__, PyDoc_STRVAR(split__doc__,
"S.split([sep [,maxsplit]]) -> list of strings\n\ "S.split([sep [,maxsplit]]) -> list of strings\n\
@ -1362,10 +1401,13 @@ string_split(PyStringObject *self, PyObject *args)
#endif #endif
else if (PyObject_AsCharBuffer(subobj, &sub, &n)) else if (PyObject_AsCharBuffer(subobj, &sub, &n))
return NULL; return NULL;
if (n == 0) { if (n == 0) {
PyErr_SetString(PyExc_ValueError, "empty separator"); PyErr_SetString(PyExc_ValueError, "empty separator");
return NULL; return NULL;
} }
else if (n == 1)
return split_char(s, len, sub[0], maxsplit);
list = PyList_New(0); list = PyList_New(0);
if (list == NULL) if (list == NULL)
@ -1406,8 +1448,8 @@ string_split(PyStringObject *self, PyObject *args)
static PyObject * static PyObject *
rsplit_whitespace(const char *s, int len, int maxsplit) rsplit_whitespace(const char *s, int len, int maxsplit)
{ {
int i, j, err; int i, j;
PyObject* item; PyObject *str;
PyObject *list = PyList_New(0); PyObject *list = PyList_New(0);
if (list == NULL) if (list == NULL)
@ -1422,33 +1464,49 @@ rsplit_whitespace(const char *s, int len, int maxsplit)
if (j > i) { if (j > i) {
if (maxsplit-- <= 0) if (maxsplit-- <= 0)
break; break;
item = PyString_FromStringAndSize(s+i+1, (int)(j-i)); SPLIT_INSERT(s, i + 1, j + 1);
if (item == NULL)
goto finally;
err = PyList_Insert(list, 0, item);
Py_DECREF(item);
if (err < 0)
goto finally;
while (i >= 0 && isspace(Py_CHARMASK(s[i]))) while (i >= 0 && isspace(Py_CHARMASK(s[i])))
i--; i--;
j = i; j = i;
} }
} }
if (j >= 0) { if (j >= 0) {
item = PyString_FromStringAndSize(s, (int)(j + 1)); SPLIT_INSERT(s, 0, j + 1);
if (item == NULL)
goto finally;
err = PyList_Insert(list, 0, item);
Py_DECREF(item);
if (err < 0)
goto finally;
} }
return list; return list;
finally: onError:
Py_DECREF(list); Py_DECREF(list);
return NULL; return NULL;
} }
static PyObject *
rsplit_char(const char *s, int len, char ch, int maxcount)
{
register int i, j;
PyObject *str;
PyObject *list = PyList_New(0);
if (list == NULL)
return NULL;
for (i = j = len - 1; i >= 0; ) {
if (s[i] == ch) {
if (maxcount-- <= 0)
break;
SPLIT_INSERT(s, i + 1, j + 1);
j = i = i - 1;
} else
i--;
}
if (j >= -1) {
SPLIT_INSERT(s, 0, j + 1);
}
return list;
onError:
Py_DECREF(list);
return NULL;
}
PyDoc_STRVAR(rsplit__doc__, PyDoc_STRVAR(rsplit__doc__,
"S.rsplit([sep [,maxsplit]]) -> list of strings\n\ "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
@ -1483,10 +1541,13 @@ string_rsplit(PyStringObject *self, PyObject *args)
#endif #endif
else if (PyObject_AsCharBuffer(subobj, &sub, &n)) else if (PyObject_AsCharBuffer(subobj, &sub, &n))
return NULL; return NULL;
if (n == 0) { if (n == 0) {
PyErr_SetString(PyExc_ValueError, "empty separator"); PyErr_SetString(PyExc_ValueError, "empty separator");
return NULL; return NULL;
} }
else if (n == 1)
return rsplit_char(s, len, sub[0], maxsplit);
list = PyList_New(0); list = PyList_New(0);
if (list == NULL) if (list == NULL)
@ -3104,17 +3165,6 @@ Return a list of the lines in S, breaking at line boundaries.\n\
Line breaks are not included in the resulting list unless keepends\n\ Line breaks are not included in the resulting list unless keepends\n\
is given and true."); is given and true.");
#define SPLIT_APPEND(data, left, right) \
str = PyString_FromStringAndSize(data + left, right - left); \
if (!str) \
goto onError; \
if (PyList_Append(list, str)) { \
Py_DECREF(str); \
goto onError; \
} \
else \
Py_DECREF(str);
static PyObject* static PyObject*
string_splitlines(PyStringObject *self, PyObject *args) string_splitlines(PyStringObject *self, PyObject *args)
{ {