395 lines
11 KiB
C
395 lines
11 KiB
C
/* stringlib: split implementation */
|
|
|
|
#ifndef STRINGLIB_SPLIT_H
|
|
#define STRINGLIB_SPLIT_H
|
|
|
|
#ifndef STRINGLIB_FASTSEARCH_H
|
|
#error must include "stringlib/fastsearch.h" before including this module
|
|
#endif
|
|
|
|
/* Overallocate the initial list to reduce the number of reallocs for small
|
|
split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
|
|
resizes, to sizes 4, 8, then 16. Most observed string splits are for human
|
|
text (roughly 11 words per line) and field delimited data (usually 1-10
|
|
fields). For large strings the split algorithms are bandwidth limited
|
|
so increasing the preallocation likely will not improve things.*/
|
|
|
|
#define MAX_PREALLOC 12
|
|
|
|
/* 5 splits gives 6 elements */
|
|
#define PREALLOC_SIZE(maxsplit) \
|
|
(maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
|
|
|
|
#define SPLIT_APPEND(data, left, right) \
|
|
sub = STRINGLIB_NEW((data) + (left), \
|
|
(right) - (left)); \
|
|
if (sub == NULL) \
|
|
goto onError; \
|
|
if (PyList_Append(list, sub)) { \
|
|
Py_DECREF(sub); \
|
|
goto onError; \
|
|
} \
|
|
else \
|
|
Py_DECREF(sub);
|
|
|
|
#define SPLIT_ADD(data, left, right) { \
|
|
sub = STRINGLIB_NEW((data) + (left), \
|
|
(right) - (left)); \
|
|
if (sub == NULL) \
|
|
goto onError; \
|
|
if (count < MAX_PREALLOC) { \
|
|
PyList_SET_ITEM(list, count, sub); \
|
|
} else { \
|
|
if (PyList_Append(list, sub)) { \
|
|
Py_DECREF(sub); \
|
|
goto onError; \
|
|
} \
|
|
else \
|
|
Py_DECREF(sub); \
|
|
} \
|
|
count++; }
|
|
|
|
|
|
/* Always force the list to the expected size. */
|
|
#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
|
|
|
|
Py_LOCAL_INLINE(PyObject *)
|
|
stringlib_split_whitespace(PyObject* str_obj,
|
|
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
|
Py_ssize_t maxcount)
|
|
{
|
|
Py_ssize_t i, j, count=0;
|
|
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
|
PyObject *sub;
|
|
|
|
if (list == NULL)
|
|
return NULL;
|
|
|
|
i = j = 0;
|
|
while (maxcount-- > 0) {
|
|
while (i < str_len && STRINGLIB_ISSPACE(str[i]))
|
|
i++;
|
|
if (i == str_len) break;
|
|
j = i; i++;
|
|
while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
|
|
i++;
|
|
#ifndef STRINGLIB_MUTABLE
|
|
if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
|
|
/* No whitespace in str_obj, so just use it as list[0] */
|
|
Py_INCREF(str_obj);
|
|
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
|
count++;
|
|
break;
|
|
}
|
|
#endif
|
|
SPLIT_ADD(str, j, i);
|
|
}
|
|
|
|
if (i < str_len) {
|
|
/* Only occurs when maxcount was reached */
|
|
/* Skip any remaining whitespace and copy to end of string */
|
|
while (i < str_len && STRINGLIB_ISSPACE(str[i]))
|
|
i++;
|
|
if (i != str_len)
|
|
SPLIT_ADD(str, i, str_len);
|
|
}
|
|
FIX_PREALLOC_SIZE(list);
|
|
return list;
|
|
|
|
onError:
|
|
Py_DECREF(list);
|
|
return NULL;
|
|
}
|
|
|
|
Py_LOCAL_INLINE(PyObject *)
|
|
stringlib_split_char(PyObject* str_obj,
|
|
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
|
const STRINGLIB_CHAR ch,
|
|
Py_ssize_t maxcount)
|
|
{
|
|
Py_ssize_t i, j, count=0;
|
|
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
|
PyObject *sub;
|
|
|
|
if (list == NULL)
|
|
return NULL;
|
|
|
|
i = j = 0;
|
|
while ((j < str_len) && (maxcount-- > 0)) {
|
|
for(; j < str_len; j++) {
|
|
/* I found that using memchr makes no difference */
|
|
if (str[j] == ch) {
|
|
SPLIT_ADD(str, i, j);
|
|
i = j = j + 1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
#ifndef STRINGLIB_MUTABLE
|
|
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
|
|
/* ch not in str_obj, so just use str_obj as list[0] */
|
|
Py_INCREF(str_obj);
|
|
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
|
count++;
|
|
} else
|
|
#endif
|
|
if (i <= str_len) {
|
|
SPLIT_ADD(str, i, str_len);
|
|
}
|
|
FIX_PREALLOC_SIZE(list);
|
|
return list;
|
|
|
|
onError:
|
|
Py_DECREF(list);
|
|
return NULL;
|
|
}
|
|
|
|
Py_LOCAL_INLINE(PyObject *)
|
|
stringlib_split(PyObject* str_obj,
|
|
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
|
const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
|
|
Py_ssize_t maxcount)
|
|
{
|
|
Py_ssize_t i, j, pos, count=0;
|
|
PyObject *list, *sub;
|
|
|
|
if (sep_len == 0) {
|
|
PyErr_SetString(PyExc_ValueError, "empty separator");
|
|
return NULL;
|
|
}
|
|
else if (sep_len == 1)
|
|
return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount);
|
|
|
|
list = PyList_New(PREALLOC_SIZE(maxcount));
|
|
if (list == NULL)
|
|
return NULL;
|
|
|
|
i = j = 0;
|
|
while (maxcount-- > 0) {
|
|
pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
|
|
if (pos < 0)
|
|
break;
|
|
j = i + pos;
|
|
SPLIT_ADD(str, i, j);
|
|
i = j + sep_len;
|
|
}
|
|
#ifndef STRINGLIB_MUTABLE
|
|
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
|
|
/* No match in str_obj, so just use it as list[0] */
|
|
Py_INCREF(str_obj);
|
|
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
|
count++;
|
|
} else
|
|
#endif
|
|
{
|
|
SPLIT_ADD(str, i, str_len);
|
|
}
|
|
FIX_PREALLOC_SIZE(list);
|
|
return list;
|
|
|
|
onError:
|
|
Py_DECREF(list);
|
|
return NULL;
|
|
}
|
|
|
|
Py_LOCAL_INLINE(PyObject *)
|
|
stringlib_rsplit_whitespace(PyObject* str_obj,
|
|
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
|
Py_ssize_t maxcount)
|
|
{
|
|
Py_ssize_t i, j, count=0;
|
|
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
|
PyObject *sub;
|
|
|
|
if (list == NULL)
|
|
return NULL;
|
|
|
|
i = j = str_len - 1;
|
|
while (maxcount-- > 0) {
|
|
while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
|
|
i--;
|
|
if (i < 0) break;
|
|
j = i; i--;
|
|
while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
|
|
i--;
|
|
#ifndef STRINGLIB_MUTABLE
|
|
if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
|
|
/* No whitespace in str_obj, so just use it as list[0] */
|
|
Py_INCREF(str_obj);
|
|
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
|
count++;
|
|
break;
|
|
}
|
|
#endif
|
|
SPLIT_ADD(str, i + 1, j + 1);
|
|
}
|
|
|
|
if (i >= 0) {
|
|
/* Only occurs when maxcount was reached */
|
|
/* Skip any remaining whitespace and copy to beginning of string */
|
|
while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
|
|
i--;
|
|
if (i >= 0)
|
|
SPLIT_ADD(str, 0, i + 1);
|
|
}
|
|
FIX_PREALLOC_SIZE(list);
|
|
if (PyList_Reverse(list) < 0)
|
|
goto onError;
|
|
return list;
|
|
|
|
onError:
|
|
Py_DECREF(list);
|
|
return NULL;
|
|
}
|
|
|
|
Py_LOCAL_INLINE(PyObject *)
|
|
stringlib_rsplit_char(PyObject* str_obj,
|
|
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
|
const STRINGLIB_CHAR ch,
|
|
Py_ssize_t maxcount)
|
|
{
|
|
Py_ssize_t i, j, count=0;
|
|
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
|
PyObject *sub;
|
|
|
|
if (list == NULL)
|
|
return NULL;
|
|
|
|
i = j = str_len - 1;
|
|
while ((i >= 0) && (maxcount-- > 0)) {
|
|
for(; i >= 0; i--) {
|
|
if (str[i] == ch) {
|
|
SPLIT_ADD(str, i + 1, j + 1);
|
|
j = i = i - 1;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
#ifndef STRINGLIB_MUTABLE
|
|
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
|
|
/* ch not in str_obj, so just use str_obj as list[0] */
|
|
Py_INCREF(str_obj);
|
|
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
|
count++;
|
|
} else
|
|
#endif
|
|
if (j >= -1) {
|
|
SPLIT_ADD(str, 0, j + 1);
|
|
}
|
|
FIX_PREALLOC_SIZE(list);
|
|
if (PyList_Reverse(list) < 0)
|
|
goto onError;
|
|
return list;
|
|
|
|
onError:
|
|
Py_DECREF(list);
|
|
return NULL;
|
|
}
|
|
|
|
Py_LOCAL_INLINE(PyObject *)
|
|
stringlib_rsplit(PyObject* str_obj,
|
|
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
|
const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
|
|
Py_ssize_t maxcount)
|
|
{
|
|
Py_ssize_t j, pos, count=0;
|
|
PyObject *list, *sub;
|
|
|
|
if (sep_len == 0) {
|
|
PyErr_SetString(PyExc_ValueError, "empty separator");
|
|
return NULL;
|
|
}
|
|
else if (sep_len == 1)
|
|
return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount);
|
|
|
|
list = PyList_New(PREALLOC_SIZE(maxcount));
|
|
if (list == NULL)
|
|
return NULL;
|
|
|
|
j = str_len;
|
|
while (maxcount-- > 0) {
|
|
pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH);
|
|
if (pos < 0)
|
|
break;
|
|
SPLIT_ADD(str, pos + sep_len, j);
|
|
j = pos;
|
|
}
|
|
#ifndef STRINGLIB_MUTABLE
|
|
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
|
|
/* No match in str_obj, so just use it as list[0] */
|
|
Py_INCREF(str_obj);
|
|
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
|
count++;
|
|
} else
|
|
#endif
|
|
{
|
|
SPLIT_ADD(str, 0, j);
|
|
}
|
|
FIX_PREALLOC_SIZE(list);
|
|
if (PyList_Reverse(list) < 0)
|
|
goto onError;
|
|
return list;
|
|
|
|
onError:
|
|
Py_DECREF(list);
|
|
return NULL;
|
|
}
|
|
|
|
Py_LOCAL_INLINE(PyObject *)
|
|
stringlib_splitlines(PyObject* str_obj,
|
|
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
|
int keepends)
|
|
{
|
|
/* This does not use the preallocated list because splitlines is
|
|
usually run with hundreds of newlines. The overhead of
|
|
switching between PyList_SET_ITEM and append causes about a
|
|
2-3% slowdown for that common case. A smarter implementation
|
|
could move the if check out, so the SET_ITEMs are done first
|
|
and the appends only done when the prealloc buffer is full.
|
|
That's too much work for little gain.*/
|
|
|
|
register Py_ssize_t i;
|
|
register Py_ssize_t j;
|
|
PyObject *list = PyList_New(0);
|
|
PyObject *sub;
|
|
|
|
if (list == NULL)
|
|
return NULL;
|
|
|
|
for (i = j = 0; i < str_len; ) {
|
|
Py_ssize_t eol;
|
|
|
|
/* Find a line and append it */
|
|
while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
|
|
i++;
|
|
|
|
/* Skip the line break reading CRLF as one line break */
|
|
eol = i;
|
|
if (i < str_len) {
|
|
if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
|
|
i += 2;
|
|
else
|
|
i++;
|
|
if (keepends)
|
|
eol = i;
|
|
}
|
|
#ifndef STRINGLIB_MUTABLE
|
|
if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
|
|
/* No linebreak in str_obj, so just use it as list[0] */
|
|
if (PyList_Append(list, str_obj))
|
|
goto onError;
|
|
break;
|
|
}
|
|
#endif
|
|
SPLIT_APPEND(str, j, eol);
|
|
j = i;
|
|
}
|
|
return list;
|
|
|
|
onError:
|
|
Py_DECREF(list);
|
|
return NULL;
|
|
}
|
|
|
|
#endif
|