mirror of https://github.com/python/cpython
Merged revisions 77461 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/trunk ........ r77461 | antoine.pitrou | 2010-01-13 08:55:48 +0100 (mer., 13 janv. 2010) | 5 lines Issue #7622: Improve the split(), rsplit(), splitlines() and replace() methods of bytes, bytearray and unicode objects by using a common implementation based on stringlib's fast search. Patch by Florent Xicluna. ........
This commit is contained in:
parent
5efea0430c
commit
f2c5484f9e
|
@ -582,6 +582,7 @@ BYTESTR_DEPS = \
|
|||
$(srcdir)/Objects/stringlib/fastsearch.h \
|
||||
$(srcdir)/Objects/stringlib/find.h \
|
||||
$(srcdir)/Objects/stringlib/partition.h \
|
||||
$(srcdir)/Objects/stringlib/split.h \
|
||||
$(srcdir)/Objects/stringlib/stringdefs.h \
|
||||
$(srcdir)/Objects/stringlib/string_format.h \
|
||||
$(srcdir)/Objects/stringlib/transmogrify.h \
|
||||
|
|
|
@ -12,6 +12,10 @@ What's New in Python 3.2 Alpha 1?
|
|||
Core and Builtins
|
||||
-----------------
|
||||
|
||||
- Issue #7622: Improve the split(), rsplit(), splitlines() and replace()
|
||||
methods of bytes, bytearray and unicode objects by using a common
|
||||
implementation based on stringlib's fast search. Patch by Florent Xicluna.
|
||||
|
||||
- Issue #7632: Fix a crash in dtoa.c that occurred in debug builds
|
||||
when parsing certain long numeric strings corresponding to subnormal
|
||||
values. Also fix a number of bugs in dtoa.c that could lead to
|
||||
|
|
|
@ -1039,14 +1039,16 @@ bytearray_dealloc(PyByteArrayObject *self)
|
|||
#define STRINGLIB_STR PyByteArray_AS_STRING
|
||||
#define STRINGLIB_NEW PyByteArray_FromStringAndSize
|
||||
#define STRINGLIB_EMPTY nullbytes
|
||||
#define STRINGLIB_ISSPACE Py_ISSPACE
|
||||
#define STRINGLIB_ISLINEBREAK(x) ((x == '\n') || (x == '\r'))
|
||||
#define STRINGLIB_CHECK_EXACT PyByteArray_CheckExact
|
||||
#define STRINGLIB_MUTABLE 1
|
||||
#define FROM_BYTEARRAY 1
|
||||
|
||||
#include "stringlib/fastsearch.h"
|
||||
#include "stringlib/count.h"
|
||||
#include "stringlib/find.h"
|
||||
#include "stringlib/partition.h"
|
||||
#include "stringlib/split.h"
|
||||
#include "stringlib/ctype.h"
|
||||
#include "stringlib/transmogrify.h"
|
||||
|
||||
|
@ -1054,21 +1056,20 @@ bytearray_dealloc(PyByteArrayObject *self)
|
|||
/* The following Py_LOCAL_INLINE and Py_LOCAL functions
|
||||
were copied from the old char* style string object. */
|
||||
|
||||
Py_LOCAL_INLINE(void)
|
||||
_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len)
|
||||
{
|
||||
if (*end > len)
|
||||
*end = len;
|
||||
else if (*end < 0)
|
||||
*end += len;
|
||||
if (*end < 0)
|
||||
*end = 0;
|
||||
if (*start < 0)
|
||||
*start += len;
|
||||
if (*start < 0)
|
||||
*start = 0;
|
||||
}
|
||||
|
||||
/* helper macro to fixup start/end slice values */
|
||||
#define ADJUST_INDICES(start, end, len) \
|
||||
if (end > len) \
|
||||
end = len; \
|
||||
else if (end < 0) { \
|
||||
end += len; \
|
||||
if (end < 0) \
|
||||
end = 0; \
|
||||
} \
|
||||
if (start < 0) { \
|
||||
start += len; \
|
||||
if (start < 0) \
|
||||
start = 0; \
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(Py_ssize_t)
|
||||
bytearray_find_internal(PyByteArrayObject *self, PyObject *args, int dir)
|
||||
|
@ -1136,10 +1137,10 @@ bytearray_count(PyByteArrayObject *self, PyObject *args)
|
|||
if (_getbuffer(sub_obj, &vsub) < 0)
|
||||
return NULL;
|
||||
|
||||
_adjust_indices(&start, &end, PyByteArray_GET_SIZE(self));
|
||||
ADJUST_INDICES(start, end, PyByteArray_GET_SIZE(self));
|
||||
|
||||
count_obj = PyLong_FromSsize_t(
|
||||
stringlib_count(str + start, end - start, vsub.buf, vsub.len)
|
||||
stringlib_count(str + start, end - start, vsub.buf, vsub.len, PY_SSIZE_T_MAX)
|
||||
);
|
||||
PyBuffer_Release(&vsub);
|
||||
return count_obj;
|
||||
|
@ -1247,7 +1248,7 @@ _bytearray_tailmatch(PyByteArrayObject *self, PyObject *substr, Py_ssize_t start
|
|||
if (_getbuffer(substr, &vsubstr) < 0)
|
||||
return -1;
|
||||
|
||||
_adjust_indices(&start, &end, len);
|
||||
ADJUST_INDICES(start, end, len);
|
||||
|
||||
if (direction < 0) {
|
||||
/* startswith */
|
||||
|
@ -1459,20 +1460,11 @@ bytearray_maketrans(PyObject *null, PyObject *args)
|
|||
}
|
||||
|
||||
|
||||
#define FORWARD 1
|
||||
#define REVERSE -1
|
||||
|
||||
/* find and count characters and substrings */
|
||||
|
||||
#define findchar(target, target_len, c) \
|
||||
((char *)memchr((const void *)(target), c, target_len))
|
||||
|
||||
/* Don't call if length < 2 */
|
||||
#define Py_STRING_MATCH(target, offset, pattern, length) \
|
||||
(target[offset] == pattern[0] && \
|
||||
target[offset+length-1] == pattern[length-1] && \
|
||||
!memcmp(target+offset+1, pattern+1, length-2) )
|
||||
|
||||
|
||||
/* Bytes ops must return a string, create a copy */
|
||||
Py_LOCAL(PyByteArrayObject *)
|
||||
|
@ -1500,93 +1492,6 @@ countchar(const char *target, Py_ssize_t target_len, char c, Py_ssize_t maxcount
|
|||
return count;
|
||||
}
|
||||
|
||||
Py_LOCAL(Py_ssize_t)
|
||||
findstring(const char *target, Py_ssize_t target_len,
|
||||
const char *pattern, Py_ssize_t pattern_len,
|
||||
Py_ssize_t start,
|
||||
Py_ssize_t end,
|
||||
int direction)
|
||||
{
|
||||
if (start < 0) {
|
||||
start += target_len;
|
||||
if (start < 0)
|
||||
start = 0;
|
||||
}
|
||||
if (end > target_len) {
|
||||
end = target_len;
|
||||
} else if (end < 0) {
|
||||
end += target_len;
|
||||
if (end < 0)
|
||||
end = 0;
|
||||
}
|
||||
|
||||
/* zero-length substrings always match at the first attempt */
|
||||
if (pattern_len == 0)
|
||||
return (direction > 0) ? start : end;
|
||||
|
||||
end -= pattern_len;
|
||||
|
||||
if (direction < 0) {
|
||||
for (; end >= start; end--)
|
||||
if (Py_STRING_MATCH(target, end, pattern, pattern_len))
|
||||
return end;
|
||||
} else {
|
||||
for (; start <= end; start++)
|
||||
if (Py_STRING_MATCH(target, start, pattern, pattern_len))
|
||||
return start;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(Py_ssize_t)
|
||||
countstring(const char *target, Py_ssize_t target_len,
|
||||
const char *pattern, Py_ssize_t pattern_len,
|
||||
Py_ssize_t start,
|
||||
Py_ssize_t end,
|
||||
int direction, Py_ssize_t maxcount)
|
||||
{
|
||||
Py_ssize_t count=0;
|
||||
|
||||
if (start < 0) {
|
||||
start += target_len;
|
||||
if (start < 0)
|
||||
start = 0;
|
||||
}
|
||||
if (end > target_len) {
|
||||
end = target_len;
|
||||
} else if (end < 0) {
|
||||
end += target_len;
|
||||
if (end < 0)
|
||||
end = 0;
|
||||
}
|
||||
|
||||
/* zero-length substrings match everywhere */
|
||||
if (pattern_len == 0 || maxcount == 0) {
|
||||
if (target_len+1 < maxcount)
|
||||
return target_len+1;
|
||||
return maxcount;
|
||||
}
|
||||
|
||||
end -= pattern_len;
|
||||
if (direction < 0) {
|
||||
for (; (end >= start); end--)
|
||||
if (Py_STRING_MATCH(target, end, pattern, pattern_len)) {
|
||||
count++;
|
||||
if (--maxcount <= 0) break;
|
||||
end -= pattern_len-1;
|
||||
}
|
||||
} else {
|
||||
for (; (start <= end); start++)
|
||||
if (Py_STRING_MATCH(target, start, pattern, pattern_len)) {
|
||||
count++;
|
||||
if (--maxcount <= 0)
|
||||
break;
|
||||
start += pattern_len-1;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
/* Algorithms for different cases of string replacement */
|
||||
|
||||
|
@ -1708,10 +1613,9 @@ replace_delete_substring(PyByteArrayObject *self,
|
|||
self_len = PyByteArray_GET_SIZE(self);
|
||||
self_s = PyByteArray_AS_STRING(self);
|
||||
|
||||
count = countstring(self_s, self_len,
|
||||
from_s, from_len,
|
||||
0, self_len, 1,
|
||||
maxcount);
|
||||
count = stringlib_count(self_s, self_len,
|
||||
from_s, from_len,
|
||||
maxcount);
|
||||
|
||||
if (count == 0) {
|
||||
/* no matches */
|
||||
|
@ -1730,9 +1634,9 @@ replace_delete_substring(PyByteArrayObject *self,
|
|||
start = self_s;
|
||||
end = self_s + self_len;
|
||||
while (count-- > 0) {
|
||||
offset = findstring(start, end-start,
|
||||
from_s, from_len,
|
||||
0, end-start, FORWARD);
|
||||
offset = stringlib_find(start, end-start,
|
||||
from_s, from_len,
|
||||
0);
|
||||
if (offset == -1)
|
||||
break;
|
||||
next = start + offset;
|
||||
|
@ -1808,9 +1712,9 @@ replace_substring_in_place(PyByteArrayObject *self,
|
|||
self_s = PyByteArray_AS_STRING(self);
|
||||
self_len = PyByteArray_GET_SIZE(self);
|
||||
|
||||
offset = findstring(self_s, self_len,
|
||||
from_s, from_len,
|
||||
0, self_len, FORWARD);
|
||||
offset = stringlib_find(self_s, self_len,
|
||||
from_s, from_len,
|
||||
0);
|
||||
if (offset == -1) {
|
||||
/* No matches; return the original bytes */
|
||||
return return_self(self);
|
||||
|
@ -1830,9 +1734,9 @@ replace_substring_in_place(PyByteArrayObject *self,
|
|||
end = result_s + self_len;
|
||||
|
||||
while ( --maxcount > 0) {
|
||||
offset = findstring(start, end-start,
|
||||
from_s, from_len,
|
||||
0, end-start, FORWARD);
|
||||
offset = stringlib_find(start, end-start,
|
||||
from_s, from_len,
|
||||
0);
|
||||
if (offset==-1)
|
||||
break;
|
||||
Py_MEMCPY(start+offset, to_s, from_len);
|
||||
|
@ -1925,9 +1829,10 @@ replace_substring(PyByteArrayObject *self,
|
|||
self_s = PyByteArray_AS_STRING(self);
|
||||
self_len = PyByteArray_GET_SIZE(self);
|
||||
|
||||
count = countstring(self_s, self_len,
|
||||
from_s, from_len,
|
||||
0, self_len, FORWARD, maxcount);
|
||||
count = stringlib_count(self_s, self_len,
|
||||
from_s, from_len,
|
||||
maxcount);
|
||||
|
||||
if (count == 0) {
|
||||
/* no matches, return unchanged */
|
||||
return return_self(self);
|
||||
|
@ -1954,9 +1859,9 @@ replace_substring(PyByteArrayObject *self,
|
|||
start = self_s;
|
||||
end = self_s + self_len;
|
||||
while (count-- > 0) {
|
||||
offset = findstring(start, end-start,
|
||||
from_s, from_len,
|
||||
0, end-start, FORWARD);
|
||||
offset = stringlib_find(start, end-start,
|
||||
from_s, from_len,
|
||||
0);
|
||||
if (offset == -1)
|
||||
break;
|
||||
next = start+offset;
|
||||
|
@ -2085,123 +1990,6 @@ bytearray_replace(PyByteArrayObject *self, PyObject *args)
|
|||
return res;
|
||||
}
|
||||
|
||||
|
||||
/* Overallocate the initial list to reduce the number of reallocs for small
|
||||
split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
|
||||
resizes, to sizes 4, 8, then 16. Most observed string splits are for human
|
||||
text (roughly 11 words per line) and field delimited data (usually 1-10
|
||||
fields). For large strings the split algorithms are bandwidth limited
|
||||
so increasing the preallocation likely will not improve things.*/
|
||||
|
||||
#define MAX_PREALLOC 12
|
||||
|
||||
/* 5 splits gives 6 elements */
|
||||
#define PREALLOC_SIZE(maxsplit) \
|
||||
(maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
|
||||
|
||||
#define SPLIT_APPEND(data, left, right) \
|
||||
str = PyByteArray_FromStringAndSize((data) + (left), \
|
||||
(right) - (left)); \
|
||||
if (str == NULL) \
|
||||
goto onError; \
|
||||
if (PyList_Append(list, str)) { \
|
||||
Py_DECREF(str); \
|
||||
goto onError; \
|
||||
} \
|
||||
else \
|
||||
Py_DECREF(str);
|
||||
|
||||
#define SPLIT_ADD(data, left, right) { \
|
||||
str = PyByteArray_FromStringAndSize((data) + (left), \
|
||||
(right) - (left)); \
|
||||
if (str == NULL) \
|
||||
goto onError; \
|
||||
if (count < MAX_PREALLOC) { \
|
||||
PyList_SET_ITEM(list, count, str); \
|
||||
} else { \
|
||||
if (PyList_Append(list, str)) { \
|
||||
Py_DECREF(str); \
|
||||
goto onError; \
|
||||
} \
|
||||
else \
|
||||
Py_DECREF(str); \
|
||||
} \
|
||||
count++; }
|
||||
|
||||
/* Always force the list to the expected size. */
|
||||
#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
|
||||
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
split_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
|
||||
{
|
||||
register Py_ssize_t i, j, count = 0;
|
||||
PyObject *str;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
i = j = 0;
|
||||
while ((j < len) && (maxcount-- > 0)) {
|
||||
for(; j < len; j++) {
|
||||
/* I found that using memchr makes no difference */
|
||||
if (s[j] == ch) {
|
||||
SPLIT_ADD(s, i, j);
|
||||
i = j = j + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (i <= len) {
|
||||
SPLIT_ADD(s, i, len);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
split_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
|
||||
{
|
||||
register Py_ssize_t i, j, count = 0;
|
||||
PyObject *str;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
for (i = j = 0; i < len; ) {
|
||||
/* find a token */
|
||||
while (i < len && Py_ISSPACE(s[i]))
|
||||
i++;
|
||||
j = i;
|
||||
while (i < len && !Py_ISSPACE(s[i]))
|
||||
i++;
|
||||
if (j < i) {
|
||||
if (maxcount-- <= 0)
|
||||
break;
|
||||
SPLIT_ADD(s, j, i);
|
||||
while (i < len && Py_ISSPACE(s[i]))
|
||||
i++;
|
||||
j = i;
|
||||
}
|
||||
}
|
||||
if (j < len) {
|
||||
SPLIT_ADD(s, j, len);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(split__doc__,
|
||||
"B.split([sep[, maxsplit]]) -> list of bytearrays\n\
|
||||
\n\
|
||||
|
@ -2213,10 +2001,10 @@ If maxsplit is given, at most maxsplit splits are done.");
|
|||
static PyObject *
|
||||
bytearray_split(PyByteArrayObject *self, PyObject *args)
|
||||
{
|
||||
Py_ssize_t len = PyByteArray_GET_SIZE(self), n, i, j, pos;
|
||||
Py_ssize_t maxsplit = -1, count = 0;
|
||||
Py_ssize_t len = PyByteArray_GET_SIZE(self), n;
|
||||
Py_ssize_t maxsplit = -1;
|
||||
const char *s = PyByteArray_AS_STRING(self), *sub;
|
||||
PyObject *list, *str, *subobj = Py_None;
|
||||
PyObject *list, *subobj = Py_None;
|
||||
Py_buffer vsub;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
|
||||
|
@ -2225,73 +2013,18 @@ bytearray_split(PyByteArrayObject *self, PyObject *args)
|
|||
maxsplit = PY_SSIZE_T_MAX;
|
||||
|
||||
if (subobj == Py_None)
|
||||
return split_whitespace(s, len, maxsplit);
|
||||
return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit);
|
||||
|
||||
if (_getbuffer(subobj, &vsub) < 0)
|
||||
return NULL;
|
||||
sub = vsub.buf;
|
||||
n = vsub.len;
|
||||
|
||||
if (n == 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "empty separator");
|
||||
PyBuffer_Release(&vsub);
|
||||
return NULL;
|
||||
}
|
||||
if (n == 1) {
|
||||
list = split_char(s, len, sub[0], maxsplit);
|
||||
PyBuffer_Release(&vsub);
|
||||
return list;
|
||||
}
|
||||
|
||||
list = PyList_New(PREALLOC_SIZE(maxsplit));
|
||||
if (list == NULL) {
|
||||
PyBuffer_Release(&vsub);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
i = j = 0;
|
||||
while (maxsplit-- > 0) {
|
||||
pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH);
|
||||
if (pos < 0)
|
||||
break;
|
||||
j = i+pos;
|
||||
SPLIT_ADD(s, i, j);
|
||||
i = j + n;
|
||||
}
|
||||
SPLIT_ADD(s, i, len);
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
list = stringlib_split(
|
||||
(PyObject*) self, s, len, sub, n, maxsplit
|
||||
);
|
||||
PyBuffer_Release(&vsub);
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
PyBuffer_Release(&vsub);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* stringlib's partition shares nullbytes in some cases.
|
||||
undo this, we don't want the nullbytes to be shared. */
|
||||
static PyObject *
|
||||
make_nullbytes_unique(PyObject *result)
|
||||
{
|
||||
if (result != NULL) {
|
||||
int i;
|
||||
assert(PyTuple_Check(result));
|
||||
assert(PyTuple_GET_SIZE(result) == 3);
|
||||
for (i = 0; i < 3; i++) {
|
||||
if (PyTuple_GET_ITEM(result, i) == (PyObject *)nullbytes) {
|
||||
PyObject *new = PyByteArray_FromStringAndSize(NULL, 0);
|
||||
if (new == NULL) {
|
||||
Py_DECREF(result);
|
||||
result = NULL;
|
||||
break;
|
||||
}
|
||||
Py_DECREF(nullbytes);
|
||||
PyTuple_SET_ITEM(result, i, new);
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(partition__doc__,
|
||||
|
@ -2318,7 +2051,7 @@ bytearray_partition(PyByteArrayObject *self, PyObject *sep_obj)
|
|||
);
|
||||
|
||||
Py_DECREF(bytesep);
|
||||
return make_nullbytes_unique(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(rpartition__doc__,
|
||||
|
@ -2346,81 +2079,7 @@ bytearray_rpartition(PyByteArrayObject *self, PyObject *sep_obj)
|
|||
);
|
||||
|
||||
Py_DECREF(bytesep);
|
||||
return make_nullbytes_unique(result);
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
rsplit_char(const char *s, Py_ssize_t len, char ch, Py_ssize_t maxcount)
|
||||
{
|
||||
register Py_ssize_t i, j, count=0;
|
||||
PyObject *str;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
i = j = len - 1;
|
||||
while ((i >= 0) && (maxcount-- > 0)) {
|
||||
for (; i >= 0; i--) {
|
||||
if (s[i] == ch) {
|
||||
SPLIT_ADD(s, i + 1, j + 1);
|
||||
j = i = i - 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (j >= -1) {
|
||||
SPLIT_ADD(s, 0, j + 1);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
rsplit_whitespace(const char *s, Py_ssize_t len, Py_ssize_t maxcount)
|
||||
{
|
||||
register Py_ssize_t i, j, count = 0;
|
||||
PyObject *str;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
for (i = j = len - 1; i >= 0; ) {
|
||||
/* find a token */
|
||||
while (i >= 0 && Py_ISSPACE(s[i]))
|
||||
i--;
|
||||
j = i;
|
||||
while (i >= 0 && !Py_ISSPACE(s[i]))
|
||||
i--;
|
||||
if (j > i) {
|
||||
if (maxcount-- <= 0)
|
||||
break;
|
||||
SPLIT_ADD(s, i + 1, j + 1);
|
||||
while (i >= 0 && Py_ISSPACE(s[i]))
|
||||
i--;
|
||||
j = i;
|
||||
}
|
||||
}
|
||||
if (j >= 0) {
|
||||
SPLIT_ADD(s, 0, j + 1);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
return result;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(rsplit__doc__,
|
||||
|
@ -2435,10 +2094,10 @@ If maxsplit is given, at most maxsplit splits are done.");
|
|||
static PyObject *
|
||||
bytearray_rsplit(PyByteArrayObject *self, PyObject *args)
|
||||
{
|
||||
Py_ssize_t len = PyByteArray_GET_SIZE(self), n, j, pos;
|
||||
Py_ssize_t maxsplit = -1, count = 0;
|
||||
Py_ssize_t len = PyByteArray_GET_SIZE(self), n;
|
||||
Py_ssize_t maxsplit = -1;
|
||||
const char *s = PyByteArray_AS_STRING(self), *sub;
|
||||
PyObject *list, *str, *subobj = Py_None;
|
||||
PyObject *list, *subobj = Py_None;
|
||||
Py_buffer vsub;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
|
||||
|
@ -2447,50 +2106,18 @@ bytearray_rsplit(PyByteArrayObject *self, PyObject *args)
|
|||
maxsplit = PY_SSIZE_T_MAX;
|
||||
|
||||
if (subobj == Py_None)
|
||||
return rsplit_whitespace(s, len, maxsplit);
|
||||
return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit);
|
||||
|
||||
if (_getbuffer(subobj, &vsub) < 0)
|
||||
return NULL;
|
||||
sub = vsub.buf;
|
||||
n = vsub.len;
|
||||
|
||||
if (n == 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "empty separator");
|
||||
PyBuffer_Release(&vsub);
|
||||
return NULL;
|
||||
}
|
||||
else if (n == 1) {
|
||||
list = rsplit_char(s, len, sub[0], maxsplit);
|
||||
PyBuffer_Release(&vsub);
|
||||
return list;
|
||||
}
|
||||
|
||||
list = PyList_New(PREALLOC_SIZE(maxsplit));
|
||||
if (list == NULL) {
|
||||
PyBuffer_Release(&vsub);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
j = len;
|
||||
|
||||
while (maxsplit-- > 0) {
|
||||
pos = fastsearch(s, j, sub, n, FAST_RSEARCH);
|
||||
if (pos < 0)
|
||||
break;
|
||||
SPLIT_ADD(s, pos + n, j);
|
||||
j = pos;
|
||||
}
|
||||
SPLIT_ADD(s, 0, j);
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
list = stringlib_rsplit(
|
||||
(PyObject*) self, s, len, sub, n, maxsplit
|
||||
);
|
||||
PyBuffer_Release(&vsub);
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
PyBuffer_Release(&vsub);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(reverse__doc__,
|
||||
|
@ -2956,6 +2583,27 @@ bytearray_join(PyByteArrayObject *self, PyObject *it)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(splitlines__doc__,
|
||||
"B.splitlines([keepends]) -> list of lines\n\
|
||||
\n\
|
||||
Return a list of the lines in B, breaking at line boundaries.\n\
|
||||
Line breaks are not included in the resulting list unless keepends\n\
|
||||
is given and true.");
|
||||
|
||||
static PyObject*
|
||||
bytearray_splitlines(PyObject *self, PyObject *args)
|
||||
{
|
||||
int keepends = 0;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
|
||||
return NULL;
|
||||
|
||||
return stringlib_splitlines(
|
||||
(PyObject*) self, PyByteArray_AS_STRING(self),
|
||||
PyByteArray_GET_SIZE(self), keepends
|
||||
);
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(fromhex_doc,
|
||||
"bytearray.fromhex(string) -> bytearray (static method)\n\
|
||||
\n\
|
||||
|
@ -3134,7 +2782,7 @@ bytearray_methods[] = {
|
|||
{"rsplit", (PyCFunction)bytearray_rsplit, METH_VARARGS, rsplit__doc__},
|
||||
{"rstrip", (PyCFunction)bytearray_rstrip, METH_VARARGS, rstrip__doc__},
|
||||
{"split", (PyCFunction)bytearray_split, METH_VARARGS, split__doc__},
|
||||
{"splitlines", (PyCFunction)stringlib_splitlines, METH_VARARGS,
|
||||
{"splitlines", (PyCFunction)bytearray_splitlines, METH_VARARGS,
|
||||
splitlines__doc__},
|
||||
{"startswith", (PyCFunction)bytearray_startswith, METH_VARARGS ,
|
||||
startswith__doc__},
|
||||
|
|
|
@ -56,7 +56,7 @@ static PyBytesObject *nullstring;
|
|||
If `str' is NULL then PyBytes_FromStringAndSize() will allocate `size+1'
|
||||
bytes (setting the last byte to the null terminating character) and you can
|
||||
fill in the data yourself. If `str' is non-NULL then the resulting
|
||||
PyString object must be treated as immutable and you must not fill in nor
|
||||
PyBytes object must be treated as immutable and you must not fill in nor
|
||||
alter the data yourself, since the strings may be shared.
|
||||
|
||||
The PyObject member `op->ob_size', which denotes the number of "extra
|
||||
|
@ -568,9 +568,9 @@ PyBytes_AsStringAndSize(register PyObject *obj,
|
|||
#include "stringlib/count.h"
|
||||
#include "stringlib/find.h"
|
||||
#include "stringlib/partition.h"
|
||||
#include "stringlib/split.h"
|
||||
#include "stringlib/ctype.h"
|
||||
|
||||
#define STRINGLIB_MUTABLE 0
|
||||
#include "stringlib/transmogrify.h"
|
||||
|
||||
PyObject *
|
||||
|
@ -1000,133 +1000,6 @@ static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
|
|||
|
||||
#define STRIPNAME(i) (stripformat[i]+3)
|
||||
|
||||
|
||||
/* Don't call if length < 2 */
|
||||
#define Py_STRING_MATCH(target, offset, pattern, length) \
|
||||
(target[offset] == pattern[0] && \
|
||||
target[offset+length-1] == pattern[length-1] && \
|
||||
!memcmp(target+offset+1, pattern+1, length-2) )
|
||||
|
||||
|
||||
/* Overallocate the initial list to reduce the number of reallocs for small
|
||||
split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
|
||||
resizes, to sizes 4, 8, then 16. Most observed string splits are for human
|
||||
text (roughly 11 words per line) and field delimited data (usually 1-10
|
||||
fields). For large strings the split algorithms are bandwidth limited
|
||||
so increasing the preallocation likely will not improve things.*/
|
||||
|
||||
#define MAX_PREALLOC 12
|
||||
|
||||
/* 5 splits gives 6 elements */
|
||||
#define PREALLOC_SIZE(maxsplit) \
|
||||
(maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
|
||||
|
||||
#define SPLIT_ADD(data, left, right) { \
|
||||
str = PyBytes_FromStringAndSize((data) + (left), \
|
||||
(right) - (left)); \
|
||||
if (str == NULL) \
|
||||
goto onError; \
|
||||
if (count < MAX_PREALLOC) { \
|
||||
PyList_SET_ITEM(list, count, str); \
|
||||
} else { \
|
||||
if (PyList_Append(list, str)) { \
|
||||
Py_DECREF(str); \
|
||||
goto onError; \
|
||||
} \
|
||||
else \
|
||||
Py_DECREF(str); \
|
||||
} \
|
||||
count++; }
|
||||
|
||||
/* Always force the list to the expected size. */
|
||||
#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
|
||||
|
||||
#define SKIP_SPACE(s, i, len) { while (i<len && ISSPACE(s[i])) i++; }
|
||||
#define SKIP_NONSPACE(s, i, len) { while (i<len && !ISSPACE(s[i])) i++; }
|
||||
#define RSKIP_SPACE(s, i) { while (i>=0 && ISSPACE(s[i])) i--; }
|
||||
#define RSKIP_NONSPACE(s, i) { while (i>=0 && !ISSPACE(s[i])) i--; }
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
split_whitespace(PyBytesObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
|
||||
{
|
||||
const char *s = PyBytes_AS_STRING(self);
|
||||
Py_ssize_t i, j, count=0;
|
||||
PyObject *str;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
i = j = 0;
|
||||
|
||||
while (maxsplit-- > 0) {
|
||||
SKIP_SPACE(s, i, len);
|
||||
if (i==len) break;
|
||||
j = i; i++;
|
||||
SKIP_NONSPACE(s, i, len);
|
||||
if (j == 0 && i == len && PyBytes_CheckExact(self)) {
|
||||
/* No whitespace in self, so just use it as list[0] */
|
||||
Py_INCREF(self);
|
||||
PyList_SET_ITEM(list, 0, (PyObject *)self);
|
||||
count++;
|
||||
break;
|
||||
}
|
||||
SPLIT_ADD(s, j, i);
|
||||
}
|
||||
|
||||
if (i < len) {
|
||||
/* Only occurs when maxsplit was reached */
|
||||
/* Skip any remaining whitespace and copy to end of string */
|
||||
SKIP_SPACE(s, i, len);
|
||||
if (i != len)
|
||||
SPLIT_ADD(s, i, len);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
return list;
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
split_char(PyBytesObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
|
||||
{
|
||||
const char *s = PyBytes_AS_STRING(self);
|
||||
register Py_ssize_t i, j, count=0;
|
||||
PyObject *str;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
i = j = 0;
|
||||
while ((j < len) && (maxcount-- > 0)) {
|
||||
for(; j<len; j++) {
|
||||
/* I found that using memchr makes no difference */
|
||||
if (s[j] == ch) {
|
||||
SPLIT_ADD(s, i, j);
|
||||
i = j = j + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (i == 0 && count == 0 && PyBytes_CheckExact(self)) {
|
||||
/* ch not in self, so just use self as list[0] */
|
||||
Py_INCREF(self);
|
||||
PyList_SET_ITEM(list, 0, (PyObject *)self);
|
||||
count++;
|
||||
}
|
||||
else if (i <= len) {
|
||||
SPLIT_ADD(s, i, len);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(split__doc__,
|
||||
"B.split([sep[, maxsplit]]) -> list of bytes\n\
|
||||
\n\
|
||||
|
@ -1138,74 +1011,26 @@ If maxsplit is given, at most maxsplit splits are done.");
|
|||
static PyObject *
|
||||
bytes_split(PyBytesObject *self, PyObject *args)
|
||||
{
|
||||
Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j;
|
||||
Py_ssize_t maxsplit = -1, count=0;
|
||||
Py_ssize_t len = PyBytes_GET_SIZE(self), n;
|
||||
Py_ssize_t maxsplit = -1;
|
||||
const char *s = PyBytes_AS_STRING(self), *sub;
|
||||
Py_buffer vsub;
|
||||
PyObject *list, *str, *subobj = Py_None;
|
||||
#ifdef USE_FAST
|
||||
Py_ssize_t pos;
|
||||
#endif
|
||||
PyObject *list, *subobj = Py_None;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "|On:split", &subobj, &maxsplit))
|
||||
return NULL;
|
||||
if (maxsplit < 0)
|
||||
maxsplit = PY_SSIZE_T_MAX;
|
||||
if (subobj == Py_None)
|
||||
return split_whitespace(self, len, maxsplit);
|
||||
return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit);
|
||||
if (_getbuffer(subobj, &vsub) < 0)
|
||||
return NULL;
|
||||
sub = vsub.buf;
|
||||
n = vsub.len;
|
||||
|
||||
if (n == 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "empty separator");
|
||||
PyBuffer_Release(&vsub);
|
||||
return NULL;
|
||||
}
|
||||
else if (n == 1) {
|
||||
list = split_char(self, len, sub[0], maxsplit);
|
||||
PyBuffer_Release(&vsub);
|
||||
return list;
|
||||
}
|
||||
|
||||
list = PyList_New(PREALLOC_SIZE(maxsplit));
|
||||
if (list == NULL) {
|
||||
PyBuffer_Release(&vsub);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef USE_FAST
|
||||
i = j = 0;
|
||||
while (maxsplit-- > 0) {
|
||||
pos = fastsearch(s+i, len-i, sub, n, FAST_SEARCH);
|
||||
if (pos < 0)
|
||||
break;
|
||||
j = i+pos;
|
||||
SPLIT_ADD(s, i, j);
|
||||
i = j + n;
|
||||
}
|
||||
#else
|
||||
i = j = 0;
|
||||
while ((j+n <= len) && (maxsplit-- > 0)) {
|
||||
for (; j+n <= len; j++) {
|
||||
if (Py_STRING_MATCH(s, j, sub, n)) {
|
||||
SPLIT_ADD(s, i, j);
|
||||
i = j = j + n;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
SPLIT_ADD(s, i, len);
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit);
|
||||
PyBuffer_Release(&vsub);
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
PyBuffer_Release(&vsub);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(partition__doc__,
|
||||
|
@ -1263,90 +1088,6 @@ bytes_rpartition(PyBytesObject *self, PyObject *sep_obj)
|
|||
);
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
rsplit_whitespace(PyBytesObject *self, Py_ssize_t len, Py_ssize_t maxsplit)
|
||||
{
|
||||
const char *s = PyBytes_AS_STRING(self);
|
||||
Py_ssize_t i, j, count=0;
|
||||
PyObject *str;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxsplit));
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
i = j = len-1;
|
||||
|
||||
while (maxsplit-- > 0) {
|
||||
RSKIP_SPACE(s, i);
|
||||
if (i<0) break;
|
||||
j = i; i--;
|
||||
RSKIP_NONSPACE(s, i);
|
||||
if (j == len-1 && i < 0 && PyBytes_CheckExact(self)) {
|
||||
/* No whitespace in self, so just use it as list[0] */
|
||||
Py_INCREF(self);
|
||||
PyList_SET_ITEM(list, 0, (PyObject *)self);
|
||||
count++;
|
||||
break;
|
||||
}
|
||||
SPLIT_ADD(s, i + 1, j + 1);
|
||||
}
|
||||
if (i >= 0) {
|
||||
/* Only occurs when maxsplit was reached. Skip any remaining
|
||||
whitespace and copy to beginning of string. */
|
||||
RSKIP_SPACE(s, i);
|
||||
if (i >= 0)
|
||||
SPLIT_ADD(s, 0, i + 1);
|
||||
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
return list;
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
rsplit_char(PyBytesObject *self, Py_ssize_t len, char ch, Py_ssize_t maxcount)
|
||||
{
|
||||
const char *s = PyBytes_AS_STRING(self);
|
||||
register Py_ssize_t i, j, count=0;
|
||||
PyObject *str;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
i = j = len - 1;
|
||||
while ((i >= 0) && (maxcount-- > 0)) {
|
||||
for (; i >= 0; i--) {
|
||||
if (s[i] == ch) {
|
||||
SPLIT_ADD(s, i + 1, j + 1);
|
||||
j = i = i - 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (i < 0 && count == 0 && PyBytes_CheckExact(self)) {
|
||||
/* ch not in self, so just use self as list[0] */
|
||||
Py_INCREF(self);
|
||||
PyList_SET_ITEM(list, 0, (PyObject *)self);
|
||||
count++;
|
||||
}
|
||||
else if (j >= -1) {
|
||||
SPLIT_ADD(s, 0, j + 1);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(rsplit__doc__,
|
||||
"B.rsplit([sep[, maxsplit]]) -> list of bytes\n\
|
||||
\n\
|
||||
|
@ -1360,71 +1101,28 @@ If maxsplit is given, at most maxsplit splits are done.");
|
|||
static PyObject *
|
||||
bytes_rsplit(PyBytesObject *self, PyObject *args)
|
||||
{
|
||||
Py_ssize_t len = PyBytes_GET_SIZE(self), n, i, j;
|
||||
Py_ssize_t maxsplit = -1, count=0;
|
||||
const char *s, *sub;
|
||||
Py_ssize_t len = PyBytes_GET_SIZE(self), n;
|
||||
Py_ssize_t maxsplit = -1;
|
||||
const char *s = PyBytes_AS_STRING(self), *sub;
|
||||
Py_buffer vsub;
|
||||
PyObject *list, *str, *subobj = Py_None;
|
||||
PyObject *list, *subobj = Py_None;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "|On:rsplit", &subobj, &maxsplit))
|
||||
return NULL;
|
||||
if (maxsplit < 0)
|
||||
maxsplit = PY_SSIZE_T_MAX;
|
||||
if (subobj == Py_None)
|
||||
return rsplit_whitespace(self, len, maxsplit);
|
||||
return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit);
|
||||
if (_getbuffer(subobj, &vsub) < 0)
|
||||
return NULL;
|
||||
sub = vsub.buf;
|
||||
n = vsub.len;
|
||||
|
||||
if (n == 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "empty separator");
|
||||
PyBuffer_Release(&vsub);
|
||||
return NULL;
|
||||
}
|
||||
else if (n == 1) {
|
||||
list = rsplit_char(self, len, sub[0], maxsplit);
|
||||
PyBuffer_Release(&vsub);
|
||||
return list;
|
||||
}
|
||||
|
||||
list = PyList_New(PREALLOC_SIZE(maxsplit));
|
||||
if (list == NULL) {
|
||||
PyBuffer_Release(&vsub);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
j = len;
|
||||
i = j - n;
|
||||
|
||||
s = PyBytes_AS_STRING(self);
|
||||
while ( (i >= 0) && (maxsplit-- > 0) ) {
|
||||
for (; i>=0; i--) {
|
||||
if (Py_STRING_MATCH(s, i, sub, n)) {
|
||||
SPLIT_ADD(s, i + n, j);
|
||||
j = i;
|
||||
i -= n;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
SPLIT_ADD(s, 0, j);
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit);
|
||||
PyBuffer_Release(&vsub);
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
PyBuffer_Release(&vsub);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#undef SPLIT_ADD
|
||||
#undef MAX_PREALLOC
|
||||
#undef PREALLOC_SIZE
|
||||
|
||||
|
||||
PyDoc_STRVAR(join__doc__,
|
||||
"B.join(iterable_of_bytes) -> bytes\n\
|
||||
|
@ -1531,20 +1229,20 @@ _PyBytes_Join(PyObject *sep, PyObject *x)
|
|||
return bytes_join(sep, x);
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(void)
|
||||
bytes_adjust_indices(Py_ssize_t *start, Py_ssize_t *end, Py_ssize_t len)
|
||||
{
|
||||
if (*end > len)
|
||||
*end = len;
|
||||
else if (*end < 0)
|
||||
*end += len;
|
||||
if (*end < 0)
|
||||
*end = 0;
|
||||
if (*start < 0)
|
||||
*start += len;
|
||||
if (*start < 0)
|
||||
*start = 0;
|
||||
}
|
||||
/* helper macro to fixup start/end slice values */
|
||||
#define ADJUST_INDICES(start, end, len) \
|
||||
if (end > len) \
|
||||
end = len; \
|
||||
else if (end < 0) { \
|
||||
end += len; \
|
||||
if (end < 0) \
|
||||
end = 0; \
|
||||
} \
|
||||
if (start < 0) { \
|
||||
start += len; \
|
||||
if (start < 0) \
|
||||
start = 0; \
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(Py_ssize_t)
|
||||
bytes_find_internal(PyBytesObject *self, PyObject *args, int dir)
|
||||
|
@ -1591,7 +1289,7 @@ bytes_find_internal(PyBytesObject *self, PyObject *args, int dir)
|
|||
PyDoc_STRVAR(find__doc__,
|
||||
"B.find(sub[, start[, end]]) -> int\n\
|
||||
\n\
|
||||
Return the lowest index in S where substring sub is found,\n\
|
||||
Return the lowest index in B where substring sub is found,\n\
|
||||
such that sub is contained within s[start:end]. Optional\n\
|
||||
arguments start and end are interpreted as in slice notation.\n\
|
||||
\n\
|
||||
|
@ -1801,7 +1499,7 @@ PyDoc_STRVAR(count__doc__,
|
|||
"B.count(sub[, start[, end]]) -> int\n\
|
||||
\n\
|
||||
Return the number of non-overlapping occurrences of substring sub in\n\
|
||||
string S[start:end]. Optional arguments start and end are interpreted\n\
|
||||
string B[start:end]. Optional arguments start and end are interpreted\n\
|
||||
as in slice notation.");
|
||||
|
||||
static PyObject *
|
||||
|
@ -1823,10 +1521,10 @@ bytes_count(PyBytesObject *self, PyObject *args)
|
|||
else if (PyObject_AsCharBuffer(sub_obj, &sub, &sub_len))
|
||||
return NULL;
|
||||
|
||||
bytes_adjust_indices(&start, &end, PyBytes_GET_SIZE(self));
|
||||
ADJUST_INDICES(start, end, PyBytes_GET_SIZE(self));
|
||||
|
||||
return PyLong_FromSsize_t(
|
||||
stringlib_count(str + start, end - start, sub, sub_len)
|
||||
stringlib_count(str + start, end - start, sub, sub_len, PY_SSIZE_T_MAX)
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1943,9 +1641,6 @@ bytes_maketrans(PyObject *null, PyObject *args)
|
|||
return _Py_bytes_maketrans(args);
|
||||
}
|
||||
|
||||
#define FORWARD 1
|
||||
#define REVERSE -1
|
||||
|
||||
/* find and count characters and substrings */
|
||||
|
||||
#define findchar(target, target_len, c) \
|
||||
|
@ -1981,94 +1676,6 @@ countchar(const char *target, int target_len, char c, Py_ssize_t maxcount)
|
|||
return count;
|
||||
}
|
||||
|
||||
Py_LOCAL(Py_ssize_t)
|
||||
findstring(const char *target, Py_ssize_t target_len,
|
||||
const char *pattern, Py_ssize_t pattern_len,
|
||||
Py_ssize_t start,
|
||||
Py_ssize_t end,
|
||||
int direction)
|
||||
{
|
||||
if (start < 0) {
|
||||
start += target_len;
|
||||
if (start < 0)
|
||||
start = 0;
|
||||
}
|
||||
if (end > target_len) {
|
||||
end = target_len;
|
||||
} else if (end < 0) {
|
||||
end += target_len;
|
||||
if (end < 0)
|
||||
end = 0;
|
||||
}
|
||||
|
||||
/* zero-length substrings always match at the first attempt */
|
||||
if (pattern_len == 0)
|
||||
return (direction > 0) ? start : end;
|
||||
|
||||
end -= pattern_len;
|
||||
|
||||
if (direction < 0) {
|
||||
for (; end >= start; end--)
|
||||
if (Py_STRING_MATCH(target, end, pattern, pattern_len))
|
||||
return end;
|
||||
} else {
|
||||
for (; start <= end; start++)
|
||||
if (Py_STRING_MATCH(target, start,pattern,pattern_len))
|
||||
return start;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(Py_ssize_t)
|
||||
countstring(const char *target, Py_ssize_t target_len,
|
||||
const char *pattern, Py_ssize_t pattern_len,
|
||||
Py_ssize_t start,
|
||||
Py_ssize_t end,
|
||||
int direction, Py_ssize_t maxcount)
|
||||
{
|
||||
Py_ssize_t count=0;
|
||||
|
||||
if (start < 0) {
|
||||
start += target_len;
|
||||
if (start < 0)
|
||||
start = 0;
|
||||
}
|
||||
if (end > target_len) {
|
||||
end = target_len;
|
||||
} else if (end < 0) {
|
||||
end += target_len;
|
||||
if (end < 0)
|
||||
end = 0;
|
||||
}
|
||||
|
||||
/* zero-length substrings match everywhere */
|
||||
if (pattern_len == 0 || maxcount == 0) {
|
||||
if (target_len+1 < maxcount)
|
||||
return target_len+1;
|
||||
return maxcount;
|
||||
}
|
||||
|
||||
end -= pattern_len;
|
||||
if (direction < 0) {
|
||||
for (; (end >= start); end--)
|
||||
if (Py_STRING_MATCH(target, end,pattern,pattern_len)) {
|
||||
count++;
|
||||
if (--maxcount <= 0) break;
|
||||
end -= pattern_len-1;
|
||||
}
|
||||
} else {
|
||||
for (; (start <= end); start++)
|
||||
if (Py_STRING_MATCH(target, start,
|
||||
pattern, pattern_len)) {
|
||||
count++;
|
||||
if (--maxcount <= 0)
|
||||
break;
|
||||
start += pattern_len-1;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
/* Algorithms for different cases of string replacement */
|
||||
|
||||
|
@ -2189,10 +1796,9 @@ replace_delete_substring(PyBytesObject *self,
|
|||
self_len = PyBytes_GET_SIZE(self);
|
||||
self_s = PyBytes_AS_STRING(self);
|
||||
|
||||
count = countstring(self_s, self_len,
|
||||
from_s, from_len,
|
||||
0, self_len, 1,
|
||||
maxcount);
|
||||
count = stringlib_count(self_s, self_len,
|
||||
from_s, from_len,
|
||||
maxcount);
|
||||
|
||||
if (count == 0) {
|
||||
/* no matches */
|
||||
|
@ -2211,9 +1817,9 @@ replace_delete_substring(PyBytesObject *self,
|
|||
start = self_s;
|
||||
end = self_s + self_len;
|
||||
while (count-- > 0) {
|
||||
offset = findstring(start, end-start,
|
||||
from_s, from_len,
|
||||
0, end-start, FORWARD);
|
||||
offset = stringlib_find(start, end-start,
|
||||
from_s, from_len,
|
||||
0);
|
||||
if (offset == -1)
|
||||
break;
|
||||
next = start + offset;
|
||||
|
@ -2289,9 +1895,9 @@ replace_substring_in_place(PyBytesObject *self,
|
|||
self_s = PyBytes_AS_STRING(self);
|
||||
self_len = PyBytes_GET_SIZE(self);
|
||||
|
||||
offset = findstring(self_s, self_len,
|
||||
from_s, from_len,
|
||||
0, self_len, FORWARD);
|
||||
offset = stringlib_find(self_s, self_len,
|
||||
from_s, from_len,
|
||||
0);
|
||||
if (offset == -1) {
|
||||
/* No matches; return the original string */
|
||||
return return_self(self);
|
||||
|
@ -2311,9 +1917,9 @@ replace_substring_in_place(PyBytesObject *self,
|
|||
end = result_s + self_len;
|
||||
|
||||
while ( --maxcount > 0) {
|
||||
offset = findstring(start, end-start,
|
||||
from_s, from_len,
|
||||
0, end-start, FORWARD);
|
||||
offset = stringlib_find(start, end-start,
|
||||
from_s, from_len,
|
||||
0);
|
||||
if (offset==-1)
|
||||
break;
|
||||
Py_MEMCPY(start+offset, to_s, from_len);
|
||||
|
@ -2407,9 +2013,10 @@ replace_substring(PyBytesObject *self,
|
|||
self_s = PyBytes_AS_STRING(self);
|
||||
self_len = PyBytes_GET_SIZE(self);
|
||||
|
||||
count = countstring(self_s, self_len,
|
||||
from_s, from_len,
|
||||
0, self_len, FORWARD, maxcount);
|
||||
count = stringlib_count(self_s, self_len,
|
||||
from_s, from_len,
|
||||
maxcount);
|
||||
|
||||
if (count == 0) {
|
||||
/* no matches, return unchanged */
|
||||
return return_self(self);
|
||||
|
@ -2438,9 +2045,9 @@ replace_substring(PyBytesObject *self,
|
|||
start = self_s;
|
||||
end = self_s + self_len;
|
||||
while (count-- > 0) {
|
||||
offset = findstring(start, end-start,
|
||||
from_s, from_len,
|
||||
0, end-start, FORWARD);
|
||||
offset = stringlib_find(start, end-start,
|
||||
from_s, from_len,
|
||||
0);
|
||||
if (offset == -1)
|
||||
break;
|
||||
next = start+offset;
|
||||
|
@ -2598,7 +2205,7 @@ _bytes_tailmatch(PyBytesObject *self, PyObject *substr, Py_ssize_t start,
|
|||
return -1;
|
||||
str = PyBytes_AS_STRING(self);
|
||||
|
||||
bytes_adjust_indices(&start, &end, len);
|
||||
ADJUST_INDICES(start, end, len);
|
||||
|
||||
if (direction < 0) {
|
||||
/* startswith */
|
||||
|
@ -2703,7 +2310,7 @@ bytes_endswith(PyBytesObject *self, PyObject *args)
|
|||
PyDoc_STRVAR(decode__doc__,
|
||||
"B.decode([encoding[, errors]]) -> str\n\
|
||||
\n\
|
||||
Decode S using the codec registered for encoding. encoding defaults\n\
|
||||
Decode B using the codec registered for encoding. encoding defaults\n\
|
||||
to the default encoding. errors may be given to set a different error\n\
|
||||
handling scheme. Default is 'strict' meaning that encoding errors raise\n\
|
||||
a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
|
||||
|
@ -2725,6 +2332,28 @@ bytes_decode(PyObject *self, PyObject *args, PyObject *kwargs)
|
|||
}
|
||||
|
||||
|
||||
PyDoc_STRVAR(splitlines__doc__,
|
||||
"B.splitlines([keepends]) -> list of lines\n\
|
||||
\n\
|
||||
Return a list of the lines in B, breaking at line boundaries.\n\
|
||||
Line breaks are not included in the resulting list unless keepends\n\
|
||||
is given and true.");
|
||||
|
||||
static PyObject*
|
||||
bytes_splitlines(PyObject *self, PyObject *args)
|
||||
{
|
||||
int keepends = 0;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
|
||||
return NULL;
|
||||
|
||||
return stringlib_splitlines(
|
||||
(PyObject*) self, PyBytes_AS_STRING(self),
|
||||
PyBytes_GET_SIZE(self), keepends
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
PyDoc_STRVAR(fromhex_doc,
|
||||
"bytes.fromhex(string) -> bytes\n\
|
||||
\n\
|
||||
|
@ -2857,7 +2486,7 @@ bytes_methods[] = {
|
|||
{"rsplit", (PyCFunction)bytes_rsplit, METH_VARARGS, rsplit__doc__},
|
||||
{"rstrip", (PyCFunction)bytes_rstrip, METH_VARARGS, rstrip__doc__},
|
||||
{"split", (PyCFunction)bytes_split, METH_VARARGS, split__doc__},
|
||||
{"splitlines", (PyCFunction)stringlib_splitlines, METH_VARARGS,
|
||||
{"splitlines", (PyCFunction)bytes_splitlines, METH_VARARGS,
|
||||
splitlines__doc__},
|
||||
{"startswith", (PyCFunction)bytes_startswith, METH_VARARGS,
|
||||
startswith__doc__},
|
||||
|
@ -3239,7 +2868,7 @@ _PyBytes_Resize(PyObject **pv, Py_ssize_t newsize)
|
|||
/* _PyBytes_FormatLong emulates the format codes d, u, o, x and X, and
|
||||
* the F_ALT flag, for Python's long (unbounded) ints. It's not used for
|
||||
* Python's regular ints.
|
||||
* Return value: a new PyString*, or NULL if error.
|
||||
* Return value: a new PyBytes*, or NULL if error.
|
||||
* . *pbuf is set to point into it,
|
||||
* *plen set to the # of chars following that.
|
||||
* Caller must decref it when done using pbuf.
|
||||
|
|
|
@ -9,28 +9,22 @@
|
|||
|
||||
Py_LOCAL_INLINE(Py_ssize_t)
|
||||
stringlib_count(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
const STRINGLIB_CHAR* sub, Py_ssize_t sub_len)
|
||||
const STRINGLIB_CHAR* sub, Py_ssize_t sub_len,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
Py_ssize_t count;
|
||||
|
||||
if (str_len < 0)
|
||||
return 0; /* start > len(str) */
|
||||
if (sub_len == 0)
|
||||
return str_len + 1;
|
||||
return (str_len < maxcount) ? str_len + 1 : maxcount;
|
||||
|
||||
count = fastsearch(str, str_len, sub, sub_len, FAST_COUNT);
|
||||
count = fastsearch(str, str_len, sub, sub_len, maxcount, FAST_COUNT);
|
||||
|
||||
if (count < 0)
|
||||
count = 0; /* no match */
|
||||
return 0; /* no match */
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
Local variables:
|
||||
c-basic-offset: 4
|
||||
indent-tabs-mode: nil
|
||||
End:
|
||||
*/
|
||||
|
|
|
@ -107,4 +107,3 @@ stringlib_swapcase(PyObject *self)
|
|||
STRINGLIB_LEN(self));
|
||||
return newobj;
|
||||
}
|
||||
|
||||
|
|
|
@ -18,10 +18,13 @@
|
|||
#define FAST_SEARCH 1
|
||||
#define FAST_RSEARCH 2
|
||||
|
||||
#define BLOOM_ADD(mask, ch) ((mask |= (1 << ((ch) & (LONG_BIT - 1)))))
|
||||
#define BLOOM(mask, ch) ((mask & (1 << ((ch) & (LONG_BIT - 1)))))
|
||||
|
||||
Py_LOCAL_INLINE(Py_ssize_t)
|
||||
fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
|
||||
const STRINGLIB_CHAR* p, Py_ssize_t m,
|
||||
int mode)
|
||||
Py_ssize_t maxcount, int mode)
|
||||
{
|
||||
long mask;
|
||||
Py_ssize_t skip, count = 0;
|
||||
|
@ -29,7 +32,7 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
|
|||
|
||||
w = n - m;
|
||||
|
||||
if (w < 0)
|
||||
if (w < 0 || (mode == FAST_COUNT && maxcount == 0))
|
||||
return -1;
|
||||
|
||||
/* look for special cases */
|
||||
|
@ -39,8 +42,11 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
|
|||
/* use special case for 1-character strings */
|
||||
if (mode == FAST_COUNT) {
|
||||
for (i = 0; i < n; i++)
|
||||
if (s[i] == p[0])
|
||||
if (s[i] == p[0]) {
|
||||
count++;
|
||||
if (count == maxcount)
|
||||
return maxcount;
|
||||
}
|
||||
return count;
|
||||
} else if (mode == FAST_SEARCH) {
|
||||
for (i = 0; i < n; i++)
|
||||
|
@ -56,19 +62,20 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
|
|||
|
||||
mlast = m - 1;
|
||||
skip = mlast - 1;
|
||||
mask = 0;
|
||||
|
||||
if (mode != FAST_RSEARCH) {
|
||||
|
||||
/* create compressed boyer-moore delta 1 table */
|
||||
|
||||
/* process pattern[:-1] */
|
||||
for (mask = i = 0; i < mlast; i++) {
|
||||
mask |= (1 << (p[i] & 0x1F));
|
||||
for (i = 0; i < mlast; i++) {
|
||||
BLOOM_ADD(mask, p[i]);
|
||||
if (p[i] == p[mlast])
|
||||
skip = mlast - i - 1;
|
||||
}
|
||||
/* process pattern[-1] outside the loop */
|
||||
mask |= (1 << (p[mlast] & 0x1F));
|
||||
BLOOM_ADD(mask, p[mlast]);
|
||||
|
||||
for (i = 0; i <= w; i++) {
|
||||
/* note: using mlast in the skip path slows things down on x86 */
|
||||
|
@ -82,17 +89,19 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
|
|||
if (mode != FAST_COUNT)
|
||||
return i;
|
||||
count++;
|
||||
if (count == maxcount)
|
||||
return maxcount;
|
||||
i = i + mlast;
|
||||
continue;
|
||||
}
|
||||
/* miss: check if next character is part of pattern */
|
||||
if (!(mask & (1 << (s[i+m] & 0x1F))))
|
||||
if (!BLOOM(mask, s[i+m]))
|
||||
i = i + m;
|
||||
else
|
||||
i = i + skip;
|
||||
} else {
|
||||
/* skip: check if next character is part of pattern */
|
||||
if (!(mask & (1 << (s[i+m] & 0x1F))))
|
||||
if (!BLOOM(mask, s[i+m]))
|
||||
i = i + m;
|
||||
}
|
||||
}
|
||||
|
@ -101,10 +110,10 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
|
|||
/* create compressed boyer-moore delta 1 table */
|
||||
|
||||
/* process pattern[0] outside the loop */
|
||||
mask = (1 << (p[0] & 0x1F));
|
||||
BLOOM_ADD(mask, p[0]);
|
||||
/* process pattern[:0:-1] */
|
||||
for (i = mlast; i > 0; i--) {
|
||||
mask |= (1 << (p[i] & 0x1F));
|
||||
BLOOM_ADD(mask, p[i]);
|
||||
if (p[i] == p[0])
|
||||
skip = i - 1;
|
||||
}
|
||||
|
@ -119,13 +128,13 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
|
|||
/* got a match! */
|
||||
return i;
|
||||
/* miss: check if previous character is part of pattern */
|
||||
if (!(mask & (1 << (s[i-1] & 0x1F))))
|
||||
if (!BLOOM(mask, s[i-1]))
|
||||
i = i - m;
|
||||
else
|
||||
i = i - skip;
|
||||
} else {
|
||||
/* skip: check if previous character is part of pattern */
|
||||
if (!(mask & (1 << (s[i-1] & 0x1F))))
|
||||
if (!BLOOM(mask, s[i-1]))
|
||||
i = i - m;
|
||||
}
|
||||
}
|
||||
|
@ -137,10 +146,3 @@ fastsearch(const STRINGLIB_CHAR* s, Py_ssize_t n,
|
|||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
Local variables:
|
||||
c-basic-offset: 4
|
||||
indent-tabs-mode: nil
|
||||
End:
|
||||
*/
|
||||
|
|
|
@ -19,7 +19,7 @@ stringlib_find(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
|||
if (sub_len == 0)
|
||||
return offset;
|
||||
|
||||
pos = fastsearch(str, str_len, sub, sub_len, FAST_SEARCH);
|
||||
pos = fastsearch(str, str_len, sub, sub_len, -1, FAST_SEARCH);
|
||||
|
||||
if (pos >= 0)
|
||||
pos += offset;
|
||||
|
@ -39,7 +39,7 @@ stringlib_rfind(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
|||
if (sub_len == 0)
|
||||
return str_len + offset;
|
||||
|
||||
pos = fastsearch(str, str_len, sub, sub_len, FAST_RSEARCH);
|
||||
pos = fastsearch(str, str_len, sub, sub_len, -1, FAST_RSEARCH);
|
||||
|
||||
if (pos >= 0)
|
||||
pos += offset;
|
||||
|
@ -47,22 +47,27 @@ stringlib_rfind(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
|||
return pos;
|
||||
}
|
||||
|
||||
/* helper macro to fixup start/end slice values */
|
||||
#define ADJUST_INDICES(start, end, len) \
|
||||
if (end > len) \
|
||||
end = len; \
|
||||
else if (end < 0) { \
|
||||
end += len; \
|
||||
if (end < 0) \
|
||||
end = 0; \
|
||||
} \
|
||||
if (start < 0) { \
|
||||
start += len; \
|
||||
if (start < 0) \
|
||||
start = 0; \
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(Py_ssize_t)
|
||||
stringlib_find_slice(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
const STRINGLIB_CHAR* sub, Py_ssize_t sub_len,
|
||||
Py_ssize_t start, Py_ssize_t end)
|
||||
{
|
||||
if (start < 0)
|
||||
start += str_len;
|
||||
if (start < 0)
|
||||
start = 0;
|
||||
if (end > str_len)
|
||||
end = str_len;
|
||||
if (end < 0)
|
||||
end += str_len;
|
||||
if (end < 0)
|
||||
end = 0;
|
||||
|
||||
ADJUST_INDICES(start, end, str_len);
|
||||
return stringlib_find(str + start, end - start, sub, sub_len, start);
|
||||
}
|
||||
|
||||
|
@ -71,17 +76,7 @@ stringlib_rfind_slice(const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
|||
const STRINGLIB_CHAR* sub, Py_ssize_t sub_len,
|
||||
Py_ssize_t start, Py_ssize_t end)
|
||||
{
|
||||
if (start < 0)
|
||||
start += str_len;
|
||||
if (start < 0)
|
||||
start = 0;
|
||||
if (end > str_len)
|
||||
end = str_len;
|
||||
if (end < 0)
|
||||
end += str_len;
|
||||
if (end < 0)
|
||||
end = 0;
|
||||
|
||||
ADJUST_INDICES(start, end, str_len);
|
||||
return stringlib_rfind(str + start, end - start, sub, sub_len, start);
|
||||
}
|
||||
|
||||
|
@ -96,9 +91,9 @@ stringlib_contains_obj(PyObject* str, PyObject* sub)
|
|||
) != -1;
|
||||
}
|
||||
|
||||
#endif /* STRINGLIB_STR */
|
||||
#endif /* STRINGLIB_WANT_CONTAINS_OBJ */
|
||||
|
||||
#ifdef FROM_UNICODE
|
||||
#if STRINGLIB_IS_UNICODE
|
||||
|
||||
/*
|
||||
This function is a helper for the "find" family (find, rfind, index,
|
||||
|
@ -146,13 +141,6 @@ _ParseTupleFinds (PyObject *args, PyObject **substring,
|
|||
return 1;
|
||||
}
|
||||
|
||||
#endif /* FROM_UNICODE */
|
||||
#endif /* STRINGLIB_IS_UNICODE */
|
||||
|
||||
#endif /* STRINGLIB_FIND_H */
|
||||
|
||||
/*
|
||||
Local variables:
|
||||
c-basic-offset: 4
|
||||
indent-tabs-mode: nil
|
||||
End:
|
||||
*/
|
||||
|
|
|
@ -8,33 +8,39 @@
|
|||
#endif
|
||||
|
||||
Py_LOCAL_INLINE(PyObject*)
|
||||
stringlib_partition(
|
||||
PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
PyObject* sep_obj, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len
|
||||
)
|
||||
stringlib_partition(PyObject* str_obj,
|
||||
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
PyObject* sep_obj,
|
||||
const STRINGLIB_CHAR* sep, Py_ssize_t sep_len)
|
||||
{
|
||||
PyObject* out;
|
||||
Py_ssize_t pos;
|
||||
|
||||
if (sep_len == 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "empty separator");
|
||||
return NULL;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
out = PyTuple_New(3);
|
||||
if (!out)
|
||||
return NULL;
|
||||
return NULL;
|
||||
|
||||
pos = fastsearch(str, str_len, sep, sep_len, FAST_SEARCH);
|
||||
pos = fastsearch(str, str_len, sep, sep_len, -1, FAST_SEARCH);
|
||||
|
||||
if (pos < 0) {
|
||||
Py_INCREF(str_obj);
|
||||
PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj);
|
||||
Py_INCREF(STRINGLIB_EMPTY);
|
||||
PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY);
|
||||
Py_INCREF(STRINGLIB_EMPTY);
|
||||
PyTuple_SET_ITEM(out, 2, (PyObject*) STRINGLIB_EMPTY);
|
||||
return out;
|
||||
#if STRINGLIB_MUTABLE
|
||||
PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, str_len));
|
||||
PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0));
|
||||
PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(NULL, 0));
|
||||
#else
|
||||
Py_INCREF(str_obj);
|
||||
PyTuple_SET_ITEM(out, 0, (PyObject*) str_obj);
|
||||
Py_INCREF(STRINGLIB_EMPTY);
|
||||
PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY);
|
||||
Py_INCREF(STRINGLIB_EMPTY);
|
||||
PyTuple_SET_ITEM(out, 2, (PyObject*) STRINGLIB_EMPTY);
|
||||
#endif
|
||||
return out;
|
||||
}
|
||||
|
||||
PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, pos));
|
||||
|
@ -44,41 +50,47 @@ stringlib_partition(
|
|||
PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str + pos, str_len - pos));
|
||||
|
||||
if (PyErr_Occurred()) {
|
||||
Py_DECREF(out);
|
||||
return NULL;
|
||||
Py_DECREF(out);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject*)
|
||||
stringlib_rpartition(
|
||||
PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
PyObject* sep_obj, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len
|
||||
)
|
||||
stringlib_rpartition(PyObject* str_obj,
|
||||
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
PyObject* sep_obj,
|
||||
const STRINGLIB_CHAR* sep, Py_ssize_t sep_len)
|
||||
{
|
||||
PyObject* out;
|
||||
Py_ssize_t pos;
|
||||
|
||||
if (sep_len == 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "empty separator");
|
||||
return NULL;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
out = PyTuple_New(3);
|
||||
if (!out)
|
||||
return NULL;
|
||||
return NULL;
|
||||
|
||||
pos = fastsearch(str, str_len, sep, sep_len, FAST_RSEARCH);
|
||||
pos = fastsearch(str, str_len, sep, sep_len, -1, FAST_RSEARCH);
|
||||
|
||||
if (pos < 0) {
|
||||
Py_INCREF(STRINGLIB_EMPTY);
|
||||
PyTuple_SET_ITEM(out, 0, (PyObject*) STRINGLIB_EMPTY);
|
||||
Py_INCREF(STRINGLIB_EMPTY);
|
||||
PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY);
|
||||
Py_INCREF(str_obj);
|
||||
PyTuple_SET_ITEM(out, 2, (PyObject*) str_obj);
|
||||
return out;
|
||||
#if STRINGLIB_MUTABLE
|
||||
PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(NULL, 0));
|
||||
PyTuple_SET_ITEM(out, 1, STRINGLIB_NEW(NULL, 0));
|
||||
PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str, str_len));
|
||||
#else
|
||||
Py_INCREF(STRINGLIB_EMPTY);
|
||||
PyTuple_SET_ITEM(out, 0, (PyObject*) STRINGLIB_EMPTY);
|
||||
Py_INCREF(STRINGLIB_EMPTY);
|
||||
PyTuple_SET_ITEM(out, 1, (PyObject*) STRINGLIB_EMPTY);
|
||||
Py_INCREF(str_obj);
|
||||
PyTuple_SET_ITEM(out, 2, (PyObject*) str_obj);
|
||||
#endif
|
||||
return out;
|
||||
}
|
||||
|
||||
PyTuple_SET_ITEM(out, 0, STRINGLIB_NEW(str, pos));
|
||||
|
@ -88,18 +100,11 @@ stringlib_rpartition(
|
|||
PyTuple_SET_ITEM(out, 2, STRINGLIB_NEW(str + pos, str_len - pos));
|
||||
|
||||
if (PyErr_Occurred()) {
|
||||
Py_DECREF(out);
|
||||
return NULL;
|
||||
Py_DECREF(out);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
Local variables:
|
||||
c-basic-offset: 4
|
||||
indent-tabs-mode: nil
|
||||
End:
|
||||
*/
|
||||
|
|
|
@ -0,0 +1,788 @@
|
|||
/* stringlib: split implementation */
|
||||
|
||||
#ifndef STRINGLIB_SPLIT_H
|
||||
#define STRINGLIB_SPLIT_H
|
||||
|
||||
#ifndef STRINGLIB_FASTSEARCH_H
|
||||
#error must include "stringlib/fastsearch.h" before including this module
|
||||
#endif
|
||||
|
||||
/* Overallocate the initial list to reduce the number of reallocs for small
|
||||
split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
|
||||
resizes, to sizes 4, 8, then 16. Most observed string splits are for human
|
||||
text (roughly 11 words per line) and field delimited data (usually 1-10
|
||||
fields). For large strings the split algorithms are bandwidth limited
|
||||
so increasing the preallocation likely will not improve things.*/
|
||||
|
||||
#define MAX_PREALLOC 12
|
||||
|
||||
/* 5 splits gives 6 elements */
|
||||
#define PREALLOC_SIZE(maxsplit) \
|
||||
(maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
|
||||
|
||||
#define SPLIT_APPEND(data, left, right) \
|
||||
sub = STRINGLIB_NEW((data) + (left), \
|
||||
(right) - (left)); \
|
||||
if (sub == NULL) \
|
||||
goto onError; \
|
||||
if (PyList_Append(list, sub)) { \
|
||||
Py_DECREF(sub); \
|
||||
goto onError; \
|
||||
} \
|
||||
else \
|
||||
Py_DECREF(sub);
|
||||
|
||||
#define SPLIT_ADD(data, left, right) { \
|
||||
sub = STRINGLIB_NEW((data) + (left), \
|
||||
(right) - (left)); \
|
||||
if (sub == NULL) \
|
||||
goto onError; \
|
||||
if (count < MAX_PREALLOC) { \
|
||||
PyList_SET_ITEM(list, count, sub); \
|
||||
} else { \
|
||||
if (PyList_Append(list, sub)) { \
|
||||
Py_DECREF(sub); \
|
||||
goto onError; \
|
||||
} \
|
||||
else \
|
||||
Py_DECREF(sub); \
|
||||
} \
|
||||
count++; }
|
||||
|
||||
|
||||
/* Always force the list to the expected size. */
|
||||
#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
stringlib_split_whitespace(PyObject* str_obj,
|
||||
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
Py_ssize_t i, j, count=0;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
PyObject *sub;
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
i = j = 0;
|
||||
while (maxcount-- > 0) {
|
||||
while (i < str_len && STRINGLIB_ISSPACE(str[i]))
|
||||
i++;
|
||||
if (i == str_len) break;
|
||||
j = i; i++;
|
||||
while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
|
||||
i++;
|
||||
#ifndef STRINGLIB_MUTABLE
|
||||
if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
|
||||
/* No whitespace in str_obj, so just use it as list[0] */
|
||||
Py_INCREF(str_obj);
|
||||
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
||||
count++;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
SPLIT_ADD(str, j, i);
|
||||
}
|
||||
|
||||
if (i < str_len) {
|
||||
/* Only occurs when maxcount was reached */
|
||||
/* Skip any remaining whitespace and copy to end of string */
|
||||
while (i < str_len && STRINGLIB_ISSPACE(str[i]))
|
||||
i++;
|
||||
if (i != str_len)
|
||||
SPLIT_ADD(str, i, str_len);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
stringlib_split_char(PyObject* str_obj,
|
||||
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
const STRINGLIB_CHAR ch,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
Py_ssize_t i, j, count=0;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
PyObject *sub;
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
i = j = 0;
|
||||
while ((j < str_len) && (maxcount-- > 0)) {
|
||||
for(; j < str_len; j++) {
|
||||
/* I found that using memchr makes no difference */
|
||||
if (str[j] == ch) {
|
||||
SPLIT_ADD(str, i, j);
|
||||
i = j = j + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifndef STRINGLIB_MUTABLE
|
||||
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
|
||||
/* ch not in str_obj, so just use str_obj as list[0] */
|
||||
Py_INCREF(str_obj);
|
||||
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
||||
count++;
|
||||
} else
|
||||
#endif
|
||||
if (i <= str_len) {
|
||||
SPLIT_ADD(str, i, str_len);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
stringlib_split(PyObject* str_obj,
|
||||
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
Py_ssize_t i, j, pos, count=0;
|
||||
PyObject *list, *sub;
|
||||
|
||||
if (sep_len == 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "empty separator");
|
||||
return NULL;
|
||||
}
|
||||
else if (sep_len == 1)
|
||||
return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount);
|
||||
|
||||
list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
i = j = 0;
|
||||
while (maxcount-- > 0) {
|
||||
pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
|
||||
if (pos < 0)
|
||||
break;
|
||||
j = i + pos;
|
||||
SPLIT_ADD(str, i, j);
|
||||
i = j + sep_len;
|
||||
}
|
||||
#ifndef STRINGLIB_MUTABLE
|
||||
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
|
||||
/* No match in str_obj, so just use it as list[0] */
|
||||
Py_INCREF(str_obj);
|
||||
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
||||
count++;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
SPLIT_ADD(str, i, str_len);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
stringlib_rsplit_whitespace(PyObject* str_obj,
|
||||
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
Py_ssize_t i, j, count=0;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
PyObject *sub;
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
i = j = str_len - 1;
|
||||
while (maxcount-- > 0) {
|
||||
while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
|
||||
i--;
|
||||
if (i < 0) break;
|
||||
j = i; i--;
|
||||
while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
|
||||
i--;
|
||||
#ifndef STRINGLIB_MUTABLE
|
||||
if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
|
||||
/* No whitespace in str_obj, so just use it as list[0] */
|
||||
Py_INCREF(str_obj);
|
||||
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
||||
count++;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
SPLIT_ADD(str, i + 1, j + 1);
|
||||
}
|
||||
|
||||
if (i >= 0) {
|
||||
/* Only occurs when maxcount was reached */
|
||||
/* Skip any remaining whitespace and copy to beginning of string */
|
||||
while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
|
||||
i--;
|
||||
if (i >= 0)
|
||||
SPLIT_ADD(str, 0, i + 1);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
stringlib_rsplit_char(PyObject* str_obj,
|
||||
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
const STRINGLIB_CHAR ch,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
Py_ssize_t i, j, count=0;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
PyObject *sub;
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
i = j = str_len - 1;
|
||||
while ((i >= 0) && (maxcount-- > 0)) {
|
||||
for(; i >= 0; i--) {
|
||||
if (str[i] == ch) {
|
||||
SPLIT_ADD(str, i + 1, j + 1);
|
||||
j = i = i - 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifndef STRINGLIB_MUTABLE
|
||||
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
|
||||
/* ch not in str_obj, so just use str_obj as list[0] */
|
||||
Py_INCREF(str_obj);
|
||||
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
||||
count++;
|
||||
} else
|
||||
#endif
|
||||
if (j >= -1) {
|
||||
SPLIT_ADD(str, 0, j + 1);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
stringlib_rsplit(PyObject* str_obj,
|
||||
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
Py_ssize_t j, pos, count=0;
|
||||
PyObject *list, *sub;
|
||||
|
||||
if (sep_len == 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "empty separator");
|
||||
return NULL;
|
||||
}
|
||||
else if (sep_len == 1)
|
||||
return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount);
|
||||
|
||||
list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
j = str_len;
|
||||
while (maxcount-- > 0) {
|
||||
pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH);
|
||||
if (pos < 0)
|
||||
break;
|
||||
SPLIT_ADD(str, pos + sep_len, j);
|
||||
j = pos;
|
||||
}
|
||||
#ifndef STRINGLIB_MUTABLE
|
||||
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
|
||||
/* No match in str_obj, so just use it as list[0] */
|
||||
Py_INCREF(str_obj);
|
||||
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
||||
count++;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
SPLIT_ADD(str, 0, j);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
stringlib_splitlines(PyObject* str_obj,
|
||||
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
int keepends)
|
||||
{
|
||||
/* This does not use the preallocated list because splitlines is
|
||||
usually run with hundreds of newlines. The overhead of
|
||||
switching between PyList_SET_ITEM and append causes about a
|
||||
2-3% slowdown for that common case. A smarter implementation
|
||||
could move the if check out, so the SET_ITEMs are done first
|
||||
and the appends only done when the prealloc buffer is full.
|
||||
That's too much work for little gain.*/
|
||||
|
||||
register Py_ssize_t i;
|
||||
register Py_ssize_t j;
|
||||
PyObject *list = PyList_New(0);
|
||||
PyObject *sub;
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
for (i = j = 0; i < str_len; ) {
|
||||
Py_ssize_t eol;
|
||||
|
||||
/* Find a line and append it */
|
||||
while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
|
||||
i++;
|
||||
|
||||
/* Skip the line break reading CRLF as one line break */
|
||||
eol = i;
|
||||
if (i < str_len) {
|
||||
if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
|
||||
i += 2;
|
||||
else
|
||||
i++;
|
||||
if (keepends)
|
||||
eol = i;
|
||||
}
|
||||
#ifndef STRINGLIB_MUTABLE
|
||||
if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
|
||||
/* No linebreak in str_obj, so just use it as list[0] */
|
||||
if (PyList_Append(list, str_obj))
|
||||
goto onError;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
SPLIT_APPEND(str, j, eol);
|
||||
j = i;
|
||||
}
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#endif
|
||||
/* stringlib: split implementation */
|
||||
|
||||
#ifndef STRINGLIB_SPLIT_H
|
||||
#define STRINGLIB_SPLIT_H
|
||||
|
||||
#ifndef STRINGLIB_FASTSEARCH_H
|
||||
#error must include "stringlib/fastsearch.h" before including this module
|
||||
#endif
|
||||
|
||||
/* Overallocate the initial list to reduce the number of reallocs for small
|
||||
split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three
|
||||
resizes, to sizes 4, 8, then 16. Most observed string splits are for human
|
||||
text (roughly 11 words per line) and field delimited data (usually 1-10
|
||||
fields). For large strings the split algorithms are bandwidth limited
|
||||
so increasing the preallocation likely will not improve things.*/
|
||||
|
||||
#define MAX_PREALLOC 12
|
||||
|
||||
/* 5 splits gives 6 elements */
|
||||
#define PREALLOC_SIZE(maxsplit) \
|
||||
(maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)
|
||||
|
||||
#define SPLIT_APPEND(data, left, right) \
|
||||
sub = STRINGLIB_NEW((data) + (left), \
|
||||
(right) - (left)); \
|
||||
if (sub == NULL) \
|
||||
goto onError; \
|
||||
if (PyList_Append(list, sub)) { \
|
||||
Py_DECREF(sub); \
|
||||
goto onError; \
|
||||
} \
|
||||
else \
|
||||
Py_DECREF(sub);
|
||||
|
||||
#define SPLIT_ADD(data, left, right) { \
|
||||
sub = STRINGLIB_NEW((data) + (left), \
|
||||
(right) - (left)); \
|
||||
if (sub == NULL) \
|
||||
goto onError; \
|
||||
if (count < MAX_PREALLOC) { \
|
||||
PyList_SET_ITEM(list, count, sub); \
|
||||
} else { \
|
||||
if (PyList_Append(list, sub)) { \
|
||||
Py_DECREF(sub); \
|
||||
goto onError; \
|
||||
} \
|
||||
else \
|
||||
Py_DECREF(sub); \
|
||||
} \
|
||||
count++; }
|
||||
|
||||
|
||||
/* Always force the list to the expected size. */
|
||||
#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
stringlib_split_whitespace(PyObject* str_obj,
|
||||
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
Py_ssize_t i, j, count=0;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
PyObject *sub;
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
i = j = 0;
|
||||
while (maxcount-- > 0) {
|
||||
while (i < str_len && STRINGLIB_ISSPACE(str[i]))
|
||||
i++;
|
||||
if (i == str_len) break;
|
||||
j = i; i++;
|
||||
while (i < str_len && !STRINGLIB_ISSPACE(str[i]))
|
||||
i++;
|
||||
#ifndef STRINGLIB_MUTABLE
|
||||
if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
|
||||
/* No whitespace in str_obj, so just use it as list[0] */
|
||||
Py_INCREF(str_obj);
|
||||
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
||||
count++;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
SPLIT_ADD(str, j, i);
|
||||
}
|
||||
|
||||
if (i < str_len) {
|
||||
/* Only occurs when maxcount was reached */
|
||||
/* Skip any remaining whitespace and copy to end of string */
|
||||
while (i < str_len && STRINGLIB_ISSPACE(str[i]))
|
||||
i++;
|
||||
if (i != str_len)
|
||||
SPLIT_ADD(str, i, str_len);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
stringlib_split_char(PyObject* str_obj,
|
||||
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
const STRINGLIB_CHAR ch,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
Py_ssize_t i, j, count=0;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
PyObject *sub;
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
i = j = 0;
|
||||
while ((j < str_len) && (maxcount-- > 0)) {
|
||||
for(; j < str_len; j++) {
|
||||
/* I found that using memchr makes no difference */
|
||||
if (str[j] == ch) {
|
||||
SPLIT_ADD(str, i, j);
|
||||
i = j = j + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifndef STRINGLIB_MUTABLE
|
||||
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
|
||||
/* ch not in str_obj, so just use str_obj as list[0] */
|
||||
Py_INCREF(str_obj);
|
||||
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
||||
count++;
|
||||
} else
|
||||
#endif
|
||||
if (i <= str_len) {
|
||||
SPLIT_ADD(str, i, str_len);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
stringlib_split(PyObject* str_obj,
|
||||
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
Py_ssize_t i, j, pos, count=0;
|
||||
PyObject *list, *sub;
|
||||
|
||||
if (sep_len == 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "empty separator");
|
||||
return NULL;
|
||||
}
|
||||
else if (sep_len == 1)
|
||||
return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount);
|
||||
|
||||
list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
i = j = 0;
|
||||
while (maxcount-- > 0) {
|
||||
pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);
|
||||
if (pos < 0)
|
||||
break;
|
||||
j = i + pos;
|
||||
SPLIT_ADD(str, i, j);
|
||||
i = j + sep_len;
|
||||
}
|
||||
#ifndef STRINGLIB_MUTABLE
|
||||
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
|
||||
/* No match in str_obj, so just use it as list[0] */
|
||||
Py_INCREF(str_obj);
|
||||
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
||||
count++;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
SPLIT_ADD(str, i, str_len);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
stringlib_rsplit_whitespace(PyObject* str_obj,
|
||||
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
Py_ssize_t i, j, count=0;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
PyObject *sub;
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
i = j = str_len - 1;
|
||||
while (maxcount-- > 0) {
|
||||
while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
|
||||
i--;
|
||||
if (i < 0) break;
|
||||
j = i; i--;
|
||||
while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))
|
||||
i--;
|
||||
#ifndef STRINGLIB_MUTABLE
|
||||
if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
|
||||
/* No whitespace in str_obj, so just use it as list[0] */
|
||||
Py_INCREF(str_obj);
|
||||
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
||||
count++;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
SPLIT_ADD(str, i + 1, j + 1);
|
||||
}
|
||||
|
||||
if (i >= 0) {
|
||||
/* Only occurs when maxcount was reached */
|
||||
/* Skip any remaining whitespace and copy to beginning of string */
|
||||
while (i >= 0 && STRINGLIB_ISSPACE(str[i]))
|
||||
i--;
|
||||
if (i >= 0)
|
||||
SPLIT_ADD(str, 0, i + 1);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
stringlib_rsplit_char(PyObject* str_obj,
|
||||
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
const STRINGLIB_CHAR ch,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
Py_ssize_t i, j, count=0;
|
||||
PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
PyObject *sub;
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
i = j = str_len - 1;
|
||||
while ((i >= 0) && (maxcount-- > 0)) {
|
||||
for(; i >= 0; i--) {
|
||||
if (str[i] == ch) {
|
||||
SPLIT_ADD(str, i + 1, j + 1);
|
||||
j = i = i - 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifndef STRINGLIB_MUTABLE
|
||||
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
|
||||
/* ch not in str_obj, so just use str_obj as list[0] */
|
||||
Py_INCREF(str_obj);
|
||||
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
||||
count++;
|
||||
} else
|
||||
#endif
|
||||
if (j >= -1) {
|
||||
SPLIT_ADD(str, 0, j + 1);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
stringlib_rsplit(PyObject* str_obj,
|
||||
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
Py_ssize_t j, pos, count=0;
|
||||
PyObject *list, *sub;
|
||||
|
||||
if (sep_len == 0) {
|
||||
PyErr_SetString(PyExc_ValueError, "empty separator");
|
||||
return NULL;
|
||||
}
|
||||
else if (sep_len == 1)
|
||||
return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount);
|
||||
|
||||
list = PyList_New(PREALLOC_SIZE(maxcount));
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
j = str_len;
|
||||
while (maxcount-- > 0) {
|
||||
pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH);
|
||||
if (pos < 0)
|
||||
break;
|
||||
SPLIT_ADD(str, pos + sep_len, j);
|
||||
j = pos;
|
||||
}
|
||||
#ifndef STRINGLIB_MUTABLE
|
||||
if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {
|
||||
/* No match in str_obj, so just use it as list[0] */
|
||||
Py_INCREF(str_obj);
|
||||
PyList_SET_ITEM(list, 0, (PyObject *)str_obj);
|
||||
count++;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
SPLIT_ADD(str, 0, j);
|
||||
}
|
||||
FIX_PREALLOC_SIZE(list);
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Py_LOCAL_INLINE(PyObject *)
|
||||
stringlib_splitlines(PyObject* str_obj,
|
||||
const STRINGLIB_CHAR* str, Py_ssize_t str_len,
|
||||
int keepends)
|
||||
{
|
||||
/* This does not use the preallocated list because splitlines is
|
||||
usually run with hundreds of newlines. The overhead of
|
||||
switching between PyList_SET_ITEM and append causes about a
|
||||
2-3% slowdown for that common case. A smarter implementation
|
||||
could move the if check out, so the SET_ITEMs are done first
|
||||
and the appends only done when the prealloc buffer is full.
|
||||
That's too much work for little gain.*/
|
||||
|
||||
register Py_ssize_t i;
|
||||
register Py_ssize_t j;
|
||||
PyObject *list = PyList_New(0);
|
||||
PyObject *sub;
|
||||
|
||||
if (list == NULL)
|
||||
return NULL;
|
||||
|
||||
for (i = j = 0; i < str_len; ) {
|
||||
Py_ssize_t eol;
|
||||
|
||||
/* Find a line and append it */
|
||||
while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))
|
||||
i++;
|
||||
|
||||
/* Skip the line break reading CRLF as one line break */
|
||||
eol = i;
|
||||
if (i < str_len) {
|
||||
if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')
|
||||
i += 2;
|
||||
else
|
||||
i++;
|
||||
if (keepends)
|
||||
eol = i;
|
||||
}
|
||||
#ifndef STRINGLIB_MUTABLE
|
||||
if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {
|
||||
/* No linebreak in str_obj, so just use it as list[0] */
|
||||
if (PyList_Append(list, str_obj))
|
||||
goto onError;
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
SPLIT_APPEND(str, j, eol);
|
||||
j = i;
|
||||
}
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -11,6 +11,8 @@
|
|||
#define STRINGLIB_TYPE_NAME "string"
|
||||
#define STRINGLIB_PARSE_CODE "S"
|
||||
#define STRINGLIB_EMPTY nullstring
|
||||
#define STRINGLIB_ISSPACE Py_ISSPACE
|
||||
#define STRINGLIB_ISLINEBREAK(x) ((x == '\n') || (x == '\r'))
|
||||
#define STRINGLIB_ISDECIMAL(x) ((x >= '0') && (x <= '9'))
|
||||
#define STRINGLIB_TODECIMAL(x) (STRINGLIB_ISDECIMAL(x) ? (x - '0') : -1)
|
||||
#define STRINGLIB_TOUPPER Py_TOUPPER
|
||||
|
|
|
@ -1,13 +1,6 @@
|
|||
/* NOTE: this API is -ONLY- for use with single byte character strings. */
|
||||
/* Do not use it with Unicode. */
|
||||
|
||||
#include "bytes_methods.h"
|
||||
|
||||
#ifndef STRINGLIB_MUTABLE
|
||||
#warning "STRINGLIB_MUTABLE not defined before #include, assuming 0"
|
||||
#define STRINGLIB_MUTABLE 0
|
||||
#endif
|
||||
|
||||
/* the more complicated methods. parts of these should be pulled out into the
|
||||
shared code in bytes_methods.c to cut down on duplicate code bloat. */
|
||||
|
||||
|
@ -269,87 +262,3 @@ stringlib_zfill(PyObject *self, PyObject *args)
|
|||
|
||||
return (PyObject*) s;
|
||||
}
|
||||
|
||||
|
||||
#define _STRINGLIB_SPLIT_APPEND(data, left, right) \
|
||||
str = STRINGLIB_NEW((data) + (left), \
|
||||
(right) - (left)); \
|
||||
if (str == NULL) \
|
||||
goto onError; \
|
||||
if (PyList_Append(list, str)) { \
|
||||
Py_DECREF(str); \
|
||||
goto onError; \
|
||||
} \
|
||||
else \
|
||||
Py_DECREF(str);
|
||||
|
||||
PyDoc_STRVAR(splitlines__doc__,
|
||||
"B.splitlines([keepends]) -> list of lines\n\
|
||||
\n\
|
||||
Return a list of the lines in B, breaking at line boundaries.\n\
|
||||
Line breaks are not included in the resulting list unless keepends\n\
|
||||
is given and true.");
|
||||
|
||||
static PyObject*
|
||||
stringlib_splitlines(PyObject *self, PyObject *args)
|
||||
{
|
||||
register Py_ssize_t i;
|
||||
register Py_ssize_t j;
|
||||
Py_ssize_t len;
|
||||
int keepends = 0;
|
||||
PyObject *list;
|
||||
PyObject *str;
|
||||
char *data;
|
||||
|
||||
if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
|
||||
return NULL;
|
||||
|
||||
data = STRINGLIB_STR(self);
|
||||
len = STRINGLIB_LEN(self);
|
||||
|
||||
/* This does not use the preallocated list because splitlines is
|
||||
usually run with hundreds of newlines. The overhead of
|
||||
switching between PyList_SET_ITEM and append causes about a
|
||||
2-3% slowdown for that common case. A smarter implementation
|
||||
could move the if check out, so the SET_ITEMs are done first
|
||||
and the appends only done when the prealloc buffer is full.
|
||||
That's too much work for little gain.*/
|
||||
|
||||
list = PyList_New(0);
|
||||
if (!list)
|
||||
goto onError;
|
||||
|
||||
for (i = j = 0; i < len; ) {
|
||||
Py_ssize_t eol;
|
||||
|
||||
/* Find a line and append it */
|
||||
while (i < len && data[i] != '\n' && data[i] != '\r')
|
||||
i++;
|
||||
|
||||
/* Skip the line break reading CRLF as one line break */
|
||||
eol = i;
|
||||
if (i < len) {
|
||||
if (data[i] == '\r' && i + 1 < len &&
|
||||
data[i+1] == '\n')
|
||||
i += 2;
|
||||
else
|
||||
i++;
|
||||
if (keepends)
|
||||
eol = i;
|
||||
}
|
||||
_STRINGLIB_SPLIT_APPEND(data, j, eol);
|
||||
j = i;
|
||||
}
|
||||
if (j < len) {
|
||||
_STRINGLIB_SPLIT_APPEND(data, j, len);
|
||||
}
|
||||
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_XDECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#undef _STRINGLIB_SPLIT_APPEND
|
||||
|
||||
|
|
|
@ -11,6 +11,8 @@
|
|||
#define STRINGLIB_TYPE_NAME "unicode"
|
||||
#define STRINGLIB_PARSE_CODE "U"
|
||||
#define STRINGLIB_EMPTY unicode_empty
|
||||
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
|
||||
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
|
||||
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL
|
||||
#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL
|
||||
#define STRINGLIB_TOUPPER Py_UNICODE_TOUPPER
|
||||
|
|
|
@ -210,7 +210,8 @@ PyUnicode_GetMax(void)
|
|||
|
||||
static BLOOM_MASK bloom_linebreak;
|
||||
|
||||
#define BLOOM(mask, ch) ((mask & (1 << ((ch) & 0x1F))))
|
||||
#define BLOOM_ADD(mask, ch) ((mask |= (1 << ((ch) & (LONG_BIT - 1)))))
|
||||
#define BLOOM(mask, ch) ((mask & (1 << ((ch) & (LONG_BIT - 1)))))
|
||||
|
||||
#define BLOOM_LINEBREAK(ch) \
|
||||
((ch) < 128U ? ascii_linebreak[(ch)] : \
|
||||
|
@ -225,7 +226,7 @@ Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
|
|||
|
||||
mask = 0;
|
||||
for (i = 0; i < len; i++)
|
||||
mask |= (1 << (ptr[i] & 0x1F));
|
||||
BLOOM_ADD(mask, ptr[i]);
|
||||
|
||||
return mask;
|
||||
}
|
||||
|
@ -5873,28 +5874,30 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
|
|||
|
||||
#include "stringlib/unicodedefs.h"
|
||||
#include "stringlib/fastsearch.h"
|
||||
|
||||
#include "stringlib/count.h"
|
||||
/* Include _ParseTupleFinds from find.h */
|
||||
#define FROM_UNICODE
|
||||
#include "stringlib/find.h"
|
||||
#include "stringlib/partition.h"
|
||||
#include "stringlib/split.h"
|
||||
|
||||
#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
|
||||
#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
|
||||
#include "stringlib/localeutil.h"
|
||||
|
||||
/* helper macro to fixup start/end slice values */
|
||||
#define FIX_START_END(obj) \
|
||||
if (start < 0) \
|
||||
start += (obj)->length; \
|
||||
if (start < 0) \
|
||||
start = 0; \
|
||||
if (end > (obj)->length) \
|
||||
end = (obj)->length; \
|
||||
if (end < 0) \
|
||||
end += (obj)->length; \
|
||||
if (end < 0) \
|
||||
end = 0;
|
||||
#define ADJUST_INDICES(start, end, len) \
|
||||
if (end > len) \
|
||||
end = len; \
|
||||
else if (end < 0) { \
|
||||
end += len; \
|
||||
if (end < 0) \
|
||||
end = 0; \
|
||||
} \
|
||||
if (start < 0) { \
|
||||
start += len; \
|
||||
if (start < 0) \
|
||||
start = 0; \
|
||||
}
|
||||
|
||||
Py_ssize_t PyUnicode_Count(PyObject *str,
|
||||
PyObject *substr,
|
||||
|
@ -5914,10 +5917,10 @@ Py_ssize_t PyUnicode_Count(PyObject *str,
|
|||
return -1;
|
||||
}
|
||||
|
||||
FIX_START_END(str_obj);
|
||||
|
||||
ADJUST_INDICES(start, end, str_obj->length);
|
||||
result = stringlib_count(
|
||||
str_obj->str + start, end - start, sub_obj->str, sub_obj->length
|
||||
str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
|
||||
PY_SSIZE_T_MAX
|
||||
);
|
||||
|
||||
Py_DECREF(sub_obj);
|
||||
|
@ -5972,8 +5975,7 @@ int tailmatch(PyUnicodeObject *self,
|
|||
if (substring->length == 0)
|
||||
return 1;
|
||||
|
||||
FIX_START_END(self);
|
||||
|
||||
ADJUST_INDICES(start, end, self->length);
|
||||
end -= substring->length;
|
||||
if (end < start)
|
||||
return 0;
|
||||
|
@ -6314,305 +6316,40 @@ PyUnicodeObject *pad(PyUnicodeObject *self,
|
|||
return u;
|
||||
}
|
||||
|
||||
#define SPLIT_APPEND(data, left, right) \
|
||||
str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
|
||||
if (!str) \
|
||||
goto onError; \
|
||||
if (PyList_Append(list, str)) { \
|
||||
Py_DECREF(str); \
|
||||
goto onError; \
|
||||
} \
|
||||
else \
|
||||
Py_DECREF(str);
|
||||
|
||||
static
|
||||
PyObject *split_whitespace(PyUnicodeObject *self,
|
||||
PyObject *list,
|
||||
Py_ssize_t maxcount)
|
||||
PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
|
||||
{
|
||||
register Py_ssize_t i;
|
||||
register Py_ssize_t j;
|
||||
Py_ssize_t len = self->length;
|
||||
PyObject *str;
|
||||
register const Py_UNICODE *buf = self->str;
|
||||
|
||||
for (i = j = 0; i < len; ) {
|
||||
/* find a token */
|
||||
while (i < len && Py_UNICODE_ISSPACE(buf[i]))
|
||||
i++;
|
||||
j = i;
|
||||
while (i < len && !Py_UNICODE_ISSPACE(buf[i]))
|
||||
i++;
|
||||
if (j < i) {
|
||||
if (maxcount-- <= 0)
|
||||
break;
|
||||
SPLIT_APPEND(buf, j, i);
|
||||
while (i < len && Py_UNICODE_ISSPACE(buf[i]))
|
||||
i++;
|
||||
j = i;
|
||||
}
|
||||
}
|
||||
if (j < len) {
|
||||
SPLIT_APPEND(buf, j, len);
|
||||
}
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
PyObject *PyUnicode_Splitlines(PyObject *string,
|
||||
int keepends)
|
||||
{
|
||||
register Py_ssize_t i;
|
||||
register Py_ssize_t j;
|
||||
Py_ssize_t len;
|
||||
PyObject *list;
|
||||
PyObject *str;
|
||||
Py_UNICODE *data;
|
||||
|
||||
string = PyUnicode_FromObject(string);
|
||||
if (string == NULL)
|
||||
return NULL;
|
||||
data = PyUnicode_AS_UNICODE(string);
|
||||
len = PyUnicode_GET_SIZE(string);
|
||||
|
||||
list = PyList_New(0);
|
||||
if (!list)
|
||||
goto onError;
|
||||
|
||||
for (i = j = 0; i < len; ) {
|
||||
Py_ssize_t eol;
|
||||
|
||||
/* Find a line and append it */
|
||||
while (i < len && !BLOOM_LINEBREAK(data[i]))
|
||||
i++;
|
||||
|
||||
/* Skip the line break reading CRLF as one line break */
|
||||
eol = i;
|
||||
if (i < len) {
|
||||
if (data[i] == '\r' && i + 1 < len &&
|
||||
data[i+1] == '\n')
|
||||
i += 2;
|
||||
else
|
||||
i++;
|
||||
if (keepends)
|
||||
eol = i;
|
||||
}
|
||||
SPLIT_APPEND(data, j, eol);
|
||||
j = i;
|
||||
}
|
||||
if (j < len) {
|
||||
SPLIT_APPEND(data, j, len);
|
||||
}
|
||||
list = stringlib_splitlines(
|
||||
(PyObject*) string, PyUnicode_AS_UNICODE(string),
|
||||
PyUnicode_GET_SIZE(string), keepends);
|
||||
|
||||
Py_DECREF(string);
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_XDECREF(list);
|
||||
Py_DECREF(string);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static
|
||||
PyObject *split_char(PyUnicodeObject *self,
|
||||
PyObject *list,
|
||||
Py_UNICODE ch,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
register Py_ssize_t i;
|
||||
register Py_ssize_t j;
|
||||
Py_ssize_t len = self->length;
|
||||
PyObject *str;
|
||||
register const Py_UNICODE *buf = self->str;
|
||||
|
||||
for (i = j = 0; i < len; ) {
|
||||
if (buf[i] == ch) {
|
||||
if (maxcount-- <= 0)
|
||||
break;
|
||||
SPLIT_APPEND(buf, j, i);
|
||||
i = j = i + 1;
|
||||
} else
|
||||
i++;
|
||||
}
|
||||
if (j <= len) {
|
||||
SPLIT_APPEND(buf, j, len);
|
||||
}
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static
|
||||
PyObject *split_substring(PyUnicodeObject *self,
|
||||
PyObject *list,
|
||||
PyUnicodeObject *substring,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
register Py_ssize_t i;
|
||||
register Py_ssize_t j;
|
||||
Py_ssize_t len = self->length;
|
||||
Py_ssize_t sublen = substring->length;
|
||||
PyObject *str;
|
||||
|
||||
for (i = j = 0; i <= len - sublen; ) {
|
||||
if (Py_UNICODE_MATCH(self, i, substring)) {
|
||||
if (maxcount-- <= 0)
|
||||
break;
|
||||
SPLIT_APPEND(self->str, j, i);
|
||||
i = j = i + sublen;
|
||||
} else
|
||||
i++;
|
||||
}
|
||||
if (j <= len) {
|
||||
SPLIT_APPEND(self->str, j, len);
|
||||
}
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static
|
||||
PyObject *rsplit_whitespace(PyUnicodeObject *self,
|
||||
PyObject *list,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
register Py_ssize_t i;
|
||||
register Py_ssize_t j;
|
||||
Py_ssize_t len = self->length;
|
||||
PyObject *str;
|
||||
register const Py_UNICODE *buf = self->str;
|
||||
|
||||
for (i = j = len - 1; i >= 0; ) {
|
||||
/* find a token */
|
||||
while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
|
||||
i--;
|
||||
j = i;
|
||||
while (i >= 0 && !Py_UNICODE_ISSPACE(buf[i]))
|
||||
i--;
|
||||
if (j > i) {
|
||||
if (maxcount-- <= 0)
|
||||
break;
|
||||
SPLIT_APPEND(buf, i + 1, j + 1);
|
||||
while (i >= 0 && Py_UNICODE_ISSPACE(buf[i]))
|
||||
i--;
|
||||
j = i;
|
||||
}
|
||||
}
|
||||
if (j >= 0) {
|
||||
SPLIT_APPEND(buf, 0, j + 1);
|
||||
}
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static
|
||||
PyObject *rsplit_char(PyUnicodeObject *self,
|
||||
PyObject *list,
|
||||
Py_UNICODE ch,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
register Py_ssize_t i;
|
||||
register Py_ssize_t j;
|
||||
Py_ssize_t len = self->length;
|
||||
PyObject *str;
|
||||
register const Py_UNICODE *buf = self->str;
|
||||
|
||||
for (i = j = len - 1; i >= 0; ) {
|
||||
if (buf[i] == ch) {
|
||||
if (maxcount-- <= 0)
|
||||
break;
|
||||
SPLIT_APPEND(buf, i + 1, j + 1);
|
||||
j = i = i - 1;
|
||||
} else
|
||||
i--;
|
||||
}
|
||||
if (j >= -1) {
|
||||
SPLIT_APPEND(buf, 0, j + 1);
|
||||
}
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static
|
||||
PyObject *rsplit_substring(PyUnicodeObject *self,
|
||||
PyObject *list,
|
||||
PyUnicodeObject *substring,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
register Py_ssize_t i;
|
||||
register Py_ssize_t j;
|
||||
Py_ssize_t len = self->length;
|
||||
Py_ssize_t sublen = substring->length;
|
||||
PyObject *str;
|
||||
|
||||
for (i = len - sublen, j = len; i >= 0; ) {
|
||||
if (Py_UNICODE_MATCH(self, i, substring)) {
|
||||
if (maxcount-- <= 0)
|
||||
break;
|
||||
SPLIT_APPEND(self->str, i + sublen, j);
|
||||
j = i;
|
||||
i -= sublen;
|
||||
} else
|
||||
i--;
|
||||
}
|
||||
if (j >= 0) {
|
||||
SPLIT_APPEND(self->str, 0, j);
|
||||
}
|
||||
if (PyList_Reverse(list) < 0)
|
||||
goto onError;
|
||||
return list;
|
||||
|
||||
onError:
|
||||
Py_DECREF(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#undef SPLIT_APPEND
|
||||
|
||||
static
|
||||
PyObject *split(PyUnicodeObject *self,
|
||||
PyUnicodeObject *substring,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
PyObject *list;
|
||||
|
||||
if (maxcount < 0)
|
||||
maxcount = PY_SSIZE_T_MAX;
|
||||
|
||||
list = PyList_New(0);
|
||||
if (!list)
|
||||
return NULL;
|
||||
|
||||
if (substring == NULL)
|
||||
return split_whitespace(self,list,maxcount);
|
||||
return stringlib_split_whitespace(
|
||||
(PyObject*) self, self->str, self->length, maxcount
|
||||
);
|
||||
|
||||
else if (substring->length == 1)
|
||||
return split_char(self,list,substring->str[0],maxcount);
|
||||
|
||||
else if (substring->length == 0) {
|
||||
Py_DECREF(list);
|
||||
PyErr_SetString(PyExc_ValueError, "empty separator");
|
||||
return NULL;
|
||||
}
|
||||
else
|
||||
return split_substring(self,list,substring,maxcount);
|
||||
return stringlib_split(
|
||||
(PyObject*) self, self->str, self->length,
|
||||
substring->str, substring->length,
|
||||
maxcount
|
||||
);
|
||||
}
|
||||
|
||||
static
|
||||
|
@ -6620,28 +6357,19 @@ PyObject *rsplit(PyUnicodeObject *self,
|
|||
PyUnicodeObject *substring,
|
||||
Py_ssize_t maxcount)
|
||||
{
|
||||
PyObject *list;
|
||||
|
||||
if (maxcount < 0)
|
||||
maxcount = PY_SSIZE_T_MAX;
|
||||
|
||||
list = PyList_New(0);
|
||||
if (!list)
|
||||
return NULL;
|
||||
|
||||
if (substring == NULL)
|
||||
return rsplit_whitespace(self,list,maxcount);
|
||||
return stringlib_rsplit_whitespace(
|
||||
(PyObject*) self, self->str, self->length, maxcount
|
||||
);
|
||||
|
||||
else if (substring->length == 1)
|
||||
return rsplit_char(self,list,substring->str[0],maxcount);
|
||||
|
||||
else if (substring->length == 0) {
|
||||
Py_DECREF(list);
|
||||
PyErr_SetString(PyExc_ValueError, "empty separator");
|
||||
return NULL;
|
||||
}
|
||||
else
|
||||
return rsplit_substring(self,list,substring,maxcount);
|
||||
return stringlib_rsplit(
|
||||
(PyObject*) self, self->str, self->length,
|
||||
substring->str, substring->length,
|
||||
maxcount
|
||||
);
|
||||
}
|
||||
|
||||
static
|
||||
|
@ -6654,9 +6382,13 @@ PyObject *replace(PyUnicodeObject *self,
|
|||
|
||||
if (maxcount < 0)
|
||||
maxcount = PY_SSIZE_T_MAX;
|
||||
else if (maxcount == 0 || self->length == 0)
|
||||
goto nothing;
|
||||
|
||||
if (str1->length == str2->length) {
|
||||
/* same length */
|
||||
if (str1->length == 0)
|
||||
goto nothing;
|
||||
Py_ssize_t i;
|
||||
if (str1->length == 1) {
|
||||
/* replace characters */
|
||||
|
@ -6676,8 +6408,8 @@ PyObject *replace(PyUnicodeObject *self,
|
|||
u->str[i] = u2;
|
||||
}
|
||||
} else {
|
||||
i = fastsearch(
|
||||
self->str, self->length, str1->str, str1->length, FAST_SEARCH
|
||||
i = stringlib_find(
|
||||
self->str, self->length, str1->str, str1->length, 0
|
||||
);
|
||||
if (i < 0)
|
||||
goto nothing;
|
||||
|
@ -6685,14 +6417,20 @@ PyObject *replace(PyUnicodeObject *self,
|
|||
if (!u)
|
||||
return NULL;
|
||||
Py_UNICODE_COPY(u->str, self->str, self->length);
|
||||
while (i <= self->length - str1->length)
|
||||
if (Py_UNICODE_MATCH(self, i, str1)) {
|
||||
if (--maxcount < 0)
|
||||
break;
|
||||
Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
|
||||
i += str1->length;
|
||||
} else
|
||||
i++;
|
||||
|
||||
/* change everything in-place, starting with this one */
|
||||
Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
|
||||
i += str1->length;
|
||||
|
||||
while ( --maxcount > 0) {
|
||||
i = stringlib_find(self->str+i, self->length-i,
|
||||
str1->str, str1->length,
|
||||
i);
|
||||
if (i == -1)
|
||||
break;
|
||||
Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
|
||||
i += str1->length;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
|
@ -6701,9 +6439,8 @@ PyObject *replace(PyUnicodeObject *self,
|
|||
Py_UNICODE *p;
|
||||
|
||||
/* replace strings */
|
||||
n = stringlib_count(self->str, self->length, str1->str, str1->length);
|
||||
if (n > maxcount)
|
||||
n = maxcount;
|
||||
n = stringlib_count(self->str, self->length, str1->str, str1->length,
|
||||
maxcount);
|
||||
if (n == 0)
|
||||
goto nothing;
|
||||
/* new_size = self->length + n * (str2->length - str1->length)); */
|
||||
|
@ -6733,15 +6470,12 @@ PyObject *replace(PyUnicodeObject *self,
|
|||
if (str1->length > 0) {
|
||||
while (n-- > 0) {
|
||||
/* look for next match */
|
||||
j = i;
|
||||
while (j <= e) {
|
||||
if (Py_UNICODE_MATCH(self, j, str1))
|
||||
break;
|
||||
j++;
|
||||
}
|
||||
if (j > i) {
|
||||
if (j > e)
|
||||
break;
|
||||
j = stringlib_find(self->str+i, self->length-i,
|
||||
str1->str, str1->length,
|
||||
i);
|
||||
if (j == -1)
|
||||
break;
|
||||
else if (j > i) {
|
||||
/* copy unchanged part [i:j] */
|
||||
Py_UNICODE_COPY(p, self->str+i, j-i);
|
||||
p += j - i;
|
||||
|
@ -7192,11 +6926,11 @@ unicode_count(PyUnicodeObject *self, PyObject *args)
|
|||
if (substring == NULL)
|
||||
return NULL;
|
||||
|
||||
FIX_START_END(self);
|
||||
|
||||
ADJUST_INDICES(start, end, self->length);
|
||||
result = PyLong_FromSsize_t(
|
||||
stringlib_count(self->str + start, end - start,
|
||||
substring->str, substring->length)
|
||||
substring->str, substring->length,
|
||||
PY_SSIZE_T_MAX)
|
||||
);
|
||||
|
||||
Py_DECREF(substring);
|
||||
|
@ -10066,11 +9800,3 @@ Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
|
|||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
Local variables:
|
||||
c-basic-offset: 4
|
||||
indent-tabs-mode: nil
|
||||
End:
|
||||
*/
|
||||
|
|
|
@ -1490,6 +1490,10 @@
|
|||
RelativePath="..\..\Objects\sliceobject.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath="..\..\Objects\stringlib\split.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath="..\..\Objects\structseq.c"
|
||||
>
|
||||
|
|
|
@ -1495,6 +1495,10 @@
|
|||
RelativePath="..\Objects\sliceobject.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath="..\Objects\stringlib\split.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath="..\Objects\structseq.c"
|
||||
>
|
||||
|
|
Loading…
Reference in New Issue