Issue #18468: The re.split, re.findall, and re.sub functions and the group()
and groups() methods of match object now always return a string or a bytes object.
This commit is contained in:
parent
355dda8d17
commit
25324971fb
|
@ -17,8 +17,26 @@ from weakref import proxy
|
||||||
|
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
|
class S(str):
|
||||||
|
def __getitem__(self, index):
|
||||||
|
return S(super().__getitem__(index))
|
||||||
|
|
||||||
|
class B(bytes):
|
||||||
|
def __getitem__(self, index):
|
||||||
|
return B(super().__getitem__(index))
|
||||||
|
|
||||||
class ReTests(unittest.TestCase):
|
class ReTests(unittest.TestCase):
|
||||||
|
|
||||||
|
def assertTypedEqual(self, actual, expect, msg=None):
|
||||||
|
self.assertEqual(actual, expect, msg)
|
||||||
|
def recurse(actual, expect):
|
||||||
|
if isinstance(expect, (tuple, list)):
|
||||||
|
for x, y in zip(actual, expect):
|
||||||
|
recurse(x, y)
|
||||||
|
else:
|
||||||
|
self.assertIs(type(actual), type(expect), msg)
|
||||||
|
recurse(actual, expect)
|
||||||
|
|
||||||
def test_keep_buffer(self):
|
def test_keep_buffer(self):
|
||||||
# See bug 14212
|
# See bug 14212
|
||||||
b = bytearray(b'x')
|
b = bytearray(b'x')
|
||||||
|
@ -53,6 +71,13 @@ class ReTests(unittest.TestCase):
|
||||||
return str(int_value + 1)
|
return str(int_value + 1)
|
||||||
|
|
||||||
def test_basic_re_sub(self):
|
def test_basic_re_sub(self):
|
||||||
|
self.assertTypedEqual(re.sub('y', 'a', 'xyz'), 'xaz')
|
||||||
|
self.assertTypedEqual(re.sub('y', S('a'), S('xyz')), 'xaz')
|
||||||
|
self.assertTypedEqual(re.sub(b'y', b'a', b'xyz'), b'xaz')
|
||||||
|
self.assertTypedEqual(re.sub(b'y', B(b'a'), B(b'xyz')), b'xaz')
|
||||||
|
self.assertTypedEqual(re.sub(b'y', bytearray(b'a'), bytearray(b'xyz')), b'xaz')
|
||||||
|
self.assertTypedEqual(re.sub(b'y', memoryview(b'a'), memoryview(b'xyz')), b'xaz')
|
||||||
|
|
||||||
self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
|
self.assertEqual(re.sub("(?i)b+", "x", "bbbb BBBB"), 'x x')
|
||||||
self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
|
self.assertEqual(re.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'),
|
||||||
'9.3 -3 24x100y')
|
'9.3 -3 24x100y')
|
||||||
|
@ -210,10 +235,22 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
|
self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
|
||||||
|
|
||||||
def test_re_split(self):
|
def test_re_split(self):
|
||||||
self.assertEqual(re.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c'])
|
for string in ":a:b::c", S(":a:b::c"):
|
||||||
self.assertEqual(re.split(":*", ":a:b::c"), ['', 'a', 'b', 'c'])
|
self.assertTypedEqual(re.split(":", string),
|
||||||
self.assertEqual(re.split("(:*)", ":a:b::c"),
|
['', 'a', 'b', '', 'c'])
|
||||||
['', ':', 'a', ':', 'b', '::', 'c'])
|
self.assertTypedEqual(re.split(":*", string),
|
||||||
|
['', 'a', 'b', 'c'])
|
||||||
|
self.assertTypedEqual(re.split("(:*)", string),
|
||||||
|
['', ':', 'a', ':', 'b', '::', 'c'])
|
||||||
|
for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
|
||||||
|
memoryview(b":a:b::c")):
|
||||||
|
self.assertTypedEqual(re.split(b":", string),
|
||||||
|
[b'', b'a', b'b', b'', b'c'])
|
||||||
|
self.assertTypedEqual(re.split(b":*", string),
|
||||||
|
[b'', b'a', b'b', b'c'])
|
||||||
|
self.assertTypedEqual(re.split(b"(:*)", string),
|
||||||
|
[b'', b':', b'a', b':', b'b', b'::', b'c'])
|
||||||
|
|
||||||
self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
|
self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c'])
|
||||||
self.assertEqual(re.split("(:)*", ":a:b::c"),
|
self.assertEqual(re.split("(:)*", ":a:b::c"),
|
||||||
['', ':', 'a', ':', 'b', ':', 'c'])
|
['', ':', 'a', ':', 'b', ':', 'c'])
|
||||||
|
@ -235,22 +272,39 @@ class ReTests(unittest.TestCase):
|
||||||
|
|
||||||
def test_re_findall(self):
|
def test_re_findall(self):
|
||||||
self.assertEqual(re.findall(":+", "abc"), [])
|
self.assertEqual(re.findall(":+", "abc"), [])
|
||||||
self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
|
for string in "a:b::c:::d", S("a:b::c:::d"):
|
||||||
self.assertEqual(re.findall("(:+)", "a:b::c:::d"), [":", "::", ":::"])
|
self.assertTypedEqual(re.findall(":+", string),
|
||||||
self.assertEqual(re.findall("(:)(:*)", "a:b::c:::d"), [(":", ""),
|
[":", "::", ":::"])
|
||||||
(":", ":"),
|
self.assertTypedEqual(re.findall("(:+)", string),
|
||||||
(":", "::")])
|
[":", "::", ":::"])
|
||||||
|
self.assertTypedEqual(re.findall("(:)(:*)", string),
|
||||||
|
[(":", ""), (":", ":"), (":", "::")])
|
||||||
|
for string in (b"a:b::c:::d", B(b"a:b::c:::d"), bytearray(b"a:b::c:::d"),
|
||||||
|
memoryview(b"a:b::c:::d")):
|
||||||
|
self.assertTypedEqual(re.findall(b":+", string),
|
||||||
|
[b":", b"::", b":::"])
|
||||||
|
self.assertTypedEqual(re.findall(b"(:+)", string),
|
||||||
|
[b":", b"::", b":::"])
|
||||||
|
self.assertTypedEqual(re.findall(b"(:)(:*)", string),
|
||||||
|
[(b":", b""), (b":", b":"), (b":", b"::")])
|
||||||
|
|
||||||
def test_bug_117612(self):
|
def test_bug_117612(self):
|
||||||
self.assertEqual(re.findall(r"(a|(b))", "aba"),
|
self.assertEqual(re.findall(r"(a|(b))", "aba"),
|
||||||
[("a", ""),("b", "b"),("a", "")])
|
[("a", ""),("b", "b"),("a", "")])
|
||||||
|
|
||||||
def test_re_match(self):
|
def test_re_match(self):
|
||||||
self.assertEqual(re.match('a', 'a').groups(), ())
|
for string in 'a', S('a'):
|
||||||
self.assertEqual(re.match('(a)', 'a').groups(), ('a',))
|
self.assertEqual(re.match('a', string).groups(), ())
|
||||||
self.assertEqual(re.match(r'(a)', 'a').group(0), 'a')
|
self.assertEqual(re.match('(a)', string).groups(), ('a',))
|
||||||
self.assertEqual(re.match(r'(a)', 'a').group(1), 'a')
|
self.assertEqual(re.match('(a)', string).group(0), 'a')
|
||||||
self.assertEqual(re.match(r'(a)', 'a').group(1, 1), ('a', 'a'))
|
self.assertEqual(re.match('(a)', string).group(1), 'a')
|
||||||
|
self.assertEqual(re.match('(a)', string).group(1, 1), ('a', 'a'))
|
||||||
|
for string in b'a', B(b'a'), bytearray(b'a'), memoryview(b'a'):
|
||||||
|
self.assertEqual(re.match(b'a', string).groups(), ())
|
||||||
|
self.assertEqual(re.match(b'(a)', string).groups(), (b'a',))
|
||||||
|
self.assertEqual(re.match(b'(a)', string).group(0), b'a')
|
||||||
|
self.assertEqual(re.match(b'(a)', string).group(1), b'a')
|
||||||
|
self.assertEqual(re.match(b'(a)', string).group(1, 1), (b'a', b'a'))
|
||||||
|
|
||||||
pat = re.compile('((a)|(b))(c)?')
|
pat = re.compile('((a)|(b))(c)?')
|
||||||
self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
|
self.assertEqual(pat.match('a').groups(), ('a', 'a', None, None))
|
||||||
|
|
|
@ -42,6 +42,10 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #18468: The re.split, re.findall, and re.sub functions and the group()
|
||||||
|
and groups() methods of match object now always return a string or a bytes
|
||||||
|
object.
|
||||||
|
|
||||||
- Issue #18725: The textwrap module now supports truncating multiline text.
|
- Issue #18725: The textwrap module now supports truncating multiline text.
|
||||||
|
|
||||||
- Issue #18776: atexit callbacks now display their full traceback when they
|
- Issue #18776: atexit callbacks now display their full traceback when they
|
||||||
|
|
110
Modules/_sre.c
110
Modules/_sre.c
|
@ -1811,6 +1811,24 @@ state_fini(SRE_STATE* state)
|
||||||
#define STATE_OFFSET(state, member)\
|
#define STATE_OFFSET(state, member)\
|
||||||
(((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
|
(((char*)(member) - (char*)(state)->beginning) / (state)->charsize)
|
||||||
|
|
||||||
|
LOCAL(PyObject*)
|
||||||
|
getslice(int logical_charsize, const void *ptr,
|
||||||
|
PyObject* string, Py_ssize_t start, Py_ssize_t end)
|
||||||
|
{
|
||||||
|
if (logical_charsize == 1) {
|
||||||
|
if (PyBytes_CheckExact(string) &&
|
||||||
|
start == 0 && end == PyBytes_GET_SIZE(string)) {
|
||||||
|
Py_INCREF(string);
|
||||||
|
return string;
|
||||||
|
}
|
||||||
|
return PyBytes_FromStringAndSize(
|
||||||
|
(const char *)ptr + start, end - start);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return PyUnicode_Substring(string, start, end);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
LOCAL(PyObject*)
|
LOCAL(PyObject*)
|
||||||
state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
|
state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
|
||||||
{
|
{
|
||||||
|
@ -1831,7 +1849,7 @@ state_getslice(SRE_STATE* state, Py_ssize_t index, PyObject* string, int empty)
|
||||||
j = STATE_OFFSET(state, state->mark[index+1]);
|
j = STATE_OFFSET(state, state->mark[index+1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
return PySequence_GetSlice(string, i, j);
|
return getslice(state->logical_charsize, state->beginning, string, i, j);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
@ -1992,45 +2010,6 @@ deepcopy(PyObject** object, PyObject* memo)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static PyObject*
|
|
||||||
join_list(PyObject* list, PyObject* string)
|
|
||||||
{
|
|
||||||
/* join list elements */
|
|
||||||
|
|
||||||
PyObject* joiner;
|
|
||||||
PyObject* function;
|
|
||||||
PyObject* args;
|
|
||||||
PyObject* result;
|
|
||||||
|
|
||||||
joiner = PySequence_GetSlice(string, 0, 0);
|
|
||||||
if (!joiner)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
if (PyList_GET_SIZE(list) == 0) {
|
|
||||||
Py_DECREF(list);
|
|
||||||
return joiner;
|
|
||||||
}
|
|
||||||
|
|
||||||
function = PyObject_GetAttrString(joiner, "join");
|
|
||||||
if (!function) {
|
|
||||||
Py_DECREF(joiner);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
args = PyTuple_New(1);
|
|
||||||
if (!args) {
|
|
||||||
Py_DECREF(function);
|
|
||||||
Py_DECREF(joiner);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
PyTuple_SET_ITEM(args, 0, list);
|
|
||||||
result = PyObject_CallObject(function, args);
|
|
||||||
Py_DECREF(args); /* also removes list */
|
|
||||||
Py_DECREF(function);
|
|
||||||
Py_DECREF(joiner);
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
static PyObject*
|
static PyObject*
|
||||||
pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
|
pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
|
||||||
{
|
{
|
||||||
|
@ -2086,7 +2065,8 @@ pattern_findall(PatternObject* self, PyObject* args, PyObject* kw)
|
||||||
case 0:
|
case 0:
|
||||||
b = STATE_OFFSET(&state, state.start);
|
b = STATE_OFFSET(&state, state.start);
|
||||||
e = STATE_OFFSET(&state, state.ptr);
|
e = STATE_OFFSET(&state, state.ptr);
|
||||||
item = PySequence_GetSlice(string, b, e);
|
item = getslice(state.logical_charsize, state.beginning,
|
||||||
|
string, b, e);
|
||||||
if (!item)
|
if (!item)
|
||||||
goto error;
|
goto error;
|
||||||
break;
|
break;
|
||||||
|
@ -2216,7 +2196,7 @@ pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* get segment before this match */
|
/* get segment before this match */
|
||||||
item = PySequence_GetSlice(
|
item = getslice(state.logical_charsize, state.beginning,
|
||||||
string, STATE_OFFSET(&state, last),
|
string, STATE_OFFSET(&state, last),
|
||||||
STATE_OFFSET(&state, state.start)
|
STATE_OFFSET(&state, state.start)
|
||||||
);
|
);
|
||||||
|
@ -2245,7 +2225,7 @@ pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* get segment following last match (even if empty) */
|
/* get segment following last match (even if empty) */
|
||||||
item = PySequence_GetSlice(
|
item = getslice(state.logical_charsize, state.beginning,
|
||||||
string, STATE_OFFSET(&state, last), state.endpos
|
string, STATE_OFFSET(&state, last), state.endpos
|
||||||
);
|
);
|
||||||
if (!item)
|
if (!item)
|
||||||
|
@ -2271,6 +2251,7 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
|
||||||
{
|
{
|
||||||
SRE_STATE state;
|
SRE_STATE state;
|
||||||
PyObject* list;
|
PyObject* list;
|
||||||
|
PyObject* joiner;
|
||||||
PyObject* item;
|
PyObject* item;
|
||||||
PyObject* filter;
|
PyObject* filter;
|
||||||
PyObject* args;
|
PyObject* args;
|
||||||
|
@ -2360,7 +2341,8 @@ pattern_subx(PatternObject* self, PyObject* ptemplate, PyObject* string,
|
||||||
|
|
||||||
if (i < b) {
|
if (i < b) {
|
||||||
/* get segment before this match */
|
/* get segment before this match */
|
||||||
item = PySequence_GetSlice(string, i, b);
|
item = getslice(state.logical_charsize, state.beginning,
|
||||||
|
string, i, b);
|
||||||
if (!item)
|
if (!item)
|
||||||
goto error;
|
goto error;
|
||||||
status = PyList_Append(list, item);
|
status = PyList_Append(list, item);
|
||||||
|
@ -2415,7 +2397,8 @@ next:
|
||||||
|
|
||||||
/* get segment following last match */
|
/* get segment following last match */
|
||||||
if (i < state.endpos) {
|
if (i < state.endpos) {
|
||||||
item = PySequence_GetSlice(string, i, state.endpos);
|
item = getslice(state.logical_charsize, state.beginning,
|
||||||
|
string, i, state.endpos);
|
||||||
if (!item)
|
if (!item)
|
||||||
goto error;
|
goto error;
|
||||||
status = PyList_Append(list, item);
|
status = PyList_Append(list, item);
|
||||||
|
@ -2429,10 +2412,24 @@ next:
|
||||||
Py_DECREF(filter);
|
Py_DECREF(filter);
|
||||||
|
|
||||||
/* convert list to single string (also removes list) */
|
/* convert list to single string (also removes list) */
|
||||||
item = join_list(list, string);
|
joiner = getslice(state.logical_charsize, state.beginning, string, 0, 0);
|
||||||
|
if (!joiner) {
|
||||||
if (!item)
|
Py_DECREF(list);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
}
|
||||||
|
if (PyList_GET_SIZE(list) == 0) {
|
||||||
|
Py_DECREF(list);
|
||||||
|
item = joiner;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (state.logical_charsize == 1)
|
||||||
|
item = _PyBytes_Join(joiner, list);
|
||||||
|
else
|
||||||
|
item = PyUnicode_Join(joiner, list);
|
||||||
|
Py_DECREF(joiner);
|
||||||
|
if (!item)
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
if (subn)
|
if (subn)
|
||||||
return Py_BuildValue("Nn", item, n);
|
return Py_BuildValue("Nn", item, n);
|
||||||
|
@ -3189,6 +3186,12 @@ match_dealloc(MatchObject* self)
|
||||||
static PyObject*
|
static PyObject*
|
||||||
match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
|
match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
|
||||||
{
|
{
|
||||||
|
Py_ssize_t length;
|
||||||
|
int logical_charsize, charsize;
|
||||||
|
Py_buffer view;
|
||||||
|
PyObject *result;
|
||||||
|
void* ptr;
|
||||||
|
|
||||||
if (index < 0 || index >= self->groups) {
|
if (index < 0 || index >= self->groups) {
|
||||||
/* raise IndexError if we were given a bad group number */
|
/* raise IndexError if we were given a bad group number */
|
||||||
PyErr_SetString(
|
PyErr_SetString(
|
||||||
|
@ -3206,9 +3209,14 @@ match_getslice_by_index(MatchObject* self, Py_ssize_t index, PyObject* def)
|
||||||
return def;
|
return def;
|
||||||
}
|
}
|
||||||
|
|
||||||
return PySequence_GetSlice(
|
ptr = getstring(self->string, &length, &logical_charsize, &charsize, &view);
|
||||||
self->string, self->mark[index], self->mark[index+1]
|
if (ptr == NULL)
|
||||||
);
|
return NULL;
|
||||||
|
result = getslice(logical_charsize, ptr,
|
||||||
|
self->string, self->mark[index], self->mark[index+1]);
|
||||||
|
if (logical_charsize == 1 && view.buf != NULL)
|
||||||
|
PyBuffer_Release(&view);
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static Py_ssize_t
|
static Py_ssize_t
|
||||||
|
|
Loading…
Reference in New Issue