diff --git a/Lib/test/test_re.py b/Lib/test/test_re.py index 360ba7285c7..b945203633d 100644 --- a/Lib/test/test_re.py +++ b/Lib/test/test_re.py @@ -968,6 +968,11 @@ class ReTests(unittest.TestCase): self.assertEqual(r, s) self.assertEqual(n, size + 1) + def test_bug_16688(self): + # Issue 16688: Backreferences make case-insensitive regex fail on + # non-ASCII strings. + self.assertEqual(re.findall(r"(?i)(a)\1", "aa \u0100"), ['a']) + self.assertEqual(re.match(r"(?s).{1,3}", "\u0100\u0100").span(), (0, 2)) def run_re_tests(): from test.re_tests import tests, SUCCEED, FAIL, SYNTAX_ERROR diff --git a/Misc/ACKS b/Misc/ACKS index af9aa062503..c947a45590e 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -70,6 +70,7 @@ Anton Barkovsky Nick Barnes Quentin Barnes David Barnett +Matthew Barnett Richard Barran Cesar Eduardo Barros Des Barry diff --git a/Misc/NEWS b/Misc/NEWS index 2a22bb56e02..c2bfa00148e 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -124,6 +124,9 @@ Core and Builtins Library ------- +- Issue #16688: Fix backreferences did make case-insensitive regex fail on + non-ASCII strings. Patch by Matthew Barnett. + - Issue #16485: Fix file descriptor not being closed if file header patching fails on closing of aifc file. diff --git a/Modules/_sre.c b/Modules/_sre.c index de3539658d2..aa56529f90f 100644 --- a/Modules/_sre.c +++ b/Modules/_sre.c @@ -492,7 +492,7 @@ SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, Py_ssize_t maxcount) Py_ssize_t i; /* adjust end */ - if (maxcount < end - ptr && maxcount != 65535) + if (maxcount < (end - ptr) / state->charsize && maxcount != 65535) end = ptr + maxcount*state->charsize; switch (pattern[0]) { @@ -583,7 +583,7 @@ SRE_INFO(SRE_STATE* state, SRE_CODE* pattern) Py_ssize_t i; /* check minimal length */ - if (pattern[3] && (end - ptr) < pattern[3]) + if (pattern[3] && (end - ptr)/state->charsize < pattern[3]) return 0; /* check known prefix */ @@ -801,7 +801,7 @@ entrance: /* <1=skip> <2=flags> <3=min> ... */ if (ctx->pattern[3] && (end - ctx->ptr)/state->charsize < ctx->pattern[3]) { TRACE(("reject (got %d chars, need %d)\n", - (end - ctx->ptr), ctx->pattern[3])); + (end - ctx->ptr)/state->charsize, ctx->pattern[3])); RETURN_FAILURE; } ctx->pattern += ctx->pattern[1] + 1; @@ -1329,9 +1329,10 @@ entrance: RETURN_FAILURE; while (p < e) { if (ctx->ptr >= end || - state->lower(SRE_CHARGET(state, ctx->ptr, 0)) != state->lower(*p)) + state->lower(SRE_CHARGET(state, ctx->ptr, 0)) != + state->lower(SRE_CHARGET(state, p, 0))) RETURN_FAILURE; - p++; + p += state->charsize; ctx->ptr += state->charsize; } }