Issue #6561: '\d' regular expression should not match characters of
category [No]; only those of category [Nd]. (Backport of r74237 from py3k.)
This commit is contained in:
parent
8d31f5413c
commit
fe67bd9168
|
@ -332,7 +332,8 @@ the second character. For example, ``\$`` matches the character ``'$'``.
|
||||||
``\d``
|
``\d``
|
||||||
When the :const:`UNICODE` flag is not specified, matches any decimal digit; this
|
When the :const:`UNICODE` flag is not specified, matches any decimal digit; this
|
||||||
is equivalent to the set ``[0-9]``. With :const:`UNICODE`, it will match
|
is equivalent to the set ``[0-9]``. With :const:`UNICODE`, it will match
|
||||||
whatever is classified as a digit in the Unicode character properties database.
|
whatever is classified as a decimal digit in the Unicode character properties
|
||||||
|
database.
|
||||||
|
|
||||||
``\D``
|
``\D``
|
||||||
When the :const:`UNICODE` flag is not specified, matches any non-digit
|
When the :const:`UNICODE` flag is not specified, matches any non-digit
|
||||||
|
|
|
@ -636,6 +636,27 @@ class ReTests(unittest.TestCase):
|
||||||
self.assertEqual(iter.next().span(), (4, 4))
|
self.assertEqual(iter.next().span(), (4, 4))
|
||||||
self.assertRaises(StopIteration, iter.next)
|
self.assertRaises(StopIteration, iter.next)
|
||||||
|
|
||||||
|
def test_bug_6561(self):
|
||||||
|
# '\d' should match characters in Unicode category 'Nd'
|
||||||
|
# (Number, Decimal Digit), but not those in 'Nl' (Number,
|
||||||
|
# Letter) or 'No' (Number, Other).
|
||||||
|
decimal_digits = [
|
||||||
|
u'\u0037', # '\N{DIGIT SEVEN}', category 'Nd'
|
||||||
|
u'\u0e58', # '\N{THAI DIGIT SIX}', category 'Nd'
|
||||||
|
u'\uff10', # '\N{FULLWIDTH DIGIT ZERO}', category 'Nd'
|
||||||
|
]
|
||||||
|
for x in decimal_digits:
|
||||||
|
self.assertEqual(re.match('^\d$', x, re.UNICODE).group(0), x)
|
||||||
|
|
||||||
|
not_decimal_digits = [
|
||||||
|
u'\u2165', # '\N{ROMAN NUMERAL SIX}', category 'Nl'
|
||||||
|
u'\u3039', # '\N{HANGZHOU NUMERAL TWENTY}', category 'Nl'
|
||||||
|
u'\u2082', # '\N{SUBSCRIPT TWO}', category 'No'
|
||||||
|
u'\u32b4', # '\N{CIRCLED NUMBER THIRTY NINE}', category 'No'
|
||||||
|
]
|
||||||
|
for x in not_decimal_digits:
|
||||||
|
self.assertIsNone(re.match('^\d$', x, re.UNICODE))
|
||||||
|
|
||||||
def test_empty_array(self):
|
def test_empty_array(self):
|
||||||
# SF buf 1647541
|
# SF buf 1647541
|
||||||
import array
|
import array
|
||||||
|
|
|
@ -1205,6 +1205,10 @@ C-API
|
||||||
Extension Modules
|
Extension Modules
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #6561: '\d' in a regex now matches only characters with
|
||||||
|
Unicode category 'Nd' (Number, Decimal Digit). Previously it also
|
||||||
|
matched characters with category 'No'.
|
||||||
|
|
||||||
- Issue #1523: Remove deprecated overflow wrapping for struct.pack
|
- Issue #1523: Remove deprecated overflow wrapping for struct.pack
|
||||||
with an integer format code ('bBhHiIlLqQ'). Packing an out-of-range
|
with an integer format code ('bBhHiIlLqQ'). Packing an out-of-range
|
||||||
integer now consistently raises struct.error.
|
integer now consistently raises struct.error.
|
||||||
|
|
|
@ -172,7 +172,7 @@ static unsigned int sre_lower_locale(unsigned int ch)
|
||||||
|
|
||||||
#if defined(HAVE_UNICODE)
|
#if defined(HAVE_UNICODE)
|
||||||
|
|
||||||
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
|
#define SRE_UNI_IS_DIGIT(ch) Py_UNICODE_ISDECIMAL((Py_UNICODE)(ch))
|
||||||
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
|
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
|
||||||
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
|
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
|
||||||
#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
|
#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
|
||||||
|
|
Loading…
Reference in New Issue