bpo-47152: Convert the re module into a package (GH-32177)

The sre_* modules are now deprecated.
This commit is contained in:
Serhiy Storchaka 2022-04-02 11:35:13 +03:00 committed by GitHub
parent 4ed8a9a589
commit 1be3260a90
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 2235 additions and 2182 deletions

View File

@ -96,14 +96,14 @@ Sample output (may vary depending on the architecture)::
Loaded modules:
_types:
copyreg: _inverted_registry,_slotnames,__all__
sre_compile: isstring,_sre,_optimize_unicode
re._compiler: isstring,_sre,_optimize_unicode
_sre:
sre_constants: REPEAT_ONE,makedict,AT_END_LINE
re._constants: REPEAT_ONE,makedict,AT_END_LINE
sys:
re: __module__,finditer,_expand
itertools:
__main__: re,itertools,baconhameggs
sre_parse: _PATTERNENDERS,SRE_FLAG_UNICODE
re._parser: _PATTERNENDERS,SRE_FLAG_UNICODE
array:
types: __module__,IntType,TypeType
---------------------------------------------------

View File

@ -73,12 +73,12 @@ the following::
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.000 0.000 0.002 0.002 {built-in method builtins.exec}
1 0.000 0.000 0.001 0.001 <string>:1(<module>)
1 0.000 0.000 0.001 0.001 re.py:250(compile)
1 0.000 0.000 0.001 0.001 re.py:289(_compile)
1 0.000 0.000 0.000 0.000 sre_compile.py:759(compile)
1 0.000 0.000 0.000 0.000 sre_parse.py:937(parse)
1 0.000 0.000 0.000 0.000 sre_compile.py:598(_code)
1 0.000 0.000 0.000 0.000 sre_parse.py:435(_parse_sub)
1 0.000 0.000 0.001 0.001 __init__.py:250(compile)
1 0.000 0.000 0.001 0.001 __init__.py:289(_compile)
1 0.000 0.000 0.000 0.000 _compiler.py:759(compile)
1 0.000 0.000 0.000 0.000 _parser.py:937(parse)
1 0.000 0.000 0.000 0.000 _compiler.py:598(_code)
1 0.000 0.000 0.000 0.000 _parser.py:435(_parse_sub)
The first line indicates that 214 calls were monitored. Of those calls, 207
were :dfn:`primitive`, meaning that the call was not induced via recursion. The

View File

@ -532,6 +532,10 @@ Deprecated
be able to parse Python 3.10 or newer. See the :pep:`617` (New PEG parser for
CPython). (Contributed by Victor Stinner in :issue:`40360`.)
* Undocumented modules ``sre_compile``, ``sre_constants`` and ``sre_parse``
are now deprecated.
(Contributed by Serhiy Storchaka in :issue:`47152`.)
* :class:`webbrowser.MacOSX` is deprecated and will be removed in Python 3.13.
It is untested and undocumented and also not used by webbrowser itself.
(Contributed by Dong-hee Na in :issue:`42255`.)

View File

@ -122,8 +122,7 @@ This module also defines an exception 'error'.
"""
import enum
import sre_compile
import sre_parse
from . import _compiler, _parser
import functools
try:
import _locale
@ -146,21 +145,21 @@ __version__ = "2.2.1"
@enum._simple_enum(enum.IntFlag, boundary=enum.KEEP)
class RegexFlag:
NOFLAG = 0
ASCII = A = sre_compile.SRE_FLAG_ASCII # assume ascii "locale"
IGNORECASE = I = sre_compile.SRE_FLAG_IGNORECASE # ignore case
LOCALE = L = sre_compile.SRE_FLAG_LOCALE # assume current 8-bit locale
UNICODE = U = sre_compile.SRE_FLAG_UNICODE # assume unicode "locale"
MULTILINE = M = sre_compile.SRE_FLAG_MULTILINE # make anchors look for newline
DOTALL = S = sre_compile.SRE_FLAG_DOTALL # make dot match newline
VERBOSE = X = sre_compile.SRE_FLAG_VERBOSE # ignore whitespace and comments
ASCII = A = _compiler.SRE_FLAG_ASCII # assume ascii "locale"
IGNORECASE = I = _compiler.SRE_FLAG_IGNORECASE # ignore case
LOCALE = L = _compiler.SRE_FLAG_LOCALE # assume current 8-bit locale
UNICODE = U = _compiler.SRE_FLAG_UNICODE # assume unicode "locale"
MULTILINE = M = _compiler.SRE_FLAG_MULTILINE # make anchors look for newline
DOTALL = S = _compiler.SRE_FLAG_DOTALL # make dot match newline
VERBOSE = X = _compiler.SRE_FLAG_VERBOSE # ignore whitespace and comments
# sre extensions (experimental, don't rely on these)
TEMPLATE = T = sre_compile.SRE_FLAG_TEMPLATE # disable backtracking
DEBUG = sre_compile.SRE_FLAG_DEBUG # dump pattern after compilation
TEMPLATE = T = _compiler.SRE_FLAG_TEMPLATE # disable backtracking
DEBUG = _compiler.SRE_FLAG_DEBUG # dump pattern after compilation
__str__ = object.__str__
_numeric_repr_ = hex
# sre exception
error = sre_compile.error
error = _compiler.error
# --------------------------------------------------------------------
# public interface
@ -257,8 +256,8 @@ def escape(pattern):
pattern = str(pattern, 'latin1')
return pattern.translate(_special_chars_map).encode('latin1')
Pattern = type(sre_compile.compile('', 0))
Match = type(sre_compile.compile('', 0).match(''))
Pattern = type(_compiler.compile('', 0))
Match = type(_compiler.compile('', 0).match(''))
# --------------------------------------------------------------------
# internals
@ -279,9 +278,9 @@ def _compile(pattern, flags):
raise ValueError(
"cannot process flags argument with a compiled pattern")
return pattern
if not sre_compile.isstring(pattern):
if not _compiler.isstring(pattern):
raise TypeError("first argument must be string or compiled pattern")
p = sre_compile.compile(pattern, flags)
p = _compiler.compile(pattern, flags)
if not (flags & DEBUG):
if len(_cache) >= _MAXCACHE:
# Drop the oldest item
@ -295,12 +294,12 @@ def _compile(pattern, flags):
@functools.lru_cache(_MAXCACHE)
def _compile_repl(repl, pattern):
# internal: compile replacement pattern
return sre_parse.parse_template(repl, pattern)
return _parser.parse_template(repl, pattern)
def _expand(pattern, match, template):
# internal: Match.expand implementation hook
template = sre_parse.parse_template(template, pattern)
return sre_parse.expand_template(template, match)
template = _parser.parse_template(template, pattern)
return _parser.expand_template(template, match)
def _subx(pattern, template):
# internal: Pattern.sub/subn implementation helper
@ -309,7 +308,7 @@ def _subx(pattern, template):
# literal replacement
return template[1][0]
def filter(match, template=template):
return sre_parse.expand_template(template, match)
return _parser.expand_template(template, match)
return filter
# register myself for pickling
@ -326,22 +325,22 @@ copyreg.pickle(Pattern, _pickle, _compile)
class Scanner:
def __init__(self, lexicon, flags=0):
from sre_constants import BRANCH, SUBPATTERN
from ._constants import BRANCH, SUBPATTERN
if isinstance(flags, RegexFlag):
flags = flags.value
self.lexicon = lexicon
# combine phrases into a compound pattern
p = []
s = sre_parse.State()
s = _parser.State()
s.flags = flags
for phrase, action in lexicon:
gid = s.opengroup()
p.append(sre_parse.SubPattern(s, [
(SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))),
p.append(_parser.SubPattern(s, [
(SUBPATTERN, (gid, 0, 0, _parser.parse(phrase, flags))),
]))
s.closegroup(gid, p[-1])
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
self.scanner = sre_compile.compile(p)
p = _parser.SubPattern(s, [(BRANCH, (None, p))])
self.scanner = _compiler.compile(p)
def scan(self, string):
result = []
append = result.append

800
Lib/re/_compiler.py Normal file
View File

@ -0,0 +1,800 @@
#
# Secret Labs' Regular Expression Engine
#
# convert template to internal format
#
# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
#
# See the __init__.py file for information on usage and redistribution.
#
"""Internal support module for sre"""
import _sre
from . import _parser
from ._constants import *
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
_LITERAL_CODES = {LITERAL, NOT_LITERAL}
_SUCCESS_CODES = {SUCCESS, FAILURE}
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
_REPEATING_CODES = {
MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE),
MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE),
POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
}
# Sets of lowercase characters which have the same uppercase.
_equivalences = (
# LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
(0x69, 0x131), # iı
# LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
(0x73, 0x17f), # sſ
# MICRO SIGN, GREEK SMALL LETTER MU
(0xb5, 0x3bc), # µμ
# COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
(0x345, 0x3b9, 0x1fbe), # \u0345ι
# GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
(0x390, 0x1fd3), # ΐΐ
# GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
(0x3b0, 0x1fe3), # ΰΰ
# GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
(0x3b2, 0x3d0), # βϐ
# GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
(0x3b5, 0x3f5), # εϵ
# GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
(0x3b8, 0x3d1), # θϑ
# GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
(0x3ba, 0x3f0), # κϰ
# GREEK SMALL LETTER PI, GREEK PI SYMBOL
(0x3c0, 0x3d6), # πϖ
# GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
(0x3c1, 0x3f1), # ρϱ
# GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
(0x3c2, 0x3c3), # ςσ
# GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
(0x3c6, 0x3d5), # φϕ
# LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
(0x1e61, 0x1e9b), # ṡẛ
# LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
(0xfb05, 0xfb06), # ſtst
)
# Maps the lowercase code to lowercase codes which have the same uppercase.
_ignorecase_fixes = {i: tuple(j for j in t if i != j)
for t in _equivalences for i in t}
def _combine_flags(flags, add_flags, del_flags,
TYPE_FLAGS=_parser.TYPE_FLAGS):
if add_flags & TYPE_FLAGS:
flags &= ~TYPE_FLAGS
return (flags | add_flags) & ~del_flags
def _compile(code, pattern, flags):
# internal: compile a (sub)pattern
emit = code.append
_len = len
LITERAL_CODES = _LITERAL_CODES
REPEATING_CODES = _REPEATING_CODES
SUCCESS_CODES = _SUCCESS_CODES
ASSERT_CODES = _ASSERT_CODES
iscased = None
tolower = None
fixes = None
if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
if flags & SRE_FLAG_UNICODE:
iscased = _sre.unicode_iscased
tolower = _sre.unicode_tolower
fixes = _ignorecase_fixes
else:
iscased = _sre.ascii_iscased
tolower = _sre.ascii_tolower
for op, av in pattern:
if op in LITERAL_CODES:
if not flags & SRE_FLAG_IGNORECASE:
emit(op)
emit(av)
elif flags & SRE_FLAG_LOCALE:
emit(OP_LOCALE_IGNORE[op])
emit(av)
elif not iscased(av):
emit(op)
emit(av)
else:
lo = tolower(av)
if not fixes: # ascii
emit(OP_IGNORE[op])
emit(lo)
elif lo not in fixes:
emit(OP_UNICODE_IGNORE[op])
emit(lo)
else:
emit(IN_UNI_IGNORE)
skip = _len(code); emit(0)
if op is NOT_LITERAL:
emit(NEGATE)
for k in (lo,) + fixes[lo]:
emit(LITERAL)
emit(k)
emit(FAILURE)
code[skip] = _len(code) - skip
elif op is IN:
charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
emit(IN_LOC_IGNORE)
elif not hascased:
emit(IN)
elif not fixes: # ascii
emit(IN_IGNORE)
else:
emit(IN_UNI_IGNORE)
skip = _len(code); emit(0)
_compile_charset(charset, flags, code)
code[skip] = _len(code) - skip
elif op is ANY:
if flags & SRE_FLAG_DOTALL:
emit(ANY_ALL)
else:
emit(ANY)
elif op in REPEATING_CODES:
if flags & SRE_FLAG_TEMPLATE:
raise error("internal: unsupported template operator %r" % (op,))
if _simple(av[2]):
emit(REPEATING_CODES[op][2])
skip = _len(code); emit(0)
emit(av[0])
emit(av[1])
_compile(code, av[2], flags)
emit(SUCCESS)
code[skip] = _len(code) - skip
else:
emit(REPEATING_CODES[op][0])
skip = _len(code); emit(0)
emit(av[0])
emit(av[1])
_compile(code, av[2], flags)
code[skip] = _len(code) - skip
emit(REPEATING_CODES[op][1])
elif op is SUBPATTERN:
group, add_flags, del_flags, p = av
if group:
emit(MARK)
emit((group-1)*2)
# _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
_compile(code, p, _combine_flags(flags, add_flags, del_flags))
if group:
emit(MARK)
emit((group-1)*2+1)
elif op is ATOMIC_GROUP:
# Atomic Groups are handled by starting with an Atomic
# Group op code, then putting in the atomic group pattern
# and finally a success op code to tell any repeat
# operations within the Atomic Group to stop eating and
# pop their stack if they reach it
emit(ATOMIC_GROUP)
skip = _len(code); emit(0)
_compile(code, av, flags)
emit(SUCCESS)
code[skip] = _len(code) - skip
elif op in SUCCESS_CODES:
emit(op)
elif op in ASSERT_CODES:
emit(op)
skip = _len(code); emit(0)
if av[0] >= 0:
emit(0) # look ahead
else:
lo, hi = av[1].getwidth()
if lo != hi:
raise error("look-behind requires fixed-width pattern")
emit(lo) # look behind
_compile(code, av[1], flags)
emit(SUCCESS)
code[skip] = _len(code) - skip
elif op is CALL:
emit(op)
skip = _len(code); emit(0)
_compile(code, av, flags)
emit(SUCCESS)
code[skip] = _len(code) - skip
elif op is AT:
emit(op)
if flags & SRE_FLAG_MULTILINE:
av = AT_MULTILINE.get(av, av)
if flags & SRE_FLAG_LOCALE:
av = AT_LOCALE.get(av, av)
elif flags & SRE_FLAG_UNICODE:
av = AT_UNICODE.get(av, av)
emit(av)
elif op is BRANCH:
emit(op)
tail = []
tailappend = tail.append
for av in av[1]:
skip = _len(code); emit(0)
# _compile_info(code, av, flags)
_compile(code, av, flags)
emit(JUMP)
tailappend(_len(code)); emit(0)
code[skip] = _len(code) - skip
emit(FAILURE) # end of branch
for tail in tail:
code[tail] = _len(code) - tail
elif op is CATEGORY:
emit(op)
if flags & SRE_FLAG_LOCALE:
av = CH_LOCALE[av]
elif flags & SRE_FLAG_UNICODE:
av = CH_UNICODE[av]
emit(av)
elif op is GROUPREF:
if not flags & SRE_FLAG_IGNORECASE:
emit(op)
elif flags & SRE_FLAG_LOCALE:
emit(GROUPREF_LOC_IGNORE)
elif not fixes: # ascii
emit(GROUPREF_IGNORE)
else:
emit(GROUPREF_UNI_IGNORE)
emit(av-1)
elif op is GROUPREF_EXISTS:
emit(op)
emit(av[0]-1)
skipyes = _len(code); emit(0)
_compile(code, av[1], flags)
if av[2]:
emit(JUMP)
skipno = _len(code); emit(0)
code[skipyes] = _len(code) - skipyes + 1
_compile(code, av[2], flags)
code[skipno] = _len(code) - skipno
else:
code[skipyes] = _len(code) - skipyes + 1
else:
raise error("internal: unsupported operand type %r" % (op,))
def _compile_charset(charset, flags, code):
# compile charset subprogram
emit = code.append
for op, av in charset:
emit(op)
if op is NEGATE:
pass
elif op is LITERAL:
emit(av)
elif op is RANGE or op is RANGE_UNI_IGNORE:
emit(av[0])
emit(av[1])
elif op is CHARSET:
code.extend(av)
elif op is BIGCHARSET:
code.extend(av)
elif op is CATEGORY:
if flags & SRE_FLAG_LOCALE:
emit(CH_LOCALE[av])
elif flags & SRE_FLAG_UNICODE:
emit(CH_UNICODE[av])
else:
emit(av)
else:
raise error("internal: unsupported set operator %r" % (op,))
emit(FAILURE)
def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
# internal: optimize character set
out = []
tail = []
charmap = bytearray(256)
hascased = False
for op, av in charset:
while True:
try:
if op is LITERAL:
if fixup:
lo = fixup(av)
charmap[lo] = 1
if fixes and lo in fixes:
for k in fixes[lo]:
charmap[k] = 1
if not hascased and iscased(av):
hascased = True
else:
charmap[av] = 1
elif op is RANGE:
r = range(av[0], av[1]+1)
if fixup:
if fixes:
for i in map(fixup, r):
charmap[i] = 1
if i in fixes:
for k in fixes[i]:
charmap[k] = 1
else:
for i in map(fixup, r):
charmap[i] = 1
if not hascased:
hascased = any(map(iscased, r))
else:
for i in r:
charmap[i] = 1
elif op is NEGATE:
out.append((op, av))
else:
tail.append((op, av))
except IndexError:
if len(charmap) == 256:
# character set contains non-UCS1 character codes
charmap += b'\0' * 0xff00
continue
# Character set contains non-BMP character codes.
if fixup:
hascased = True
# There are only two ranges of cased non-BMP characters:
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
# and for both ranges RANGE_UNI_IGNORE works.
if op is RANGE:
op = RANGE_UNI_IGNORE
tail.append((op, av))
break
# compress character map
runs = []
q = 0
while True:
p = charmap.find(1, q)
if p < 0:
break
if len(runs) >= 2:
runs = None
break
q = charmap.find(0, p)
if q < 0:
runs.append((p, len(charmap)))
break
runs.append((p, q))
if runs is not None:
# use literal/range
for p, q in runs:
if q - p == 1:
out.append((LITERAL, p))
else:
out.append((RANGE, (p, q - 1)))
out += tail
# if the case was changed or new representation is more compact
if hascased or len(out) < len(charset):
return out, hascased
# else original character set is good enough
return charset, hascased
# use bitmap
if len(charmap) == 256:
data = _mk_bitmap(charmap)
out.append((CHARSET, data))
out += tail
return out, hascased
# To represent a big charset, first a bitmap of all characters in the
# set is constructed. Then, this bitmap is sliced into chunks of 256
# characters, duplicate chunks are eliminated, and each chunk is
# given a number. In the compiled expression, the charset is
# represented by a 32-bit word sequence, consisting of one word for
# the number of different chunks, a sequence of 256 bytes (64 words)
# of chunk numbers indexed by their original chunk position, and a
# sequence of 256-bit chunks (8 words each).
# Compression is normally good: in a typical charset, large ranges of
# Unicode will be either completely excluded (e.g. if only cyrillic
# letters are to be matched), or completely included (e.g. if large
# subranges of Kanji match). These ranges will be represented by
# chunks of all one-bits or all zero-bits.
# Matching can be also done efficiently: the more significant byte of
# the Unicode character is an index into the chunk number, and the
# less significant byte is a bit index in the chunk (just like the
# CHARSET matching).
charmap = bytes(charmap) # should be hashable
comps = {}
mapping = bytearray(256)
block = 0
data = bytearray()
for i in range(0, 65536, 256):
chunk = charmap[i: i + 256]
if chunk in comps:
mapping[i // 256] = comps[chunk]
else:
mapping[i // 256] = comps[chunk] = block
block += 1
data += chunk
data = _mk_bitmap(data)
data[0:0] = [block] + _bytes_to_codes(mapping)
out.append((BIGCHARSET, data))
out += tail
return out, hascased
_CODEBITS = _sre.CODESIZE * 8
MAXCODE = (1 << _CODEBITS) - 1
_BITS_TRANS = b'0' + b'1' * 255
def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
s = bits.translate(_BITS_TRANS)[::-1]
return [_int(s[i - _CODEBITS: i], 2)
for i in range(len(s), 0, -_CODEBITS)]
def _bytes_to_codes(b):
# Convert block indices to word array
a = memoryview(b).cast('I')
assert a.itemsize == _sre.CODESIZE
assert len(a) * a.itemsize == len(b)
return a.tolist()
def _simple(p):
# check if this subpattern is a "simple" operator
if len(p) != 1:
return False
op, av = p[0]
if op is SUBPATTERN:
return av[0] is None and _simple(av[-1])
return op in _UNIT_CODES
def _generate_overlap_table(prefix):
"""
Generate an overlap table for the following prefix.
An overlap table is a table of the same size as the prefix which
informs about the potential self-overlap for each index in the prefix:
- if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
- if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
prefix[0:k]
"""
table = [0] * len(prefix)
for i in range(1, len(prefix)):
idx = table[i - 1]
while prefix[i] != prefix[idx]:
if idx == 0:
table[i] = 0
break
idx = table[idx - 1]
else:
table[i] = idx + 1
return table
def _get_iscased(flags):
if not flags & SRE_FLAG_IGNORECASE:
return None
elif flags & SRE_FLAG_UNICODE:
return _sre.unicode_iscased
else:
return _sre.ascii_iscased
def _get_literal_prefix(pattern, flags):
# look for literal prefix
prefix = []
prefixappend = prefix.append
prefix_skip = None
iscased = _get_iscased(flags)
for op, av in pattern.data:
if op is LITERAL:
if iscased and iscased(av):
break
prefixappend(av)
elif op is SUBPATTERN:
group, add_flags, del_flags, p = av
flags1 = _combine_flags(flags, add_flags, del_flags)
if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
break
prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
if prefix_skip is None:
if group is not None:
prefix_skip = len(prefix)
elif prefix_skip1 is not None:
prefix_skip = len(prefix) + prefix_skip1
prefix.extend(prefix1)
if not got_all:
break
else:
break
else:
return prefix, prefix_skip, True
return prefix, prefix_skip, False
def _get_charset_prefix(pattern, flags):
while True:
if not pattern.data:
return None
op, av = pattern.data[0]
if op is not SUBPATTERN:
break
group, add_flags, del_flags, pattern = av
flags = _combine_flags(flags, add_flags, del_flags)
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
return None
iscased = _get_iscased(flags)
if op is LITERAL:
if iscased and iscased(av):
return None
return [(op, av)]
elif op is BRANCH:
charset = []
charsetappend = charset.append
for p in av[1]:
if not p:
return None
op, av = p[0]
if op is LITERAL and not (iscased and iscased(av)):
charsetappend((op, av))
else:
return None
return charset
elif op is IN:
charset = av
if iscased:
for op, av in charset:
if op is LITERAL:
if iscased(av):
return None
elif op is RANGE:
if av[1] > 0xffff:
return None
if any(map(iscased, range(av[0], av[1]+1))):
return None
return charset
return None
def _compile_info(code, pattern, flags):
# internal: compile an info block. in the current version,
# this contains min/max pattern width, and an optional literal
# prefix or a character map
lo, hi = pattern.getwidth()
if hi > MAXCODE:
hi = MAXCODE
if lo == 0:
code.extend([INFO, 4, 0, lo, hi])
return
# look for a literal prefix
prefix = []
prefix_skip = 0
charset = [] # not used
if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
# look for literal prefix
prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
# if no prefix, look for charset prefix
if not prefix:
charset = _get_charset_prefix(pattern, flags)
## if prefix:
## print("*** PREFIX", prefix, prefix_skip)
## if charset:
## print("*** CHARSET", charset)
# add an info block
emit = code.append
emit(INFO)
skip = len(code); emit(0)
# literal flag
mask = 0
if prefix:
mask = SRE_INFO_PREFIX
if prefix_skip is None and got_all:
mask = mask | SRE_INFO_LITERAL
elif charset:
mask = mask | SRE_INFO_CHARSET
emit(mask)
# pattern length
if lo < MAXCODE:
emit(lo)
else:
emit(MAXCODE)
prefix = prefix[:MAXCODE]
emit(min(hi, MAXCODE))
# add literal prefix
if prefix:
emit(len(prefix)) # length
if prefix_skip is None:
prefix_skip = len(prefix)
emit(prefix_skip) # skip
code.extend(prefix)
# generate overlap table
code.extend(_generate_overlap_table(prefix))
elif charset:
charset, hascased = _optimize_charset(charset)
assert not hascased
_compile_charset(charset, flags, code)
code[skip] = len(code) - skip
def isstring(obj):
return isinstance(obj, (str, bytes))
def _code(p, flags):
flags = p.state.flags | flags
code = []
# compile info block
_compile_info(code, p, flags)
# compile the pattern
_compile(code, p.data, flags)
code.append(SUCCESS)
return code
def _hex_code(code):
return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
def dis(code):
import sys
labels = set()
level = 0
offset_width = len(str(len(code) - 1))
def dis_(start, end):
def print_(*args, to=None):
if to is not None:
labels.add(to)
args += ('(to %d)' % (to,),)
print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
end=' '*(level-1))
print(*args)
def print_2(*args):
print(end=' '*(offset_width + 2*level))
print(*args)
nonlocal level
level += 1
i = start
while i < end:
start = i
op = code[i]
i += 1
op = OPCODES[op]
if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
MAX_UNTIL, MIN_UNTIL, NEGATE):
print_(op)
elif op in (LITERAL, NOT_LITERAL,
LITERAL_IGNORE, NOT_LITERAL_IGNORE,
LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
arg = code[i]
i += 1
print_(op, '%#02x (%r)' % (arg, chr(arg)))
elif op is AT:
arg = code[i]
i += 1
arg = str(ATCODES[arg])
assert arg[:3] == 'AT_'
print_(op, arg[3:])
elif op is CATEGORY:
arg = code[i]
i += 1
arg = str(CHCODES[arg])
assert arg[:9] == 'CATEGORY_'
print_(op, arg[9:])
elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
skip = code[i]
print_(op, skip, to=i+skip)
dis_(i+1, i+skip)
i += skip
elif op in (RANGE, RANGE_UNI_IGNORE):
lo, hi = code[i: i+2]
i += 2
print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
elif op is CHARSET:
print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
i += 256//_CODEBITS
elif op is BIGCHARSET:
arg = code[i]
i += 1
mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
for x in code[i: i + 256//_sre.CODESIZE]))
print_(op, arg, mapping)
i += 256//_sre.CODESIZE
level += 1
for j in range(arg):
print_2(_hex_code(code[i: i + 256//_CODEBITS]))
i += 256//_CODEBITS
level -= 1
elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
GROUPREF_LOC_IGNORE):
arg = code[i]
i += 1
print_(op, arg)
elif op is JUMP:
skip = code[i]
print_(op, skip, to=i+skip)
i += 1
elif op is BRANCH:
skip = code[i]
print_(op, skip, to=i+skip)
while skip:
dis_(i+1, i+skip)
i += skip
start = i
skip = code[i]
if skip:
print_('branch', skip, to=i+skip)
else:
print_(FAILURE)
i += 1
elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE,
POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE):
skip, min, max = code[i: i+3]
if max == MAXREPEAT:
max = 'MAXREPEAT'
print_(op, skip, min, max, to=i+skip)
dis_(i+3, i+skip)
i += skip
elif op is GROUPREF_EXISTS:
arg, skip = code[i: i+2]
print_(op, arg, skip, to=i+skip)
i += 2
elif op in (ASSERT, ASSERT_NOT):
skip, arg = code[i: i+2]
print_(op, skip, arg, to=i+skip)
dis_(i+2, i+skip)
i += skip
elif op is ATOMIC_GROUP:
skip = code[i]
print_(op, skip, to=i+skip)
dis_(i+1, i+skip)
i += skip
elif op is INFO:
skip, flags, min, max = code[i: i+4]
if max == MAXREPEAT:
max = 'MAXREPEAT'
print_(op, skip, bin(flags), min, max, to=i+skip)
start = i+4
if flags & SRE_INFO_PREFIX:
prefix_len, prefix_skip = code[i+4: i+6]
print_2(' prefix_skip', prefix_skip)
start = i + 6
prefix = code[start: start+prefix_len]
print_2(' prefix',
'[%s]' % ', '.join('%#02x' % x for x in prefix),
'(%r)' % ''.join(map(chr, prefix)))
start += prefix_len
print_2(' overlap', code[start: start+prefix_len])
start += prefix_len
if flags & SRE_INFO_CHARSET:
level += 1
print_2('in')
dis_(start, i+skip)
level -= 1
i += skip
else:
raise ValueError(op)
level -= 1
dis_(0, len(code))
def compile(p, flags=0):
# internal: convert pattern list to internal format
if isstring(p):
pattern = p
p = _parser.parse(p, flags)
else:
pattern = None
code = _code(p, flags)
if flags & SRE_FLAG_DEBUG:
print()
dis(code)
# map in either direction
groupindex = p.state.groupdict
indexgroup = [None] * p.state.groups
for k, i in groupindex.items():
indexgroup[i] = k
return _sre.compile(
pattern, flags | p.state.flags, code,
p.state.groups-1,
groupindex, tuple(indexgroup)
)

262
Lib/re/_constants.py Normal file
View File

@ -0,0 +1,262 @@
#
# Secret Labs' Regular Expression Engine
#
# various symbols used by the regular expression engine.
# run this script to update the _sre include files!
#
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
#
# See the __init__.py file for information on usage and redistribution.
#
"""Internal support module for sre"""
# update when constants are added or removed
MAGIC = 20220318
from _sre import MAXREPEAT, MAXGROUPS
# SRE standard exception (access as sre.error)
# should this really be here?
class error(Exception):
"""Exception raised for invalid regular expressions.
Attributes:
msg: The unformatted error message
pattern: The regular expression pattern
pos: The index in the pattern where compilation failed (may be None)
lineno: The line corresponding to pos (may be None)
colno: The column corresponding to pos (may be None)
"""
__module__ = 're'
def __init__(self, msg, pattern=None, pos=None):
self.msg = msg
self.pattern = pattern
self.pos = pos
if pattern is not None and pos is not None:
msg = '%s at position %d' % (msg, pos)
if isinstance(pattern, str):
newline = '\n'
else:
newline = b'\n'
self.lineno = pattern.count(newline, 0, pos) + 1
self.colno = pos - pattern.rfind(newline, 0, pos)
if newline in pattern:
msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno)
else:
self.lineno = self.colno = None
super().__init__(msg)
class _NamedIntConstant(int):
def __new__(cls, value, name):
self = super(_NamedIntConstant, cls).__new__(cls, value)
self.name = name
return self
def __repr__(self):
return self.name
MAXREPEAT = _NamedIntConstant(MAXREPEAT, 'MAXREPEAT')
def _makecodes(names):
names = names.strip().split()
items = [_NamedIntConstant(i, name) for i, name in enumerate(names)]
globals().update({item.name: item for item in items})
return items
# operators
# failure=0 success=1 (just because it looks better that way :-)
OPCODES = _makecodes("""
FAILURE SUCCESS
ANY ANY_ALL
ASSERT ASSERT_NOT
AT
BRANCH
CALL
CATEGORY
CHARSET BIGCHARSET
GROUPREF GROUPREF_EXISTS
IN
INFO
JUMP
LITERAL
MARK
MAX_UNTIL
MIN_UNTIL
NOT_LITERAL
NEGATE
RANGE
REPEAT
REPEAT_ONE
SUBPATTERN
MIN_REPEAT_ONE
ATOMIC_GROUP
POSSESSIVE_REPEAT
POSSESSIVE_REPEAT_ONE
GROUPREF_IGNORE
IN_IGNORE
LITERAL_IGNORE
NOT_LITERAL_IGNORE
GROUPREF_LOC_IGNORE
IN_LOC_IGNORE
LITERAL_LOC_IGNORE
NOT_LITERAL_LOC_IGNORE
GROUPREF_UNI_IGNORE
IN_UNI_IGNORE
LITERAL_UNI_IGNORE
NOT_LITERAL_UNI_IGNORE
RANGE_UNI_IGNORE
MIN_REPEAT MAX_REPEAT
""")
del OPCODES[-2:] # remove MIN_REPEAT and MAX_REPEAT
# positions
ATCODES = _makecodes("""
AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING
AT_BOUNDARY AT_NON_BOUNDARY
AT_END AT_END_LINE AT_END_STRING
AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY
AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY
""")
# categories
CHCODES = _makecodes("""
CATEGORY_DIGIT CATEGORY_NOT_DIGIT
CATEGORY_SPACE CATEGORY_NOT_SPACE
CATEGORY_WORD CATEGORY_NOT_WORD
CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK
CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD
CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT
CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE
CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD
CATEGORY_UNI_LINEBREAK CATEGORY_UNI_NOT_LINEBREAK
""")
# replacement operations for "ignore case" mode
OP_IGNORE = {
LITERAL: LITERAL_IGNORE,
NOT_LITERAL: NOT_LITERAL_IGNORE,
}
OP_LOCALE_IGNORE = {
LITERAL: LITERAL_LOC_IGNORE,
NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
}
OP_UNICODE_IGNORE = {
LITERAL: LITERAL_UNI_IGNORE,
NOT_LITERAL: NOT_LITERAL_UNI_IGNORE,
}
AT_MULTILINE = {
AT_BEGINNING: AT_BEGINNING_LINE,
AT_END: AT_END_LINE
}
AT_LOCALE = {
AT_BOUNDARY: AT_LOC_BOUNDARY,
AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY
}
AT_UNICODE = {
AT_BOUNDARY: AT_UNI_BOUNDARY,
AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY
}
CH_LOCALE = {
CATEGORY_DIGIT: CATEGORY_DIGIT,
CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT,
CATEGORY_SPACE: CATEGORY_SPACE,
CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE,
CATEGORY_WORD: CATEGORY_LOC_WORD,
CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD,
CATEGORY_LINEBREAK: CATEGORY_LINEBREAK,
CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK
}
CH_UNICODE = {
CATEGORY_DIGIT: CATEGORY_UNI_DIGIT,
CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT,
CATEGORY_SPACE: CATEGORY_UNI_SPACE,
CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE,
CATEGORY_WORD: CATEGORY_UNI_WORD,
CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD,
CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK,
CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK
}
# flags
SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
SRE_FLAG_IGNORECASE = 2 # case insensitive
SRE_FLAG_LOCALE = 4 # honour system locale
SRE_FLAG_MULTILINE = 8 # treat target as multiline string
SRE_FLAG_DOTALL = 16 # treat target as a single string
SRE_FLAG_UNICODE = 32 # use unicode "locale"
SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments
SRE_FLAG_DEBUG = 128 # debugging
SRE_FLAG_ASCII = 256 # use ascii "locale"
# flags for INFO primitive
SRE_INFO_PREFIX = 1 # has prefix
SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix)
SRE_INFO_CHARSET = 4 # pattern starts with character from given set
if __name__ == "__main__":
def dump(f, d, prefix):
items = sorted(d)
for item in items:
f.write("#define %s_%s %d\n" % (prefix, item, item))
with open("sre_constants.h", "w") as f:
f.write("""\
/*
* Secret Labs' Regular Expression Engine
*
* regular expression matching engine
*
* NOTE: This file is generated by Lib/re/_constants.py. If you need
* to change anything in here, edit Lib/re/_constants.py and run it.
*
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
*
* See the _sre.c file for information on usage and redistribution.
*/
""")
f.write("#define SRE_MAGIC %d\n" % MAGIC)
dump(f, OPCODES, "SRE_OP")
dump(f, ATCODES, "SRE")
dump(f, CHCODES, "SRE")
f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE)
f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE)
f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE)
f.write("#define SRE_FLAG_MULTILINE %d\n" % SRE_FLAG_MULTILINE)
f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL)
f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE)
f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE)
f.write("#define SRE_FLAG_DEBUG %d\n" % SRE_FLAG_DEBUG)
f.write("#define SRE_FLAG_ASCII %d\n" % SRE_FLAG_ASCII)
f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX)
f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL)
f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET)
print("done")

1079
Lib/re/_parser.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,800 +1,7 @@
#
# Secret Labs' Regular Expression Engine
#
# convert template to internal format
#
# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
#
# See the sre.py file for information on usage and redistribution.
#
import warnings
warnings.warn(f"module {__name__!r} is deprecated",
DeprecationWarning,
stacklevel=2)
"""Internal support module for sre"""
import _sre
import sre_parse
from sre_constants import *
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
_LITERAL_CODES = {LITERAL, NOT_LITERAL}
_SUCCESS_CODES = {SUCCESS, FAILURE}
_ASSERT_CODES = {ASSERT, ASSERT_NOT}
_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
_REPEATING_CODES = {
MIN_REPEAT: (REPEAT, MIN_UNTIL, MIN_REPEAT_ONE),
MAX_REPEAT: (REPEAT, MAX_UNTIL, REPEAT_ONE),
POSSESSIVE_REPEAT: (POSSESSIVE_REPEAT, SUCCESS, POSSESSIVE_REPEAT_ONE),
}
# Sets of lowercase characters which have the same uppercase.
_equivalences = (
# LATIN SMALL LETTER I, LATIN SMALL LETTER DOTLESS I
(0x69, 0x131), # iı
# LATIN SMALL LETTER S, LATIN SMALL LETTER LONG S
(0x73, 0x17f), # sſ
# MICRO SIGN, GREEK SMALL LETTER MU
(0xb5, 0x3bc), # µμ
# COMBINING GREEK YPOGEGRAMMENI, GREEK SMALL LETTER IOTA, GREEK PROSGEGRAMMENI
(0x345, 0x3b9, 0x1fbe), # \u0345ι
# GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
(0x390, 0x1fd3), # ΐΐ
# GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS, GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
(0x3b0, 0x1fe3), # ΰΰ
# GREEK SMALL LETTER BETA, GREEK BETA SYMBOL
(0x3b2, 0x3d0), # βϐ
# GREEK SMALL LETTER EPSILON, GREEK LUNATE EPSILON SYMBOL
(0x3b5, 0x3f5), # εϵ
# GREEK SMALL LETTER THETA, GREEK THETA SYMBOL
(0x3b8, 0x3d1), # θϑ
# GREEK SMALL LETTER KAPPA, GREEK KAPPA SYMBOL
(0x3ba, 0x3f0), # κϰ
# GREEK SMALL LETTER PI, GREEK PI SYMBOL
(0x3c0, 0x3d6), # πϖ
# GREEK SMALL LETTER RHO, GREEK RHO SYMBOL
(0x3c1, 0x3f1), # ρϱ
# GREEK SMALL LETTER FINAL SIGMA, GREEK SMALL LETTER SIGMA
(0x3c2, 0x3c3), # ςσ
# GREEK SMALL LETTER PHI, GREEK PHI SYMBOL
(0x3c6, 0x3d5), # φϕ
# LATIN SMALL LETTER S WITH DOT ABOVE, LATIN SMALL LETTER LONG S WITH DOT ABOVE
(0x1e61, 0x1e9b), # ṡẛ
# LATIN SMALL LIGATURE LONG S T, LATIN SMALL LIGATURE ST
(0xfb05, 0xfb06), # ſtst
)
# Maps the lowercase code to lowercase codes which have the same uppercase.
_ignorecase_fixes = {i: tuple(j for j in t if i != j)
for t in _equivalences for i in t}
def _combine_flags(flags, add_flags, del_flags,
TYPE_FLAGS=sre_parse.TYPE_FLAGS):
if add_flags & TYPE_FLAGS:
flags &= ~TYPE_FLAGS
return (flags | add_flags) & ~del_flags
def _compile(code, pattern, flags):
# internal: compile a (sub)pattern
emit = code.append
_len = len
LITERAL_CODES = _LITERAL_CODES
REPEATING_CODES = _REPEATING_CODES
SUCCESS_CODES = _SUCCESS_CODES
ASSERT_CODES = _ASSERT_CODES
iscased = None
tolower = None
fixes = None
if flags & SRE_FLAG_IGNORECASE and not flags & SRE_FLAG_LOCALE:
if flags & SRE_FLAG_UNICODE:
iscased = _sre.unicode_iscased
tolower = _sre.unicode_tolower
fixes = _ignorecase_fixes
else:
iscased = _sre.ascii_iscased
tolower = _sre.ascii_tolower
for op, av in pattern:
if op in LITERAL_CODES:
if not flags & SRE_FLAG_IGNORECASE:
emit(op)
emit(av)
elif flags & SRE_FLAG_LOCALE:
emit(OP_LOCALE_IGNORE[op])
emit(av)
elif not iscased(av):
emit(op)
emit(av)
else:
lo = tolower(av)
if not fixes: # ascii
emit(OP_IGNORE[op])
emit(lo)
elif lo not in fixes:
emit(OP_UNICODE_IGNORE[op])
emit(lo)
else:
emit(IN_UNI_IGNORE)
skip = _len(code); emit(0)
if op is NOT_LITERAL:
emit(NEGATE)
for k in (lo,) + fixes[lo]:
emit(LITERAL)
emit(k)
emit(FAILURE)
code[skip] = _len(code) - skip
elif op is IN:
charset, hascased = _optimize_charset(av, iscased, tolower, fixes)
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
emit(IN_LOC_IGNORE)
elif not hascased:
emit(IN)
elif not fixes: # ascii
emit(IN_IGNORE)
else:
emit(IN_UNI_IGNORE)
skip = _len(code); emit(0)
_compile_charset(charset, flags, code)
code[skip] = _len(code) - skip
elif op is ANY:
if flags & SRE_FLAG_DOTALL:
emit(ANY_ALL)
else:
emit(ANY)
elif op in REPEATING_CODES:
if flags & SRE_FLAG_TEMPLATE:
raise error("internal: unsupported template operator %r" % (op,))
if _simple(av[2]):
emit(REPEATING_CODES[op][2])
skip = _len(code); emit(0)
emit(av[0])
emit(av[1])
_compile(code, av[2], flags)
emit(SUCCESS)
code[skip] = _len(code) - skip
else:
emit(REPEATING_CODES[op][0])
skip = _len(code); emit(0)
emit(av[0])
emit(av[1])
_compile(code, av[2], flags)
code[skip] = _len(code) - skip
emit(REPEATING_CODES[op][1])
elif op is SUBPATTERN:
group, add_flags, del_flags, p = av
if group:
emit(MARK)
emit((group-1)*2)
# _compile_info(code, p, _combine_flags(flags, add_flags, del_flags))
_compile(code, p, _combine_flags(flags, add_flags, del_flags))
if group:
emit(MARK)
emit((group-1)*2+1)
elif op is ATOMIC_GROUP:
# Atomic Groups are handled by starting with an Atomic
# Group op code, then putting in the atomic group pattern
# and finally a success op code to tell any repeat
# operations within the Atomic Group to stop eating and
# pop their stack if they reach it
emit(ATOMIC_GROUP)
skip = _len(code); emit(0)
_compile(code, av, flags)
emit(SUCCESS)
code[skip] = _len(code) - skip
elif op in SUCCESS_CODES:
emit(op)
elif op in ASSERT_CODES:
emit(op)
skip = _len(code); emit(0)
if av[0] >= 0:
emit(0) # look ahead
else:
lo, hi = av[1].getwidth()
if lo != hi:
raise error("look-behind requires fixed-width pattern")
emit(lo) # look behind
_compile(code, av[1], flags)
emit(SUCCESS)
code[skip] = _len(code) - skip
elif op is CALL:
emit(op)
skip = _len(code); emit(0)
_compile(code, av, flags)
emit(SUCCESS)
code[skip] = _len(code) - skip
elif op is AT:
emit(op)
if flags & SRE_FLAG_MULTILINE:
av = AT_MULTILINE.get(av, av)
if flags & SRE_FLAG_LOCALE:
av = AT_LOCALE.get(av, av)
elif flags & SRE_FLAG_UNICODE:
av = AT_UNICODE.get(av, av)
emit(av)
elif op is BRANCH:
emit(op)
tail = []
tailappend = tail.append
for av in av[1]:
skip = _len(code); emit(0)
# _compile_info(code, av, flags)
_compile(code, av, flags)
emit(JUMP)
tailappend(_len(code)); emit(0)
code[skip] = _len(code) - skip
emit(FAILURE) # end of branch
for tail in tail:
code[tail] = _len(code) - tail
elif op is CATEGORY:
emit(op)
if flags & SRE_FLAG_LOCALE:
av = CH_LOCALE[av]
elif flags & SRE_FLAG_UNICODE:
av = CH_UNICODE[av]
emit(av)
elif op is GROUPREF:
if not flags & SRE_FLAG_IGNORECASE:
emit(op)
elif flags & SRE_FLAG_LOCALE:
emit(GROUPREF_LOC_IGNORE)
elif not fixes: # ascii
emit(GROUPREF_IGNORE)
else:
emit(GROUPREF_UNI_IGNORE)
emit(av-1)
elif op is GROUPREF_EXISTS:
emit(op)
emit(av[0]-1)
skipyes = _len(code); emit(0)
_compile(code, av[1], flags)
if av[2]:
emit(JUMP)
skipno = _len(code); emit(0)
code[skipyes] = _len(code) - skipyes + 1
_compile(code, av[2], flags)
code[skipno] = _len(code) - skipno
else:
code[skipyes] = _len(code) - skipyes + 1
else:
raise error("internal: unsupported operand type %r" % (op,))
def _compile_charset(charset, flags, code):
# compile charset subprogram
emit = code.append
for op, av in charset:
emit(op)
if op is NEGATE:
pass
elif op is LITERAL:
emit(av)
elif op is RANGE or op is RANGE_UNI_IGNORE:
emit(av[0])
emit(av[1])
elif op is CHARSET:
code.extend(av)
elif op is BIGCHARSET:
code.extend(av)
elif op is CATEGORY:
if flags & SRE_FLAG_LOCALE:
emit(CH_LOCALE[av])
elif flags & SRE_FLAG_UNICODE:
emit(CH_UNICODE[av])
else:
emit(av)
else:
raise error("internal: unsupported set operator %r" % (op,))
emit(FAILURE)
def _optimize_charset(charset, iscased=None, fixup=None, fixes=None):
# internal: optimize character set
out = []
tail = []
charmap = bytearray(256)
hascased = False
for op, av in charset:
while True:
try:
if op is LITERAL:
if fixup:
lo = fixup(av)
charmap[lo] = 1
if fixes and lo in fixes:
for k in fixes[lo]:
charmap[k] = 1
if not hascased and iscased(av):
hascased = True
else:
charmap[av] = 1
elif op is RANGE:
r = range(av[0], av[1]+1)
if fixup:
if fixes:
for i in map(fixup, r):
charmap[i] = 1
if i in fixes:
for k in fixes[i]:
charmap[k] = 1
else:
for i in map(fixup, r):
charmap[i] = 1
if not hascased:
hascased = any(map(iscased, r))
else:
for i in r:
charmap[i] = 1
elif op is NEGATE:
out.append((op, av))
else:
tail.append((op, av))
except IndexError:
if len(charmap) == 256:
# character set contains non-UCS1 character codes
charmap += b'\0' * 0xff00
continue
# Character set contains non-BMP character codes.
if fixup:
hascased = True
# There are only two ranges of cased non-BMP characters:
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
# and for both ranges RANGE_UNI_IGNORE works.
if op is RANGE:
op = RANGE_UNI_IGNORE
tail.append((op, av))
break
# compress character map
runs = []
q = 0
while True:
p = charmap.find(1, q)
if p < 0:
break
if len(runs) >= 2:
runs = None
break
q = charmap.find(0, p)
if q < 0:
runs.append((p, len(charmap)))
break
runs.append((p, q))
if runs is not None:
# use literal/range
for p, q in runs:
if q - p == 1:
out.append((LITERAL, p))
else:
out.append((RANGE, (p, q - 1)))
out += tail
# if the case was changed or new representation is more compact
if hascased or len(out) < len(charset):
return out, hascased
# else original character set is good enough
return charset, hascased
# use bitmap
if len(charmap) == 256:
data = _mk_bitmap(charmap)
out.append((CHARSET, data))
out += tail
return out, hascased
# To represent a big charset, first a bitmap of all characters in the
# set is constructed. Then, this bitmap is sliced into chunks of 256
# characters, duplicate chunks are eliminated, and each chunk is
# given a number. In the compiled expression, the charset is
# represented by a 32-bit word sequence, consisting of one word for
# the number of different chunks, a sequence of 256 bytes (64 words)
# of chunk numbers indexed by their original chunk position, and a
# sequence of 256-bit chunks (8 words each).
# Compression is normally good: in a typical charset, large ranges of
# Unicode will be either completely excluded (e.g. if only cyrillic
# letters are to be matched), or completely included (e.g. if large
# subranges of Kanji match). These ranges will be represented by
# chunks of all one-bits or all zero-bits.
# Matching can be also done efficiently: the more significant byte of
# the Unicode character is an index into the chunk number, and the
# less significant byte is a bit index in the chunk (just like the
# CHARSET matching).
charmap = bytes(charmap) # should be hashable
comps = {}
mapping = bytearray(256)
block = 0
data = bytearray()
for i in range(0, 65536, 256):
chunk = charmap[i: i + 256]
if chunk in comps:
mapping[i // 256] = comps[chunk]
else:
mapping[i // 256] = comps[chunk] = block
block += 1
data += chunk
data = _mk_bitmap(data)
data[0:0] = [block] + _bytes_to_codes(mapping)
out.append((BIGCHARSET, data))
out += tail
return out, hascased
_CODEBITS = _sre.CODESIZE * 8
MAXCODE = (1 << _CODEBITS) - 1
_BITS_TRANS = b'0' + b'1' * 255
def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
s = bits.translate(_BITS_TRANS)[::-1]
return [_int(s[i - _CODEBITS: i], 2)
for i in range(len(s), 0, -_CODEBITS)]
def _bytes_to_codes(b):
# Convert block indices to word array
a = memoryview(b).cast('I')
assert a.itemsize == _sre.CODESIZE
assert len(a) * a.itemsize == len(b)
return a.tolist()
def _simple(p):
# check if this subpattern is a "simple" operator
if len(p) != 1:
return False
op, av = p[0]
if op is SUBPATTERN:
return av[0] is None and _simple(av[-1])
return op in _UNIT_CODES
def _generate_overlap_table(prefix):
"""
Generate an overlap table for the following prefix.
An overlap table is a table of the same size as the prefix which
informs about the potential self-overlap for each index in the prefix:
- if overlap[i] == 0, prefix[i:] can't overlap prefix[0:...]
- if overlap[i] == k with 0 < k <= i, prefix[i-k+1:i+1] overlaps with
prefix[0:k]
"""
table = [0] * len(prefix)
for i in range(1, len(prefix)):
idx = table[i - 1]
while prefix[i] != prefix[idx]:
if idx == 0:
table[i] = 0
break
idx = table[idx - 1]
else:
table[i] = idx + 1
return table
def _get_iscased(flags):
if not flags & SRE_FLAG_IGNORECASE:
return None
elif flags & SRE_FLAG_UNICODE:
return _sre.unicode_iscased
else:
return _sre.ascii_iscased
def _get_literal_prefix(pattern, flags):
# look for literal prefix
prefix = []
prefixappend = prefix.append
prefix_skip = None
iscased = _get_iscased(flags)
for op, av in pattern.data:
if op is LITERAL:
if iscased and iscased(av):
break
prefixappend(av)
elif op is SUBPATTERN:
group, add_flags, del_flags, p = av
flags1 = _combine_flags(flags, add_flags, del_flags)
if flags1 & SRE_FLAG_IGNORECASE and flags1 & SRE_FLAG_LOCALE:
break
prefix1, prefix_skip1, got_all = _get_literal_prefix(p, flags1)
if prefix_skip is None:
if group is not None:
prefix_skip = len(prefix)
elif prefix_skip1 is not None:
prefix_skip = len(prefix) + prefix_skip1
prefix.extend(prefix1)
if not got_all:
break
else:
break
else:
return prefix, prefix_skip, True
return prefix, prefix_skip, False
def _get_charset_prefix(pattern, flags):
while True:
if not pattern.data:
return None
op, av = pattern.data[0]
if op is not SUBPATTERN:
break
group, add_flags, del_flags, pattern = av
flags = _combine_flags(flags, add_flags, del_flags)
if flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE:
return None
iscased = _get_iscased(flags)
if op is LITERAL:
if iscased and iscased(av):
return None
return [(op, av)]
elif op is BRANCH:
charset = []
charsetappend = charset.append
for p in av[1]:
if not p:
return None
op, av = p[0]
if op is LITERAL and not (iscased and iscased(av)):
charsetappend((op, av))
else:
return None
return charset
elif op is IN:
charset = av
if iscased:
for op, av in charset:
if op is LITERAL:
if iscased(av):
return None
elif op is RANGE:
if av[1] > 0xffff:
return None
if any(map(iscased, range(av[0], av[1]+1))):
return None
return charset
return None
def _compile_info(code, pattern, flags):
# internal: compile an info block. in the current version,
# this contains min/max pattern width, and an optional literal
# prefix or a character map
lo, hi = pattern.getwidth()
if hi > MAXCODE:
hi = MAXCODE
if lo == 0:
code.extend([INFO, 4, 0, lo, hi])
return
# look for a literal prefix
prefix = []
prefix_skip = 0
charset = [] # not used
if not (flags & SRE_FLAG_IGNORECASE and flags & SRE_FLAG_LOCALE):
# look for literal prefix
prefix, prefix_skip, got_all = _get_literal_prefix(pattern, flags)
# if no prefix, look for charset prefix
if not prefix:
charset = _get_charset_prefix(pattern, flags)
## if prefix:
## print("*** PREFIX", prefix, prefix_skip)
## if charset:
## print("*** CHARSET", charset)
# add an info block
emit = code.append
emit(INFO)
skip = len(code); emit(0)
# literal flag
mask = 0
if prefix:
mask = SRE_INFO_PREFIX
if prefix_skip is None and got_all:
mask = mask | SRE_INFO_LITERAL
elif charset:
mask = mask | SRE_INFO_CHARSET
emit(mask)
# pattern length
if lo < MAXCODE:
emit(lo)
else:
emit(MAXCODE)
prefix = prefix[:MAXCODE]
emit(min(hi, MAXCODE))
# add literal prefix
if prefix:
emit(len(prefix)) # length
if prefix_skip is None:
prefix_skip = len(prefix)
emit(prefix_skip) # skip
code.extend(prefix)
# generate overlap table
code.extend(_generate_overlap_table(prefix))
elif charset:
charset, hascased = _optimize_charset(charset)
assert not hascased
_compile_charset(charset, flags, code)
code[skip] = len(code) - skip
def isstring(obj):
return isinstance(obj, (str, bytes))
def _code(p, flags):
flags = p.state.flags | flags
code = []
# compile info block
_compile_info(code, p, flags)
# compile the pattern
_compile(code, p.data, flags)
code.append(SUCCESS)
return code
def _hex_code(code):
return '[%s]' % ', '.join('%#0*x' % (_sre.CODESIZE*2+2, x) for x in code)
def dis(code):
import sys
labels = set()
level = 0
offset_width = len(str(len(code) - 1))
def dis_(start, end):
def print_(*args, to=None):
if to is not None:
labels.add(to)
args += ('(to %d)' % (to,),)
print('%*d%s ' % (offset_width, start, ':' if start in labels else '.'),
end=' '*(level-1))
print(*args)
def print_2(*args):
print(end=' '*(offset_width + 2*level))
print(*args)
nonlocal level
level += 1
i = start
while i < end:
start = i
op = code[i]
i += 1
op = OPCODES[op]
if op in (SUCCESS, FAILURE, ANY, ANY_ALL,
MAX_UNTIL, MIN_UNTIL, NEGATE):
print_(op)
elif op in (LITERAL, NOT_LITERAL,
LITERAL_IGNORE, NOT_LITERAL_IGNORE,
LITERAL_UNI_IGNORE, NOT_LITERAL_UNI_IGNORE,
LITERAL_LOC_IGNORE, NOT_LITERAL_LOC_IGNORE):
arg = code[i]
i += 1
print_(op, '%#02x (%r)' % (arg, chr(arg)))
elif op is AT:
arg = code[i]
i += 1
arg = str(ATCODES[arg])
assert arg[:3] == 'AT_'
print_(op, arg[3:])
elif op is CATEGORY:
arg = code[i]
i += 1
arg = str(CHCODES[arg])
assert arg[:9] == 'CATEGORY_'
print_(op, arg[9:])
elif op in (IN, IN_IGNORE, IN_UNI_IGNORE, IN_LOC_IGNORE):
skip = code[i]
print_(op, skip, to=i+skip)
dis_(i+1, i+skip)
i += skip
elif op in (RANGE, RANGE_UNI_IGNORE):
lo, hi = code[i: i+2]
i += 2
print_(op, '%#02x %#02x (%r-%r)' % (lo, hi, chr(lo), chr(hi)))
elif op is CHARSET:
print_(op, _hex_code(code[i: i + 256//_CODEBITS]))
i += 256//_CODEBITS
elif op is BIGCHARSET:
arg = code[i]
i += 1
mapping = list(b''.join(x.to_bytes(_sre.CODESIZE, sys.byteorder)
for x in code[i: i + 256//_sre.CODESIZE]))
print_(op, arg, mapping)
i += 256//_sre.CODESIZE
level += 1
for j in range(arg):
print_2(_hex_code(code[i: i + 256//_CODEBITS]))
i += 256//_CODEBITS
level -= 1
elif op in (MARK, GROUPREF, GROUPREF_IGNORE, GROUPREF_UNI_IGNORE,
GROUPREF_LOC_IGNORE):
arg = code[i]
i += 1
print_(op, arg)
elif op is JUMP:
skip = code[i]
print_(op, skip, to=i+skip)
i += 1
elif op is BRANCH:
skip = code[i]
print_(op, skip, to=i+skip)
while skip:
dis_(i+1, i+skip)
i += skip
start = i
skip = code[i]
if skip:
print_('branch', skip, to=i+skip)
else:
print_(FAILURE)
i += 1
elif op in (REPEAT, REPEAT_ONE, MIN_REPEAT_ONE,
POSSESSIVE_REPEAT, POSSESSIVE_REPEAT_ONE):
skip, min, max = code[i: i+3]
if max == MAXREPEAT:
max = 'MAXREPEAT'
print_(op, skip, min, max, to=i+skip)
dis_(i+3, i+skip)
i += skip
elif op is GROUPREF_EXISTS:
arg, skip = code[i: i+2]
print_(op, arg, skip, to=i+skip)
i += 2
elif op in (ASSERT, ASSERT_NOT):
skip, arg = code[i: i+2]
print_(op, skip, arg, to=i+skip)
dis_(i+2, i+skip)
i += skip
elif op is ATOMIC_GROUP:
skip = code[i]
print_(op, skip, to=i+skip)
dis_(i+1, i+skip)
i += skip
elif op is INFO:
skip, flags, min, max = code[i: i+4]
if max == MAXREPEAT:
max = 'MAXREPEAT'
print_(op, skip, bin(flags), min, max, to=i+skip)
start = i+4
if flags & SRE_INFO_PREFIX:
prefix_len, prefix_skip = code[i+4: i+6]
print_2(' prefix_skip', prefix_skip)
start = i + 6
prefix = code[start: start+prefix_len]
print_2(' prefix',
'[%s]' % ', '.join('%#02x' % x for x in prefix),
'(%r)' % ''.join(map(chr, prefix)))
start += prefix_len
print_2(' overlap', code[start: start+prefix_len])
start += prefix_len
if flags & SRE_INFO_CHARSET:
level += 1
print_2('in')
dis_(start, i+skip)
level -= 1
i += skip
else:
raise ValueError(op)
level -= 1
dis_(0, len(code))
def compile(p, flags=0):
# internal: convert pattern list to internal format
if isstring(p):
pattern = p
p = sre_parse.parse(p, flags)
else:
pattern = None
code = _code(p, flags)
if flags & SRE_FLAG_DEBUG:
print()
dis(code)
# map in either direction
groupindex = p.state.groupdict
indexgroup = [None] * p.state.groups
for k, i in groupindex.items():
indexgroup[i] = k
return _sre.compile(
pattern, flags | p.state.flags, code,
p.state.groups-1,
groupindex, tuple(indexgroup)
)
from re import _compiler as _
globals().update({k: v for k, v in vars(_).items() if k[:2] != '__'})

View File

@ -1,262 +1,7 @@
#
# Secret Labs' Regular Expression Engine
#
# various symbols used by the regular expression engine.
# run this script to update the _sre include files!
#
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
#
# See the sre.py file for information on usage and redistribution.
#
import warnings
warnings.warn(f"module {__name__!r} is deprecated",
DeprecationWarning,
stacklevel=2)
"""Internal support module for sre"""
# update when constants are added or removed
MAGIC = 20220318
from _sre import MAXREPEAT, MAXGROUPS
# SRE standard exception (access as sre.error)
# should this really be here?
class error(Exception):
"""Exception raised for invalid regular expressions.
Attributes:
msg: The unformatted error message
pattern: The regular expression pattern
pos: The index in the pattern where compilation failed (may be None)
lineno: The line corresponding to pos (may be None)
colno: The column corresponding to pos (may be None)
"""
__module__ = 're'
def __init__(self, msg, pattern=None, pos=None):
self.msg = msg
self.pattern = pattern
self.pos = pos
if pattern is not None and pos is not None:
msg = '%s at position %d' % (msg, pos)
if isinstance(pattern, str):
newline = '\n'
else:
newline = b'\n'
self.lineno = pattern.count(newline, 0, pos) + 1
self.colno = pos - pattern.rfind(newline, 0, pos)
if newline in pattern:
msg = '%s (line %d, column %d)' % (msg, self.lineno, self.colno)
else:
self.lineno = self.colno = None
super().__init__(msg)
class _NamedIntConstant(int):
def __new__(cls, value, name):
self = super(_NamedIntConstant, cls).__new__(cls, value)
self.name = name
return self
def __repr__(self):
return self.name
MAXREPEAT = _NamedIntConstant(MAXREPEAT, 'MAXREPEAT')
def _makecodes(names):
names = names.strip().split()
items = [_NamedIntConstant(i, name) for i, name in enumerate(names)]
globals().update({item.name: item for item in items})
return items
# operators
# failure=0 success=1 (just because it looks better that way :-)
OPCODES = _makecodes("""
FAILURE SUCCESS
ANY ANY_ALL
ASSERT ASSERT_NOT
AT
BRANCH
CALL
CATEGORY
CHARSET BIGCHARSET
GROUPREF GROUPREF_EXISTS
IN
INFO
JUMP
LITERAL
MARK
MAX_UNTIL
MIN_UNTIL
NOT_LITERAL
NEGATE
RANGE
REPEAT
REPEAT_ONE
SUBPATTERN
MIN_REPEAT_ONE
ATOMIC_GROUP
POSSESSIVE_REPEAT
POSSESSIVE_REPEAT_ONE
GROUPREF_IGNORE
IN_IGNORE
LITERAL_IGNORE
NOT_LITERAL_IGNORE
GROUPREF_LOC_IGNORE
IN_LOC_IGNORE
LITERAL_LOC_IGNORE
NOT_LITERAL_LOC_IGNORE
GROUPREF_UNI_IGNORE
IN_UNI_IGNORE
LITERAL_UNI_IGNORE
NOT_LITERAL_UNI_IGNORE
RANGE_UNI_IGNORE
MIN_REPEAT MAX_REPEAT
""")
del OPCODES[-2:] # remove MIN_REPEAT and MAX_REPEAT
# positions
ATCODES = _makecodes("""
AT_BEGINNING AT_BEGINNING_LINE AT_BEGINNING_STRING
AT_BOUNDARY AT_NON_BOUNDARY
AT_END AT_END_LINE AT_END_STRING
AT_LOC_BOUNDARY AT_LOC_NON_BOUNDARY
AT_UNI_BOUNDARY AT_UNI_NON_BOUNDARY
""")
# categories
CHCODES = _makecodes("""
CATEGORY_DIGIT CATEGORY_NOT_DIGIT
CATEGORY_SPACE CATEGORY_NOT_SPACE
CATEGORY_WORD CATEGORY_NOT_WORD
CATEGORY_LINEBREAK CATEGORY_NOT_LINEBREAK
CATEGORY_LOC_WORD CATEGORY_LOC_NOT_WORD
CATEGORY_UNI_DIGIT CATEGORY_UNI_NOT_DIGIT
CATEGORY_UNI_SPACE CATEGORY_UNI_NOT_SPACE
CATEGORY_UNI_WORD CATEGORY_UNI_NOT_WORD
CATEGORY_UNI_LINEBREAK CATEGORY_UNI_NOT_LINEBREAK
""")
# replacement operations for "ignore case" mode
OP_IGNORE = {
LITERAL: LITERAL_IGNORE,
NOT_LITERAL: NOT_LITERAL_IGNORE,
}
OP_LOCALE_IGNORE = {
LITERAL: LITERAL_LOC_IGNORE,
NOT_LITERAL: NOT_LITERAL_LOC_IGNORE,
}
OP_UNICODE_IGNORE = {
LITERAL: LITERAL_UNI_IGNORE,
NOT_LITERAL: NOT_LITERAL_UNI_IGNORE,
}
AT_MULTILINE = {
AT_BEGINNING: AT_BEGINNING_LINE,
AT_END: AT_END_LINE
}
AT_LOCALE = {
AT_BOUNDARY: AT_LOC_BOUNDARY,
AT_NON_BOUNDARY: AT_LOC_NON_BOUNDARY
}
AT_UNICODE = {
AT_BOUNDARY: AT_UNI_BOUNDARY,
AT_NON_BOUNDARY: AT_UNI_NON_BOUNDARY
}
CH_LOCALE = {
CATEGORY_DIGIT: CATEGORY_DIGIT,
CATEGORY_NOT_DIGIT: CATEGORY_NOT_DIGIT,
CATEGORY_SPACE: CATEGORY_SPACE,
CATEGORY_NOT_SPACE: CATEGORY_NOT_SPACE,
CATEGORY_WORD: CATEGORY_LOC_WORD,
CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD,
CATEGORY_LINEBREAK: CATEGORY_LINEBREAK,
CATEGORY_NOT_LINEBREAK: CATEGORY_NOT_LINEBREAK
}
CH_UNICODE = {
CATEGORY_DIGIT: CATEGORY_UNI_DIGIT,
CATEGORY_NOT_DIGIT: CATEGORY_UNI_NOT_DIGIT,
CATEGORY_SPACE: CATEGORY_UNI_SPACE,
CATEGORY_NOT_SPACE: CATEGORY_UNI_NOT_SPACE,
CATEGORY_WORD: CATEGORY_UNI_WORD,
CATEGORY_NOT_WORD: CATEGORY_UNI_NOT_WORD,
CATEGORY_LINEBREAK: CATEGORY_UNI_LINEBREAK,
CATEGORY_NOT_LINEBREAK: CATEGORY_UNI_NOT_LINEBREAK
}
# flags
SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
SRE_FLAG_IGNORECASE = 2 # case insensitive
SRE_FLAG_LOCALE = 4 # honour system locale
SRE_FLAG_MULTILINE = 8 # treat target as multiline string
SRE_FLAG_DOTALL = 16 # treat target as a single string
SRE_FLAG_UNICODE = 32 # use unicode "locale"
SRE_FLAG_VERBOSE = 64 # ignore whitespace and comments
SRE_FLAG_DEBUG = 128 # debugging
SRE_FLAG_ASCII = 256 # use ascii "locale"
# flags for INFO primitive
SRE_INFO_PREFIX = 1 # has prefix
SRE_INFO_LITERAL = 2 # entire pattern is literal (given by prefix)
SRE_INFO_CHARSET = 4 # pattern starts with character from given set
if __name__ == "__main__":
def dump(f, d, prefix):
items = sorted(d)
for item in items:
f.write("#define %s_%s %d\n" % (prefix, item, item))
with open("sre_constants.h", "w") as f:
f.write("""\
/*
* Secret Labs' Regular Expression Engine
*
* regular expression matching engine
*
* NOTE: This file is generated by sre_constants.py. If you need
* to change anything in here, edit sre_constants.py and run it.
*
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
*
* See the _sre.c file for information on usage and redistribution.
*/
""")
f.write("#define SRE_MAGIC %d\n" % MAGIC)
dump(f, OPCODES, "SRE_OP")
dump(f, ATCODES, "SRE")
dump(f, CHCODES, "SRE")
f.write("#define SRE_FLAG_TEMPLATE %d\n" % SRE_FLAG_TEMPLATE)
f.write("#define SRE_FLAG_IGNORECASE %d\n" % SRE_FLAG_IGNORECASE)
f.write("#define SRE_FLAG_LOCALE %d\n" % SRE_FLAG_LOCALE)
f.write("#define SRE_FLAG_MULTILINE %d\n" % SRE_FLAG_MULTILINE)
f.write("#define SRE_FLAG_DOTALL %d\n" % SRE_FLAG_DOTALL)
f.write("#define SRE_FLAG_UNICODE %d\n" % SRE_FLAG_UNICODE)
f.write("#define SRE_FLAG_VERBOSE %d\n" % SRE_FLAG_VERBOSE)
f.write("#define SRE_FLAG_DEBUG %d\n" % SRE_FLAG_DEBUG)
f.write("#define SRE_FLAG_ASCII %d\n" % SRE_FLAG_ASCII)
f.write("#define SRE_INFO_PREFIX %d\n" % SRE_INFO_PREFIX)
f.write("#define SRE_INFO_LITERAL %d\n" % SRE_INFO_LITERAL)
f.write("#define SRE_INFO_CHARSET %d\n" % SRE_INFO_CHARSET)
print("done")
from re import _constants as _
globals().update({k: v for k, v in vars(_).items() if k[:2] != '__'})

File diff suppressed because it is too large Load Diff

View File

@ -221,7 +221,7 @@ class PyclbrTest(TestCase):
cm('cgi', ignore=('log',)) # set with = in module
cm('pickle', ignore=('partial', 'PickleBuffer'))
cm('aifc', ignore=('_aifc_params',)) # set with = in module
cm('sre_parse', ignore=('dump', 'groups', 'pos')) # from sre_constants import *; property
cm('re._parser', ignore=('dump', 'groups', 'pos')) # from ._constants import *; property
cm(
'pdb',
# pyclbr does not handle elegantly `typing` or properties

View File

@ -3,8 +3,8 @@ from test.support import (gc_collect, bigmemtest, _2G,
check_disallow_instantiation, is_emscripten)
import locale
import re
import sre_compile
import string
import sys
import time
import unittest
import warnings
@ -569,7 +569,7 @@ class ReTests(unittest.TestCase):
'two branches', 10)
def test_re_groupref_overflow(self):
from sre_constants import MAXGROUPS
from re._constants import MAXGROUPS
self.checkTemplateError('()', r'\g<%s>' % MAXGROUPS, 'xx',
'invalid group reference %d' % MAXGROUPS, 3)
self.checkPatternError(r'(?P<a>)(?(%d))' % MAXGROUPS,
@ -2433,7 +2433,7 @@ class ImplementationTest(unittest.TestCase):
tp.foo = 1
def test_overlap_table(self):
f = sre_compile._generate_overlap_table
f = re._compiler._generate_overlap_table
self.assertEqual(f(""), [])
self.assertEqual(f("a"), [0])
self.assertEqual(f("abcd"), [0, 0, 0, 0])
@ -2442,8 +2442,8 @@ class ImplementationTest(unittest.TestCase):
self.assertEqual(f("abcabdac"), [0, 0, 0, 1, 2, 0, 1, 0])
def test_signedness(self):
self.assertGreaterEqual(sre_compile.MAXREPEAT, 0)
self.assertGreaterEqual(sre_compile.MAXGROUPS, 0)
self.assertGreaterEqual(re._compiler.MAXREPEAT, 0)
self.assertGreaterEqual(re._compiler.MAXGROUPS, 0)
@cpython_only
def test_disallow_instantiation(self):
@ -2453,6 +2453,32 @@ class ImplementationTest(unittest.TestCase):
pat = re.compile("")
check_disallow_instantiation(self, type(pat.scanner("")))
def test_deprecated_modules(self):
deprecated = {
'sre_compile': ['compile', 'error',
'SRE_FLAG_IGNORECASE', 'SUBPATTERN',
'_compile_info'],
'sre_constants': ['error', 'SRE_FLAG_IGNORECASE', 'SUBPATTERN',
'_NamedIntConstant'],
'sre_parse': ['SubPattern', 'parse',
'SRE_FLAG_IGNORECASE', 'SUBPATTERN',
'_parse_sub'],
}
for name in deprecated:
with self.subTest(module=name):
sys.modules.pop(name, None)
with self.assertWarns(DeprecationWarning) as cm:
__import__(name)
self.assertEqual(str(cm.warnings[0].message),
f"module {name!r} is deprecated")
self.assertEqual(cm.warnings[0].filename, __file__)
self.assertIn(name, sys.modules)
mod = sys.modules[name]
self.assertEqual(mod.__name__, name)
self.assertEqual(mod.__package__, '')
for attr in deprecated[name]:
self.assertTrue(hasattr(mod, attr))
del sys.modules[name]
class ExternalTests(unittest.TestCase):

View File

@ -523,7 +523,7 @@ class StartupImportTests(unittest.TestCase):
self.assertIn('site', modules)
# http://bugs.python.org/issue19205
re_mods = {'re', '_sre', 'sre_compile', 'sre_constants', 'sre_parse'}
re_mods = {'re', '_sre', 're._compiler', 're._constants', 're._parser'}
self.assertFalse(modules.intersection(re_mods), stderr)
# http://bugs.python.org/issue9548

View File

@ -1862,6 +1862,7 @@ LIBSUBDIRS= asyncio \
logging \
multiprocessing multiprocessing/dummy \
pydoc_data \
re \
site-packages \
sqlite3 \
tkinter \

View File

@ -0,0 +1,2 @@
Convert the :mod:`re` module into a package. Deprecate modules ``sre_compile``,
``sre_constants`` and ``sre_parse``.

View File

@ -3,8 +3,8 @@
*
* regular expression matching engine
*
* NOTE: This file is generated by sre_constants.py. If you need
* to change anything in here, edit sre_constants.py and run it.
* NOTE: This file is generated by Lib/re/_constants.py. If you need
* to change anything in here, edit Lib/re/_constants.py and run it.
*
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
*