cpython/Lib/sre_parse.py

896 lines
32 KiB
Python
Raw Normal View History

#
# Secret Labs' Regular Expression Engine
#
# convert re-style regular expression to sre pattern
#
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
#
# See the sre.py file for information on usage and redistribution.
#
"""Internal support module for sre"""
# XXX: show string offset and offending character for all errors
from sre_constants import *
SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{"
DIGITS = frozenset("0123456789")
OCTDIGITS = frozenset("01234567")
HEXDIGITS = frozenset("0123456789abcdefABCDEF")
WHITESPACE = frozenset(" \t\n\r\v\f")
_REPEATCODES = frozenset({MIN_REPEAT, MAX_REPEAT})
_UNITCODES = frozenset({ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY})
ESCAPES = {
r"\a": (LITERAL, ord("\a")),
r"\b": (LITERAL, ord("\b")),
r"\f": (LITERAL, ord("\f")),
r"\n": (LITERAL, ord("\n")),
r"\r": (LITERAL, ord("\r")),
r"\t": (LITERAL, ord("\t")),
r"\v": (LITERAL, ord("\v")),
r"\\": (LITERAL, ord("\\"))
}
CATEGORIES = {
r"\A": (AT, AT_BEGINNING_STRING), # start of string
r"\b": (AT, AT_BOUNDARY),
r"\B": (AT, AT_NON_BOUNDARY),
r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
r"\Z": (AT, AT_END_STRING), # end of string
}
FLAGS = {
2000-06-29 05:58:44 -03:00
# standard flags
"i": SRE_FLAG_IGNORECASE,
"L": SRE_FLAG_LOCALE,
"m": SRE_FLAG_MULTILINE,
"s": SRE_FLAG_DOTALL,
"x": SRE_FLAG_VERBOSE,
2000-06-29 05:58:44 -03:00
# extensions
"a": SRE_FLAG_ASCII,
2000-06-29 05:58:44 -03:00
"t": SRE_FLAG_TEMPLATE,
"u": SRE_FLAG_UNICODE,
}
class Pattern:
# master pattern object. keeps track of global attributes
def __init__(self):
self.flags = 0
self.open = []
self.groups = 1
self.groupdict = {}
def opengroup(self, name=None):
gid = self.groups
self.groups = gid + 1
if self.groups > MAXGROUPS:
raise error("groups number is too large")
2002-06-01 21:40:05 -03:00
if name is not None:
ogid = self.groupdict.get(name, None)
if ogid is not None:
raise error("redefinition of group name %r as group %d; "
"was group %d" % (name, gid, ogid))
self.groupdict[name] = gid
self.open.append(gid)
return gid
def closegroup(self, gid):
self.open.remove(gid)
def checkgroup(self, gid):
return gid < self.groups and gid not in self.open
class SubPattern:
# a subpattern, in intermediate form
def __init__(self, pattern, data=None):
self.pattern = pattern
2002-06-01 21:40:05 -03:00
if data is None:
data = []
self.data = data
self.width = None
def dump(self, level=0):
nl = True
seqtypes = (tuple, list)
for op, av in self.data:
print(level*" " + str(op), end='')
if op is IN:
# member sublanguage
print()
for op, a in av:
print((level+1)*" " + str(op), a)
elif op is BRANCH:
print()
for i, a in enumerate(av[1]):
if i:
print(level*" " + "OR")
a.dump(level+1)
elif op is GROUPREF_EXISTS:
condgroup, item_yes, item_no = av
print('', condgroup)
item_yes.dump(level+1)
if item_no:
print(level*" " + "ELSE")
item_no.dump(level+1)
elif isinstance(av, seqtypes):
nl = False
for a in av:
if isinstance(a, SubPattern):
if not nl:
print()
a.dump(level+1)
nl = True
else:
if not nl:
print(' ', end='')
print(a, end='')
nl = False
if not nl:
print()
else:
print('', av)
def __repr__(self):
return repr(self.data)
def __len__(self):
return len(self.data)
def __delitem__(self, index):
del self.data[index]
def __getitem__(self, index):
Merged revisions 53005-53303 via svnmerge from svn+ssh://pythondev@svn.python.org/python/trunk ........ r53012 | walter.doerwald | 2006-12-12 22:55:31 +0100 (Tue, 12 Dec 2006) | 2 lines Fix typo. ........ r53023 | brett.cannon | 2006-12-13 23:31:37 +0100 (Wed, 13 Dec 2006) | 2 lines Remove an unneeded import of 'warnings'. ........ r53025 | brett.cannon | 2006-12-14 00:02:38 +0100 (Thu, 14 Dec 2006) | 2 lines Remove unneeded imports of 'warnings'. ........ r53026 | brett.cannon | 2006-12-14 00:09:53 +0100 (Thu, 14 Dec 2006) | 4 lines Add test.test_support.guard_warnings_filter . This function returns a context manager that protects warnings.filter from being modified once the context is exited. ........ r53029 | george.yoshida | 2006-12-14 03:22:44 +0100 (Thu, 14 Dec 2006) | 2 lines Note that guard_warnings_filter was added in 2.6 ........ r53031 | vinay.sajip | 2006-12-14 09:53:55 +0100 (Thu, 14 Dec 2006) | 1 line Added news on recent changes to logging ........ r53032 | andrew.kuchling | 2006-12-14 19:57:53 +0100 (Thu, 14 Dec 2006) | 1 line [Patch #1599256 from David Watson] check that os.fsync is available before using it ........ r53042 | kurt.kaiser | 2006-12-15 06:13:11 +0100 (Fri, 15 Dec 2006) | 6 lines 1. Avoid hang when encountering a duplicate in a completion list. Bug 1571112. 2. Duplicate some old entries from Python's NEWS to IDLE's NEWS.txt M AutoCompleteWindow.py M NEWS.txt ........ r53048 | andrew.kuchling | 2006-12-18 18:12:31 +0100 (Mon, 18 Dec 2006) | 1 line [Bug #1618083] Add missing word; make a few grammar fixes ........ r53050 | andrew.kuchling | 2006-12-18 18:16:05 +0100 (Mon, 18 Dec 2006) | 1 line Bump version ........ r53051 | andrew.kuchling | 2006-12-18 18:22:07 +0100 (Mon, 18 Dec 2006) | 1 line [Bug #1616726] Fix description of generator.close(); if you raise some random exception, the exception is raised and doesn't trigger a RuntimeError ........ r53052 | andrew.kuchling | 2006-12-18 18:38:14 +0100 (Mon, 18 Dec 2006) | 1 line Describe new methods in Queue module ........ r53053 | andrew.kuchling | 2006-12-18 20:22:24 +0100 (Mon, 18 Dec 2006) | 1 line [Patch #1615868 by Lars Gustaebel] Use Py_off_t to fix BZ2File.seek() for offsets > 2Gb ........ r53057 | andrew.kuchling | 2006-12-18 22:29:07 +0100 (Mon, 18 Dec 2006) | 1 line Fix markup ........ r53063 | thomas.wouters | 2006-12-19 09:17:50 +0100 (Tue, 19 Dec 2006) | 5 lines Make sre's SubPattern objects accept slice objects like it already accepts simple slices. ........ r53065 | andrew.kuchling | 2006-12-19 15:13:05 +0100 (Tue, 19 Dec 2006) | 6 lines [Patch #1618455 by Ben Maurer] Improve speed of HMAC by using str.translate() instead of a more general XOR that has to construct a list. Slightly modified from Maurer's patch: the _strxor() function is no longer necessary at all. ........ r53066 | andrew.kuchling | 2006-12-19 15:28:23 +0100 (Tue, 19 Dec 2006) | 9 lines [Bug #1613651] Document socket.recv_into, socket.recvfrom_into Also, the text for recvfrom told you to read recv() for an explanation of the 'flags' argument, but recv() just pointed you at the man page. Copied the man-page text to recvfrom(), recvfrom_into, recv_into to avoid the pointless redirection. I don't have LaTeX on this machine; hope my markup is OK. ........ r53067 | andrew.kuchling | 2006-12-19 15:29:04 +0100 (Tue, 19 Dec 2006) | 1 line Comment typo ........ r53068 | andrew.kuchling | 2006-12-19 16:11:41 +0100 (Tue, 19 Dec 2006) | 1 line [Patch #1617413 from Dug Song] Fix HTTP Basic authentication via HTTPS ........ r53071 | andrew.kuchling | 2006-12-19 16:18:12 +0100 (Tue, 19 Dec 2006) | 1 line [Patch #1600491 from Jim Jewett] Describe how to build help files on Windows ........ r53073 | andrew.kuchling | 2006-12-19 16:43:10 +0100 (Tue, 19 Dec 2006) | 6 lines [Patch #1587139 by kxroberto] Protect lock acquisition/release with try...finally to ensure the lock is always released. This could use the 'with' statement, but the patch uses 'finally'. 2.5 backport candidate. ........ r53074 | vinay.sajip | 2006-12-19 19:29:11 +0100 (Tue, 19 Dec 2006) | 1 line Updated documentation for findCaller() to indicate that a 3-tuple is now returned, rather than a 2-tuple. ........ r53090 | georg.brandl | 2006-12-19 23:06:46 +0100 (Tue, 19 Dec 2006) | 3 lines Patch #1484695: The tarfile module now raises a HeaderError exception if a buffer given to frombuf() is invalid. ........ r53099 | raymond.hettinger | 2006-12-20 07:42:06 +0100 (Wed, 20 Dec 2006) | 5 lines Bug #1590891: random.randrange don't return correct value for big number Needs to be backported. ........ r53106 | georg.brandl | 2006-12-20 12:55:16 +0100 (Wed, 20 Dec 2006) | 3 lines Testcase for patch #1484695. ........ r53110 | andrew.kuchling | 2006-12-20 20:48:20 +0100 (Wed, 20 Dec 2006) | 17 lines [Apply length-checking.diff from bug #1599254] Add length checking to single-file mailbox formats: before doing a flush() on a mailbox, seek to the end and verify its length is unchanged, raising ExternalClashError if the file's length has changed. This fix avoids potential data loss if some other process appends to the mailbox file after the table of contents has been generated; instead of overwriting the modified file, you'll get the exception. I also noticed that the self._lookup() call in self.flush() wasn't necessary (everything that sets self._pending to True also calls self.lookup()), and replaced it by an assertion. 2.5 backport candidate. ........ r53112 | andrew.kuchling | 2006-12-20 20:57:10 +0100 (Wed, 20 Dec 2006) | 1 line [Bug #1619674] Make sum() use the term iterable, not sequence ........ r53113 | andrew.kuchling | 2006-12-20 20:58:11 +0100 (Wed, 20 Dec 2006) | 1 line Two grammar fixes ........ r53115 | andrew.kuchling | 2006-12-20 21:11:12 +0100 (Wed, 20 Dec 2006) | 5 lines Some other built-in functions are described with 'sequence' arguments that should really be 'iterable'; this commit changes them. Did I miss any? Did I introduce any errors? ........ r53117 | andrew.kuchling | 2006-12-20 21:20:42 +0100 (Wed, 20 Dec 2006) | 1 line [Bug #1619680] in_dll() arguments are documented in the wrong order ........ r53120 | neal.norwitz | 2006-12-21 05:38:00 +0100 (Thu, 21 Dec 2006) | 1 line Lars asked for permission on on python-dev for work on tarfile.py ........ r53125 | andrew.kuchling | 2006-12-21 14:40:29 +0100 (Thu, 21 Dec 2006) | 1 line Mention the os.SEEK_* constants ........ r53129 | walter.doerwald | 2006-12-21 19:06:30 +0100 (Thu, 21 Dec 2006) | 2 lines Fix typo. ........ r53131 | thomas.heller | 2006-12-21 19:30:56 +0100 (Thu, 21 Dec 2006) | 3 lines Fix wrong markup of an argument in a method signature. Will backport. ........ r53137 | andrew.kuchling | 2006-12-22 01:50:56 +0100 (Fri, 22 Dec 2006) | 1 line Typo fix ........ r53139 | andrew.kuchling | 2006-12-22 14:25:02 +0100 (Fri, 22 Dec 2006) | 1 line [Bug #737202; fix from Titus Brown] Make CGIHTTPServer work for scripts in sub-directories ........ r53141 | andrew.kuchling | 2006-12-22 16:04:45 +0100 (Fri, 22 Dec 2006) | 6 lines [Bug #802128] Make the mode argument of dumbdbm actually work the way it's described, and add a test for it. 2.5 bugfix candidate, maybe; arguably this patch changes the API of dumbdbm and shouldn't be added in a point-release. ........ r53142 | andrew.kuchling | 2006-12-22 16:16:58 +0100 (Fri, 22 Dec 2006) | 6 lines [Bug #802128 continued] Modify mode depending on the process umask. Is there really no other way to read the umask than to set it? Hope this works on Windows... ........ r53145 | andrew.kuchling | 2006-12-22 17:43:26 +0100 (Fri, 22 Dec 2006) | 1 line [Bug #776202] Apply Walter Doerwald's patch to use text mode for encoded files ........ r53146 | andrew.kuchling | 2006-12-22 19:41:42 +0100 (Fri, 22 Dec 2006) | 9 lines [Patch #783050 from Patrick Lynch] The emulation of forkpty() is incorrect; the master should close the slave fd. Added a test to test_pty.py that reads from the master_fd after doing a pty.fork(); without the fix it hangs forever instead of raising an exception. (<crossing fingers for the buildbots>) 2.5 backport candidate. ........ r53147 | andrew.kuchling | 2006-12-22 20:06:16 +0100 (Fri, 22 Dec 2006) | 1 line [Patch #827559 from Chris Gonnerman] Make SimpleHTTPServer redirect when a directory URL is missing the trailing slash; this lets relative links work. ........ r53149 | andrew.kuchling | 2006-12-22 20:21:27 +0100 (Fri, 22 Dec 2006) | 1 line Darn; this test works when you run test_pty.py directly, but fails when regrtest runs it (the os.read() raises os.error). I can't figure out the cause, so am commenting out the test. ........ r53150 | andrew.kuchling | 2006-12-22 22:48:19 +0100 (Fri, 22 Dec 2006) | 1 line Frak; this test also fails ........ r53153 | lars.gustaebel | 2006-12-23 17:40:13 +0100 (Sat, 23 Dec 2006) | 5 lines Patch #1230446: tarfile.py: fix ExFileObject so that read() and tell() work correctly together with readline(). Will backport to 2.5. ........ r53155 | lars.gustaebel | 2006-12-23 18:57:23 +0100 (Sat, 23 Dec 2006) | 5 lines Patch #1262036: Prevent TarFiles from being added to themselves under certain conditions. Will backport to 2.5. ........ r53159 | andrew.kuchling | 2006-12-27 04:25:31 +0100 (Wed, 27 Dec 2006) | 4 lines [Part of patch #1182394] Move the HMAC blocksize to be a class-level constant; this allows changing it in a subclass. To accommodate this, copy() now uses __class__. Also add some text to a comment. ........ r53160 | andrew.kuchling | 2006-12-27 04:31:24 +0100 (Wed, 27 Dec 2006) | 1 line [Rest of patch #1182394] Add ._current() method so that we can use the written-in-C .hexdigest() method ........ r53161 | lars.gustaebel | 2006-12-27 11:30:46 +0100 (Wed, 27 Dec 2006) | 4 lines Patch #1504073: Fix tarfile.open() for mode "r" with a fileobj argument. Will backport to 2.5. ........ r53165 | neal.norwitz | 2006-12-28 05:39:20 +0100 (Thu, 28 Dec 2006) | 1 line Remove a stray (old) macro name left around (I guess) ........ r53188 | neal.norwitz | 2006-12-29 04:01:53 +0100 (Fri, 29 Dec 2006) | 1 line SF bug #1623890, fix argument name in docstring ........ r53200 | raymond.hettinger | 2006-12-30 05:01:17 +0100 (Sat, 30 Dec 2006) | 1 line For sets with cyclical reprs, emit an ellipsis instead of infinitely recursing. ........ r53232 | brett.cannon | 2007-01-04 01:23:49 +0100 (Thu, 04 Jan 2007) | 3 lines Add EnvironmentVarGuard to test.test_support. Provides a context manager to temporarily set or unset environment variables. ........ r53235 | neal.norwitz | 2007-01-04 07:25:31 +0100 (Thu, 04 Jan 2007) | 1 line SF #1627373, fix typo in CarbonEvt. ........ r53244 | raymond.hettinger | 2007-01-04 18:53:34 +0100 (Thu, 04 Jan 2007) | 1 line Fix stability of heapq's nlargest() and nsmallest(). ........ r53249 | martin.v.loewis | 2007-01-04 22:06:12 +0100 (Thu, 04 Jan 2007) | 3 lines Bug #1566280: Explicitly invoke threading._shutdown from Py_Main, to avoid relying on atexit. Will backport to 2.5. ........ r53252 | gregory.p.smith | 2007-01-05 02:59:42 +0100 (Fri, 05 Jan 2007) | 3 lines Support linking of the bsddb module against BerkeleyDB 4.5.x (will backport to 2.5) ........ r53253 | gregory.p.smith | 2007-01-05 03:06:17 +0100 (Fri, 05 Jan 2007) | 2 lines bump module version to match supported berkeleydb version ........ r53255 | neal.norwitz | 2007-01-05 06:25:22 +0100 (Fri, 05 Jan 2007) | 6 lines Prevent crash on shutdown which can occur if we are finalizing and the module dict has been cleared already and some object raises a warning (like in a __del__). Will backport. ........ r53258 | gregory.p.smith | 2007-01-05 08:21:35 +0100 (Fri, 05 Jan 2007) | 2 lines typo fix ........ r53260 | neal.norwitz | 2007-01-05 09:06:43 +0100 (Fri, 05 Jan 2007) | 1 line Add Collin Winter for access to update PEP 3107 ........ r53262 | andrew.kuchling | 2007-01-05 15:22:17 +0100 (Fri, 05 Jan 2007) | 1 line [Bug #1622533] Make docstrings raw strings because they contain control characters (\0, \1) ........ r53264 | andrew.kuchling | 2007-01-05 16:51:24 +0100 (Fri, 05 Jan 2007) | 1 line [Patch #1520904] Fix bsddb tests to write to the temp directory instead of the Lib/bsddb/test directory ........ r53279 | brett.cannon | 2007-01-05 22:45:09 +0100 (Fri, 05 Jan 2007) | 3 lines Silence a warning from gcc 4.0.1 by specifying a function's parameter list is 'void' instead of just a set of empty parentheses. ........ r53285 | raymond.hettinger | 2007-01-06 02:14:41 +0100 (Sat, 06 Jan 2007) | 2 lines SF# 1409443: Expand comment to cover the interaction between f->f_lasti and the PREDICT macros. ........ r53286 | anthony.baxter | 2007-01-06 05:45:54 +0100 (Sat, 06 Jan 2007) | 1 line update to (c) years to include 2007 ........ r53291 | neal.norwitz | 2007-01-06 22:24:35 +0100 (Sat, 06 Jan 2007) | 1 line Add Josiah to SF for maintaining asyncore/asynchat ........ r53293 | peter.astrand | 2007-01-07 09:53:46 +0100 (Sun, 07 Jan 2007) | 1 line Re-implemented fix for #1531862 once again, in a way that works with Python 2.2. Fixes bug #1603424. ........ r53295 | peter.astrand | 2007-01-07 15:34:16 +0100 (Sun, 07 Jan 2007) | 1 line Avoid O(N**2) bottleneck in _communicate_(). Fixes #1598181. ........ r53300 | raymond.hettinger | 2007-01-08 19:09:20 +0100 (Mon, 08 Jan 2007) | 1 line Fix zero-length corner case for iterating over a mutating deque. ........ r53301 | vinay.sajip | 2007-01-08 19:50:32 +0100 (Mon, 08 Jan 2007) | 4 lines Bare except clause removed from SMTPHandler.emit(). Now, only ImportError is trapped. Bare except clause removed from SocketHandler.createSocket(). Now, only socket.error is trapped. (SF #411881) ........ r53302 | vinay.sajip | 2007-01-08 19:51:46 +0100 (Mon, 08 Jan 2007) | 2 lines Bare except clause removed from LogRecord.__init__. Now, only ValueError, TypeError and AttributeError are trapped. (SF #411881) ........ r53303 | vinay.sajip | 2007-01-08 19:52:36 +0100 (Mon, 08 Jan 2007) | 1 line Added entries about removal of some bare except clauses from logging. ........
2007-01-09 19:18:33 -04:00
if isinstance(index, slice):
return SubPattern(self.pattern, self.data[index])
return self.data[index]
def __setitem__(self, index, code):
self.data[index] = code
def insert(self, index, code):
self.data.insert(index, code)
def append(self, code):
self.data.append(code)
def getwidth(self):
# determine the width (min, max) for this subpattern
if self.width is not None:
return self.width
lo = hi = 0
for op, av in self.data:
if op is BRANCH:
i = MAXREPEAT - 1
j = 0
for av in av[1]:
l, h = av.getwidth()
i = min(i, l)
j = max(j, h)
lo = lo + i
hi = hi + j
elif op is CALL:
i, j = av.getwidth()
lo = lo + i
hi = hi + j
elif op is SUBPATTERN:
i, j = av[1].getwidth()
lo = lo + i
hi = hi + j
elif op in _REPEATCODES:
i, j = av[2].getwidth()
lo = lo + i * av[0]
hi = hi + j * av[1]
elif op in _UNITCODES:
lo = lo + 1
hi = hi + 1
elif op == SUCCESS:
break
self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
return self.width
class Tokenizer:
def __init__(self, string):
self.istext = isinstance(string, str)
self.string = string
if not self.istext:
string = str(string, 'latin1')
self.decoded_string = string
self.index = 0
self.next = None
self.__next()
def __next(self):
index = self.index
try:
char = self.decoded_string[index]
except IndexError:
self.next = None
return
if char == "\\":
index += 1
try:
char += self.decoded_string[index]
except IndexError:
raise error("bogus escape (end of line)",
self.string, len(self.string) - 1) from None
self.index = index + 1
self.next = char
def match(self, char):
if char == self.next:
self.__next()
return True
return False
def get(self):
this = self.next
self.__next()
return this
def getwhile(self, n, charset):
result = ''
for _ in range(n):
c = self.next
if c not in charset:
break
result += c
self.__next()
return result
def getuntil(self, terminator):
result = ''
while True:
c = self.next
self.__next()
if c is None:
raise self.error("unterminated name")
if c == terminator:
break
result += c
return result
def tell(self):
return self.index - len(self.next or '')
def seek(self, index):
self.index = index
self.__next()
def error(self, msg, offset=0):
return error(msg, self.string, self.tell() - offset)
# The following three functions are not used in this module anymore, but we keep
# them here (with DeprecationWarnings) for backwards compatibility.
def isident(char):
import warnings
warnings.warn('sre_parse.isident() will be removed in 3.5',
DeprecationWarning, stacklevel=2)
return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
def isdigit(char):
import warnings
warnings.warn('sre_parse.isdigit() will be removed in 3.5',
DeprecationWarning, stacklevel=2)
return "0" <= char <= "9"
def isname(name):
import warnings
warnings.warn('sre_parse.isname() will be removed in 3.5',
DeprecationWarning, stacklevel=2)
# check that group name is a valid string
if not isident(name[0]):
return False
for char in name[1:]:
if not isident(char) and not isdigit(char):
return False
return True
def _class_escape(source, escape):
# handle escape code inside character class
code = ESCAPES.get(escape)
if code:
return code
code = CATEGORIES.get(escape)
if code and code[0] is IN:
return code
try:
c = escape[1:2]
if c == "x":
# hexadecimal escape (exactly two digits)
escape += source.getwhile(2, HEXDIGITS)
if len(escape) != 4:
raise ValueError
return LITERAL, int(escape[2:], 16)
elif c == "u" and source.istext:
# unicode escape (exactly four digits)
escape += source.getwhile(4, HEXDIGITS)
if len(escape) != 6:
raise ValueError
return LITERAL, int(escape[2:], 16)
elif c == "U" and source.istext:
# unicode escape (exactly eight digits)
escape += source.getwhile(8, HEXDIGITS)
if len(escape) != 10:
raise ValueError
c = int(escape[2:], 16)
chr(c) # raise ValueError for invalid code
return LITERAL, c
elif c in OCTDIGITS:
# octal escape (up to three digits)
escape += source.getwhile(2, OCTDIGITS)
c = int(escape[1:], 8)
if c > 0o377:
raise source.error('octal escape value %r outside of '
'range 0-0o377' % escape, len(escape))
return LITERAL, c
elif c in DIGITS:
raise ValueError
if len(escape) == 2:
return LITERAL, ord(escape[1])
except ValueError:
pass
raise source.error("bogus escape: %r" % escape, len(escape))
def _escape(source, escape, state):
# handle escape code in expression
code = CATEGORIES.get(escape)
if code:
return code
code = ESCAPES.get(escape)
if code:
return code
try:
c = escape[1:2]
if c == "x":
# hexadecimal escape
escape += source.getwhile(2, HEXDIGITS)
if len(escape) != 4:
raise ValueError
return LITERAL, int(escape[2:], 16)
elif c == "u" and source.istext:
# unicode escape (exactly four digits)
escape += source.getwhile(4, HEXDIGITS)
if len(escape) != 6:
raise ValueError
return LITERAL, int(escape[2:], 16)
elif c == "U" and source.istext:
# unicode escape (exactly eight digits)
escape += source.getwhile(8, HEXDIGITS)
if len(escape) != 10:
raise ValueError
c = int(escape[2:], 16)
chr(c) # raise ValueError for invalid code
return LITERAL, c
elif c == "0":
# octal escape
escape += source.getwhile(2, OCTDIGITS)
return LITERAL, int(escape[1:], 8)
elif c in DIGITS:
# octal escape *or* decimal group reference (sigh)
if source.next in DIGITS:
escape += source.get()
if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
source.next in OCTDIGITS):
# got three octal digits; this is an octal escape
escape += source.get()
c = int(escape[1:], 8)
if c > 0o377:
raise source.error('octal escape value %r outside of '
'range 0-0o377' % escape,
len(escape))
return LITERAL, c
# not an octal escape, so this is a group reference
group = int(escape[1:])
if group < state.groups:
if not state.checkgroup(group):
raise source.error("cannot refer to open group",
len(escape))
return GROUPREF, group
raise ValueError
if len(escape) == 2:
return LITERAL, ord(escape[1])
except ValueError:
pass
raise source.error("bogus escape: %r" % escape, len(escape))
def _parse_sub(source, state, nested=True):
# parse an alternation: a|b|c
items = []
itemsappend = items.append
sourcematch = source.match
while True:
itemsappend(_parse(source, state))
if not sourcematch("|"):
break
if nested and source.next is not None and source.next != ")":
raise source.error("pattern not properly closed")
if len(items) == 1:
return items[0]
subpattern = SubPattern(state)
subpatternappend = subpattern.append
# check if all items share a common prefix
while True:
prefix = None
for item in items:
if not item:
break
if prefix is None:
prefix = item[0]
elif item[0] != prefix:
break
else:
# all subitems start with a common "prefix".
# move it out of the branch
for item in items:
del item[0]
subpatternappend(prefix)
continue # check next one
break
# check if the branch can be replaced by a character set
for item in items:
if len(item) != 1 or item[0][0] is not LITERAL:
break
else:
# we can store this as a character set instead of a
# branch (the compiler may optimize this even more)
subpatternappend((IN, [item[0] for item in items]))
return subpattern
subpattern.append((BRANCH, (None, items)))
return subpattern
def _parse_sub_cond(source, state, condgroup):
2004-01-18 16:29:55 -04:00
item_yes = _parse(source, state)
if source.match("|"):
2004-01-18 16:29:55 -04:00
item_no = _parse(source, state)
if source.next == "|":
raise source.error("conditional backref with more than two branches")
else:
item_no = None
if source.next is not None and source.next != ")":
raise source.error("pattern not properly closed")
subpattern = SubPattern(state)
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
return subpattern
def _parse(source, state):
# parse a simple pattern
subpattern = SubPattern(state)
# precompute constants into local variables
subpatternappend = subpattern.append
sourceget = source.get
sourcematch = source.match
_len = len
_ord = ord
verbose = state.flags & SRE_FLAG_VERBOSE
while True:
this = source.next
if this is None:
break # end of pattern
if this in "|)":
break # end of subpattern
sourceget()
if verbose:
# skip whitespace and comments
if this in WHITESPACE:
continue
if this == "#":
while True:
this = sourceget()
if this is None or this == "\n":
break
continue
if this[0] == "\\":
code = _escape(source, this, state)
subpatternappend(code)
elif this not in SPECIAL_CHARS:
subpatternappend((LITERAL, _ord(this)))
elif this == "[":
# character set
set = []
setappend = set.append
## if sourcematch(":"):
## pass # handle character classes
if sourcematch("^"):
setappend((NEGATE, None))
# check remaining characters
start = set[:]
while True:
this = sourceget()
if this is None:
raise source.error("unexpected end of regular expression")
if this == "]" and set != start:
break
elif this[0] == "\\":
code1 = _class_escape(source, this)
else:
code1 = LITERAL, _ord(this)
if sourcematch("-"):
# potential range
this = sourceget()
if this is None:
raise source.error("unexpected end of regular expression")
if this == "]":
if code1[0] is IN:
code1 = code1[1][0]
setappend(code1)
setappend((LITERAL, _ord("-")))
break
if this[0] == "\\":
code2 = _class_escape(source, this)
else:
code2 = LITERAL, _ord(this)
if code1[0] != LITERAL or code2[0] != LITERAL:
raise source.error("bad character range", len(this))
lo = code1[1]
hi = code2[1]
if hi < lo:
raise source.error("bad character range", len(this))
setappend((RANGE, (lo, hi)))
else:
if code1[0] is IN:
code1 = code1[1][0]
setappend(code1)
# XXX: <fl> should move set optimization to compiler!
if _len(set)==1 and set[0][0] is LITERAL:
subpatternappend(set[0]) # optimization
elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
subpatternappend((NOT_LITERAL, set[1][1])) # optimization
else:
# XXX: <fl> should add charmap optimization here
subpatternappend((IN, set))
elif this in REPEAT_CHARS:
# repeat previous item
here = source.tell()
if this == "?":
min, max = 0, 1
elif this == "*":
min, max = 0, MAXREPEAT
elif this == "+":
min, max = 1, MAXREPEAT
elif this == "{":
if source.next == "}":
subpatternappend((LITERAL, _ord(this)))
continue
min, max = 0, MAXREPEAT
lo = hi = ""
while source.next in DIGITS:
lo += sourceget()
if sourcematch(","):
while source.next in DIGITS:
hi += sourceget()
else:
hi = lo
if not sourcematch("}"):
subpatternappend((LITERAL, _ord(this)))
source.seek(here)
continue
if lo:
min = int(lo)
if min >= MAXREPEAT:
raise OverflowError("the repetition number is too large")
if hi:
max = int(hi)
if max >= MAXREPEAT:
raise OverflowError("the repetition number is too large")
if max < min:
raise source.error("bad repeat interval",
source.tell() - here)
else:
raise source.error("not supported", len(this))
# figure out which item to repeat
if subpattern:
item = subpattern[-1:]
else:
item = None
if not item or (_len(item) == 1 and item[0][0] is AT):
raise source.error("nothing to repeat",
source.tell() - here + len(this))
if item[0][0] in _REPEATCODES:
raise source.error("multiple repeat",
source.tell() - here + len(this))
if sourcematch("?"):
subpattern[-1] = (MIN_REPEAT, (min, max, item))
else:
subpattern[-1] = (MAX_REPEAT, (min, max, item))
elif this == ".":
subpatternappend((ANY, None))
elif this == "(":
group = 1
name = None
condgroup = None
if sourcematch("?"):
group = 0
# options
char = sourceget()
if char is None:
raise self.error("unexpected end of pattern")
if char == "P":
# python extensions
if sourcematch("<"):
# named group: skip forward to end of name
name = source.getuntil(">")
group = 1
if not name:
raise source.error("missing group name", 1)
if not name.isidentifier():
raise source.error("bad character in group name "
"%r" % name,
len(name) + 1)
elif sourcematch("="):
# named backreference
name = source.getuntil(")")
if not name:
raise source.error("missing group name", 1)
if not name.isidentifier():
raise source.error("bad character in backref "
"group name %r" % name,
len(name) + 1)
gid = state.groupdict.get(name)
if gid is None:
msg = "unknown group name: {0!r}".format(name)
raise source.error(msg, len(name) + 1)
subpatternappend((GROUPREF, gid))
continue
else:
char = sourceget()
if char is None:
raise source.error("unexpected end of pattern")
raise source.error("unknown specifier: ?P%s" % char,
len(char))
elif char == ":":
# non-capturing group
group = 2
elif char == "#":
# comment
while True:
if source.next is None:
raise source.error("unbalanced parenthesis")
if sourceget() == ")":
break
continue
elif char in "=!<":
# lookahead assertions
dir = 1
if char == "<":
char = sourceget()
if char is None or char not in "=!":
raise source.error("syntax error")
dir = -1 # lookbehind
p = _parse_sub(source, state)
if not sourcematch(")"):
raise source.error("unbalanced parenthesis")
if char == "=":
subpatternappend((ASSERT, (dir, p)))
else:
subpatternappend((ASSERT_NOT, (dir, p)))
continue
elif char == "(":
# conditional backreference group
condname = source.getuntil(")")
group = 2
if not condname:
raise source.error("missing group name", 1)
if condname.isidentifier():
condgroup = state.groupdict.get(condname)
if condgroup is None:
msg = "unknown group name: {0!r}".format(condname)
raise source.error(msg, len(condname) + 1)
else:
try:
condgroup = int(condname)
if condgroup < 0:
raise ValueError
except ValueError:
raise source.error("bad character in group name",
len(condname) + 1)
if not condgroup:
raise source.error("bad group number",
len(condname) + 1)
if condgroup >= MAXGROUPS:
raise source.error("the group number is too large",
len(condname) + 1)
elif char in FLAGS:
# flags
state.flags |= FLAGS[char]
while source.next in FLAGS:
state.flags |= FLAGS[sourceget()]
verbose = state.flags & SRE_FLAG_VERBOSE
else:
raise source.error("unexpected end of pattern")
if group:
# parse group contents
if group == 2:
# anonymous group
group = None
else:
try:
group = state.opengroup(name)
except error as err:
raise source.error(err.msg, len(name) + 1)
if condgroup:
p = _parse_sub_cond(source, state, condgroup)
else:
p = _parse_sub(source, state)
if not sourcematch(")"):
raise source.error("unbalanced parenthesis")
if group is not None:
state.closegroup(group)
subpatternappend((SUBPATTERN, (group, p)))
else:
while True:
char = sourceget()
if char is None:
raise source.error("unexpected end of pattern")
if char == ")":
break
raise source.error("unknown extension", len(char))
elif this == "^":
subpatternappend((AT, AT_BEGINNING))
elif this == "$":
subpattern.append((AT, AT_END))
else:
raise source.error("parser error", len(this))
return subpattern
def fix_flags(src, flags):
# Check and fix flags according to the type of pattern (str or bytes)
if isinstance(src, str):
if flags & SRE_FLAG_LOCALE:
import warnings
warnings.warn("LOCALE flag with a str pattern is deprecated. "
"Will be an error in 3.6",
DeprecationWarning, stacklevel=6)
if not flags & SRE_FLAG_ASCII:
flags |= SRE_FLAG_UNICODE
elif flags & SRE_FLAG_UNICODE:
raise ValueError("ASCII and UNICODE flags are incompatible")
else:
if flags & SRE_FLAG_UNICODE:
raise ValueError("can't use UNICODE flag with a bytes pattern")
if flags & SRE_FLAG_LOCALE and flags & SRE_FLAG_ASCII:
import warnings
warnings.warn("ASCII and LOCALE flags are incompatible. "
"Will be an error in 3.6",
DeprecationWarning, stacklevel=6)
return flags
def parse(str, flags=0, pattern=None):
# parse 're' pattern into list of (opcode, argument) tuples
source = Tokenizer(str)
if pattern is None:
pattern = Pattern()
pattern.flags = flags
pattern.str = str
p = _parse_sub(source, pattern, 0)
p.pattern.flags = fix_flags(str, p.pattern.flags)
if source.next is not None:
if source.next == ")":
raise source.error("unbalanced parenthesis")
else:
raise source.error("bogus characters at end of regular expression",
len(tail))
if flags & SRE_FLAG_DEBUG:
p.dump()
if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
# the VERBOSE flag was switched on inside the pattern. to be
# on the safe side, we'll parse the whole thing again...
return parse(str, p.pattern.flags)
return p
2000-06-29 05:58:44 -03:00
def parse_template(source, pattern):
# parse 're' replacement string into list of literals and
# group references
s = Tokenizer(source)
sget = s.get
groups = []
literals = []
literal = []
lappend = literal.append
def addgroup(index):
if literal:
literals.append(''.join(literal))
del literal[:]
groups.append((len(literals), index))
literals.append(None)
while True:
this = sget()
if this is None:
break # end of replacement string
if this[0] == "\\":
# group
c = this[1]
if c == "g":
name = ""
if s.match("<"):
name = s.getuntil(">")
if not name:
raise s.error("missing group name", 1)
try:
index = int(name)
if index < 0:
raise s.error("negative group number", len(name) + 1)
if index >= MAXGROUPS:
raise s.error("the group number is too large",
len(name) + 1)
except ValueError:
if not name.isidentifier():
raise s.error("bad character in group name",
len(name) + 1)
try:
index = pattern.groupindex[name]
except KeyError:
msg = "unknown group name: {0!r}".format(name)
raise IndexError(msg)
addgroup(index)
elif c == "0":
if s.next in OCTDIGITS:
this += sget()
if s.next in OCTDIGITS:
this += sget()
lappend(chr(int(this[1:], 8) & 0xff))
elif c in DIGITS:
isoctal = False
if s.next in DIGITS:
this += sget()
if (c in OCTDIGITS and this[2] in OCTDIGITS and
s.next in OCTDIGITS):
this += sget()
isoctal = True
c = int(this[1:], 8)
if c > 0o377:
raise s.error('octal escape value %r outside of '
'range 0-0o377' % this, len(this))
lappend(chr(c))
if not isoctal:
addgroup(int(this[1:]))
else:
try:
this = chr(ESCAPES[this][1])
except KeyError:
pass
lappend(this)
else:
lappend(this)
if literal:
literals.append(''.join(literal))
if not isinstance(source, str):
# The tokenizer implicitly decodes bytes objects as latin-1, we must
# therefore re-encode the final representation.
literals = [None if s is None else s.encode('latin-1') for s in literals]
return groups, literals
2000-06-29 05:58:44 -03:00
def expand_template(template, match):
g = match.group
empty = match.string[:0]
groups, literals = template
literals = literals[:]
try:
for index, group in groups:
literals[index] = g(group) or empty
except IndexError:
2007-08-29 22:19:48 -03:00
raise error("invalid group reference")
return empty.join(literals)