cpython/Lib/sre_parse.py

#
# Secret Labs' Regular Expression Engine
# $Id$
#
# convert re-style regular expression to SRE template.  the current
# implementation is somewhat incomplete, and not very fast.  should
# definitely be rewritten before Python 1.6 goes beta.
#
# Copyright (c) 1998-2000 by Secret Labs AB.  All rights reserved.
#
# This code can only be used for 1.6 alpha testing.  All other use
# require explicit permission from Secret Labs AB.
#
# Portions of this engine have been developed in cooperation with
# CNRI.  Hewlett-Packard provided funding for 1.6 integration and
# other compatibility work.
#

# FIXME: comments marked with the FIXME tag are open issues.  all such
# issues should be closed before the final beta.

import string, sys

from sre_constants import *

SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS  = "*+?{"

OCTDIGITS = "01234567"
HEXDIGITS = "0123456789abcdefABCDEF"

ESCAPES = {
    "\\a": (LITERAL, chr(7)),
    "\\b": (LITERAL, chr(8)),
    "\\f": (LITERAL, chr(12)),
    "\\n": (LITERAL, chr(10)),
    "\\r": (LITERAL, chr(13)),
    "\\t": (LITERAL, chr(9)),
    "\\v": (LITERAL, chr(11))
}

CATEGORIES = {
    "\\A": (AT, AT_BEGINNING), # start of string
    "\\b": (AT, AT_BOUNDARY),
    "\\B": (AT, AT_NON_BOUNDARY),
    "\\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),
    "\\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),
    "\\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),
    "\\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),
    "\\w": (IN, [(CATEGORY, CATEGORY_WORD)]),
    "\\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),
    "\\Z": (AT, AT_END), # end of string
}

class Pattern:
    # FIXME: <fl> rename class, and store flags in here too!
    def __init__(self):
	self.flags = []
	self.groups = 1
	self.groupdict = {}
    def getgroup(self, name=None):
	gid = self.groups
	self.groups = gid + 1
	if name:
	    self.groupdict[name] = gid
	return gid
    def setflag(self, flag):
	if flag not in self.flags:
	    self.flags.append(flag)

class SubPattern:
    # a subpattern, in intermediate form
    def __init__(self, pattern, data=None):
	self.pattern = pattern
	if not data:
	    data = []
	self.data = data
	self.flags = []
	self.width = None
    def __repr__(self):
	return repr(self.data)
    def __len__(self):
	return len(self.data)
    def __delitem__(self, index):
	del self.data[index]
    def __getitem__(self, index):
	return self.data[index]
    def __setitem__(self, index, code):
	self.data[index] = code
    def __getslice__(self, start, stop):
	return SubPattern(self.pattern, self.data[start:stop])
    def insert(self, index, code):
	self.data.insert(index, code)
    def append(self, code):
	self.data.append(code)
    def getwidth(self):
	# determine the width (min, max) for this subpattern
	if self.width:
	    return self.width
	lo = hi = 0L
	for op, av in self.data:
	    if op is BRANCH:
		l = sys.maxint
		h = 0
		for av in av[1]:
		    i, j = av.getwidth()
		    l = min(l, i)
		    h = min(h, j)
		lo = lo + i
		hi = hi + j
	    elif op is CALL:
		i, j = av.getwidth()
		lo = lo + i
		hi = hi + j
	    elif op is SUBPATTERN:
		i, j = av[1].getwidth()
		lo = lo + i
		hi = hi + j
	    elif op in (MIN_REPEAT, MAX_REPEAT):
		i, j = av[2].getwidth()
		lo = lo + i * av[0]
		hi = hi + j * av[1]
	    elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
		lo = lo + 1
		hi = hi + 1
	    elif op == SUCCESS:
		break
	self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
	return self.width
    def set(self, flag):
	if not flag in self.flags:
	    self.flags.append(flag)
    def reset(self, flag):
	if flag in self.flags:
	    self.flags.remove(flag)

class Tokenizer:
    def __init__(self, string):
	self.string = list(string)
	self.next = self.__next()
    def __next(self):
	if not self.string:
	    return None
	char = self.string[0]
	if char[0] == "\\":
	    try:
		c = self.string[1]
	    except IndexError:
		raise SyntaxError, "bogus escape"
	    char = char + c
	    try:
		if c == "x":
		    # hexadecimal constant
		    for i in xrange(2, sys.maxint):
			c = self.string[i]
			if c not in HEXDIGITS:
			    break
			char = char + c
		elif c in string.digits:
		    # decimal (or octal) number
		    for i in xrange(2, sys.maxint):
			c = self.string[i]
			# FIXME: if larger than current number of
			# groups, interpret as an octal number 
			if c not in string.digits:
			    break
			char = char + c
	    except IndexError:
		pass # use what we've got this far
	del self.string[0:len(char)]
	return char
    def match(self, char):
	if char == self.next:
	    self.next = self.__next()
	    return 1
	return 0
    def match_set(self, set):
	if self.next in set:
	    self.next = self.__next()
	    return 1
	return 0
    def get(self):
	this = self.next
	self.next = self.__next()
	return this

def _fixescape(escape, character_class=0):
    # convert escape to (type, value)
    if character_class:
	# inside a character class, we'll look in the character
	# escapes dictionary first
	code = ESCAPES.get(escape)
	if code:
	    return code
	code = CATEGORIES.get(escape)
    else:
	code = CATEGORIES.get(escape)
	if code:
	    return code
	code = ESCAPES.get(escape)
    if code:
	return code
    if not character_class:
	try:
	    group = int(escape[1:])
	    # FIXME: only valid if group <= current number of groups
	    return GROUP, group
	except ValueError:
	    pass
    try:
	if escape[1:2] == "x":
	    escape = escape[2:]
	    return LITERAL, chr(string.atoi(escape[-2:], 16) & 0xff)
	elif escape[1:2] in string.digits:
	    return LITERAL, chr(string.atoi(escape[1:], 8) & 0xff)
	elif len(escape) == 2:
	    return LITERAL, escape[1]
    except ValueError:
	pass
    raise SyntaxError, "bogus escape: %s" % repr(escape)

def _branch(subpattern, items):

    # form a branch operator from a set of items (FIXME: move this
    # optimization to the compiler module!)

    # check if all items share a common prefix
    while 1:
	prefix = None
	for item in items:
	    if not item:
		break
	    if prefix is None:
		prefix = item[0]
	    elif item[0] != prefix:
		break
	else:
	    # all subitems start with a common "prefix".
	    # move it out of the branch
	    for item in items:
		del item[0]
	    subpattern.append(prefix)
	    continue # check next one
	break

    # check if the branch can be replaced by a character set
    for item in items:
	if len(item) != 1 or item[0][0] != LITERAL:
	    break
    else:
	# we can store this as a character set instead of a
	# branch (FIXME: use a range if possible)
	set = []
	for item in items:
	    set.append(item[0])
	subpattern.append((IN, set))
	return

    subpattern.append((BRANCH, (None, items)))

def _parse(source, pattern, flags=()):

    # parse regular expression pattern into an operator list.

    subpattern = SubPattern(pattern)

    this = None

    while 1:

	if source.next in ("|", ")"):
	    break # end of subpattern
	this = source.get()
	if this is None:
	    break # end of pattern

	if this and this[0] not in SPECIAL_CHARS:
	    subpattern.append((LITERAL, this))

	elif this == "[":
	    # character set
	    set = []
## 	    if source.match(":"):
## 		pass # handle character classes
	    if source.match("^"):
		set.append((NEGATE, None))
	    # check remaining characters
	    start = set[:]
	    while 1:
		this = source.get()
		if this == "]" and set != start:
		    break
		elif this and this[0] == "\\":
		    code1 = _fixescape(this, 1)
		elif this:
		    code1 = LITERAL, this
		else:
		    raise SyntaxError, "unexpected end of regular expression"
		if source.match("-"):
		    # potential range
		    this = source.get()
		    if this == "]":
			set.append(code1)
			set.append((LITERAL, "-"))
			break
		    else:
			if this[0] == "\\":
			    code2 = _fixescape(this, 1)
			else:
			    code2 = LITERAL, this
			if code1[0] != LITERAL or code2[0] != LITERAL:
			    raise SyntaxError, "illegal range"
			if len(code1[1]) != 1 or len(code2[1]) != 1:
			    raise SyntaxError, "illegal range"
			set.append((RANGE, (code1[1], code2[1])))
		else:
		    if code1[0] is IN:
			code1 = code1[1][0]
		    set.append(code1)

	    # FIXME: <fl> move set optimization to support function
	    if len(set)==1 and set[0][0] is LITERAL:
		subpattern.append(set[0]) # optimization
	    elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
		subpattern.append((NOT_LITERAL, set[1][1])) # optimization
	    else:
		# FIXME: <fl> add charmap optimization
		subpattern.append((IN, set))

	elif this and this[0] in REPEAT_CHARS:
	    # repeat previous item
	    if this == "?":
		min, max = 0, 1
	    elif this == "*":
		min, max = 0, sys.maxint
	    elif this == "+":
		min, max = 1, sys.maxint
	    elif this == "{":
		min, max = 0, sys.maxint
		lo = hi = ""
		while source.next in string.digits:
		    lo = lo + source.get()
		if source.match(","):
		    while source.next in string.digits:
			hi = hi + source.get()
		else:
		    hi = lo
		if not source.match("}"):
		    raise SyntaxError, "bogus range"
		if lo:
		    min = int(lo)
		if hi:
		    max = int(hi)
		# FIXME: <fl> check that hi >= lo!
	    else:
		raise SyntaxError, "not supported"
	    # figure out which item to repeat
	    # FIXME: should back up to the right mark, right?
	    if subpattern:
		index = len(subpattern)-1
		while subpattern[index][0] is MARK:
		    index = index - 1
		item = subpattern[index:index+1]
	    else:
		raise SyntaxError, "nothing to repeat"
	    if source.match("?"):
		subpattern[index] = (MIN_REPEAT, (min, max, item))
	    else:
		subpattern[index] = (MAX_REPEAT, (min, max, item))
	elif this == ".":
	    subpattern.append((ANY, None))
	elif this == "(":
	    group = 1
	    name = None
	    if source.match("?"):
		group = 0
		# options
		if source.match("P"):
		    # named group: skip forward to end of name
		    if source.match("<"):
			name = ""
			while 1:
			    char = source.get()
			    if char in (">", None):
				break
			    name = name + char
			group = 1
		elif source.match(":"):
		    # non-capturing group
		    group = 2
		elif source.match_set("iI"):
		    pattern.setflag("i")
		elif source.match_set("lL"):
		    pattern.setflag("l")
		elif source.match_set("mM"):
		    pattern.setflag("m")
		elif source.match_set("sS"):
		    pattern.setflag("s")
		elif source.match_set("xX"):
		    pattern.setflag("x")
	    if group:
		# parse group contents
		b = []
		if group == 2:
		    # anonymous group
		    group = None
		else:
		    group = pattern.getgroup(name)
 		if group:
 		    subpattern.append((MARK, (group-1)*2))
		while 1:
		    p = _parse(source, pattern, flags)
		    if source.match(")"):
			if b:
			    b.append(p)
			    _branch(subpattern, b)
			else:
			    subpattern.append((SUBPATTERN, (group, p)))
			break
		    elif source.match("|"):
			b.append(p)
		    else:
			raise SyntaxError, "group not properly closed"
 		if group:
 		    subpattern.append((MARK, (group-1)*2+1))
	    else:
		# FIXME: should this really be a while loop?
 		while source.get() not in (")", None):
 		    pass

	elif this == "^":
	    subpattern.append((AT, AT_BEGINNING))

	elif this == "$":
	    subpattern.append((AT, AT_END))

	elif this and this[0] == "\\":
	    code =_fixescape(this)
	    subpattern.append(code)

	else:
	    raise SyntaxError, "parser error"

    return subpattern

def parse(source, flags=()):
    s = Tokenizer(source)
    g = Pattern()
    b = []
    while 1:
	p = _parse(s, g, flags)
	tail = s.get()
	if tail == "|":
	    b.append(p)
	elif tail == ")":
	    raise SyntaxError, "unbalanced parenthesis"
	elif tail is None:
	    if b:
		b.append(p)
		p = SubPattern(g)
		_branch(p, b)
	    break
	else:
	    raise SyntaxError, "bogus characters at end of regular expression"
    return p

if __name__ == "__main__":
    from pprint import pprint
    from testpatterns import PATTERNS
    a = b = c = 0
    for pattern, flags in PATTERNS:
	if flags:
	    continue
	print "-"*68
	try:
	    p = parse(pattern)
	    print repr(pattern), "->"
	    pprint(p.data)
	    import sre_compile
	    try:
		code = sre_compile.compile(p)
		c = c + 1
	    except:
		pass
	    a = a + 1
	except SyntaxError, v:
	    print "**", repr(pattern), v
	b = b + 1
    print "-"*68
    print a, "of", b, "patterns successfully parsed"
    print c, "of", b, "patterns successfully compiled"
Added Fredrik Lundh's sre module and its supporting cast. NOTE: THIS IS VERY ROUGH ALPHA CODE! 2000-03-31 10:58:54 -04:00			`#`
			`# Secret Labs' Regular Expression Engine`
			`# $Id$`
			`#`
			`# convert re-style regular expression to SRE template. the current`
			`# implementation is somewhat incomplete, and not very fast. should`
			`# definitely be rewritten before Python 1.6 goes beta.`
			`#`
			`# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.`
			`#`
			`# This code can only be used for 1.6 alpha testing. All other use`
			`# require explicit permission from Secret Labs AB.`
			`#`
			`# Portions of this engine have been developed in cooperation with`
			`# CNRI. Hewlett-Packard provided funding for 1.6 integration and`
			`# other compatibility work.`
			`#`

			`# FIXME: comments marked with the FIXME tag are open issues. all such`
			`# issues should be closed before the final beta.`

			`import string, sys`

			`from sre_constants import *`

			`SPECIAL_CHARS = ".\\[{()*+?^$\|"`
			`REPEAT_CHARS = "*+?{"`

			`OCTDIGITS = "01234567"`
			`HEXDIGITS = "0123456789abcdefABCDEF"`

			`ESCAPES = {`
			`"\\a": (LITERAL, chr(7)),`
			`"\\b": (LITERAL, chr(8)),`
			`"\\f": (LITERAL, chr(12)),`
			`"\\n": (LITERAL, chr(10)),`
			`"\\r": (LITERAL, chr(13)),`
			`"\\t": (LITERAL, chr(9)),`
			`"\\v": (LITERAL, chr(11))`
			`}`

			`CATEGORIES = {`
			`"\\A": (AT, AT_BEGINNING), # start of string`
			`"\\b": (AT, AT_BOUNDARY),`
			`"\\B": (AT, AT_NON_BOUNDARY),`
			`"\\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),`
			`"\\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),`
			`"\\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),`
			`"\\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),`
			`"\\w": (IN, [(CATEGORY, CATEGORY_WORD)]),`
			`"\\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),`
			`"\\Z": (AT, AT_END), # end of string`
			`}`

			`class Pattern:`
			`# FIXME: <fl> rename class, and store flags in here too!`
			`def __init__(self):`
			`self.flags = []`
			`self.groups = 1`
			`self.groupdict = {}`
			`def getgroup(self, name=None):`
			`gid = self.groups`
			`self.groups = gid + 1`
			`if name:`
			`self.groupdict[name] = gid`
			`return gid`
			`def setflag(self, flag):`
			`if flag not in self.flags:`
			`self.flags.append(flag)`

			`class SubPattern:`
			`# a subpattern, in intermediate form`
			`def __init__(self, pattern, data=None):`
			`self.pattern = pattern`
			`if not data:`
			`data = []`
			`self.data = data`
			`self.flags = []`
			`self.width = None`
			`def __repr__(self):`
			`return repr(self.data)`
			`def __len__(self):`
			`return len(self.data)`
			`def __delitem__(self, index):`
			`del self.data[index]`
			`def __getitem__(self, index):`
			`return self.data[index]`
			`def __setitem__(self, index, code):`
			`self.data[index] = code`
			`def __getslice__(self, start, stop):`
			`return SubPattern(self.pattern, self.data[start:stop])`
			`def insert(self, index, code):`
			`self.data.insert(index, code)`
			`def append(self, code):`
			`self.data.append(code)`
			`def getwidth(self):`
			`# determine the width (min, max) for this subpattern`
			`if self.width:`
			`return self.width`
			`lo = hi = 0L`
			`for op, av in self.data:`
			`if op is BRANCH:`
			`l = sys.maxint`
			`h = 0`
			`for av in av[1]:`
			`i, j = av.getwidth()`
			`l = min(l, i)`
			`h = min(h, j)`
			`lo = lo + i`
			`hi = hi + j`
			`elif op is CALL:`
			`i, j = av.getwidth()`
			`lo = lo + i`
			`hi = hi + j`
			`elif op is SUBPATTERN:`
			`i, j = av[1].getwidth()`
			`lo = lo + i`
			`hi = hi + j`
			`elif op in (MIN_REPEAT, MAX_REPEAT):`
			`i, j = av[2].getwidth()`
			`lo = lo + i * av[0]`
			`hi = hi + j * av[1]`
			`elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):`
			`lo = lo + 1`
			`hi = hi + 1`
			`elif op == SUCCESS:`
			`break`
			`self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))`
			`return self.width`
			`def set(self, flag):`
			`if not flag in self.flags:`
			`self.flags.append(flag)`
			`def reset(self, flag):`
			`if flag in self.flags:`
			`self.flags.remove(flag)`

			`class Tokenizer:`
			`def __init__(self, string):`
			`self.string = list(string)`
			`self.next = self.__next()`
			`def __next(self):`
			`if not self.string:`
			`return None`
			`char = self.string[0]`
			`if char[0] == "\\":`
			`try:`
			`c = self.string[1]`
			`except IndexError:`
			`raise SyntaxError, "bogus escape"`
			`char = char + c`
			`try:`
			`if c == "x":`
			`# hexadecimal constant`
			`for i in xrange(2, sys.maxint):`
			`c = self.string[i]`
			`if c not in HEXDIGITS:`
			`break`
			`char = char + c`
			`elif c in string.digits:`
			`# decimal (or octal) number`
			`for i in xrange(2, sys.maxint):`
			`c = self.string[i]`
			`# FIXME: if larger than current number of`
			`# groups, interpret as an octal number`
			`if c not in string.digits:`
			`break`
			`char = char + c`
			`except IndexError:`
			`pass # use what we've got this far`
			`del self.string[0:len(char)]`
			`return char`
			`def match(self, char):`
			`if char == self.next:`
			`self.next = self.__next()`
			`return 1`
			`return 0`
			`def match_set(self, set):`
			`if self.next in set:`
			`self.next = self.__next()`
			`return 1`
			`return 0`
			`def get(self):`
			`this = self.next`
			`self.next = self.__next()`
			`return this`

			`def _fixescape(escape, character_class=0):`
			`# convert escape to (type, value)`
			`if character_class:`
			`# inside a character class, we'll look in the character`
			`# escapes dictionary first`
			`code = ESCAPES.get(escape)`
			`if code:`
			`return code`
			`code = CATEGORIES.get(escape)`
			`else:`
			`code = CATEGORIES.get(escape)`
			`if code:`
			`return code`
			`code = ESCAPES.get(escape)`
			`if code:`
			`return code`
			`if not character_class:`
			`try:`
			`group = int(escape[1:])`
			`# FIXME: only valid if group <= current number of groups`
			`return GROUP, group`
			`except ValueError:`
			`pass`
			`try:`
			`if escape[1:2] == "x":`
			`escape = escape[2:]`
			`return LITERAL, chr(string.atoi(escape[-2:], 16) & 0xff)`
			`elif escape[1:2] in string.digits:`
			`return LITERAL, chr(string.atoi(escape[1:], 8) & 0xff)`
			`elif len(escape) == 2:`
			`return LITERAL, escape[1]`
			`except ValueError:`
			`pass`
			`raise SyntaxError, "bogus escape: %s" % repr(escape)`

			`def _branch(subpattern, items):`

			`# form a branch operator from a set of items (FIXME: move this`
			`# optimization to the compiler module!)`

			`# check if all items share a common prefix`
			`while 1:`
			`prefix = None`
			`for item in items:`
			`if not item:`
			`break`
			`if prefix is None:`
			`prefix = item[0]`
			`elif item[0] != prefix:`
			`break`
			`else:`
			`# all subitems start with a common "prefix".`
			`# move it out of the branch`
			`for item in items:`
			`del item[0]`
			`subpattern.append(prefix)`
			`continue # check next one`
			`break`

			`# check if the branch can be replaced by a character set`
			`for item in items:`
			`if len(item) != 1 or item[0][0] != LITERAL:`
			`break`
			`else:`
			`# we can store this as a character set instead of a`
			`# branch (FIXME: use a range if possible)`
			`set = []`
			`for item in items:`
			`set.append(item[0])`
			`subpattern.append((IN, set))`
			`return`

			`subpattern.append((BRANCH, (None, items)))`

			`def _parse(source, pattern, flags=()):`

			`# parse regular expression pattern into an operator list.`

			`subpattern = SubPattern(pattern)`

			`this = None`

			`while 1:`

			`if source.next in ("\|", ")"):`
			`break # end of subpattern`
			`this = source.get()`
			`if this is None:`
			`break # end of pattern`

			`if this and this[0] not in SPECIAL_CHARS:`
			`subpattern.append((LITERAL, this))`

			`elif this == "[":`
			`# character set`
			`set = []`
			`## if source.match(":"):`
			`## pass # handle character classes`
			`if source.match("^"):`
			`set.append((NEGATE, None))`
			`# check remaining characters`
			`start = set[:]`
			`while 1:`
			`this = source.get()`
			`if this == "]" and set != start:`
			`break`
			`elif this and this[0] == "\\":`
			`code1 = _fixescape(this, 1)`
			`elif this:`
			`code1 = LITERAL, this`
			`else:`
			`raise SyntaxError, "unexpected end of regular expression"`
			`if source.match("-"):`
			`# potential range`
			`this = source.get()`
			`if this == "]":`
			`set.append(code1)`
			`set.append((LITERAL, "-"))`
			`break`
			`else:`
			`if this[0] == "\\":`
			`code2 = _fixescape(this, 1)`
			`else:`
			`code2 = LITERAL, this`
			`if code1[0] != LITERAL or code2[0] != LITERAL:`
			`raise SyntaxError, "illegal range"`
			`if len(code1[1]) != 1 or len(code2[1]) != 1:`
			`raise SyntaxError, "illegal range"`
			`set.append((RANGE, (code1[1], code2[1])))`
			`else:`
			`if code1[0] is IN:`
			`code1 = code1[1][0]`
			`set.append(code1)`

			`# FIXME: <fl> move set optimization to support function`
			`if len(set)==1 and set[0][0] is LITERAL:`
			`subpattern.append(set[0]) # optimization`
			`elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:`
			`subpattern.append((NOT_LITERAL, set[1][1])) # optimization`
			`else:`
			`# FIXME: <fl> add charmap optimization`
			`subpattern.append((IN, set))`

			`elif this and this[0] in REPEAT_CHARS:`
			`# repeat previous item`
			`if this == "?":`
			`min, max = 0, 1`
			`elif this == "*":`
			`min, max = 0, sys.maxint`
			`elif this == "+":`
			`min, max = 1, sys.maxint`
			`elif this == "{":`
			`min, max = 0, sys.maxint`
			`lo = hi = ""`
			`while source.next in string.digits:`
			`lo = lo + source.get()`
			`if source.match(","):`
			`while source.next in string.digits:`
			`hi = hi + source.get()`
			`else:`
			`hi = lo`
			`if not source.match("}"):`
			`raise SyntaxError, "bogus range"`
			`if lo:`
			`min = int(lo)`
			`if hi:`
			`max = int(hi)`
			`# FIXME: <fl> check that hi >= lo!`
			`else:`
			`raise SyntaxError, "not supported"`
			`# figure out which item to repeat`
			`# FIXME: should back up to the right mark, right?`
			`if subpattern:`
			`index = len(subpattern)-1`
			`while subpattern[index][0] is MARK:`
			`index = index - 1`
			`item = subpattern[index:index+1]`
			`else:`
			`raise SyntaxError, "nothing to repeat"`
			`if source.match("?"):`
			`subpattern[index] = (MIN_REPEAT, (min, max, item))`
			`else:`
			`subpattern[index] = (MAX_REPEAT, (min, max, item))`
			`elif this == ".":`
			`subpattern.append((ANY, None))`
			`elif this == "(":`
			`group = 1`
			`name = None`
			`if source.match("?"):`
			`group = 0`
			`# options`
			`if source.match("P"):`
			`# named group: skip forward to end of name`
			`if source.match("<"):`
			`name = ""`
			`while 1:`
			`char = source.get()`
			`if char in (">", None):`
			`break`
			`name = name + char`
			`group = 1`
			`elif source.match(":"):`
			`# non-capturing group`
			`group = 2`
			`elif source.match_set("iI"):`
			`pattern.setflag("i")`
			`elif source.match_set("lL"):`
			`pattern.setflag("l")`
			`elif source.match_set("mM"):`
			`pattern.setflag("m")`
			`elif source.match_set("sS"):`
			`pattern.setflag("s")`
			`elif source.match_set("xX"):`
			`pattern.setflag("x")`
			`if group:`
			`# parse group contents`
			`b = []`
			`if group == 2:`
			`# anonymous group`
			`group = None`
			`else:`
			`group = pattern.getgroup(name)`
			`if group:`
			`subpattern.append((MARK, (group-1)*2))`
			`while 1:`
			`p = _parse(source, pattern, flags)`
			`if source.match(")"):`
			`if b:`
			`b.append(p)`
			`_branch(subpattern, b)`
			`else:`
			`subpattern.append((SUBPATTERN, (group, p)))`
			`break`
			`elif source.match("\|"):`
			`b.append(p)`
			`else:`
			`raise SyntaxError, "group not properly closed"`
			`if group:`
			`subpattern.append((MARK, (group-1)*2+1))`
			`else:`
			`# FIXME: should this really be a while loop?`
			`while source.get() not in (")", None):`
			`pass`

			`elif this == "^":`
			`subpattern.append((AT, AT_BEGINNING))`

			`elif this == "$":`
			`subpattern.append((AT, AT_END))`

			`elif this and this[0] == "\\":`
			`code =_fixescape(this)`
			`subpattern.append(code)`

			`else:`
			`raise SyntaxError, "parser error"`

			`return subpattern`

			`def parse(source, flags=()):`
			`s = Tokenizer(source)`
			`g = Pattern()`
			`b = []`
			`while 1:`
			`p = _parse(s, g, flags)`
			`tail = s.get()`
			`if tail == "\|":`
			`b.append(p)`
			`elif tail == ")":`
			`raise SyntaxError, "unbalanced parenthesis"`
			`elif tail is None:`
			`if b:`
			`b.append(p)`
			`p = SubPattern(g)`
			`_branch(p, b)`
			`break`
			`else:`
			`raise SyntaxError, "bogus characters at end of regular expression"`
			`return p`

			`if __name__ == "__main__":`
			`from pprint import pprint`
			`from testpatterns import PATTERNS`
			`a = b = c = 0`
			`for pattern, flags in PATTERNS:`
			`if flags:`
			`continue`
			`print "-"*68`
			`try:`
			`p = parse(pattern)`
			`print repr(pattern), "->"`
			`pprint(p.data)`
			`import sre_compile`
			`try:`
			`code = sre_compile.compile(p)`
			`c = c + 1`
			`except:`
			`pass`
			`a = a + 1`
			`except SyntaxError, v:`
			`print "**", repr(pattern), v`
			`b = b + 1`
			`print "-"*68`
			`print a, "of", b, "patterns successfully parsed"`
			`print c, "of", b, "patterns successfully compiled"`