#!/usr/bin/env python # -*- mode: python -*- # $Id$ import sys import string from pcre import * [ NORMAL, CHARCLASS, REPLACEMENT ] = range(3) [ CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER ] = range(9) # # First, the public part of the interface: # # pcre.error and re.error should be the same, since exceptions can be # raised from either module. # compilation flags I = IGNORECASE M = MULTILINE S = DOTALL X = VERBOSE # # # _cache = {} _MAXCACHE = 20 def _cachecompile(pattern, flags=0): key = (pattern, flags) try: return _cache[key] except KeyError: pass value = compile(pattern, flags) if len(_cache) >= _MAXCACHE: _cache.clear() _cache[key] = value return value def match(pattern, string, flags=0): return _cachecompile(pattern, flags).match(string) def search(pattern, string, flags=0): return _cachecompile(pattern, flags).search(string) def sub(pattern, repl, string, count=0): if type(pattern) == type(''): pattern = _cachecompile(pattern) return pattern.sub(repl, string, count) def subn(pattern, repl, string, count=0): if type(pattern) == type(''): pattern = _cachecompile(pattern) return pattern.subn(repl, string, count) def split(pattern, string, maxsplit=0): if type(pattern) == type(''): pattern = _cachecompile(pattern) return pattern.split(string, maxsplit) # # # class RegexObject: def __init__(self, pattern, flags, code, groupindex): self.code = code self.flags = flags self.pattern = pattern self.groupindex = groupindex def search(self, string, pos=0): regs = self.code.match(string, pos, 0) if regs is None: return None self.num_regs=len(regs) return MatchObject(self, string, pos, regs) def match(self, string, pos=0): regs = self.code.match(string, pos, ANCHORED) if regs is None: return None self.num_regs=len(regs)/2 return MatchObject(self, string, pos, regs) def sub(self, repl, string, count=0): return self.subn(repl, string, count)[0] def subn(self, repl, source, count=0): if count < 0: raise error, "negative substitution count" if count == 0: import sys count = sys.maxint if type(repl) == type(''): if '\\' in repl: repl = lambda m, r=repl: pcre_expand(m, r) else: repl = lambda m, r=repl: r n = 0 # Number of matches pos = 0 # Where to start searching lastmatch = -1 # End of last match results = [] # Substrings making up the result end = len(source) while n < count and pos <= end: m = self.search(source, pos) if not m: break i, j = m.span(0) if i == j == lastmatch: # Empty match adjacent to previous match pos = pos + 1 results.append(source[lastmatch:pos]) continue if pos < i: results.append(source[pos:i]) results.append(repl(m)) pos = lastmatch = j if i == j: # Last match was empty; don't try here again pos = pos + 1 results.append(source[lastmatch:pos]) n = n + 1 results.append(source[pos:]) return (string.join(results, ''), n) def split(self, source, maxsplit=0): if maxsplit < 0: raise error, "negative split count" if maxsplit == 0: import sys maxsplit = sys.maxint n = 0 pos = 0 lastmatch = 0 results = [] end = len(source) while n < maxsplit: m = self.search(source, pos) if not m: break i, j = m.span(0) if i == j: # Empty match if pos >= end: break pos = pos+1 continue results.append(source[lastmatch:i]) g = m.group() if g: results[len(results):] = list(g) pos = lastmatch = j results.append(source[lastmatch:]) return results class MatchObject: def __init__(self, re, string, pos, regs): self.re = re self.string = string self.pos = pos self.regs = regs def start(self, g): if type(g) == type(''): try: g = self.re.groupindex[g] except (KeyError, TypeError): raise IndexError, ('group "' + g + '" is undefined') return self.regs[g][0] def end(self, g): if type(g) == type(''): try: g = self.re.groupindex[g] except (KeyError, TypeError): raise IndexError, ('group "' + g + '" is undefined') return self.regs[g][1] def span(self, g): if type(g) == type(''): try: g = self.re.groupindex[g] except (KeyError, TypeError): raise IndexError, ('group "' + g + '" is undefined') return self.regs[g] def group(self, *groups): if len(groups) == 0: groups = range(1, self.re.num_regs) use_all = 1 else: use_all = 0 result = [] for g in groups: if type(g) == type(''): try: g = self.re.groupindex[g] except (KeyError, TypeError): raise IndexError, ('group "' + g + '" is undefined') if len(self.regs)<=g: raise IndexError, ('group "' + str(g) + '" is undefined') elif (self.regs[g][0] == -1) or (self.regs[g][1] == -1): result.append(None) else: result.append(self.string[self.regs[g][0]:self.regs[g][1]]) if use_all or len(result) > 1: return tuple(result) elif len(result) == 1: return result[0] else: return () def escape(pattern): result = [] alphanum=string.letters+'_'+string.digits for char in pattern: if char not in alphanum: result.append('\\') result.append(char) return string.join(result, '') def valid_identifier(id): import string if len(id) == 0: return 0 if id[0] not in string.letters+'_': return 0 for char in id[1:]: if not syntax_table[char] & word: return 0 return 1 def compile(pattern, flags=0): groupindex={} code=pcre_compile(pattern, flags, groupindex) return RegexObject(pattern, flags, code, groupindex) def _expand(m, repl): results = [] index = 0 size = len(repl) while index < size: found = string.find(repl, '\\', index) if found < 0: results.append(repl[index:]) break if found > index: results.append(repl[index:found]) escape_type, value, index = _expand_escape(repl, found+1, REPLACEMENT) if escape_type == CHAR: results.append(value) elif escape_type == MEMORY_REFERENCE: r = m.group(value) if r is None: raise error, ('group "' + str(value) + '" did not contribute ' 'to the match') results.append(m.group(value)) else: raise error, "bad escape in replacement" return string.join(results, '') def _expand_escape(pattern, index, context=NORMAL): if index >= len(pattern): raise error, 'escape ends too soon' elif pattern[index] == 't': return CHAR, chr(9), index + 1 elif pattern[index] == 'n': return CHAR, chr(10), index + 1 elif pattern[index] == 'v': return CHAR, chr(11), index + 1 elif pattern[index] == 'r': return CHAR, chr(13), index + 1 elif pattern[index] == 'f': return CHAR, chr(12), index + 1 elif pattern[index] == 'a': return CHAR, chr(7), index + 1 elif pattern[index] == 'x': # CAUTION: this is the Python rule, not the Perl rule! end = index + 1 # Skip over the 'x' character while (end < len(pattern)) and (pattern[end] in string.hexdigits): end = end + 1 if end == index: raise error, "\\x must be followed by hex digit(s)" # let Python evaluate it, so we don't incorrectly 2nd-guess # what it's doing (and Python in turn passes it on to sscanf, # so that *it* doesn't incorrectly 2nd-guess what C does!) char = eval ('"' + pattern[index-1:end] + '"') # assert len(char) == 1 return CHAR, char, end elif pattern[index] == 'b': if context != NORMAL: return CHAR, chr(8), index + 1 else: return WORD_BOUNDARY, '', index + 1 elif pattern[index] == 'B': if context != NORMAL: return CHAR, 'B', index + 1 else: return NOT_WORD_BOUNDARY, '', index + 1 elif pattern[index] == 'A': if context != NORMAL: return CHAR, 'A', index + 1 else: return BEGINNING_OF_BUFFER, '', index + 1 elif pattern[index] == 'Z': if context != NORMAL: return CHAR, 'Z', index + 1 else: return END_OF_BUFFER, '', index + 1 elif pattern[index] in 'GluLUQE': raise error, ('\\' + pattern[index] + ' is not allowed') elif pattern[index] == 'w': return CHAR, 'w', index + 1 elif pattern[index] == 'W': return CHAR, 'W', index + 1 elif pattern[index] == 's': return CHAR, 's', index + 1 elif pattern[index] == 'S': return CHAR, 'S', index + 1 elif pattern[index] == 'd': return CHAR, 'd', index + 1 elif pattern[index] == 'D': return CHAR, 'D', index + 1 elif pattern[index] in '0123456789': if pattern[index] == '0': if (index + 1 < len(pattern)) and \ (pattern[index + 1] in string.octdigits): if (index + 2 < len(pattern)) and \ (pattern[index + 2] in string.octdigits): value = string.atoi(pattern[index:index + 3], 8) index = index + 3 else: value = string.atoi(pattern[index:index + 2], 8) index = index + 2 else: value = 0 index = index + 1 if value > 255: raise error, 'octal value out of range' return CHAR, chr(value), index else: if (index + 1 < len(pattern)) and \ (pattern[index + 1] in string.digits): if (index + 2 < len(pattern)) and \ (pattern[index + 2] in string.octdigits) and \ (pattern[index + 1] in string.octdigits) and \ (pattern[index] in string.octdigits): value = string.atoi(pattern[index:index + 3], 8) if value > 255: raise error, 'octal value out of range' return CHAR, chr(value), index + 3 else: value = string.atoi(pattern[index:index + 2]) if (value < 1) or (value > 99): raise error, 'memory reference out of range' if context == CHARCLASS: raise error, ('cannot reference a register from ' 'inside a character class') return MEMORY_REFERENCE, value, index + 2 else: if context == CHARCLASS: raise error, ('cannot reference a register from ' 'inside a character class') value = string.atoi(pattern[index]) return MEMORY_REFERENCE, value, index + 1 elif pattern[index] == 'g': if context != REPLACEMENT: return CHAR, 'g', index + 1 index = index + 1 if index >= len(pattern): raise error, 'unfinished symbolic reference' if pattern[index] != '<': raise error, 'missing < in symbolic reference' index = index + 1 end = string.find(pattern, '>', index) if end == -1: raise error, 'unfinished symbolic reference' value = pattern[index:end] if not valid_identifier(value): raise error, 'illegal symbolic reference' return MEMORY_REFERENCE, value, end + 1 else: return CHAR, pattern[index], index + 1