# # Secret Labs' Regular Expression Engine # # convert template to internal format # # Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved. # # Portions of this engine have been developed in cooperation with # CNRI. Hewlett-Packard provided funding for 1.6 integration and # other compatibility work. # import array import _sre from sre_constants import * # find an array type code that matches the engine's code size for WORDSIZE in "BHil": if len(array.array(WORDSIZE, [0]).tostring()) == _sre.getcodesize(): break else: raise RuntimeError, "cannot find a useable array type" def _compile(code, pattern, flags): emit = code.append for op, av in pattern: if op is ANY: if flags & SRE_FLAG_DOTALL: emit(OPCODES[op]) else: emit(OPCODES[CATEGORY]) emit(CHCODES[CATEGORY_NOT_LINEBREAK]) elif op in (SUCCESS, FAILURE): emit(OPCODES[op]) elif op is AT: emit(OPCODES[op]) if flags & SRE_FLAG_MULTILINE: emit(ATCODES[AT_MULTILINE[av]]) else: emit(ATCODES[av]) elif op is BRANCH: emit(OPCODES[op]) tail = [] for av in av[1]: skip = len(code); emit(0) _compile(code, av, flags) emit(OPCODES[JUMP]) tail.append(len(code)); emit(0) code[skip] = len(code) - skip emit(0) # end of branch for tail in tail: code[tail] = len(code) - tail elif op is CALL: emit(OPCODES[op]) skip = len(code); emit(0) _compile(code, av, flags) emit(OPCODES[SUCCESS]) code[skip] = len(code) - skip elif op is CATEGORY: emit(OPCODES[op]) if flags & SRE_FLAG_LOCALE: emit(CH_LOCALE[CHCODES[av]]) elif flags & SRE_FLAG_UNICODE: emit(CH_UNICODE[CHCODES[av]]) else: emit(CHCODES[av]) elif op is GROUP: if flags & SRE_FLAG_IGNORECASE: emit(OPCODES[OP_IGNORE[op]]) else: emit(OPCODES[op]) emit(av-1) elif op is IN: if flags & SRE_FLAG_IGNORECASE: emit(OPCODES[OP_IGNORE[op]]) def fixup(literal, flags=flags): return _sre.getlower(ord(literal), flags) else: emit(OPCODES[op]) fixup = ord skip = len(code); emit(0) for op, av in av: emit(OPCODES[op]) if op is NEGATE: pass elif op is LITERAL: emit(fixup(av)) elif op is RANGE: emit(fixup(av[0])) emit(fixup(av[1])) elif op is CATEGORY: if flags & SRE_FLAG_LOCALE: emit(CH_LOCALE[CHCODES[av]]) elif flags & SRE_FLAG_UNICODE: emit(CH_UNICODE[CHCODES[av]]) else: emit(CHCODES[av]) else: raise error, "internal: unsupported set operator" emit(OPCODES[FAILURE]) code[skip] = len(code) - skip elif op in (LITERAL, NOT_LITERAL): if flags & SRE_FLAG_IGNORECASE: emit(OPCODES[OP_IGNORE[op]]) else: emit(OPCODES[op]) emit(ord(av)) elif op is MARK: emit(OPCODES[op]) emit(av) elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT): if flags & SRE_FLAG_TEMPLATE: emit(OPCODES[REPEAT]) skip = len(code); emit(0) emit(av[0]) emit(av[1]) _compile(code, av[2], flags) emit(OPCODES[SUCCESS]) code[skip] = len(code) - skip else: lo, hi = av[2].getwidth() if lo == 0: raise error, "nothing to repeat" if 0 and lo == hi == 1 and op is MAX_REPEAT: # FIXME: need a better way to figure out when # it's safe to use this one (in the parser, probably) emit(OPCODES[MAX_REPEAT_ONE]) skip = len(code); emit(0) emit(av[0]) emit(av[1]) _compile(code, av[2], flags) emit(OPCODES[SUCCESS]) code[skip] = len(code) - skip else: emit(OPCODES[op]) skip = len(code); emit(0) emit(av[0]) emit(av[1]) _compile(code, av[2], flags) emit(OPCODES[SUCCESS]) code[skip] = len(code) - skip elif op is SUBPATTERN: group = av[0] if group: emit(OPCODES[MARK]) emit((group-1)*2) _compile(code, av[1], flags) if group: emit(OPCODES[MARK]) emit((group-1)*2+1) else: raise ValueError, ("unsupported operand type", op) def compile(p, flags=0): # internal: convert pattern list to internal format if type(p) in (type(""), type(u"")): import sre_parse pattern = p p = sre_parse.parse(p) else: pattern = None flags = p.pattern.flags | flags code = [] _compile(code, p.data, flags) code.append(OPCODES[SUCCESS]) # FIXME: get rid of this limitation assert p.pattern.groups <= 100,\ "sorry, but this version only supports 100 named groups" return _sre.compile( pattern, flags, array.array(WORDSIZE, code).tostring(), p.pattern.groups-1, p.pattern.groupdict )