diff --git a/Lib/ni1.py b/Lib/ni1.py deleted file mode 100644 index 95c9b59cded..00000000000 --- a/Lib/ni1.py +++ /dev/null @@ -1,434 +0,0 @@ -"""New import scheme with package support. - -Quick Reference ---------------- - -- To enable package support, execute "import ni" before importing any - packages. Importing this module automatically installs the relevant - import hooks. - -- To create a package named spam containing sub-modules ham, bacon and - eggs, create a directory spam somewhere on Python's module search - path (i.e. spam's parent directory must be one of the directories in - sys.path or $PYTHONPATH); then create files ham.py, bacon.py and - eggs.py inside spam. - -- To import module ham from package spam and use function hamneggs() - from that module, you can either do - - import spam.ham # *not* "import spam" !!! - spam.ham.hamneggs() - - or - - from spam import ham - ham.hamneggs() - - or - - from spam.ham import hamneggs - hamneggs() - -- Importing just "spam" does not do what you expect: it creates an - empty package named spam if one does not already exist, but it does - not import spam's submodules. The only submodule that is guaranteed - to be imported is spam.__init__, if it exists. Note that - spam.__init__ is a submodule of package spam. It can reference to - spam's namespace via the '__.' prefix, for instance - - __.spam_inited = 1 # Set a package-level variable - - - -Theory of Operation -------------------- - -A Package is a module that can contain other modules. Packages can be -nested. Package introduce dotted names for modules, like P.Q.M, which -could correspond to a file P/Q/M.py found somewhere on sys.path. It -is possible to import a package itself, though this makes little sense -unless the package contains a module called __init__. - -A package has two variables that control the namespace used for -packages and modules, both initialized to sensible defaults the first -time the package is referenced. - -(1) A package's *module search path*, contained in the per-package -variable __path__, defines a list of *directories* where submodules or -subpackages of the package are searched. It is initialized to the -directory containing the package. Setting this variable to None makes -the module search path default to sys.path (this is not quite the same -as setting it to sys.path, since the latter won't track later -assignments to sys.path). - -(2) A package's *import domain*, contained in the per-package variable -__domain__, defines a list of *packages* that are searched (using -their respective module search paths) to satisfy imports. It is -initialized to the list cosisting of the package itself, its parent -package, its parent's parent, and so on, ending with the root package -(the nameless package containing all top-level packages and modules, -whose module search path is None, implying sys.path). - -The default domain implements a search algorithm called "expanding -search". An alternative search algorithm called "explicit search" -fixes the import search path to contain only the root package, -requiring the modules in the package to name all imported modules by -their full name. The convention of using '__' to refer to the current -package (both as a per-module variable and in module names) can be -used by packages using explicit search to refer to modules in the same -package; this combination is known as "explicit-relative search". - -The PackageImporter and PackageLoader classes together implement the -following policies: - -- There is a root package, whose name is ''. It cannot be imported - directly but may be referenced, e.g. by using '__' from a top-level - module. - -- In each module or package, the variable '__' contains a reference to - the parent package; in the root package, '__' points to itself. - -- In the name for imported modules (e.g. M in "import M" or "from M - import ..."), a leading '__' refers to the current package (i.e. - the package containing the current module); leading '__.__' and so - on refer to the current package's parent, and so on. The use of - '__' elsewhere in the module name is not supported. - -- Modules are searched using the "expanding search" algorithm by - virtue of the default value for __domain__. - -- If A.B.C is imported, A is searched using __domain__; then - subpackage B is searched in A using its __path__, and so on. - -- Built-in modules have priority: even if a file sys.py exists in a - package, "import sys" imports the built-in sys module. - -- The same holds for frozen modules, for better or for worse. - -- Submodules and subpackages are not automatically loaded when their - parent packages is loaded. - -- The construct "from package import *" is illegal. (It can still be - used to import names from a module.) - -- When "from package import module1, module2, ..." is used, those - modules are explicitly loaded. - -- When a package is loaded, if it has a submodule __init__, that - module is loaded. This is the place where required submodules can - be loaded, the __path__ variable extended, etc. The __init__ module - is loaded even if the package was loaded only in order to create a - stub for a sub-package: if "import P.Q.R" is the first reference to - P, and P has a submodule __init__, P.__init__ is loaded before P.Q - is even searched. - -Caveats: - -- It is possible to import a package that has no __init__ submodule; - this is not particularly useful but there may be useful applications - for it (e.g. to manipulate its search paths from the outside!). - -- There are no special provisions for os.chdir(). If you plan to use - os.chdir() before you have imported all your modules, it is better - not to have relative pathnames in sys.path. (This could actually be - fixed by changing the implementation of path_join() in the hook to - absolutize paths.) - -- Packages and modules are introduced in sys.modules as soon as their - loading is started. When the loading is terminated by an exception, - the sys.modules entries remain around. - -- There are no special measures to support mutually recursive modules, - but it will work under the same conditions where it works in the - flat module space system. - -- Sometimes dummy entries (whose value is None) are entered in - sys.modules, to indicate that a particular module does not exist -- - this is done to speed up the expanding search algorithm when a - module residing at a higher level is repeatedly imported (Python - promises that importing a previously imported module is cheap!) - -- Although dynamically loaded extensions are allowed inside packages, - the current implementation (hardcoded in the interpreter) of their - initialization may cause problems if an extension invokes the - interpreter during its initialization. - -- reload() may find another version of the module only if it occurs on - the package search path. Thus, it keeps the connection to the - package to which the module belongs, but may find a different file. - -XXX Need to have an explicit name for '', e.g. '__root__'. - -""" - - -import imp -import string -import sys -import __builtin__ - -import ihooks -from ihooks import ModuleLoader, ModuleImporter - - -class PackageLoader(ModuleLoader): - - """A subclass of ModuleLoader with package support. - - find_module_in_dir() will succeed if there's a subdirectory with - the given name; load_module() will create a stub for a package and - load its __init__ module if it exists. - - """ - - def find_module_in_dir(self, name, dir): - if dir is not None: - dirname = self.hooks.path_join(dir, name) - if self.hooks.path_isdir(dirname): - return None, dirname, ('', '', 'PACKAGE') - return ModuleLoader.find_module_in_dir(self, name, dir) - - def load_module(self, name, stuff): - file, filename, info = stuff - suff, mode, type = info - if type == 'PACKAGE': - return self.load_package(name, stuff) - if sys.modules.has_key(name): - m = sys.modules[name] - else: - sys.modules[name] = m = imp.new_module(name) - self.set_parent(m) - if type == imp.C_EXTENSION and '.' in name: - return self.load_dynamic(name, stuff) - else: - return ModuleLoader.load_module(self, name, stuff) - - def load_dynamic(self, name, stuff): - file, filename, (suff, mode, type) = stuff - # Hack around restriction in imp.load_dynamic() - i = string.rfind(name, '.') - tail = name[i+1:] - if sys.modules.has_key(tail): - save = sys.modules[tail] - else: - save = None - sys.modules[tail] = imp.new_module(name) - try: - m = imp.load_dynamic(tail, filename, file) - finally: - if save: - sys.modules[tail] = save - else: - del sys.modules[tail] - sys.modules[name] = m - return m - - def load_package(self, name, stuff): - file, filename, info = stuff - if sys.modules.has_key(name): - package = sys.modules[name] - else: - sys.modules[name] = package = imp.new_module(name) - package.__path__ = [filename] - self.init_package(package) - return package - - def init_package(self, package): - self.set_parent(package) - self.set_domain(package) - self.call_init_module(package) - - def set_parent(self, m): - name = m.__name__ - if '.' in name: - name = name[:string.rfind(name, '.')] - else: - name = '' - m.__ = sys.modules[name] - - def set_domain(self, package): - name = package.__name__ - package.__domain__ = domain = [name] - while '.' in name: - name = name[:string.rfind(name, '.')] - domain.append(name) - if name: - domain.append('') - - def call_init_module(self, package): - stuff = self.find_module('__init__', package.__path__) - if stuff: - m = self.load_module(package.__name__ + '.__init__', stuff) - package.__init__ = m - - -class PackageImporter(ModuleImporter): - - """Importer that understands packages and '__'.""" - - def __init__(self, loader = None, verbose = 0): - ModuleImporter.__init__(self, - loader or PackageLoader(None, verbose), verbose) - - def import_module(self, name, globals={}, locals={}, fromlist=[]): - if globals.has_key('__'): - package = globals['__'] - else: - # No calling context, assume in root package - package = sys.modules[''] - if name[:3] in ('__.', '__'): - p = package - name = name[3:] - while name[:3] in ('__.', '__'): - p = p.__ - name = name[3:] - if not name: - return self.finish(package, p, '', fromlist) - if '.' in name: - i = string.find(name, '.') - name, tail = name[:i], name[i:] - else: - tail = '' - mname = p.__name__ and p.__name__+'.'+name or name - m = self.get1(mname) - return self.finish(package, m, tail, fromlist) - if '.' in name: - i = string.find(name, '.') - name, tail = name[:i], name[i:] - else: - tail = '' - for pname in package.__domain__: - mname = pname and pname+'.'+name or name - m = self.get0(mname) - if m: break - else: - raise ImportError, "No such module %s" % name - return self.finish(m, m, tail, fromlist) - - def finish(self, module, m, tail, fromlist): - # Got ....A; now get ....A.B.C.D - yname = m.__name__ - if tail and sys.modules.has_key(yname + tail): # Fast path - yname, tail = yname + tail, '' - m = self.get1(yname) - while tail: - i = string.find(tail, '.', 1) - if i > 0: - head, tail = tail[:i], tail[i:] - else: - head, tail = tail, '' - yname = yname + head - m = self.get1(yname) - - # Got ....A.B.C.D; now finalize things depending on fromlist - if not fromlist: - return module - if '__' in fromlist: - raise ImportError, "Can't import __ from anywhere" - if not hasattr(m, '__path__'): return m - if '*' in fromlist: - raise ImportError, "Can't import * from a package" - for f in fromlist: - if hasattr(m, f): continue - fname = yname + '.' + f - self.get1(fname) - return m - - def get1(self, name): - m = self.get(name) - if not m: - raise ImportError, "No module named %s" % name - return m - - def get0(self, name): - m = self.get(name) - if not m: - sys.modules[name] = None - return m - - def get(self, name): - # Internal routine to get or load a module when its parent exists - if sys.modules.has_key(name): - return sys.modules[name] - if '.' in name: - i = string.rfind(name, '.') - head, tail = name[:i], name[i+1:] - else: - head, tail = '', name - path = sys.modules[head].__path__ - stuff = self.loader.find_module(tail, path) - if not stuff: - return None - sys.modules[name] = m = self.loader.load_module(name, stuff) - if head: - setattr(sys.modules[head], tail, m) - return m - - def reload(self, module): - name = module.__name__ - if '.' in name: - i = string.rfind(name, '.') - head, tail = name[:i], name[i+1:] - path = sys.modules[head].__path__ - else: - tail = name - path = sys.modules[''].__path__ - stuff = self.loader.find_module(tail, path) - if not stuff: - raise ImportError, "No module named %s" % name - return self.loader.load_module(name, stuff) - - def unload(self, module): - if hasattr(module, '__path__'): - raise ImportError, "don't know how to unload packages yet" - PackageImporter.unload(self, module) - - def install(self): - if not sys.modules.has_key(''): - sys.modules[''] = package = imp.new_module('') - package.__path__ = None - self.loader.init_package(package) - for m in sys.modules.values(): - if not m: continue - if not hasattr(m, '__'): - self.loader.set_parent(m) - ModuleImporter.install(self) - - -def install(v = 0): - ihooks.install(PackageImporter(None, v)) - -def uninstall(): - ihooks.uninstall() - -def ni(v = 0): - install(v) - -def no(): - uninstall() - -def test(): - import pdb - try: - testproper() - except: - sys.last_type, sys.last_value, sys.last_traceback = sys.exc_info() - print - print sys.last_type, ':', sys.last_value - print - pdb.pm() - -def testproper(): - install(1) - try: - import mactest - print dir(mactest) - raw_input('OK?') - finally: - uninstall() - - -if __name__ == '__main__': - test() -else: - install() diff --git a/Lib/re1.py b/Lib/re1.py deleted file mode 100644 index 6c24797ffd8..00000000000 --- a/Lib/re1.py +++ /dev/null @@ -1,1508 +0,0 @@ -#!/usr/bin/env python -# -*- mode: python -*- -# $Id$ - -import string -import reop - -# reop.error and re.error should be the same, since exceptions can be -# raised from either module. -error = reop.error # 're error' - -from reop import NORMAL, CHARCLASS, REPLACEMENT -from reop import CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET -from reop import WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER - -# compilation flags - -IGNORECASE = I = 0x01 - -MULTILINE = M = 0x02 -DOTALL = S = 0x04 -VERBOSE = X = 0x08 - -repetition_operators = ['*', '*?', '+', '+?', '?', '??', '{n}', '{n}?', - '{n,}', '{n,}?', '{n,m}', '{n,m}?'] - -# -# -# - -def valid_identifier(id): - if len(id) == 0: - return 0 - if (not reop.syntax_table[id[0]] & reop.word) or \ - (reop.syntax_table[id[0]] & reop.digit): - return 0 - for char in id[1:]: - if not reop.syntax_table[char] & reop.word: - return 0 - return 1 - -# -# -# - -_cache = {} -_MAXCACHE = 20 - -def _cachecompile(pattern, flags=0): - key = (pattern, flags) - try: - return _cache[key] - except KeyError: - pass - value = compile(pattern, flags) - if len(_cache) >= _MAXCACHE: - _cache.clear() - _cache[key] = value - return value - -def match(pattern, string, flags=0): - return _cachecompile(pattern, flags).match(string) - -def search(pattern, string, flags=0): - return _cachecompile(pattern, flags).search(string) - -def sub(pattern, repl, string, count=0): - if type(pattern) == type(''): - pattern = _cachecompile(pattern) - return pattern.sub(repl, string, count) - -def subn(pattern, repl, string, count=0): - if type(pattern) == type(''): - pattern = _cachecompile(pattern) - return pattern.subn(repl, string, count) - -def split(pattern, string, maxsplit=0): - if type(pattern) == type(''): - pattern = _cachecompile(pattern) - return pattern.split(string, maxsplit) - -# -# -# - -def _expand(m, repl): - results = [] - index = 0 - size = len(repl) - while index < size: - found = string.find(repl, '\\', index) - if found < 0: - results.append(repl[index:]) - break - if found > index: - results.append(repl[index:found]) - escape_type, value, index = expand_escape(repl, found+1, REPLACEMENT) - if escape_type == CHAR: - results.append(value) - elif escape_type == MEMORY_REFERENCE: - r = m.group(value) - if r is None: - raise error, ('group "' + str(value) + '" did not contribute ' - 'to the match') - results.append(m.group(value)) - else: - raise error, "bad escape in replacement" - return string.join(results, '') - -class RegexObject: - def __init__(self, pattern, flags, code, num_regs, groupindex): - self.code = code - self.num_regs = num_regs - self.flags = flags - self.pattern = pattern - self.groupindex = groupindex - self.fastmap = build_fastmap(code) - - if code[0].name == 'bol': - self.anchor = 1 - - elif code[0].name == 'begbuf': - self.anchor = 2 - - else: - self.anchor = 0 - - self.buffer = assemble(code) - def search(self, string, pos=0): - regs = reop.search(self.buffer, - self.num_regs, - self.flags, - self.fastmap.can_be_null, - self.fastmap.fastmap(), - self.anchor, - string, - pos) - if regs is None: - return None - - return MatchObject(self, - string, - pos, - regs) - - def match(self, string, pos=0): - regs = reop.match(self.buffer, - self.num_regs, - self.flags, - self.fastmap.can_be_null, - self.fastmap.fastmap(), - self.anchor, - string, - pos) - if regs is None: - return None - - return MatchObject(self, - string, - pos, - regs) - - def sub(self, repl, string, count=0): - return self.subn(repl, string, count)[0] - - def subn(self, repl, source, count=0): - if count < 0: - raise ValueError, "negative substibution count" - if count == 0: - import sys - count = sys.maxint - if type(repl) == type(''): - if '\\' in repl: - repl = lambda m, r=repl: _expand(m, r) - else: - repl = lambda m, r=repl: r - n = 0 # Number of matches - pos = 0 # Where to start searching - lastmatch = -1 # End of last match - results = [] # Substrings making up the result - end = len(source) - while n < count and pos <= end: - m = self.search(source, pos) - if not m: - break - i, j = m.span(0) - if i == j == lastmatch: - # Empty match adjacent to previous match - pos = pos + 1 - results.append(source[lastmatch:pos]) - continue - if pos < i: - results.append(source[pos:i]) - results.append(repl(m)) - pos = lastmatch = j - if i == j: - # Last match was empty; don't try here again - pos = pos + 1 - results.append(source[lastmatch:pos]) - n = n + 1 - results.append(source[pos:]) - return (string.join(results, ''), n) - - def split(self, source, maxsplit=0): - if maxsplit < 0: - raise error, "negative split count" - if maxsplit == 0: - import sys - maxsplit = sys.maxint - n = 0 - pos = 0 - lastmatch = 0 - results = [] - end = len(source) - while n < maxsplit: - m = self.search(source, pos) - if not m: - break - i, j = m.span(0) - if i == j: - # Empty match - if pos >= end: - break - pos = pos+1 - continue - results.append(source[lastmatch:i]) - g = m.group() - if g: - results[len(results):] = list(g) - pos = lastmatch = j - results.append(source[lastmatch:]) - return results - -class MatchObject: - def __init__(self, re, string, pos, regs): - self.re = re - self.string = string - self.pos = pos - self.regs = regs - - def start(self, g): - if type(g) == type(''): - try: - g = self.re.groupindex[g] - except (KeyError, TypeError): - raise IndexError, ('group "' + g + '" is undefined') - return self.regs[g][0] - - def end(self, g): - if type(g) == type(''): - try: - g = self.re.groupindex[g] - except (KeyError, TypeError): - raise IndexError, ('group "' + g + '" is undefined') - return self.regs[g][1] - - def span(self, g): - if type(g) == type(''): - try: - g = self.re.groupindex[g] - except (KeyError, TypeError): - raise IndexError, ('group "' + g + '" is undefined') - return self.regs[g] - - def group(self, *groups): - if len(groups) == 0: - groups = range(1, self.re.num_regs) - use_all = 1 - else: - use_all = 0 - result = [] - for g in groups: - if type(g) == type(''): - try: - g = self.re.groupindex[g] - except (KeyError, TypeError): - raise IndexError, ('group "' + g + '" is undefined') - if (self.regs[g][0] == -1) or (self.regs[g][1] == -1): - result.append(None) - else: - result.append(self.string[self.regs[g][0]:self.regs[g][1]]) - if use_all or len(result) > 1: - return tuple(result) - elif len(result) == 1: - return result[0] - else: - return () - -# -# A set of classes to make assembly a bit easier, if a bit verbose. -# - -class Instruction: - def __init__(self, opcode, size=1): - self.opcode = opcode - self.size = size - def assemble(self, position, labels): - return self.opcode - def __repr__(self): - return '%-15s' % (self.name) - -class End(Instruction): - name = 'end' - def __init__(self): - Instruction.__init__(self, chr(0)) - -class Bol(Instruction): - name = 'bol' - def __init__(self): - self.name = 'bol' - Instruction.__init__(self, chr(1)) - -class Eol(Instruction): - name = 'eol' - def __init__(self): - Instruction.__init__(self, chr(2)) - -class Set(Instruction): - name = 'set' - def __init__(self, set, flags=0): - self.set = set - if flags & IGNORECASE: self.set=map(string.lower, self.set) - if len(set)==1: - # If only one element, use the "exact" opcode (it'll be faster) - Instruction.__init__(self, chr(4), 2) - else: - # Use the "set" opcode - Instruction.__init__(self, chr(3), 33) - def assemble(self, position, labels): - if len(self.set)==1: - # If only one character in set, generate an "exact" opcode - return self.opcode + self.set[0] - result = self.opcode - temp = 0 - for i, c in map(lambda x: (x, chr(x)), range(256)): - if c in self.set: - temp = temp | (1 << (i & 7)) - if (i % 8) == 7: - result = result + chr(temp) - temp = 0 - return result - def __repr__(self): - result = '%-15s' % (self.name) - self.set.sort() - # XXX this should print more intelligently - for char in self.set: - result = result + char - return result - -class Exact(Instruction): - name = 'exact' - def __init__(self, char, flags): - self.char = char - if flags & IGNORECASE: self.char=string.lower(self.char) - Instruction.__init__(self, chr(4), 2) - def assemble(self, position, labels): - return self.opcode + self.char - def __repr__(self): - return '%-15s %s' % (self.name, `self.char`) - -class AnyChar(Instruction): - name = 'anychar' - def __init__(self): - Instruction.__init__(self, chr(5)) - def assemble(self, position, labels): - return self.opcode - -class MemoryInstruction(Instruction): - def __init__(self, opcode, register): - self.register = register - Instruction.__init__(self, opcode, 2) - def assemble(self, position, labels): - return self.opcode + chr(self.register) - def __repr__(self): - return '%-15s %i' % (self.name, self.register) - -class StartMemory(MemoryInstruction): - name = 'start_memory' - def __init__(self, register): - MemoryInstruction.__init__(self, chr(6), register) - -class EndMemory(MemoryInstruction): - name = 'end_memory' - def __init__(self, register): - MemoryInstruction.__init__(self, chr(7), register) - -class MatchMemory(MemoryInstruction): - name = 'match_memory' - def __init__(self, register): - MemoryInstruction.__init__(self, chr(8), register) - -class JumpInstruction(Instruction): - def __init__(self, opcode, label): - self.label = label - Instruction.__init__(self, opcode, 3) - def compute_offset(self, start, dest): - return dest - (start + 3) - def pack_offset(self, offset): - if offset > 32767: - raise error, 'offset out of range (pos)' - elif offset < -32768: - raise error, 'offset out of range (neg)' - elif offset < 0: - offset = offset + 65536 - return chr(offset & 0xff) + chr((offset >> 8) & 0xff) - def assemble(self, position, labels): - return self.opcode + \ - self.pack_offset(self.compute_offset(position, - labels[self.label])) - def __repr__(self): - return '%-15s %i' % (self.name, self.label) - -class Jump(JumpInstruction): - name = 'jump' - def __init__(self, label): - JumpInstruction.__init__(self, chr(9), label) - -class StarJump(JumpInstruction): - name = 'star_jump' - def __init__(self, label): - JumpInstruction.__init__(self, chr(10), label) - -class FailureJump(JumpInstruction): - name = 'failure_jump' - def __init__(self, label): - JumpInstruction.__init__(self, chr(11), label) - -class UpdateFailureJump(JumpInstruction): - name = 'update_failure_jump' - def __init__(self, label): - JumpInstruction.__init__(self, chr(12), label) - -class DummyFailureJump(JumpInstruction): - name = 'dummy_failure_jump' - def __init__(self, label): - JumpInstruction.__init__(self, chr(13), label) - -class BegBuf(Instruction): - name = 'begbuf' - def __init__(self): - Instruction.__init__(self, chr(14)) - -class EndBuf(Instruction): - name = 'endbuf' - def __init__(self): - Instruction.__init__(self, chr(15)) - -class WordBeg(Instruction): - name = 'wordbeg' - def __init__(self): - Instruction.__init__(self, chr(16)) - -class WordEnd(Instruction): - name = 'wordend' - def __init__(self): - Instruction.__init__(self, chr(17)) - -class WordBound(Instruction): - name = 'wordbound' - def __init__(self): - Instruction.__init__(self, chr(18)) - -class NotWordBound(Instruction): - name = 'notwordbound' - def __init__(self): - Instruction.__init__(self, chr(19)) - -class SyntaxSpec(Instruction): - name = 'syntaxspec' - def __init__(self, syntax): - self.syntax = syntax - Instruction.__init__(self, chr(20), 2) - def assemble(self, postition, labels): - return self.opcode + chr(self.syntax) - -class NotSyntaxSpec(Instruction): - name = 'notsyntaxspec' - def __init__(self, syntax): - self.syntax = syntax - Instruction.__init__(self, chr(21), 2) - def assemble(self, postition, labels): - return self.opcode + chr(self.syntax) - -class Label(Instruction): - name = 'label' - def __init__(self, label): - self.label = label - Instruction.__init__(self, '', 0) - def __repr__(self): - return '%-15s %i' % (self.name, self.label) - -class OpenParen(Instruction): - name = '(' - def __init__(self, register): - self.register = register - Instruction.__init__(self, '', 0) - def assemble(self, position, labels): - raise error, 'unmatched open parenthesis' - -class Alternation(Instruction): - name = '|' - def __init__(self): - Instruction.__init__(self, '', 0) - def assemble(self, position, labels): - raise error, 'an alternation was not taken care of' - -# -# -# - -def assemble(instructions): - labels = {} - position = 0 - pass1 = [] - for instruction in instructions: - if instruction.name == 'label': - labels[instruction.label] = position - else: - pass1.append((position, instruction)) - position = position + instruction.size - pass2 = '' - for position, instruction in pass1: - pass2 = pass2 + instruction.assemble(position, labels) - return pass2 - -# -# -# - -def escape(pattern): - result = [] - for char in pattern: - if not reop.syntax_table[char] & reop.word: - result.append('\\') - result.append(char) - return string.join(result, '') - -# -# -# - -def registers_used(instructions): - result = [] - for instruction in instructions: - if (instruction.name in ['set_memory', 'end_memory']) and \ - (instruction.register not in result): - result.append(instruction.register) - return result - -# -# -# - -class Fastmap: - def __init__(self): - self.map = ['\000']*256 - self.can_be_null = 0 - def add(self, char): - self.map[ord(char)] = '\001' - def fastmap(self): - return string.join(self.map, '') - def __getitem__(self, char): - return ord(self.map[ord(char)]) - def __repr__(self): - self.map.sort() - return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')' - -# -# -# - -def find_label(code, label): - line = 0 - for instruction in code: - if (instruction.name == 'label') and (instruction.label == label): - return line + 1 - line = line + 1 - -def build_fastmap_aux(code, pos, visited, fastmap): - if visited[pos]: - return - while 1: - instruction = code[pos] - visited[pos] = 1 - pos = pos + 1 - if instruction.name == 'end': - fastmap.can_be_null = 1 - return - elif instruction.name == 'syntaxspec': - for char in map(chr, range(256)): - if reop.syntax_table[char] & instruction.syntax: - fastmap.add(char) - return - elif instruction.name == 'notsyntaxspec': - for char in map(chr, range(256)): - if not reop.syntax_table[char] & instruction.syntax: - fastmap.add(char) - return - elif instruction.name == 'eol': - fastmap.add('\n') - if fastmap.can_be_null == 0: - fastmap.can_be_null = 2 - return - elif instruction.name == 'set': - for char in instruction.set: - fastmap.add(char) - return - elif instruction.name == 'exact': - fastmap.add(instruction.char) - elif instruction.name == 'anychar': - for char in map(chr, range(256)): - if char != '\n': - fastmap.add(char) - return - elif instruction.name == 'match_memory': - for char in map(chr, range(256)): - fastmap.add(char) - fastmap.can_be_null = 1 - return - elif instruction.name in ['jump', 'dummy_failure_jump', \ - 'update_failure_jump', 'star_jump']: - pos = find_label(code, instruction.label) - if visited[pos]: - return - visited[pos] = 1 - elif instruction.name == 'failure_jump': - build_fastmap_aux(code, - find_label(code, instruction.label), - visited, - fastmap) - -def build_fastmap(code, pos=0): - visited = [0] * len(code) - fastmap = Fastmap() - build_fastmap_aux(code, pos, visited, fastmap) - return fastmap - -# -# -# - -#[NORMAL, CHARCLASS, REPLACEMENT] = range(3) -#[CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY, -# NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(9) - -def expand_escape(pattern, index, context=NORMAL): - if index >= len(pattern): - raise error, 'escape ends too soon' - - elif pattern[index] == 't': - return CHAR, chr(9), index + 1 - - elif pattern[index] == 'n': - return CHAR, chr(10), index + 1 - - elif pattern[index] == 'v': - return CHAR, chr(11), index + 1 - - elif pattern[index] == 'r': - return CHAR, chr(13), index + 1 - - elif pattern[index] == 'f': - return CHAR, chr(12), index + 1 - - elif pattern[index] == 'a': - return CHAR, chr(7), index + 1 - - elif pattern[index] == 'x': - # CAUTION: this is the Python rule, not the Perl rule! - end = index + 1 # Skip over the 'x' character - while (end < len(pattern)) and (pattern[end] in string.hexdigits): - end = end + 1 - if end == index: - raise error, "\\x must be followed by hex digit(s)" - # let Python evaluate it, so we don't incorrectly 2nd-guess - # what it's doing (and Python in turn passes it on to sscanf, - # so that *it* doesn't incorrectly 2nd-guess what C does!) - char = eval ('"' + pattern[index-1:end] + '"') - assert len(char) == 1 - return CHAR, char, end - - elif pattern[index] == 'b': - if context != NORMAL: - return CHAR, chr(8), index + 1 - else: - return WORD_BOUNDARY, '', index + 1 - - elif pattern[index] == 'B': - if context != NORMAL: - return CHAR, 'B', index + 1 - else: - return NOT_WORD_BOUNDARY, '', index + 1 - - elif pattern[index] == 'A': - if context != NORMAL: - return CHAR, 'A', index + 1 - else: - return BEGINNING_OF_BUFFER, '', index + 1 - - elif pattern[index] == 'Z': - if context != NORMAL: - return CHAR, 'Z', index + 1 - else: - return END_OF_BUFFER, '', index + 1 - - elif pattern[index] in 'GluLUQE': - raise error, ('\\' + pattern[index] + ' is not allowed') - - elif pattern[index] == 'w': - if context == NORMAL: - return SYNTAX, reop.word, index + 1 - elif context == CHARCLASS: - set = [] - for char in reop.syntax_table.keys(): - if reop.syntax_table[char] & reop.word: - set.append(char) - return SET, set, index + 1 - else: - return CHAR, 'w', index + 1 - - elif pattern[index] == 'W': - if context == NORMAL: - return NOT_SYNTAX, reop.word, index + 1 - elif context == CHARCLASS: - set = [] - for char in reop.syntax_table.keys(): - if not reop.syntax_table[char] & reop.word: - set.append(char) - return SET, set, index + 1 - else: - return CHAR, 'W', index + 1 - - elif pattern[index] == 's': - if context == NORMAL: - return SYNTAX, reop.whitespace, index + 1 - elif context == CHARCLASS: - set = [] - for char in reop.syntax_table.keys(): - if reop.syntax_table[char] & reop.whitespace: - set.append(char) - return SET, set, index + 1 - else: - return CHAR, 's', index + 1 - - elif pattern[index] == 'S': - if context == NORMAL: - return NOT_SYNTAX, reop.whitespace, index + 1 - elif context == CHARCLASS: - set = [] - for char in reop.syntax_table.keys(): - if not reop.syntax_table[char] & reop.whitespace: - set.append(char) - return SET, set, index + 1 - else: - return CHAR, 'S', index + 1 - - elif pattern[index] == 'd': - if context == NORMAL: - return SYNTAX, reop.digit, index + 1 - elif context == CHARCLASS: - set = [] - for char in reop.syntax_table.keys(): - if reop.syntax_table[char] & reop.digit: - set.append(char) - return SET, set, index + 1 - else: - return CHAR, 'd', index + 1 - - elif pattern[index] == 'D': - if context == NORMAL: - return NOT_SYNTAX, reop.digit, index + 1 - elif context == CHARCLASS: - set = [] - for char in reop.syntax_table.keys(): - if not reop.syntax_table[char] & reop.digit: - set.append(char) - return SET, set, index + 1 - else: - return CHAR, 'D', index + 1 - - elif pattern[index] in '0123456789': - - if pattern[index] == '0': - if (index + 1 < len(pattern)) and \ - (pattern[index + 1] in string.octdigits): - if (index + 2 < len(pattern)) and \ - (pattern[index + 2] in string.octdigits): - value = string.atoi(pattern[index:index + 3], 8) - index = index + 3 - - else: - value = string.atoi(pattern[index:index + 2], 8) - index = index + 2 - - else: - value = 0 - index = index + 1 - - if value > 255: - raise error, 'octal value out of range' - - return CHAR, chr(value), index - - else: - if (index + 1 < len(pattern)) and \ - (pattern[index + 1] in string.digits): - if (index + 2 < len(pattern)) and \ - (pattern[index + 2] in string.octdigits) and \ - (pattern[index + 1] in string.octdigits) and \ - (pattern[index] in string.octdigits): - value = string.atoi(pattern[index:index + 3], 8) - if value > 255: - raise error, 'octal value out of range' - - return CHAR, chr(value), index + 3 - - else: - value = string.atoi(pattern[index:index + 2]) - if (value < 1) or (value > 99): - raise error, 'memory reference out of range' - - if context == CHARCLASS: - raise error, ('cannot reference a register from ' - 'inside a character class') - return MEMORY_REFERENCE, value, index + 2 - - else: - if context == CHARCLASS: - raise error, ('cannot reference a register from ' - 'inside a character class') - - value = string.atoi(pattern[index]) - return MEMORY_REFERENCE, value, index + 1 - - elif pattern[index] == 'g': - if context != REPLACEMENT: - return CHAR, 'g', index + 1 - - index = index + 1 - if index >= len(pattern): - raise error, 'unfinished symbolic reference' - if pattern[index] != '<': - raise error, 'missing < in symbolic reference' - - index = index + 1 - end = string.find(pattern, '>', index) - if end == -1: - raise error, 'unfinished symbolic reference' - value = pattern[index:end] - if not valid_identifier(value): - raise error, 'illegal symbolic reference' - return MEMORY_REFERENCE, value, end + 1 - - else: - return CHAR, pattern[index], index + 1 - -def compile(pattern, flags=0): - stack = [] - label = 0 - register = 1 - groupindex = {} - lastop = '' - - # look for embedded pattern modifiers at the beginning of the pattern - - index = 0 - - if len(pattern) >= 3 and \ - (pattern[:2] == '(?') and \ - (pattern[2] in 'iImMsSxX'): - index = 2 - while (index < len(pattern)) and (pattern[index] != ')'): - if pattern[index] in 'iI': - flags = flags | IGNORECASE - elif pattern[index] in 'mM': - flags = flags | MULTILINE - elif pattern[index] in 'sS': - flags = flags | DOTALL - elif pattern[index] in 'xX': - flags = flags | VERBOSE - else: - raise error, 'unknown modifier' - index = index + 1 - index = index + 1 - - # compile the rest of the pattern - - while (index < len(pattern)): - char = pattern[index] - index = index + 1 - if char == '\\': - escape_type, value, index = expand_escape(pattern, index) - - if escape_type == CHAR: - stack.append([Exact(value, flags)]) - lastop = '\\' + value - - elif escape_type == MEMORY_REFERENCE: - if value >= register: - raise error, ('cannot reference a register ' - 'not yet used') - stack.append([MatchMemory(value)]) - lastop = '\\1' - - elif escape_type == BEGINNING_OF_BUFFER: - stack.append([BegBuf()]) - lastop = '\\A' - - elif escape_type == END_OF_BUFFER: - stack.append([EndBuf()]) - lastop = '\\Z' - - elif escape_type == WORD_BOUNDARY: - stack.append([WordBound()]) - lastop = '\\b' - - elif escape_type == NOT_WORD_BOUNDARY: - stack.append([NotWordBound()]) - lastop = '\\B' - - elif escape_type == SYNTAX: - stack.append([SyntaxSpec(value)]) - if value == reop.word: - lastop = '\\w' - elif value == reop.whitespace: - lastop = '\\s' - elif value == reop.digit: - lastop = '\\d' - else: - lastop = '\\?' - - elif escape_type == NOT_SYNTAX: - stack.append([NotSyntaxSpec(value)]) - if value == reop.word: - lastop = '\\W' - elif value == reop.whitespace: - lastop = '\\S' - elif value == reop.digit: - lastop = '\\D' - else: - lastop = '\\?' - - elif escape_type == SET: - raise error, 'cannot use set escape type here' - - else: - raise error, 'unknown escape type' - - elif char == '|': - expr = [] - - while (len(stack) != 0) and \ - (stack[-1][0].name != '(') and \ - (stack[-1][0].name != '|'): - expr = stack[-1] + expr - del stack[-1] - stack.append([FailureJump(label)] + \ - expr + \ - [Jump(-1), - Label(label)]) - stack.append([Alternation()]) - label = label + 1 - lastop = '|' - - elif char == '(': - if index >= len(pattern): - raise error, 'no matching close paren' - - elif pattern[index] == '?': - # Perl style (?...) extensions - index = index + 1 - if index >= len(pattern): - raise error, 'extension ends prematurely' - - elif pattern[index] == 'P': - # Python extensions - index = index + 1 - if index >= len(pattern): - raise error, 'extension ends prematurely' - - elif pattern[index] == '<': - # Handle Python symbolic group names (?P<...>...) - index = index + 1 - end = string.find(pattern, '>', index) - if end == -1: - raise error, 'no end to symbolic group name' - name = pattern[index:end] - if not valid_identifier(name): - raise error, ('symbolic group name must be a ' - 'valid identifier') - index = end + 1 - groupindex[name] = register - stack.append([OpenParen(register)]) - register = register + 1 - lastop = '(' - - elif pattern[index] == '=': - # backreference to symbolic group name - if index >= len(pattern): - raise error, '(?P= at the end of the pattern' - start = index + 1 - end = string.find(pattern, ')', start) - if end == -1: - raise error, 'no ) to end symbolic group name' - name = pattern[start:end] - if name not in groupindex.keys(): - raise error, ('symbolic group name ' + name + \ - ' has not been used yet') - stack.append([MatchMemory(groupindex[name])]) - index = end + 1 - lastop = '(?P=)' - - else: - raise error, ('unknown Python extension: ' + \ - pattern[index]) - - elif pattern[index] == ':': - # grouping, but no registers - index = index + 1 - stack.append([OpenParen(-1)]) - lastop = '(' - - elif pattern[index] == '#': - # comment - index = index + 1 - end = string.find(pattern, ')', index) - if end == -1: - raise error, 'no end to comment' - index = end + 1 - # do not change lastop - - elif pattern[index] == '=': - raise error, ('zero-width positive lookahead ' - 'assertion is unsupported') - - elif pattern[index] == '!': - raise error, ('zero-width negative lookahead ' - 'assertion is unsupported') - - elif pattern[index] in 'iImMsSxX': - raise error, ('embedded pattern modifiers are only ' - 'allowed at the beginning of the pattern') - - else: - raise error, 'unknown extension' - - else: - stack.append([OpenParen(register)]) - register = register + 1 - lastop = '(' - - elif char == ')': - # make one expression out of everything on the stack up to - # the marker left by the last parenthesis - expr = [] - while (len(stack) > 0) and (stack[-1][0].name != '('): - expr = stack[-1] + expr - del stack[-1] - - if len(stack) == 0: - raise error, 'too many close parens' - - # remove markers left by alternation - expr = filter(lambda x: x.name != '|', expr) - - # clean up jumps inserted by alternation - need_label = 0 - for i in range(len(expr)): - if (expr[i].name == 'jump') and (expr[i].label == -1): - expr[i] = Jump(label) - need_label = 1 - if need_label: - expr.append(Label(label)) - label = label + 1 - - if stack[-1][0].register > 0: - expr = [StartMemory(stack[-1][0].register)] + \ - expr + \ - [EndMemory(stack[-1][0].register)] - del stack[-1] - stack.append(expr) - lastop = ')' - - elif char == '{': - if len(stack) == 0: - raise error, 'no expression to repeat' - end = string.find(pattern, '}', index) - if end == -1: - raise error, ('no close curly bracket to match' - ' open curly bracket') - - fields = map(string.strip, - string.split(pattern[index:end], ',')) - index = end + 1 - - minimal = 0 - if (index < len(pattern)) and (pattern[index] == '?'): - minimal = 1 - index = index + 1 - - if len(fields) == 1: - # {n} or {n}? (there's really no difference) - try: - count = string.atoi(fields[0]) - except ValueError: - raise error, ('count must be an integer ' - 'inside curly braces') - if count > 65535: - raise error, 'repeat count out of range' - expr = [] - while count > 0: - expr = expr + stack[-1] - count = count - 1 - del stack[-1] - stack.append(expr) - if minimal: - lastop = '{n}?' - else: - lastop = '{n}' - - elif len(fields) == 2: - # {n,} or {n,m} - if fields[1] == '': - # {n,} - try: - min = string.atoi(fields[0]) - except ValueError: - raise error, ('minimum must be an integer ' - 'inside curly braces') - if min > 65535: - raise error, 'minimum repeat count out of range' - - expr = [] - while min > 0: - expr = expr + stack[-1] - min = min - 1 - if minimal: - expr = expr + \ - ([Jump(label + 1), - Label(label)] + \ - stack[-1] + \ - [Label(label + 1), - FailureJump(label)]) - lastop = '{n,}?' - else: - expr = expr + \ - ([Label(label), - FailureJump(label + 1)] + - stack[-1] + - [StarJump(label), - Label(label + 1)]) - lastop = '{n,}' - - del stack[-1] - stack.append(expr) - label = label + 2 - - else: - # {n,m} - try: - min = string.atoi(fields[0]) - except ValueError: - raise error, ('minimum must be an integer ' - 'inside curly braces') - try: - max = string.atoi(fields[1]) - except ValueError: - raise error, ('maximum must be an integer ' - 'inside curly braces') - if min > 65535: - raise error, ('minumim repeat count out ' - 'of range') - if max > 65535: - raise error, ('maximum repeat count out ' - 'of range') - if min > max: - raise error, ('minimum repeat count must be ' - 'less than the maximum ' - 'repeat count') - expr = [] - while min > 0: - expr = expr + stack[-1] - min = min - 1 - max = max - 1 - if minimal: - while max > 0: - expr = expr + \ - [FailureJump(label), - Jump(label + 1), - Label(label)] + \ - stack[-1] + \ - [Label(label + 1)] - max = max - 1 - label = label + 2 - del stack[-1] - stack.append(expr) - lastop = '{n,m}?' - else: - while max > 0: - expr = expr + \ - [FailureJump(label)] + \ - stack[-1] - max = max - 1 - del stack[-1] - stack.append(expr + [Label(label)]) - label = label + 1 - lastop = '{n,m}' - - else: - raise error, ('there need to be one or two fields ' - 'in a {} expression') - - elif char == '}': - raise error, 'unbalanced close curly brace' - - elif char == '*': - # Kleene closure - if len(stack) == 0: - raise error, '* needs something to repeat' - - if lastop in ['(', '|']: - raise error, '* needs something to repeat' - - if lastop in repetition_operators: - raise error, 'nested repetition operators' - - if (index < len(pattern)) and (pattern[index] == '?'): - # non-greedy matching - expr = [Jump(label + 1), - Label(label)] + \ - stack[-1] + \ - [Label(label + 1), - FailureJump(label)] - index = index + 1 - lastop = '*?' - else: - # greedy matching - expr = [Label(label), - FailureJump(label + 1)] + \ - stack[-1] + \ - [StarJump(label), - Label(label + 1)] - lastop = '*' - del stack[-1] - stack.append(expr) - label = label + 2 - - elif char == '+': - # positive closure - if len(stack) == 0: - raise error, '+ needs something to repeat' - - if lastop in ['(', '|']: - raise error, '+ needs something to repeat' - - if lastop in repetition_operators: - raise error, 'nested repetition operators' - - if (index < len(pattern)) and (pattern[index] == '?'): - # non-greedy - expr = [Label(label)] + \ - stack[-1] + \ - [FailureJump(label)] - label = label + 1 - index = index + 1 - lastop = '+?' - - else: - # greedy - expr = [DummyFailureJump(label + 1), - Label(label), - FailureJump(label + 2), - Label(label + 1)] + \ - stack[-1] + \ - [StarJump(label), - Label(label + 2)] - label = label + 3 - lastop = '+' - - del stack[-1] - stack.append(expr) - - elif char == '?': - if len(stack) == 0: - raise error, 'need something to be optional' - - if len(stack) == 0: - raise error, '? needs something to repeat' - - if lastop in ['(', '|']: - raise error, '? needs something to repeat' - - if lastop in repetition_operators: - raise error, 'nested repetition operators' - - if (index < len(pattern)) and (pattern[index] == '?'): - # non-greedy matching - expr = [FailureJump(label), - Jump(label + 1), - Label(label)] + \ - stack[-1] + \ - [Label(label + 1)] - label = label + 2 - index = index + 1 - lastop = '??' - - else: - # greedy matching - expr = [FailureJump(label)] + \ - stack[-1] + \ - [Label(label)] - label = label + 1 - lastop = '?' - - del stack[-1] - stack.append(expr) - - elif char == '.': - if flags & DOTALL: - stack.append([Set(map(chr, range(256)), flags)]) - else: - stack.append([AnyChar()]) - lastop = '.' - - elif char == '^': - if flags & MULTILINE: - stack.append([Bol()]) - else: - stack.append([BegBuf()]) - lastop = '^' - - elif char == '$': - if flags & MULTILINE: - stack.append([Eol()]) - else: - stack.append([EndBuf()]) - lastop = '$' - - elif char == '#': - if flags & VERBOSE: - # comment - index = index + 1 - end = string.find(pattern, '\n', index) - if end == -1: - index = len(pattern) - else: - index = end + 1 - # do not change lastop - else: - stack.append([Exact(char, flags)]) - lastop = '#' - - elif char in string.whitespace: - if not (flags & VERBOSE): - stack.append([Exact(char, flags)]) - lastop = char - - elif char == '[': - # compile character class - - if index >= len(pattern): - raise error, 'unclosed character class' - - negate = 0 - last = '' - set = [] - - if pattern[index] == '^': - negate = 1 - index = index + 1 - if index >= len(pattern): - raise error, 'unclosed character class' - - if pattern[index] == ']': - set.append(']') - index = index + 1 - if index >= len(pattern): - raise error, 'unclosed character class' - - elif pattern[index] == '-': - set.append('-') - index = index + 1 - if index >= len(pattern): - raise error, 'unclosed character class' - - while (index < len(pattern)) and (pattern[index] != ']'): - next = pattern[index] - index = index + 1 - if next == '-': - if index >= len(pattern): - raise error, 'incomplete range in character class' - - elif pattern[index] == ']': - set.append('-') - - else: - if last == '': - raise error, ('improper use of range in ' - 'character class') - - start = last - - if pattern[index] == '\\': - escape_type, - value, - index = expand_escape(pattern, - index + 1, - CHARCLASS) - - if escape_type == CHAR: - end = value - - else: - raise error, ('illegal escape in character ' - 'class range') - else: - end = pattern[index] - index = index + 1 - - if start > end: - raise error, ('range arguments out of order ' - 'in character class') - - for char in map(chr, range(ord(start), ord(end) + 1)): - if char not in set: - set.append(char) - - last = '' - - elif next == '\\': - # expand syntax meta-characters and add to set - if index >= len(pattern): - raise error, 'incomplete set' - - escape_type, value, index = expand_escape(pattern, - index, - CHARCLASS) - - if escape_type == CHAR: - set.append(value) - last = value - - elif escape_type == SET: - for char in value: - if char not in set: - set.append(char) - last = '' - - else: - raise error, 'illegal escape type in character class' - - else: - if next not in set: - set.append(next) - last = next - - if (index >= len(pattern)) or ( pattern[index] != ']'): - raise error, 'incomplete set' - - index = index + 1 - - if negate: - # If case is being ignored, then both upper- and lowercase - # versions of the letters must be excluded. - if flags & IGNORECASE: set=set+map(string.upper, set) - notset = [] - for char in map(chr, range(256)): - if char not in set: - notset.append(char) - if len(notset) == 0: - raise error, 'empty negated set' - stack.append([Set(notset, flags)]) - else: - if len(set) == 0: - raise error, 'empty set' - stack.append([Set(set, flags)]) - - lastop = '[]' - - else: - stack.append([Exact(char, flags)]) - lastop = char - - code = [] - while len(stack) > 0: - if stack[-1][0].name == '(': - raise error, 'too many open parens' - code = stack[-1] + code - del stack[-1] - if len(code) == 0: - raise error, 'no code generated' - code = filter(lambda x: x.name != '|', code) - need_label = 0 - for i in range(len(code)): - if (code[i].name == 'jump') and (code[i].label == -1): - code[i] = Jump(label) - need_label = 1 - if need_label: - code.append(Label(label)) - label = label + 1 - code.append(End()) -# print code - return RegexObject(pattern, flags, code, register, groupindex) - -# Replace expand_escape and _expand functions with their C equivalents. -# If you suspect bugs in the C versions, comment out the next two lines -expand_escape = reop.expand_escape -_expand = reop._expand