From 53187f32ebbb6156fa8e3ddf834dfde74cfa3208 Mon Sep 17 00:00:00 2001 From: Jeremy Hylton Date: Tue, 8 Feb 2000 19:01:29 +0000 Subject: [PATCH] now produces valid pyc files for a least a trivial subset of the language. CodeGenerator: * modify to track stack depth * add emit method that call's PythonVMCode's makeCodeObject * thread filenames through in hackish way * set flags for code objects for modules and functions XXX the docs for the flags seem out of date and/or incomplete PythonVMCode: * add doc string describing the elements of a real code object LineAddrTable: * creates an lnotab (no quite correctly though) --- Lib/compiler/pycodegen.py | 388 +++++++++++++++++++++------ Tools/compiler/compiler/pycodegen.py | 388 +++++++++++++++++++++------ 2 files changed, 614 insertions(+), 162 deletions(-) diff --git a/Lib/compiler/pycodegen.py b/Lib/compiler/pycodegen.py index 067ebc44bad..ee0c75b5223 100644 --- a/Lib/compiler/pycodegen.py +++ b/Lib/compiler/pycodegen.py @@ -11,6 +11,10 @@ import misc import marshal import new import string +import sys +import os +import stat +import struct def parse(path): f = open(path) @@ -60,7 +64,7 @@ class ASTVisitor: XXX Perhaps I can use a postorder walk for the code generator? """ - VERBOSE = 0 + VERBOSE = 1 def __init__(self): self.node = None @@ -101,9 +105,34 @@ class ASTVisitor: return meth(node) class CodeGenerator: - def __init__(self): - self.code = PythonVMCode() + def __init__(self, filename=None): + self.filename = filename + self.code = PythonVMCode(filename=filename) + self.code.setFlags(0) self.locals = misc.Stack() + # track the current and max stack size + # XXX does this belong here or in the PythonVMCode? + self.curStack = 0 + self.maxStack = 0 + + def emit(self): + """Create a Python code object + + XXX It is confusing that this method isn't related to the + method named emit in the PythonVMCode. + """ + return self.code.makeCodeObject(self.maxStack) + + def push(self, n): + self.curStack = self.curStack + n + if self.curStack > self.maxStack: + self.maxStack = self.curStack + + def pop(self, n): + if n >= self.curStack: + self.curStack = self.curStack - n + else: + self.curStack = 0 def visitDiscard(self, node): return 1 @@ -112,16 +141,16 @@ class CodeGenerator: lnf = walk(node.node, LocalNameFinder()) self.locals.push(lnf.getLocals()) self.visit(node.node) - self.code.emit('LOAD_CONST', 'None') + self.code.emit('LOAD_CONST', None) self.code.emit('RETURN_VALUE') return 1 def visitFunction(self, node): - codeBody = NestedCodeGenerator(node.code, node.argnames) - walk(node.code, codeBody) + codeBody = NestedCodeGenerator(node, filename=self.filename) + walk(node, codeBody) self.code.setLineNo(node.lineno) - self.code.emit('LOAD_CONST', codeBody.code) - self.code.emit('MAKE_FUNCTION') + self.code.emit('LOAD_CONST', codeBody) + self.code.emit('MAKE_FUNCTION', 0) self.code.emit('STORE_NAME', node.name) return 1 @@ -212,6 +241,7 @@ class CodeGenerator: self.visit(node.left) self.visit(node.right) self.code.emit(op) + self.pop(1) return 1 def visitAdd(self, node): @@ -232,9 +262,11 @@ class CodeGenerator: self.code.loadFast(node.name) else: self.code.loadGlobal(node.name) + self.push(1) def visitConst(self, node): self.code.loadConst(node.value) + self.push(1) def visitReturn(self, node): self.code.setLineNo(node.lineno) @@ -262,6 +294,7 @@ class CodeGenerator: for child in node.nodes: self.visit(child) self.code.emit('PRINT_ITEM') + self.pop(len(node.nodes)) return 1 def visitPrintnl(self, node): @@ -276,26 +309,38 @@ class NestedCodeGenerator(CodeGenerator): """ super_init = CodeGenerator.__init__ - def __init__(self, code, args): + def __init__(self, func, filename=''): """code and args of function or class being walked XXX need to separately pass to ASTVisitor. the constructor only uses the code object to find the local names + + Copies code form parent __init__ rather than calling it. """ - self.super_init() - lnf = walk(code, LocalNameFinder(args)) + self.name = func.name + self.super_init(filename) + args = func.argnames + self.code = PythonVMCode(len(args), name=func.name, + filename=filename) + if func.varargs: + self.code.setVarArgs() + if func.kwargs: + self.code.setKWArgs() + lnf = walk(func.code, LocalNameFinder(args)) self.locals.push(lnf.getLocals()) + def __repr__(self): + return "" % self.name + def visitFunction(self, node): lnf = walk(node.code, LocalNameFinder(node.argnames)) self.locals.push(lnf.getLocals()) # XXX need to handle def foo((a, b)): self.code.setLineNo(node.lineno) self.visit(node.code) - self.code.emit('LOAD_CONST', 'None') + self.code.emit('LOAD_CONST', None) self.code.emit('RETURN_VALUE') return 1 - class LocalNameFinder: def __init__(self, names=()): @@ -353,64 +398,86 @@ class ForwardRef: def resolve(self): return self.val -class CompiledModule: - """Store the code object for a compiled module +def add_hook(hooks, type, meth): + """Helper function for PythonVMCode _emit_hooks""" + l = hooks.get(type, []) + l.append(meth) + hooks[type] = l - XXX Not clear how the code objects will be stored. Seems possible - that a single code attribute is sufficient, because it will - contains references to all the need code objects. That might be - messy, though. - """ - MAGIC = (20121 | (ord('\r')<<16) | (ord('\n')<<24)) - - def __init__(self): - self.code = None - - def addCode(self, code): - """addCode(self: SelfType, code: PythonVMCode)""" - - def dump(self, path): - """create a .pyc file""" - f = open(path, 'wb') - f.write(self._pyc_header()) - marshal.dump(self.code, f) - f.close() - - def _pyc_header(self, path): - # compile.c uses marshal to write a long directly, with - # calling the interface that would also generate a 1-byte code - # to indicate the type of the value. simplest way to get the - # same effect is to call marshal and then skip the code. - buf = marshal.dumps(self.MAGIC)[1:] - # skip the mtime for now, since I don't have the write - # structure to pass the filename being compiled into this - # instance - return buf + chr(0) * 4 - class PythonVMCode: + """Creates Python code objects + + The new module is used to create the code object. The following + attribute definitions are included from the reference manual: + + co_name gives the function name + co_argcount is the number of positional arguments (including + arguments with default values) + co_nlocals is the number of local variables used by the function + (including arguments) + co_varnames is a tuple containing the names of the local variables + (starting with the argument names) + co_code is a string representing the sequence of bytecode instructions + co_consts is a tuple containing the literals used by the bytecode + co_names is a tuple containing the names used by the bytecode + co_filename is the filename from which the code was compiled + co_firstlineno is the first line number of the function + co_lnotab is a string encoding the mapping from byte code offsets + to line numbers (for detais see the source code of the + interpreter) + see code com_set_lineno and com_add_lnotab + it's a string with 2bytes per set_lineno + + co_stacksize is the required stack size (including local variables) + co_flags is an integer encoding a number of flags for the + interpreter. - def __init__(self): + The following flag bits are defined for co_flags: bit 2 is set if + the function uses the "*arguments" syntax to accept an arbitrary + number of positional arguments; bit 3 is set if the function uses + the "**keywords" syntax to accept arbitrary keyword arguments; + other bits are used internally or reserved for future use. + + If a code object represents a function, the first item in + co_consts is the documentation string of the function, or None if + undefined. + """ + + # XXX flag bits + VARARGS = 0x04 + KWARGS = 0x08 + + def __init__(self, argcount=0, name='?', filename='', + docstring=None): + # XXX why is the default value for flags 3? self.insts = [] # used by makeCodeObject - self.argcount = 0 + self.argcount = argcount self.code = '' - self.consts = [] - self.filename = '' - self.firstlineno = 0 - self.flags = 0 - self.lnotab = None - self.name = '' + self.consts = [docstring] + self.filename = filename + self.flags = 3 + self.name = name self.names = [] - self.nlocals = 0 - self.stacksize = 2 self.varnames = [] + # lnotab support + self.firstlineno = 0 + self.lastlineno = 0 + self.last_addr = 0 + self.lnotab = '' def __repr__(self): return "" % len(self.insts) - def emit(self, *args): - print "emit", args - self.insts.append(args) + def setFlags(self, val): + """XXX for module's function""" + self.flags = 0 + + def setVarArgs(self): + self.flags = self.flags | self.VARARGS + + def setKWArgs(self): + self.flags = self.flags | self.KWARGS def getCurInst(self): return len(self.insts) @@ -418,23 +485,70 @@ class PythonVMCode: def getNextInst(self): return len(self.insts) + 1 - def convert(self): - """Convert human-readable names to real bytecode""" - pass + def dump(self, io=sys.stdout): + i = 0 + for inst in self.insts: + if inst[0] == 'SET_LINENO': + io.write("\n") + io.write(" %3d " % i) + if len(inst) == 1: + io.write("%s\n" % inst) + else: + io.write("%-15.15s\t%s\n" % inst) + i = i + 1 - def makeCodeObject(self): - """Make a Python code object""" - code = [] + def makeCodeObject(self, stacksize): + """Make a Python code object + + This creates a Python code object using the new module. This + seems simpler than reverse-engineering the way marshal dumps + code objects into .pyc files. One of the key difficulties is + figuring out how to layout references to code objects that + appear on the VM stack; e.g. + 3 SET_LINENO 1 + 6 LOAD_CONST 0 ( 0 or line > 0: + # write the values in 1-byte chunks that sum + # to desired value + trunc_addr = addr + trunc_line = line + if trunc_addr > 255: + trunc_addr = 255 + if trunc_line > 255: + trunc_line = 255 + self.lnotab.append(trunc_addr) + self.lnotab.append(trunc_line) + addr = addr - trunc_addr + line = line - trunc_line + self.lastline = lineno + self.lastoff = self.codeOffset + + def getCode(self): + return string.join(self.code, '') + + def getTable(self): + return string.join(map(chr, self.lnotab), '') + +class CompiledModule: + """Store the code object for a compiled module + + XXX Not clear how the code objects will be stored. Seems possible + that a single code attribute is sufficient, because it will + contains references to all the need code objects. That might be + messy, though. + """ + MAGIC = (20121 | (ord('\r')<<16) | (ord('\n')<<24)) + + def __init__(self, source, filename): + self.source = source + self.filename = filename + + def compile(self): + t = transformer.Transformer() + self.ast = t.parsesuite(self.source) + cg = CodeGenerator(self.filename) + walk(self.ast, cg) + self.code = cg.emit() + + def dump(self, path): + """create a .pyc file""" + f = open(path, 'wb') + f.write(self._pyc_header()) + marshal.dump(self.code, f) + f.close() + + def _pyc_header(self): + # compile.c uses marshal to write a long directly, with + # calling the interface that would also generate a 1-byte code + # to indicate the type of the value. simplest way to get the + # same effect is to call marshal and then skip the code. + magic = marshal.dumps(self.MAGIC)[1:] + mtime = os.stat(self.filename)[stat.ST_MTIME] + mtime = struct.pack('i', mtime) + return magic + mtime + if __name__ == "__main__": - tree = parse('test.py') - cg = CodeGenerator() - ASTVisitor.VERBOSE = 1 - w = walk(tree, cg) - w.VERBOSE = 1 - for i in range(len(cg.code.insts)): - inst = cg.code.insts[i] - if inst[0] == 'SET_LINENO': - print - print "%4d" % i, inst - code = cg.code.makeCodeObject() + if len(sys.argv) > 1: + filename = sys.argv[1] + else: + filename = 'test.py' + buf = open(filename).read() + mod = CompiledModule(buf, filename) + mod.compile() + mod.dump(filename + 'c') diff --git a/Tools/compiler/compiler/pycodegen.py b/Tools/compiler/compiler/pycodegen.py index 067ebc44bad..ee0c75b5223 100644 --- a/Tools/compiler/compiler/pycodegen.py +++ b/Tools/compiler/compiler/pycodegen.py @@ -11,6 +11,10 @@ import misc import marshal import new import string +import sys +import os +import stat +import struct def parse(path): f = open(path) @@ -60,7 +64,7 @@ class ASTVisitor: XXX Perhaps I can use a postorder walk for the code generator? """ - VERBOSE = 0 + VERBOSE = 1 def __init__(self): self.node = None @@ -101,9 +105,34 @@ class ASTVisitor: return meth(node) class CodeGenerator: - def __init__(self): - self.code = PythonVMCode() + def __init__(self, filename=None): + self.filename = filename + self.code = PythonVMCode(filename=filename) + self.code.setFlags(0) self.locals = misc.Stack() + # track the current and max stack size + # XXX does this belong here or in the PythonVMCode? + self.curStack = 0 + self.maxStack = 0 + + def emit(self): + """Create a Python code object + + XXX It is confusing that this method isn't related to the + method named emit in the PythonVMCode. + """ + return self.code.makeCodeObject(self.maxStack) + + def push(self, n): + self.curStack = self.curStack + n + if self.curStack > self.maxStack: + self.maxStack = self.curStack + + def pop(self, n): + if n >= self.curStack: + self.curStack = self.curStack - n + else: + self.curStack = 0 def visitDiscard(self, node): return 1 @@ -112,16 +141,16 @@ class CodeGenerator: lnf = walk(node.node, LocalNameFinder()) self.locals.push(lnf.getLocals()) self.visit(node.node) - self.code.emit('LOAD_CONST', 'None') + self.code.emit('LOAD_CONST', None) self.code.emit('RETURN_VALUE') return 1 def visitFunction(self, node): - codeBody = NestedCodeGenerator(node.code, node.argnames) - walk(node.code, codeBody) + codeBody = NestedCodeGenerator(node, filename=self.filename) + walk(node, codeBody) self.code.setLineNo(node.lineno) - self.code.emit('LOAD_CONST', codeBody.code) - self.code.emit('MAKE_FUNCTION') + self.code.emit('LOAD_CONST', codeBody) + self.code.emit('MAKE_FUNCTION', 0) self.code.emit('STORE_NAME', node.name) return 1 @@ -212,6 +241,7 @@ class CodeGenerator: self.visit(node.left) self.visit(node.right) self.code.emit(op) + self.pop(1) return 1 def visitAdd(self, node): @@ -232,9 +262,11 @@ class CodeGenerator: self.code.loadFast(node.name) else: self.code.loadGlobal(node.name) + self.push(1) def visitConst(self, node): self.code.loadConst(node.value) + self.push(1) def visitReturn(self, node): self.code.setLineNo(node.lineno) @@ -262,6 +294,7 @@ class CodeGenerator: for child in node.nodes: self.visit(child) self.code.emit('PRINT_ITEM') + self.pop(len(node.nodes)) return 1 def visitPrintnl(self, node): @@ -276,26 +309,38 @@ class NestedCodeGenerator(CodeGenerator): """ super_init = CodeGenerator.__init__ - def __init__(self, code, args): + def __init__(self, func, filename=''): """code and args of function or class being walked XXX need to separately pass to ASTVisitor. the constructor only uses the code object to find the local names + + Copies code form parent __init__ rather than calling it. """ - self.super_init() - lnf = walk(code, LocalNameFinder(args)) + self.name = func.name + self.super_init(filename) + args = func.argnames + self.code = PythonVMCode(len(args), name=func.name, + filename=filename) + if func.varargs: + self.code.setVarArgs() + if func.kwargs: + self.code.setKWArgs() + lnf = walk(func.code, LocalNameFinder(args)) self.locals.push(lnf.getLocals()) + def __repr__(self): + return "" % self.name + def visitFunction(self, node): lnf = walk(node.code, LocalNameFinder(node.argnames)) self.locals.push(lnf.getLocals()) # XXX need to handle def foo((a, b)): self.code.setLineNo(node.lineno) self.visit(node.code) - self.code.emit('LOAD_CONST', 'None') + self.code.emit('LOAD_CONST', None) self.code.emit('RETURN_VALUE') return 1 - class LocalNameFinder: def __init__(self, names=()): @@ -353,64 +398,86 @@ class ForwardRef: def resolve(self): return self.val -class CompiledModule: - """Store the code object for a compiled module +def add_hook(hooks, type, meth): + """Helper function for PythonVMCode _emit_hooks""" + l = hooks.get(type, []) + l.append(meth) + hooks[type] = l - XXX Not clear how the code objects will be stored. Seems possible - that a single code attribute is sufficient, because it will - contains references to all the need code objects. That might be - messy, though. - """ - MAGIC = (20121 | (ord('\r')<<16) | (ord('\n')<<24)) - - def __init__(self): - self.code = None - - def addCode(self, code): - """addCode(self: SelfType, code: PythonVMCode)""" - - def dump(self, path): - """create a .pyc file""" - f = open(path, 'wb') - f.write(self._pyc_header()) - marshal.dump(self.code, f) - f.close() - - def _pyc_header(self, path): - # compile.c uses marshal to write a long directly, with - # calling the interface that would also generate a 1-byte code - # to indicate the type of the value. simplest way to get the - # same effect is to call marshal and then skip the code. - buf = marshal.dumps(self.MAGIC)[1:] - # skip the mtime for now, since I don't have the write - # structure to pass the filename being compiled into this - # instance - return buf + chr(0) * 4 - class PythonVMCode: + """Creates Python code objects + + The new module is used to create the code object. The following + attribute definitions are included from the reference manual: + + co_name gives the function name + co_argcount is the number of positional arguments (including + arguments with default values) + co_nlocals is the number of local variables used by the function + (including arguments) + co_varnames is a tuple containing the names of the local variables + (starting with the argument names) + co_code is a string representing the sequence of bytecode instructions + co_consts is a tuple containing the literals used by the bytecode + co_names is a tuple containing the names used by the bytecode + co_filename is the filename from which the code was compiled + co_firstlineno is the first line number of the function + co_lnotab is a string encoding the mapping from byte code offsets + to line numbers (for detais see the source code of the + interpreter) + see code com_set_lineno and com_add_lnotab + it's a string with 2bytes per set_lineno + + co_stacksize is the required stack size (including local variables) + co_flags is an integer encoding a number of flags for the + interpreter. - def __init__(self): + The following flag bits are defined for co_flags: bit 2 is set if + the function uses the "*arguments" syntax to accept an arbitrary + number of positional arguments; bit 3 is set if the function uses + the "**keywords" syntax to accept arbitrary keyword arguments; + other bits are used internally or reserved for future use. + + If a code object represents a function, the first item in + co_consts is the documentation string of the function, or None if + undefined. + """ + + # XXX flag bits + VARARGS = 0x04 + KWARGS = 0x08 + + def __init__(self, argcount=0, name='?', filename='', + docstring=None): + # XXX why is the default value for flags 3? self.insts = [] # used by makeCodeObject - self.argcount = 0 + self.argcount = argcount self.code = '' - self.consts = [] - self.filename = '' - self.firstlineno = 0 - self.flags = 0 - self.lnotab = None - self.name = '' + self.consts = [docstring] + self.filename = filename + self.flags = 3 + self.name = name self.names = [] - self.nlocals = 0 - self.stacksize = 2 self.varnames = [] + # lnotab support + self.firstlineno = 0 + self.lastlineno = 0 + self.last_addr = 0 + self.lnotab = '' def __repr__(self): return "" % len(self.insts) - def emit(self, *args): - print "emit", args - self.insts.append(args) + def setFlags(self, val): + """XXX for module's function""" + self.flags = 0 + + def setVarArgs(self): + self.flags = self.flags | self.VARARGS + + def setKWArgs(self): + self.flags = self.flags | self.KWARGS def getCurInst(self): return len(self.insts) @@ -418,23 +485,70 @@ class PythonVMCode: def getNextInst(self): return len(self.insts) + 1 - def convert(self): - """Convert human-readable names to real bytecode""" - pass + def dump(self, io=sys.stdout): + i = 0 + for inst in self.insts: + if inst[0] == 'SET_LINENO': + io.write("\n") + io.write(" %3d " % i) + if len(inst) == 1: + io.write("%s\n" % inst) + else: + io.write("%-15.15s\t%s\n" % inst) + i = i + 1 - def makeCodeObject(self): - """Make a Python code object""" - code = [] + def makeCodeObject(self, stacksize): + """Make a Python code object + + This creates a Python code object using the new module. This + seems simpler than reverse-engineering the way marshal dumps + code objects into .pyc files. One of the key difficulties is + figuring out how to layout references to code objects that + appear on the VM stack; e.g. + 3 SET_LINENO 1 + 6 LOAD_CONST 0 ( 0 or line > 0: + # write the values in 1-byte chunks that sum + # to desired value + trunc_addr = addr + trunc_line = line + if trunc_addr > 255: + trunc_addr = 255 + if trunc_line > 255: + trunc_line = 255 + self.lnotab.append(trunc_addr) + self.lnotab.append(trunc_line) + addr = addr - trunc_addr + line = line - trunc_line + self.lastline = lineno + self.lastoff = self.codeOffset + + def getCode(self): + return string.join(self.code, '') + + def getTable(self): + return string.join(map(chr, self.lnotab), '') + +class CompiledModule: + """Store the code object for a compiled module + + XXX Not clear how the code objects will be stored. Seems possible + that a single code attribute is sufficient, because it will + contains references to all the need code objects. That might be + messy, though. + """ + MAGIC = (20121 | (ord('\r')<<16) | (ord('\n')<<24)) + + def __init__(self, source, filename): + self.source = source + self.filename = filename + + def compile(self): + t = transformer.Transformer() + self.ast = t.parsesuite(self.source) + cg = CodeGenerator(self.filename) + walk(self.ast, cg) + self.code = cg.emit() + + def dump(self, path): + """create a .pyc file""" + f = open(path, 'wb') + f.write(self._pyc_header()) + marshal.dump(self.code, f) + f.close() + + def _pyc_header(self): + # compile.c uses marshal to write a long directly, with + # calling the interface that would also generate a 1-byte code + # to indicate the type of the value. simplest way to get the + # same effect is to call marshal and then skip the code. + magic = marshal.dumps(self.MAGIC)[1:] + mtime = os.stat(self.filename)[stat.ST_MTIME] + mtime = struct.pack('i', mtime) + return magic + mtime + if __name__ == "__main__": - tree = parse('test.py') - cg = CodeGenerator() - ASTVisitor.VERBOSE = 1 - w = walk(tree, cg) - w.VERBOSE = 1 - for i in range(len(cg.code.insts)): - inst = cg.code.insts[i] - if inst[0] == 'SET_LINENO': - print - print "%4d" % i, inst - code = cg.code.makeCodeObject() + if len(sys.argv) > 1: + filename = sys.argv[1] + else: + filename = 'test.py' + buf = open(filename).read() + mod = CompiledModule(buf, filename) + mod.compile() + mod.dump(filename + 'c')