555 lines
19 KiB
Python
Executable File
555 lines
19 KiB
Python
Executable File
#! /usr/bin/env python
|
||
|
||
"""Generate ESIS events based on a LaTeX source document and
|
||
configuration data.
|
||
|
||
The conversion is not strong enough to work with arbitrary LaTeX
|
||
documents; it has only been designed to work with the highly stylized
|
||
markup used in the standard Python documentation. A lot of
|
||
information about specific markup is encoded in the control table
|
||
passed to the convert() function; changing this table can allow this
|
||
tool to support additional LaTeX markups.
|
||
|
||
The format of the table is largely undocumented; see the commented
|
||
headers where the table is specified in main(). There is no provision
|
||
to load an alternate table from an external file.
|
||
"""
|
||
|
||
import errno
|
||
import getopt
|
||
import os
|
||
import re
|
||
import string
|
||
import sys
|
||
import UserList
|
||
import xml.sax.saxutils
|
||
|
||
from types import ListType, StringType, TupleType
|
||
|
||
try:
|
||
from xml.parsers.xmllib import XMLParser
|
||
except ImportError:
|
||
from xmllib import XMLParser
|
||
|
||
|
||
from esistools import encode
|
||
|
||
|
||
DEBUG = 0
|
||
|
||
|
||
class LaTeXFormatError(Exception):
|
||
pass
|
||
|
||
|
||
class LaTeXStackError(LaTeXFormatError):
|
||
def __init__(self, found, stack):
|
||
msg = "environment close for %s doesn't match;\n stack = %s" \
|
||
% (found, stack)
|
||
self.found = found
|
||
self.stack = stack[:]
|
||
LaTeXFormatError.__init__(self, msg)
|
||
|
||
|
||
_begin_env_rx = re.compile(r"[\\]begin{([^}]*)}")
|
||
_end_env_rx = re.compile(r"[\\]end{([^}]*)}")
|
||
_begin_macro_rx = re.compile(r"[\\]([a-zA-Z]+[*]?) ?({|\s*\n?)")
|
||
_comment_rx = re.compile("%+ ?(.*)\n[ \t]*")
|
||
_text_rx = re.compile(r"[^]~%\\{}]+")
|
||
_optional_rx = re.compile(r"\s*[[]([^]]*)[]]", re.MULTILINE)
|
||
# _parameter_rx is this complicated to allow {...} inside a parameter;
|
||
# this is useful to match tabular layout specifications like {c|p{24pt}}
|
||
_parameter_rx = re.compile("[ \n]*{(([^{}}]|{[^}]*})*)}")
|
||
_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
|
||
_start_group_rx = re.compile("[ \n]*{")
|
||
_start_optional_rx = re.compile("[ \n]*[[]")
|
||
|
||
|
||
ESCAPED_CHARS = "$%#^ {}&~"
|
||
|
||
|
||
def dbgmsg(msg):
|
||
if DEBUG:
|
||
sys.stderr.write(msg + "\n")
|
||
|
||
def pushing(name, point, depth):
|
||
dbgmsg("pushing <%s> at %s" % (name, point))
|
||
|
||
def popping(name, point, depth):
|
||
dbgmsg("popping </%s> at %s" % (name, point))
|
||
|
||
|
||
class _Stack(UserList.UserList):
|
||
def append(self, entry):
|
||
if type(entry) is not StringType:
|
||
raise LaTeXFormatError("cannot push non-string on stack: "
|
||
+ `entry`)
|
||
#dbgmsg("%s<%s>" % (" "*len(self.data), entry))
|
||
self.data.append(entry)
|
||
|
||
def pop(self, index=-1):
|
||
entry = self.data[index]
|
||
del self.data[index]
|
||
#dbgmsg("%s</%s>" % (" "*len(self.data), entry))
|
||
|
||
def __delitem__(self, index):
|
||
entry = self.data[index]
|
||
del self.data[index]
|
||
#dbgmsg("%s</%s>" % (" "*len(self.data), entry))
|
||
|
||
|
||
def new_stack():
|
||
if DEBUG:
|
||
return _Stack()
|
||
return []
|
||
|
||
|
||
class Conversion:
|
||
def __init__(self, ifp, ofp, table):
|
||
self.write = ofp.write
|
||
self.ofp = ofp
|
||
self.table = table
|
||
self.line = string.join([s.rstrip() for s in ifp.readlines()], "\n")
|
||
self.preamble = 1
|
||
|
||
def convert(self):
|
||
self.subconvert()
|
||
|
||
def subconvert(self, endchar=None, depth=0):
|
||
#
|
||
# Parses content, including sub-structures, until the character
|
||
# 'endchar' is found (with no open structures), or until the end
|
||
# of the input data is endchar is None.
|
||
#
|
||
stack = new_stack()
|
||
line = self.line
|
||
while line:
|
||
if line[0] == endchar and not stack:
|
||
self.line = line
|
||
return line
|
||
m = _comment_rx.match(line)
|
||
if m:
|
||
text = m.group(1)
|
||
if text:
|
||
self.write("(COMMENT\n- %s \n)COMMENT\n-\\n\n"
|
||
% encode(text))
|
||
line = line[m.end():]
|
||
continue
|
||
m = _begin_env_rx.match(line)
|
||
if m:
|
||
name = m.group(1)
|
||
entry = self.get_env_entry(name)
|
||
# re-write to use the macro handler
|
||
line = r"\%s %s" % (name, line[m.end():])
|
||
continue
|
||
m = _end_env_rx.match(line)
|
||
if m:
|
||
# end of environment
|
||
envname = m.group(1)
|
||
entry = self.get_entry(envname)
|
||
while stack and envname != stack[-1] \
|
||
and stack[-1] in entry.endcloses:
|
||
self.write(")%s\n" % stack.pop())
|
||
if stack and envname == stack[-1]:
|
||
self.write(")%s\n" % entry.outputname)
|
||
del stack[-1]
|
||
else:
|
||
raise LaTeXStackError(envname, stack)
|
||
line = line[m.end():]
|
||
continue
|
||
m = _begin_macro_rx.match(line)
|
||
if m:
|
||
# start of macro
|
||
macroname = m.group(1)
|
||
if macroname == "c":
|
||
# Ugh! This is a combining character...
|
||
endpos = m.end()
|
||
self.combining_char("c", line[endpos])
|
||
line = line[endpos + 1:]
|
||
continue
|
||
entry = self.get_entry(macroname)
|
||
if entry.verbatim:
|
||
# magic case!
|
||
pos = line.find("\\end{%s}" % macroname)
|
||
text = line[m.end(1):pos]
|
||
stack.append(entry.name)
|
||
self.write("(%s\n" % entry.outputname)
|
||
self.write("-%s\n" % encode(text))
|
||
self.write(")%s\n" % entry.outputname)
|
||
stack.pop()
|
||
line = line[pos + len("\\end{%s}" % macroname):]
|
||
continue
|
||
while stack and stack[-1] in entry.closes:
|
||
top = stack.pop()
|
||
topentry = self.get_entry(top)
|
||
if topentry.outputname:
|
||
self.write(")%s\n-\\n\n" % topentry.outputname)
|
||
#
|
||
if entry.outputname and entry.empty:
|
||
self.write("e\n")
|
||
#
|
||
params, optional, empty = self.start_macro(macroname)
|
||
# rip off the macroname
|
||
if params:
|
||
line = line[m.end(1):]
|
||
elif empty:
|
||
line = line[m.end(1):]
|
||
else:
|
||
line = line[m.end():]
|
||
opened = 0
|
||
implied_content = 0
|
||
|
||
# handle attribute mappings here:
|
||
for pentry in params:
|
||
if pentry.type == "attribute":
|
||
if pentry.optional:
|
||
m = _optional_rx.match(line)
|
||
if m and entry.outputname:
|
||
line = line[m.end():]
|
||
self.dump_attr(pentry, m.group(1))
|
||
elif pentry.text and entry.outputname:
|
||
# value supplied by conversion spec:
|
||
self.dump_attr(pentry, pentry.text)
|
||
else:
|
||
m = _parameter_rx.match(line)
|
||
if not m:
|
||
raise LaTeXFormatError(
|
||
"could not extract parameter %s for %s: %s"
|
||
% (pentry.name, macroname, `line[:100]`))
|
||
if entry.outputname:
|
||
self.dump_attr(pentry, m.group(1))
|
||
line = line[m.end():]
|
||
elif pentry.type == "child":
|
||
if pentry.optional:
|
||
m = _optional_rx.match(line)
|
||
if m:
|
||
line = line[m.end():]
|
||
if entry.outputname and not opened:
|
||
opened = 1
|
||
self.write("(%s\n" % entry.outputname)
|
||
stack.append(macroname)
|
||
stack.append(pentry.name)
|
||
self.write("(%s\n" % pentry.name)
|
||
self.write("-%s\n" % encode(m.group(1)))
|
||
self.write(")%s\n" % pentry.name)
|
||
stack.pop()
|
||
else:
|
||
if entry.outputname and not opened:
|
||
opened = 1
|
||
self.write("(%s\n" % entry.outputname)
|
||
stack.append(entry.name)
|
||
self.write("(%s\n" % pentry.name)
|
||
stack.append(pentry.name)
|
||
self.line = skip_white(line)[1:]
|
||
line = self.subconvert(
|
||
"}", len(stack) + depth + 1)[1:]
|
||
self.write(")%s\n" % stack.pop())
|
||
elif pentry.type == "content":
|
||
if pentry.implied:
|
||
implied_content = 1
|
||
else:
|
||
if entry.outputname and not opened:
|
||
opened = 1
|
||
self.write("(%s\n" % entry.outputname)
|
||
stack.append(entry.name)
|
||
line = skip_white(line)
|
||
if line[0] != "{":
|
||
raise LaTeXFormatError(
|
||
"missing content for " + macroname)
|
||
self.line = line[1:]
|
||
line = self.subconvert("}", len(stack) + depth + 1)
|
||
if line and line[0] == "}":
|
||
line = line[1:]
|
||
elif pentry.type == "text" and pentry.text:
|
||
if entry.outputname and not opened:
|
||
opened = 1
|
||
stack.append(entry.name)
|
||
self.write("(%s\n" % entry.outputname)
|
||
#dbgmsg("--- text: %s" % `pentry.text`)
|
||
self.write("-%s\n" % encode(pentry.text))
|
||
elif pentry.type == "entityref":
|
||
self.write("&%s\n" % pentry.name)
|
||
if entry.outputname:
|
||
if not opened:
|
||
self.write("(%s\n" % entry.outputname)
|
||
stack.append(entry.name)
|
||
if not implied_content:
|
||
self.write(")%s\n" % entry.outputname)
|
||
stack.pop()
|
||
continue
|
||
if line[0] == endchar and not stack:
|
||
self.line = line[1:]
|
||
return self.line
|
||
if line[0] == "}":
|
||
# end of macro or group
|
||
macroname = stack[-1]
|
||
if macroname:
|
||
conversion = self.table[macroname]
|
||
if conversion.outputname:
|
||
# otherwise, it was just a bare group
|
||
self.write(")%s\n" % conversion.outputname)
|
||
del stack[-1]
|
||
line = line[1:]
|
||
continue
|
||
if line[0] == "~":
|
||
# don't worry about the "tie" aspect of this command
|
||
line = line[1:]
|
||
self.write("- \n")
|
||
continue
|
||
if line[0] == "{":
|
||
stack.append("")
|
||
line = line[1:]
|
||
continue
|
||
if line[0] == "\\" and line[1] in ESCAPED_CHARS:
|
||
self.write("-%s\n" % encode(line[1]))
|
||
line = line[2:]
|
||
continue
|
||
if line[:2] == r"\\":
|
||
self.write("(BREAK\n)BREAK\n")
|
||
line = line[2:]
|
||
continue
|
||
if line[:2] == r"\_":
|
||
line = "_" + line[2:]
|
||
continue
|
||
if line[:2] in (r"\'", r'\"'):
|
||
# combining characters...
|
||
self.combining_char(line[1], line[2])
|
||
line = line[3:]
|
||
continue
|
||
m = _text_rx.match(line)
|
||
if m:
|
||
text = encode(m.group())
|
||
self.write("-%s\n" % text)
|
||
line = line[m.end():]
|
||
continue
|
||
# special case because of \item[]
|
||
# XXX can we axe this???
|
||
if line[0] == "]":
|
||
self.write("-]\n")
|
||
line = line[1:]
|
||
continue
|
||
# avoid infinite loops
|
||
extra = ""
|
||
if len(line) > 100:
|
||
extra = "..."
|
||
raise LaTeXFormatError("could not identify markup: %s%s"
|
||
% (`line[:100]`, extra))
|
||
while stack:
|
||
entry = self.get_entry(stack[-1])
|
||
if entry.closes:
|
||
self.write(")%s\n-%s\n" % (entry.outputname, encode("\n")))
|
||
del stack[-1]
|
||
else:
|
||
break
|
||
if stack:
|
||
raise LaTeXFormatError("elements remain on stack: "
|
||
+ string.join(stack, ", "))
|
||
# otherwise we just ran out of input here...
|
||
|
||
# This is a really limited table of combinations, but it will have
|
||
# to do for now.
|
||
_combinations = {
|
||
("c", "c"): 0x00E7,
|
||
("'", "e"): 0x00E9,
|
||
('"', "o"): 0x00F6,
|
||
}
|
||
|
||
def combining_char(self, prefix, char):
|
||
ordinal = self._combinations[(prefix, char)]
|
||
self.write("-\\%%%d;\n" % ordinal)
|
||
|
||
def start_macro(self, name):
|
||
conversion = self.get_entry(name)
|
||
parameters = conversion.parameters
|
||
optional = parameters and parameters[0].optional
|
||
return parameters, optional, conversion.empty
|
||
|
||
def get_entry(self, name):
|
||
entry = self.table.get(name)
|
||
if entry is None:
|
||
dbgmsg("get_entry(%s) failing; building default entry!" % `name`)
|
||
# not defined; build a default entry:
|
||
entry = TableEntry(name)
|
||
entry.has_content = 1
|
||
entry.parameters.append(Parameter("content"))
|
||
self.table[name] = entry
|
||
return entry
|
||
|
||
def get_env_entry(self, name):
|
||
entry = self.table.get(name)
|
||
if entry is None:
|
||
# not defined; build a default entry:
|
||
entry = TableEntry(name, 1)
|
||
entry.has_content = 1
|
||
entry.parameters.append(Parameter("content"))
|
||
entry.parameters[-1].implied = 1
|
||
self.table[name] = entry
|
||
elif not entry.environment:
|
||
raise LaTeXFormatError(
|
||
name + " is defined as a macro; expected environment")
|
||
return entry
|
||
|
||
def dump_attr(self, pentry, value):
|
||
if not (pentry.name and value):
|
||
return
|
||
if _token_rx.match(value):
|
||
dtype = "TOKEN"
|
||
else:
|
||
dtype = "CDATA"
|
||
self.write("A%s %s %s\n" % (pentry.name, dtype, encode(value)))
|
||
|
||
|
||
def convert(ifp, ofp, table):
|
||
c = Conversion(ifp, ofp, table)
|
||
try:
|
||
c.convert()
|
||
except IOError, (err, msg):
|
||
if err != errno.EPIPE:
|
||
raise
|
||
|
||
|
||
def skip_white(line):
|
||
while line and line[0] in " %\n\t\r":
|
||
line = line[1:].lstrip()
|
||
return line
|
||
|
||
|
||
|
||
class TableEntry:
|
||
def __init__(self, name, environment=0):
|
||
self.name = name
|
||
self.outputname = name
|
||
self.environment = environment
|
||
self.empty = not environment
|
||
self.has_content = 0
|
||
self.verbatim = 0
|
||
self.auto_close = 0
|
||
self.parameters = []
|
||
self.closes = []
|
||
self.endcloses = []
|
||
|
||
class Parameter:
|
||
def __init__(self, type, name=None, optional=0):
|
||
self.type = type
|
||
self.name = name
|
||
self.optional = optional
|
||
self.text = ''
|
||
self.implied = 0
|
||
|
||
|
||
class TableParser(XMLParser):
|
||
def __init__(self, table=None):
|
||
if table is None:
|
||
table = {}
|
||
self.__table = table
|
||
self.__current = None
|
||
self.__buffer = ''
|
||
XMLParser.__init__(self)
|
||
|
||
def get_table(self):
|
||
for entry in self.__table.values():
|
||
if entry.environment and not entry.has_content:
|
||
p = Parameter("content")
|
||
p.implied = 1
|
||
entry.parameters.append(p)
|
||
entry.has_content = 1
|
||
return self.__table
|
||
|
||
def start_environment(self, attrs):
|
||
name = attrs["name"]
|
||
self.__current = TableEntry(name, environment=1)
|
||
self.__current.verbatim = attrs.get("verbatim") == "yes"
|
||
if attrs.has_key("outputname"):
|
||
self.__current.outputname = attrs.get("outputname")
|
||
self.__current.endcloses = attrs.get("endcloses", "").split()
|
||
def end_environment(self):
|
||
self.end_macro()
|
||
|
||
def start_macro(self, attrs):
|
||
name = attrs["name"]
|
||
self.__current = TableEntry(name)
|
||
self.__current.closes = attrs.get("closes", "").split()
|
||
if attrs.has_key("outputname"):
|
||
self.__current.outputname = attrs.get("outputname")
|
||
def end_macro(self):
|
||
self.__table[self.__current.name] = self.__current
|
||
self.__current = None
|
||
|
||
def start_attribute(self, attrs):
|
||
name = attrs.get("name")
|
||
optional = attrs.get("optional") == "yes"
|
||
if name:
|
||
p = Parameter("attribute", name, optional=optional)
|
||
else:
|
||
p = Parameter("attribute", optional=optional)
|
||
self.__current.parameters.append(p)
|
||
self.__buffer = ''
|
||
def end_attribute(self):
|
||
self.__current.parameters[-1].text = self.__buffer
|
||
|
||
def start_entityref(self, attrs):
|
||
name = attrs["name"]
|
||
p = Parameter("entityref", name)
|
||
self.__current.parameters.append(p)
|
||
|
||
def start_child(self, attrs):
|
||
name = attrs["name"]
|
||
p = Parameter("child", name, attrs.get("optional") == "yes")
|
||
self.__current.parameters.append(p)
|
||
self.__current.empty = 0
|
||
|
||
def start_content(self, attrs):
|
||
p = Parameter("content")
|
||
p.implied = attrs.get("implied") == "yes"
|
||
if self.__current.environment:
|
||
p.implied = 1
|
||
self.__current.parameters.append(p)
|
||
self.__current.has_content = 1
|
||
self.__current.empty = 0
|
||
|
||
def start_text(self, attrs):
|
||
self.__current.empty = 0
|
||
self.__buffer = ''
|
||
def end_text(self):
|
||
p = Parameter("text")
|
||
p.text = self.__buffer
|
||
self.__current.parameters.append(p)
|
||
|
||
def handle_data(self, data):
|
||
self.__buffer = self.__buffer + data
|
||
|
||
|
||
def load_table(fp, table=None):
|
||
parser = TableParser(table=table)
|
||
parser.feed(fp.read())
|
||
parser.close()
|
||
return parser.get_table()
|
||
|
||
|
||
def main():
|
||
global DEBUG
|
||
#
|
||
opts, args = getopt.getopt(sys.argv[1:], "D", ["debug"])
|
||
for opt, arg in opts:
|
||
if opt in ("-D", "--debug"):
|
||
DEBUG = DEBUG + 1
|
||
if len(args) == 0:
|
||
ifp = sys.stdin
|
||
ofp = sys.stdout
|
||
elif len(args) == 1:
|
||
ifp = open(args)
|
||
ofp = sys.stdout
|
||
elif len(args) == 2:
|
||
ifp = open(args[0])
|
||
ofp = open(args[1], "w")
|
||
else:
|
||
usage()
|
||
sys.exit(2)
|
||
|
||
table = load_table(open(os.path.join(sys.path[0], 'conversion.xml')))
|
||
convert(ifp, ofp, table)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|