cpython/Doc/tools/sgmlconv/latex2esis.py

566 lines
20 KiB
Python
Raw Normal View History

1998-11-23 12:59:39 -04:00
#! /usr/bin/env python
"""Generate ESIS events based on a LaTeX source document and
configuration data.
The conversion is not strong enough to work with arbitrary LaTeX
documents; it has only been designed to work with the highly stylized
markup used in the standard Python documentation. A lot of
information about specific markup is encoded in the control table
passed to the convert() function; changing this table can allow this
tool to support additional LaTeX markups.
The format of the table is largely undocumented; see the commented
headers where the table is specified in main(). There is no provision
to load an alternate table from an external file.
1998-11-23 12:59:39 -04:00
"""
import errno
import getopt
import os
1998-11-23 12:59:39 -04:00
import re
import sys
2001-11-30 15:30:03 -04:00
import xml.sax
import xml.sax.saxutils
1998-11-23 12:59:39 -04:00
from esistools import encode
DEBUG = 0
class LaTeXFormatError(Exception):
1998-11-23 12:59:39 -04:00
pass
class LaTeXStackError(LaTeXFormatError):
def __init__(self, found, stack):
msg = "environment close for %s doesn't match;\n stack = %s" \
% (found, stack)
self.found = found
self.stack = stack[:]
LaTeXFormatError.__init__(self, msg)
1998-11-23 12:59:39 -04:00
1998-11-23 12:59:39 -04:00
_begin_env_rx = re.compile(r"[\\]begin{([^}]*)}")
_end_env_rx = re.compile(r"[\\]end{([^}]*)}")
_begin_macro_rx = re.compile(r"[\\]([a-zA-Z]+[*]?) ?({|\s*\n?)")
_comment_rx = re.compile("%+ ?(.*)\n[ \t]*")
_text_rx = re.compile(r"[^]~%\\{}]+")
_optional_rx = re.compile(r"\s*[[]([^]]*)[]]", re.MULTILINE)
# _parameter_rx is this complicated to allow {...} inside a parameter;
# this is useful to match tabular layout specifications like {c|p{24pt}}
_parameter_rx = re.compile("[ \n]*{(([^{}}]|{[^}]*})*)}")
1998-11-23 12:59:39 -04:00
_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
_start_group_rx = re.compile("[ \n]*{")
_start_optional_rx = re.compile("[ \n]*[[]")
1998-11-30 10:45:24 -04:00
ESCAPED_CHARS = "$%#^ {}&~"
1998-11-23 12:59:39 -04:00
def dbgmsg(msg):
if DEBUG:
sys.stderr.write(msg + "\n")
def pushing(name, point, depth):
dbgmsg("pushing <%s> at %s" % (name, point))
def popping(name, point, depth):
dbgmsg("popping </%s> at %s" % (name, point))
class _Stack(list):
def append(self, entry):
if not isinstance(entry, str):
raise LaTeXFormatError("cannot push non-string on stack: %r"
% (entry, ))
#dbgmsg("%s<%s>" % (" "*len(self.data), entry))
list.append(self, entry)
def pop(self, index=-1):
entry = self[index]
del self[index]
#dbgmsg("%s</%s>" % (" " * len(self), entry))
def __delitem__(self, index):
entry = self[index]
list.__delitem__(self, index)
#dbgmsg("%s</%s>" % (" " * len(self), entry))
def new_stack():
if DEBUG:
return _Stack()
else:
return []
class Conversion:
def __init__(self, ifp, ofp, table):
self.write = ofp.write
self.ofp = ofp
self.table = table
L = [s.rstrip() for s in ifp.readlines()]
L.append("")
self.line = "\n".join(L)
self.preamble = 1
def convert(self):
self.subconvert()
def subconvert(self, endchar=None, depth=0):
#
# Parses content, including sub-structures, until the character
# 'endchar' is found (with no open structures), or until the end
# of the input data is endchar is None.
#
stack = new_stack()
line = self.line
while line:
if line[0] == endchar and not stack:
self.line = line
return line
m = _comment_rx.match(line)
if m:
text = m.group(1)
if text:
self.write("(COMMENT\n- %s \n)COMMENT\n-\\n\n"
% encode(text))
line = line[m.end():]
continue
m = _begin_env_rx.match(line)
if m:
name = m.group(1)
entry = self.get_env_entry(name)
# re-write to use the macro handler
line = r"\%s %s" % (name, line[m.end():])
continue
m = _end_env_rx.match(line)
if m:
# end of environment
envname = m.group(1)
entry = self.get_entry(envname)
while stack and envname != stack[-1] \
and stack[-1] in entry.endcloses:
self.write(")%s\n" % stack.pop())
if stack and envname == stack[-1]:
self.write(")%s\n" % entry.outputname)
del stack[-1]
else:
raise LaTeXStackError(envname, stack)
line = line[m.end():]
continue
m = _begin_macro_rx.match(line)
if m:
# start of macro
macroname = m.group(1)
if macroname == "c":
# Ugh! This is a combining character...
endpos = m.end()
self.combining_char("c", line[endpos])
line = line[endpos + 1:]
continue
entry = self.get_entry(macroname)
if entry.verbatim:
# magic case!
pos = line.find("\\end{%s}" % macroname)
text = line[m.end(1):pos]
stack.append(entry.name)
self.write("(%s\n" % entry.outputname)
self.write("-%s\n" % encode(text))
self.write(")%s\n" % entry.outputname)
stack.pop()
line = line[pos + len("\\end{%s}" % macroname):]
continue
while stack and stack[-1] in entry.closes:
top = stack.pop()
topentry = self.get_entry(top)
if topentry.outputname:
self.write(")%s\n-\\n\n" % topentry.outputname)
#
if entry.outputname and entry.empty:
self.write("e\n")
#
params, optional, empty = self.start_macro(macroname)
# rip off the macroname
if params:
line = line[m.end(1):]
elif empty:
line = line[m.end(1):]
else:
line = line[m.end():]
opened = 0
implied_content = 0
# handle attribute mappings here:
for pentry in params:
if pentry.type == "attribute":
if pentry.optional:
m = _optional_rx.match(line)
if m and entry.outputname:
line = line[m.end():]
self.dump_attr(pentry, m.group(1))
elif pentry.text and entry.outputname:
# value supplied by conversion spec:
self.dump_attr(pentry, pentry.text)
else:
m = _parameter_rx.match(line)
if not m:
raise LaTeXFormatError(
"could not extract parameter %s for %s: %r"
% (pentry.name, macroname, line[:100]))
if entry.outputname:
self.dump_attr(pentry, m.group(1))
line = line[m.end():]
elif pentry.type == "child":
if pentry.optional:
m = _optional_rx.match(line)
if m:
line = line[m.end():]
if entry.outputname and not opened:
opened = 1
self.write("(%s\n" % entry.outputname)
stack.append(macroname)
stack.append(pentry.name)
self.write("(%s\n" % pentry.name)
self.write("-%s\n" % encode(m.group(1)))
self.write(")%s\n" % pentry.name)
stack.pop()
else:
if entry.outputname and not opened:
opened = 1
self.write("(%s\n" % entry.outputname)
stack.append(entry.name)
self.write("(%s\n" % pentry.name)
stack.append(pentry.name)
self.line = skip_white(line)[1:]
line = self.subconvert(
"}", len(stack) + depth + 1)[1:]
self.write(")%s\n" % stack.pop())
elif pentry.type == "content":
if pentry.implied:
implied_content = 1
else:
if entry.outputname and not opened:
opened = 1
self.write("(%s\n" % entry.outputname)
stack.append(entry.name)
line = skip_white(line)
if line[0] != "{":
raise LaTeXFormatError(
"missing content for " + macroname)
self.line = line[1:]
line = self.subconvert("}", len(stack) + depth + 1)
if line and line[0] == "}":
line = line[1:]
elif pentry.type == "text" and pentry.text:
if entry.outputname and not opened:
opened = 1
stack.append(entry.name)
self.write("(%s\n" % entry.outputname)
#dbgmsg("--- text: %r" % pentry.text)
self.write("-%s\n" % encode(pentry.text))
elif pentry.type == "entityref":
self.write("&%s\n" % pentry.name)
if entry.outputname:
if not opened:
self.write("(%s\n" % entry.outputname)
stack.append(entry.name)
if not implied_content:
self.write(")%s\n" % entry.outputname)
stack.pop()
continue
if line[0] == endchar and not stack:
self.line = line[1:]
return self.line
if line[0] == "}":
# end of macro or group
macroname = stack[-1]
if macroname:
conversion = self.table[macroname]
if conversion.outputname:
# otherwise, it was just a bare group
self.write(")%s\n" % conversion.outputname)
del stack[-1]
line = line[1:]
continue
if line[0] == "~":
# don't worry about the "tie" aspect of this command
line = line[1:]
self.write("- \n")
continue
if line[0] == "{":
stack.append("")
line = line[1:]
continue
if line[0] == "\\" and line[1] in ESCAPED_CHARS:
self.write("-%s\n" % encode(line[1]))
line = line[2:]
continue
if line[:2] == r"\\":
self.write("(BREAK\n)BREAK\n")
line = line[2:]
continue
if line[:2] == r"\_":
line = "_" + line[2:]
continue
if line[:2] in (r"\'", r'\"'):
# combining characters...
self.combining_char(line[1], line[2])
line = line[3:]
continue
m = _text_rx.match(line)
if m:
text = encode(m.group())
self.write("-%s\n" % text)
line = line[m.end():]
continue
# special case because of \item[]
# XXX can we axe this???
if line[0] == "]":
self.write("-]\n")
line = line[1:]
continue
# avoid infinite loops
extra = ""
if len(line) > 100:
extra = "..."
raise LaTeXFormatError("could not identify markup: %r%s"
% (line[:100], extra))
while stack:
entry = self.get_entry(stack[-1])
if entry.closes:
self.write(")%s\n-%s\n" % (entry.outputname, encode("\n")))
del stack[-1]
else:
break
if stack:
raise LaTeXFormatError("elements remain on stack: "
+ ", ".join(stack))
# otherwise we just ran out of input here...
# This is a really limited table of combinations, but it will have
# to do for now.
_combinations = {
("c", "c"): 0x00E7,
("'", "e"): 0x00E9,
('"', "o"): 0x00F6,
}
def combining_char(self, prefix, char):
ordinal = self._combinations[(prefix, char)]
self.write("-\\%%%d;\n" % ordinal)
def start_macro(self, name):
conversion = self.get_entry(name)
parameters = conversion.parameters
optional = parameters and parameters[0].optional
return parameters, optional, conversion.empty
def get_entry(self, name):
entry = self.table.get(name)
if entry is None:
dbgmsg("get_entry(%r) failing; building default entry!" % (name, ))
# not defined; build a default entry:
entry = TableEntry(name)
entry.has_content = 1
entry.parameters.append(Parameter("content"))
self.table[name] = entry
return entry
def get_env_entry(self, name):
entry = self.table.get(name)
if entry is None:
# not defined; build a default entry:
entry = TableEntry(name, 1)
entry.has_content = 1
entry.parameters.append(Parameter("content"))
entry.parameters[-1].implied = 1
self.table[name] = entry
elif not entry.environment:
raise LaTeXFormatError(
name + " is defined as a macro; expected environment")
return entry
def dump_attr(self, pentry, value):
if not (pentry.name and value):
return
if _token_rx.match(value):
dtype = "TOKEN"
else:
dtype = "CDATA"
self.write("A%s %s %s\n" % (pentry.name, dtype, encode(value)))
1999-07-29 19:42:27 -03:00
def convert(ifp, ofp, table):
c = Conversion(ifp, ofp, table)
try:
c.convert()
except IOError, (err, msg):
if err != errno.EPIPE:
raise
def skip_white(line):
while line and line[0] in " %\n\t\r":
line = line[1:].lstrip()
return line
class TableEntry:
def __init__(self, name, environment=0):
self.name = name
self.outputname = name
self.environment = environment
self.empty = not environment
self.has_content = 0
self.verbatim = 0
self.auto_close = 0
self.parameters = []
self.closes = []
self.endcloses = []
class Parameter:
def __init__(self, type, name=None, optional=0):
self.type = type
self.name = name
self.optional = optional
self.text = ''
self.implied = 0
2001-11-30 15:30:03 -04:00
class TableHandler(xml.sax.handler.ContentHandler):
def __init__(self):
self.__table = {}
self.__buffer = ''
2001-11-30 15:30:03 -04:00
self.__methods = {}
def get_table(self):
for entry in self.__table.values():
if entry.environment and not entry.has_content:
p = Parameter("content")
p.implied = 1
entry.parameters.append(p)
entry.has_content = 1
return self.__table
2001-11-30 15:30:03 -04:00
def startElement(self, tag, attrs):
try:
start, end = self.__methods[tag]
except KeyError:
start = getattr(self, "start_" + tag, None)
end = getattr(self, "end_" + tag, None)
self.__methods[tag] = (start, end)
if start:
start(attrs)
def endElement(self, tag):
start, end = self.__methods[tag]
if end:
end()
def endDocument(self):
self.__methods.clear()
def characters(self, data):
self.__buffer += data
def start_environment(self, attrs):
name = attrs["name"]
self.__current = TableEntry(name, environment=1)
self.__current.verbatim = attrs.get("verbatim") == "yes"
if attrs.has_key("outputname"):
self.__current.outputname = attrs.get("outputname")
self.__current.endcloses = attrs.get("endcloses", "").split()
def end_environment(self):
self.end_macro()
def start_macro(self, attrs):
name = attrs["name"]
self.__current = TableEntry(name)
self.__current.closes = attrs.get("closes", "").split()
if attrs.has_key("outputname"):
self.__current.outputname = attrs.get("outputname")
def end_macro(self):
name = self.__current.name
if self.__table.has_key(name):
raise ValueError("name %r already in use" % (name,))
self.__table[name] = self.__current
self.__current = None
def start_attribute(self, attrs):
name = attrs.get("name")
optional = attrs.get("optional") == "yes"
if name:
p = Parameter("attribute", name, optional=optional)
else:
p = Parameter("attribute", optional=optional)
self.__current.parameters.append(p)
self.__buffer = ''
def end_attribute(self):
self.__current.parameters[-1].text = self.__buffer
def start_entityref(self, attrs):
name = attrs["name"]
p = Parameter("entityref", name)
self.__current.parameters.append(p)
def start_child(self, attrs):
name = attrs["name"]
p = Parameter("child", name, attrs.get("optional") == "yes")
self.__current.parameters.append(p)
self.__current.empty = 0
def start_content(self, attrs):
p = Parameter("content")
p.implied = attrs.get("implied") == "yes"
if self.__current.environment:
p.implied = 1
self.__current.parameters.append(p)
self.__current.has_content = 1
self.__current.empty = 0
def start_text(self, attrs):
self.__current.empty = 0
self.__buffer = ''
def end_text(self):
p = Parameter("text")
p.text = self.__buffer
self.__current.parameters.append(p)
2001-11-30 15:30:03 -04:00
def load_table(fp):
ch = TableHandler()
xml.sax.parse(fp, ch)
return ch.get_table()
1998-11-23 12:59:39 -04:00
def main():
global DEBUG
#
1999-07-29 19:42:27 -03:00
opts, args = getopt.getopt(sys.argv[1:], "D", ["debug"])
for opt, arg in opts:
1999-07-29 19:42:27 -03:00
if opt in ("-D", "--debug"):
DEBUG += 1
if len(args) == 0:
ifp = sys.stdin
ofp = sys.stdout
elif len(args) == 1:
ifp = open(args[0])
1998-11-23 12:59:39 -04:00
ofp = sys.stdout
elif len(args) == 2:
ifp = open(args[0])
ofp = open(args[1], "w")
1998-11-23 12:59:39 -04:00
else:
usage()
sys.exit(2)
1999-07-29 19:42:27 -03:00
table = load_table(open(os.path.join(sys.path[0], 'conversion.xml')))
convert(ifp, ofp, table)
1998-11-23 12:59:39 -04:00
if __name__ == "__main__":
main()