mirror of https://github.com/python/cpython
Moved into tools/sgmlconv/.
This commit is contained in:
parent
63de8f6d55
commit
30a68c7a2b
|
@ -0,0 +1,131 @@
|
|||
#! /usr/bin/env python
|
||||
|
||||
"""Convert ESIS events to SGML or XML markup.
|
||||
|
||||
This is limited, but seems sufficient for the ESIS generated by the
|
||||
latex2esis.py script when run over the Python documentation.
|
||||
"""
|
||||
__version__ = '$Revision$'
|
||||
|
||||
import errno
|
||||
import re
|
||||
import string
|
||||
|
||||
|
||||
_data_rx = re.compile(r"[^\\][^\\]*")
|
||||
|
||||
def decode(s):
|
||||
r = ''
|
||||
while s:
|
||||
m = _data_rx.match(s)
|
||||
if m:
|
||||
r = r + m.group()
|
||||
s = s[len(m.group()):]
|
||||
elif s[1] == "\\":
|
||||
r = r + "\\"
|
||||
s = s[2:]
|
||||
elif s[1] == "n":
|
||||
r = r + "\n"
|
||||
s = s[2:]
|
||||
else:
|
||||
raise ValueError, "can't handle " + `s`
|
||||
return r
|
||||
|
||||
|
||||
def format_attrs(attrs):
|
||||
attrs = attrs.items()
|
||||
attrs.sort()
|
||||
s = ''
|
||||
for name, value in attrs:
|
||||
s = '%s %s="%s"' % (s, name, value)
|
||||
return s
|
||||
|
||||
|
||||
def do_convert(ifp, ofp, knownempties, xml=0):
|
||||
attrs = {}
|
||||
lastopened = None
|
||||
knownempty = 0
|
||||
lastempty = 0
|
||||
while 1:
|
||||
line = ifp.readline()
|
||||
if not line:
|
||||
break
|
||||
|
||||
type = line[0]
|
||||
data = line[1:]
|
||||
if data and data[-1] == "\n":
|
||||
data = data[:-1]
|
||||
if type == "-":
|
||||
data = decode(data)
|
||||
ofp.write(data)
|
||||
if "\n" in data:
|
||||
lastopened = None
|
||||
knownempty = 0
|
||||
lastempty = 0
|
||||
elif type == "(":
|
||||
if knownempty and xml:
|
||||
ofp.write("<%s%s/>" % (data, format_attrs(attrs)))
|
||||
else:
|
||||
ofp.write("<%s%s>" % (data, format_attrs(attrs)))
|
||||
if knownempty and data not in knownempties:
|
||||
# accumulate knowledge!
|
||||
knownempties.append(data)
|
||||
attrs = {}
|
||||
lastopened = data
|
||||
lastempty = knownempty
|
||||
knownempty = 0
|
||||
elif type == ")":
|
||||
if xml:
|
||||
if not lastempty:
|
||||
ofp.write("</%s>" % data)
|
||||
elif data not in knownempties:
|
||||
if lastopened == data:
|
||||
ofp.write("</>")
|
||||
else:
|
||||
ofp.write("</%s>" % data)
|
||||
lastopened = None
|
||||
lastempty = 0
|
||||
elif type == "A":
|
||||
name, type, value = string.split(data, " ", 2)
|
||||
attrs[name] = decode(value)
|
||||
elif type == "e":
|
||||
knownempty = 1
|
||||
|
||||
|
||||
def sgml_convert(ifp, ofp, knownempties=()):
|
||||
return do_convert(ifp, ofp, list(knownempties), xml=0)
|
||||
|
||||
|
||||
def xml_convert(ifp, ofp, knownempties=()):
|
||||
return do_convert(ifp, ofp, list(knownempties), xml=1)
|
||||
|
||||
|
||||
def main():
|
||||
import sys
|
||||
#
|
||||
convert = sgml_convert
|
||||
if sys.argv[1:] and sys.argv[1] in ("-x", "--xml"):
|
||||
convert = xml_convert
|
||||
del sys.argv[1]
|
||||
if len(sys.argv) == 1:
|
||||
ifp = sys.stdin
|
||||
ofp = sys.stdout
|
||||
elif len(sys.argv) == 2:
|
||||
ifp = open(sys.argv[1])
|
||||
ofp = sys.stdout
|
||||
elif len(sys.argv) == 3:
|
||||
ifp = open(sys.argv[1])
|
||||
ofp = open(sys.argv[2], "w")
|
||||
else:
|
||||
usage()
|
||||
sys.exit(2)
|
||||
# knownempties is ignored in the XML version
|
||||
try:
|
||||
convert(ifp, ofp)
|
||||
except IOError, (err, msg):
|
||||
if err != errno.EPIPE:
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,337 @@
|
|||
#! /usr/bin/env python
|
||||
|
||||
"""Generate ESIS events based on a LaTeX source document and configuration
|
||||
data.
|
||||
|
||||
|
||||
"""
|
||||
__version__ = '$Revision$'
|
||||
|
||||
import errno
|
||||
import re
|
||||
import string
|
||||
import StringIO
|
||||
import sys
|
||||
|
||||
|
||||
class Error(Exception):
|
||||
pass
|
||||
|
||||
class LaTeXFormatError(Error):
|
||||
pass
|
||||
|
||||
|
||||
_begin_env_rx = re.compile(r"[\\]begin{([^}]*)}")
|
||||
_end_env_rx = re.compile(r"[\\]end{([^}]*)}")
|
||||
_begin_macro_rx = re.compile("[\\\\]([a-zA-Z]+[*]?)({|\\s*\n?)")
|
||||
_comment_rx = re.compile("%+[ \t]*(.*)\n")
|
||||
_text_rx = re.compile(r"[^]%\\{}]+")
|
||||
_optional_rx = re.compile(r"\s*[[]([^]]*)[]]")
|
||||
_parameter_rx = re.compile("[ \n]*{([^}]*)}")
|
||||
_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
|
||||
_start_group_rx = re.compile("[ \n]*{")
|
||||
_start_optional_rx = re.compile("[ \n]*[[]")
|
||||
|
||||
|
||||
_charmap = {}
|
||||
for c in map(chr, range(256)):
|
||||
_charmap[c] = c
|
||||
_charmap["\n"] = r"\n"
|
||||
_charmap["\\"] = r"\\"
|
||||
del c
|
||||
|
||||
def encode(s):
|
||||
return string.join(map(_charmap.get, s), '')
|
||||
|
||||
|
||||
ESCAPED_CHARS = "$%#^ {}"
|
||||
|
||||
|
||||
def subconvert(line, ofp, table, discards, autoclosing, knownempty,
|
||||
endchar=None):
|
||||
stack = []
|
||||
while line:
|
||||
if line[0] == endchar and not stack:
|
||||
return line[1:]
|
||||
m = _comment_rx.match(line)
|
||||
if m:
|
||||
text = m.group(1)
|
||||
if text:
|
||||
ofp.write("(COMMENT\n")
|
||||
ofp.write("- %s \n" % encode(text))
|
||||
ofp.write(")COMMENT\n")
|
||||
ofp.write("-\\n\n")
|
||||
else:
|
||||
ofp.write("-\\n\n")
|
||||
line = line[m.end():]
|
||||
continue
|
||||
m = _begin_env_rx.match(line)
|
||||
if m:
|
||||
# re-write to use the macro handler
|
||||
line = r"\%s %s" % (m.group(1), line[m.end():])
|
||||
continue
|
||||
m =_end_env_rx.match(line)
|
||||
if m:
|
||||
# end of environment
|
||||
envname = m.group(1)
|
||||
if envname == "document":
|
||||
# special magic
|
||||
for n in stack[1:]:
|
||||
if n not in autoclosing:
|
||||
raise LaTeXFormatError("open element on stack: " + `n`)
|
||||
# should be more careful, but this is easier to code:
|
||||
stack = []
|
||||
ofp.write(")document\n")
|
||||
elif envname == stack[-1]:
|
||||
ofp.write(")%s\n" % envname)
|
||||
del stack[-1]
|
||||
else:
|
||||
raise LaTeXFormatError("environment close doesn't match")
|
||||
line = line[m.end():]
|
||||
continue
|
||||
m = _begin_macro_rx.match(line)
|
||||
if m:
|
||||
# start of macro
|
||||
macroname = m.group(1)
|
||||
if macroname == "verbatim":
|
||||
# really magic case!
|
||||
pos = string.find(line, "\\end{verbatim}")
|
||||
text = line[m.end(1):pos]
|
||||
ofp.write("(verbatim\n")
|
||||
ofp.write("-%s\n" % encode(text))
|
||||
ofp.write(")verbatim\n")
|
||||
line = line[pos + len("\\end{verbatim}"):]
|
||||
continue
|
||||
numbered = 1
|
||||
if macroname[-1] == "*":
|
||||
macroname = macroname[:-1]
|
||||
numbered = 0
|
||||
if macroname in autoclosing and macroname in stack:
|
||||
while stack[-1] != macroname:
|
||||
if stack[-1] and stack[-1] not in discards:
|
||||
ofp.write(")%s\n-\\n\n" % stack[-1])
|
||||
del stack[-1]
|
||||
if macroname not in discards:
|
||||
ofp.write("-\\n\n)%s\n-\\n\n" % macroname)
|
||||
del stack[-1]
|
||||
real_ofp = ofp
|
||||
if macroname in discards:
|
||||
ofp = StringIO.StringIO()
|
||||
#
|
||||
conversion = table.get(macroname, ([], 0, 0))
|
||||
params, optional, empty = conversion
|
||||
empty = empty or knownempty(macroname)
|
||||
if empty:
|
||||
ofp.write("e\n")
|
||||
if not numbered:
|
||||
ofp.write("Anumbered TOKEN no\n")
|
||||
# rip off the macroname
|
||||
if params:
|
||||
if optional and len(params) == 1:
|
||||
line = line = line[m.end():]
|
||||
else:
|
||||
line = line[m.end(1):]
|
||||
elif empty:
|
||||
line = line[m.end(1):]
|
||||
else:
|
||||
line = line[m.end():]
|
||||
#
|
||||
# Very ugly special case to deal with \item[]. The catch is that
|
||||
# this needs to occur outside the for loop that handles attribute
|
||||
# parsing so we can 'continue' the outer loop.
|
||||
#
|
||||
if optional and type(params[0]) is type(()):
|
||||
# the attribute name isn't used in this special case
|
||||
stack.append(macroname)
|
||||
ofp.write("(%s\n" % macroname)
|
||||
m = _start_optional_rx.match(line)
|
||||
if m:
|
||||
line = line[m.end():]
|
||||
line = subconvert(line, ofp, table, discards,
|
||||
autoclosing, knownempty, endchar="]")
|
||||
line = "}" + line
|
||||
continue
|
||||
# handle attribute mappings here:
|
||||
for attrname in params:
|
||||
if optional:
|
||||
optional = 0
|
||||
if type(attrname) is type(""):
|
||||
m = _optional_rx.match(line)
|
||||
if m:
|
||||
line = line[m.end():]
|
||||
ofp.write("A%s TOKEN %s\n"
|
||||
% (attrname, encode(m.group(1))))
|
||||
elif type(attrname) is type(()):
|
||||
# This is a sub-element; but don't place the
|
||||
# element we found on the stack (\section-like)
|
||||
stack.append(macroname)
|
||||
ofp.write("(%s\n" % macroname)
|
||||
macroname = attrname[0]
|
||||
m = _start_group_rx.match(line)
|
||||
if m:
|
||||
line = line[m.end():]
|
||||
elif type(attrname) is type([]):
|
||||
# A normal subelement.
|
||||
attrname = attrname[0]
|
||||
stack.append(macroname)
|
||||
stack.append(attrname)
|
||||
ofp.write("(%s\n" % macroname)
|
||||
macroname = attrname
|
||||
else:
|
||||
m = _parameter_rx.match(line)
|
||||
if not m:
|
||||
raise LaTeXFormatError(
|
||||
"could not extract parameter %s for %s: %s"
|
||||
% (attrname, macroname, `line[:100]`))
|
||||
value = m.group(1)
|
||||
if _token_rx.match(value):
|
||||
dtype = "TOKEN"
|
||||
else:
|
||||
dtype = "CDATA"
|
||||
ofp.write("A%s %s %s\n"
|
||||
% (attrname, dtype, encode(value)))
|
||||
line = line[m.end():]
|
||||
stack.append(macroname)
|
||||
ofp.write("(%s\n" % macroname)
|
||||
if empty:
|
||||
line = "}" + line
|
||||
ofp = real_ofp
|
||||
continue
|
||||
if line[0] == "}":
|
||||
# end of macro
|
||||
macroname = stack[-1]
|
||||
conversion = table.get(macroname)
|
||||
if macroname \
|
||||
and macroname not in discards \
|
||||
and type(conversion) is not type(""):
|
||||
# otherwise, it was just a bare group
|
||||
ofp.write(")%s\n" % stack[-1])
|
||||
del stack[-1]
|
||||
line = line[1:]
|
||||
continue
|
||||
if line[0] == "{":
|
||||
stack.append("")
|
||||
line = line[1:]
|
||||
continue
|
||||
if line[0] == "\\" and line[1] in ESCAPED_CHARS:
|
||||
ofp.write("-%s\n" % encode(line[1]))
|
||||
line = line[2:]
|
||||
continue
|
||||
if line[:2] == r"\\":
|
||||
ofp.write("(BREAK\n)BREAK\n")
|
||||
line = line[2:]
|
||||
continue
|
||||
m = _text_rx.match(line)
|
||||
if m:
|
||||
text = encode(m.group())
|
||||
ofp.write("-%s\n" % text)
|
||||
line = line[m.end():]
|
||||
continue
|
||||
# special case because of \item[]
|
||||
if line[0] == "]":
|
||||
ofp.write("-]\n")
|
||||
line = line[1:]
|
||||
continue
|
||||
# avoid infinite loops
|
||||
extra = ""
|
||||
if len(line) > 100:
|
||||
extra = "..."
|
||||
raise LaTeXFormatError("could not identify markup: %s%s"
|
||||
% (`line[:100]`, extra))
|
||||
|
||||
|
||||
def convert(ifp, ofp, table={}, discards=(), autoclosing=(), knownempties=()):
|
||||
d = {}
|
||||
for gi in knownempties:
|
||||
d[gi] = gi
|
||||
try:
|
||||
subconvert(ifp.read(), ofp, table, discards, autoclosing, d.has_key)
|
||||
except IOError, (err, msg):
|
||||
if err != errno.EPIPE:
|
||||
raise
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) == 2:
|
||||
ifp = open(sys.argv[1])
|
||||
ofp = sys.stdout
|
||||
elif len(sys.argv) == 3:
|
||||
ifp = open(sys.argv[1])
|
||||
ofp = open(sys.argv[2], "w")
|
||||
else:
|
||||
usage()
|
||||
sys.exit(2)
|
||||
convert(ifp, ofp, {
|
||||
# entries are name
|
||||
# -> ([list of attribute names], first_is_optional, empty)
|
||||
"cfuncdesc": (["type", "name", ("args",)], 0, 0),
|
||||
"chapter": ([("title",)], 0, 0),
|
||||
"chapter*": ([("title",)], 0, 0),
|
||||
"classdesc": (["name", ("constructor-args",)], 0, 0),
|
||||
"ctypedesc": (["name"], 0, 0),
|
||||
"cvardesc": (["type", "name"], 0, 0),
|
||||
"datadesc": (["name"], 0, 0),
|
||||
"declaremodule": (["id", "type", "name"], 1, 1),
|
||||
"deprecated": (["release"], 0, 1),
|
||||
"documentclass": (["classname"], 0, 1),
|
||||
"excdesc": (["name"], 0, 0),
|
||||
"funcdesc": (["name", ("args",)], 0, 0),
|
||||
"funcdescni": (["name", ("args",)], 0, 0),
|
||||
"indexii": (["ie1", "ie2"], 0, 1),
|
||||
"indexiii": (["ie1", "ie2", "ie3"], 0, 1),
|
||||
"indexiv": (["ie1", "ie2", "ie3", "ie4"], 0, 1),
|
||||
"input": (["source"], 0, 1),
|
||||
"item": ([("leader",)], 1, 0),
|
||||
"label": (["id"], 0, 1),
|
||||
"manpage": (["name", "section"], 0, 1),
|
||||
"memberdesc": (["class", "name"], 1, 0),
|
||||
"methoddesc": (["class", "name", ("args",)], 1, 0),
|
||||
"methoddescni": (["class", "name", ("args",)], 1, 0),
|
||||
"opcodedesc": (["name", "var"], 0, 0),
|
||||
"par": ([], 0, 1),
|
||||
"paragraph": ([("title",)], 0, 0),
|
||||
"rfc": (["number"], 0, 1),
|
||||
"section": ([("title",)], 0, 0),
|
||||
"seemodule": (["ref", "name"], 1, 0),
|
||||
"subparagraph": ([("title",)], 0, 0),
|
||||
"subsection": ([("title",)], 0, 0),
|
||||
"subsubsection": ([("title",)], 0, 0),
|
||||
"tableii": (["colspec", "style", "head1", "head2"], 0, 0),
|
||||
"tableiii": (["colspec", "style", "head1", "head2", "head3"], 0, 0),
|
||||
"tableiv": (["colspec", "style", "head1", "head2", "head3", "head4"],
|
||||
0, 0),
|
||||
"versionadded": (["version"], 0, 1),
|
||||
"versionchanged": (["version"], 0, 1),
|
||||
#
|
||||
"ABC": ([], 0, 1),
|
||||
"ASCII": ([], 0, 1),
|
||||
"C": ([], 0, 1),
|
||||
"Cpp": ([], 0, 1),
|
||||
"EOF": ([], 0, 1),
|
||||
"e": ([], 0, 1),
|
||||
"ldots": ([], 0, 1),
|
||||
"NULL": ([], 0, 1),
|
||||
"POSIX": ([], 0, 1),
|
||||
"UNIX": ([], 0, 1),
|
||||
#
|
||||
# Things that will actually be going away!
|
||||
#
|
||||
"fi": ([], 0, 1),
|
||||
"ifhtml": ([], 0, 1),
|
||||
"makeindex": ([], 0, 1),
|
||||
"makemodindex": ([], 0, 1),
|
||||
"maketitle": ([], 0, 1),
|
||||
"noindent": ([], 0, 1),
|
||||
"tableofcontents": ([], 0, 1),
|
||||
},
|
||||
discards=["fi", "ifhtml", "makeindex", "makemodindex", "maketitle",
|
||||
"noindent", "tableofcontents"],
|
||||
autoclosing=["chapter", "section", "subsection", "subsubsection",
|
||||
"paragraph", "subparagraph", ],
|
||||
knownempties=["appendix",
|
||||
"maketitle", "makeindex", "makemodindex",
|
||||
"localmoduletable"])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue