cpython/Doc/tools/sgmlconv/docfixer.py

#! /usr/bin/env python

"""Promote the IDs from <label/> elements to the enclosing section / chapter /
whatever, then remove the <label/> elements.  This allows *ML style internal
linking rather than the bogus LaTeX model.

Note that <label/>s in <title> elements are promoted two steps, since the
<title> elements are artificially created from the section parameter, and the
label really refers to the sectioning construct.
"""
__version__ = '$Revision$'


import errno
import string
import sys
import xml.dom.core
import xml.dom.esis_builder


# Workaround to deal with invalid documents (multiple root elements).  This
# does not indicate a bug in the DOM implementation.
#
def get_documentElement(self):
    docelem = None
    for n in self._node.children:
        if n.type == xml.dom.core.ELEMENT:
            docelem = xml.dom.core.Element(n, self, self)
    return docelem

xml.dom.core.Document.get_documentElement = get_documentElement


# Replace get_childNodes for the Document class; without this, children
# accessed from the Document object via .childNodes (no matter how many
# levels of access are used) will be given an ownerDocument of None.
#
def get_childNodes(self):
    return xml.dom.core.NodeList(self._node.children, self, self)

xml.dom.core.Document.get_childNodes = get_childNodes


def get_first_element(doc, gi):
    for n in doc.childNodes:
        if n.nodeType == xml.dom.core.ELEMENT and n.tagName == gi:
            return n

def extract_first_element(doc, gi):
    node = get_first_element(doc, gi)
    if node is not None:
        doc.removeChild(node)
    return node


def simplify(doc):
    # Try to rationalize the document a bit, since these things are simply
    # not valid SGML/XML documents as they stand, and need a little work.
    documentclass = "document"
    inputs = []
    node = extract_first_element(doc, "documentclass")
    if node is not None:
        documentclass = node.getAttribute("classname")
    node = extract_first_element(doc, "title")
    if node is not None:
        inputs.append(node)
    # update the name of the root element
    node = get_first_element(doc, "document")
    if node is not None:
        node._node.name = documentclass
    while 1:
        node = extract_first_element(doc, "input")
        if node is None:
            break
        inputs.append(node)
    if inputs:
        docelem = doc.documentElement
        inputs.reverse()
        for node in inputs:
            text = doc.createTextNode("\n")
            docelem.insertBefore(text, docelem.firstChild)
            docelem.insertBefore(node, text)
        docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)
    while doc.firstChild.nodeType == xml.dom.core.TEXT:
        doc.removeChild(doc.firstChild)


def cleanup_root_text(doc):
    discards = []
    skip = 0
    for n in doc.childNodes:
        prevskip = skip
        skip = 0
        if n.nodeType == xml.dom.core.TEXT and not prevskip:
            discards.append(n)
        elif n.nodeType == xml.dom.core.COMMENT:
            skip = 1
    for node in discards:
        doc.removeChild(node)


def rewrite_desc_entries(doc, argname_gi):
    argnodes = doc.getElementsByTagName(argname_gi)
    for node in argnodes:
        parent = node.parentNode
        nodes = []
        for n in parent.childNodes:
            if n.nodeType != xml.dom.core.ELEMENT or n.tagName != argname_gi:
                nodes.append(n)
        desc = doc.createElement("description")
        for n in nodes:
            parent.removeChild(n)
            desc.appendChild(n)
        if node.childNodes:
            # keep the <args>...</args>, newline & indent
            parent.insertBefore(doc.createText("\n  "), node)
        else:
            # no arguments, remove the <args/> node
            parent.removeChild(node)
        parent.appendChild(doc.createText("\n  "))
        parent.appendChild(desc)
        parent.appendChild(doc.createText("\n"))

def handle_args(doc):
    rewrite_desc_entries(doc, "args")
    rewrite_desc_entries(doc, "constructor-args")


def handle_comments(doc, node=None):
    if node is None:
        node = doc
    for n in node.childNodes:
        if n.nodeType == xml.dom.core.ELEMENT:
            if n.tagName == "COMMENT":
                comment = doc.createComment(n.childNodes[0].data)
                node.replaceChild(comment, n)
            else:
                handle_comments(doc, n)


def handle_labels(doc):
    labels = doc.getElementsByTagName("label")
    for label in labels:
        id = label.getAttribute("id")
        if not id:
            continue
        parent = label.parentNode
        if parent.tagName == "title":
            parent.parentNode.setAttribute("id", id)
        else:
            parent.setAttribute("id", id)
        # now, remove <label id="..."/> from parent:
        parent.removeChild(label)


def fixup_trailing_whitespace(doc, wsmap):
    queue = [doc]
    while queue:
        node = queue[0]
        del queue[0]
        if node.nodeType == xml.dom.core.ELEMENT \
           and wsmap.has_key(node.tagName):
            ws = wsmap[node.tagName]
            children = node.childNodes
            children.reverse()
            if children[0].nodeType == xml.dom.core.TEXT:
                data = string.rstrip(children[0].data) + ws
                children[0].data = data
            children.reverse()
            # hack to get the title in place:
            if node.tagName == "title" \
               and node.parentNode.firstChild.nodeType == xml.dom.core.ELEMENT:
                node.parentNode.insertBefore(doc.createText("\n  "),
                                             node.parentNode.firstChild)
        for child in node.childNodes:
            if child.nodeType == xml.dom.core.ELEMENT:
                queue.append(child)


def normalize(doc):
    for node in doc.childNodes:
        if node.nodeType == xml.dom.core.ELEMENT:
            node.normalize()


def cleanup_trailing_parens(doc, element_names):
    d = {}
    for gi in element_names:
        d[gi] = gi
    rewrite_element = d.has_key
    queue = []
    for node in doc.childNodes:
        if node.nodeType == xml.dom.core.ELEMENT:
            queue.append(node)
    while queue:
        node = queue[0]
        del queue[0]
        if rewrite_element(node.tagName):
            children = node.childNodes
            if len(children) == 1 \
               and children[0].nodeType == xml.dom.core.TEXT:
                data = children[0].data
                if data[-2:] == "()":
                    children[0].data = data[:-2]
        else:
            for child in node.childNodes:
                if child.nodeType == xml.dom.core.ELEMENT:
                    queue.append(child)


def convert(ifp, ofp):
    p = xml.dom.esis_builder.EsisBuilder()
    p.feed(ifp.read())
    doc = p.document
    normalize(doc)
    handle_args(doc)
    handle_comments(doc)
    simplify(doc)
    handle_labels(doc)
    fixup_trailing_whitespace(doc, {
        "abstract": "\n",
        "title": "",
        "chapter": "\n\n",
        "section": "\n\n",
        "subsection": "\n\n",
        "subsubsection": "\n\n",
        "paragraph": "\n\n",
        "subparagraph": "\n\n",
        })
    cleanup_root_text(doc)
    cleanup_trailing_parens(doc, ["function", "method", "cfunction"])
    try:
        ofp.write(doc.toxml())
        ofp.write("\n")
    except IOError, (err, msg):
        # Ignore EPIPE; it just means that whoever we're writing to stopped
        # reading.  The rest of the output would be ignored.  All other errors
        # should still be reported,
        if err != errno.EPIPE:
            raise


def main():
    if len(sys.argv) == 1:
        ifp = sys.stdin
        ofp = sys.stdout
    elif len(sys.argv) == 2:
        ifp = open(sys.argv[1])
        ofp = sys.stdout
    elif len(sys.argv) == 3:
        ifp = open(sys.argv[1])
        ofp = open(sys.argv[2], "w")
    else:
        usage()
        sys.exit(2)
    convert(ifp, ofp)


if __name__ == "__main__":
    main()
Script to squirrel around with the DOM tree of document fragments from the LaTeX-based ESIS streams to be a little better structured, and generally perform clean-up. Preliminary. 1998-11-23 13:02:03 -04:00			`#! /usr/bin/env python`

			`"""Promote the IDs from <label/> elements to the enclosing section / chapter /`
			`whatever, then remove the <label/> elements. This allows *ML style internal`
			`linking rather than the bogus LaTeX model.`

			`Note that <label/>s in <title> elements are promoted two steps, since the`
			`<title> elements are artificially created from the section parameter, and the`
			`label really refers to the sectioning construct.`
			`"""`
			`__version__ = '$Revision$'`


			`import errno`
			`import string`
			`import sys`
			`import xml.dom.core`
			`import xml.dom.esis_builder`


			`# Workaround to deal with invalid documents (multiple root elements). This`
			`# does not indicate a bug in the DOM implementation.`
			`#`
			`def get_documentElement(self):`
			`docelem = None`
			`for n in self._node.children:`
			`if n.type == xml.dom.core.ELEMENT:`
			`docelem = xml.dom.core.Element(n, self, self)`
			`return docelem`

			`xml.dom.core.Document.get_documentElement = get_documentElement`


			`# Replace get_childNodes for the Document class; without this, children`
			`# accessed from the Document object via .childNodes (no matter how many`
			`# levels of access are used) will be given an ownerDocument of None.`
			`#`
			`def get_childNodes(self):`
			`return xml.dom.core.NodeList(self._node.children, self, self)`

			`xml.dom.core.Document.get_childNodes = get_childNodes`


			`def get_first_element(doc, gi):`
			`for n in doc.childNodes:`
			`if n.nodeType == xml.dom.core.ELEMENT and n.tagName == gi:`
			`return n`

			`def extract_first_element(doc, gi):`
			`node = get_first_element(doc, gi)`
			`if node is not None:`
			`doc.removeChild(node)`
			`return node`


			`def simplify(doc):`
			`# Try to rationalize the document a bit, since these things are simply`
			`# not valid SGML/XML documents as they stand, and need a little work.`
			`documentclass = "document"`
			`inputs = []`
			`node = extract_first_element(doc, "documentclass")`
			`if node is not None:`
			`documentclass = node.getAttribute("classname")`
			`node = extract_first_element(doc, "title")`
			`if node is not None:`
			`inputs.append(node)`
			`# update the name of the root element`
			`node = get_first_element(doc, "document")`
			`if node is not None:`
			`node._node.name = documentclass`
			`while 1:`
			`node = extract_first_element(doc, "input")`
			`if node is None:`
			`break`
			`inputs.append(node)`
			`if inputs:`
			`docelem = doc.documentElement`
			`inputs.reverse()`
			`for node in inputs:`
			`text = doc.createTextNode("\n")`
			`docelem.insertBefore(text, docelem.firstChild)`
			`docelem.insertBefore(node, text)`
			`docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)`
			`while doc.firstChild.nodeType == xml.dom.core.TEXT:`
			`doc.removeChild(doc.firstChild)`


			`def cleanup_root_text(doc):`
			`discards = []`
			`skip = 0`
			`for n in doc.childNodes:`
			`prevskip = skip`
			`skip = 0`
			`if n.nodeType == xml.dom.core.TEXT and not prevskip:`
			`discards.append(n)`
			`elif n.nodeType == xml.dom.core.COMMENT:`
			`skip = 1`
			`for node in discards:`
			`doc.removeChild(node)`


			`def rewrite_desc_entries(doc, argname_gi):`
			`argnodes = doc.getElementsByTagName(argname_gi)`
			`for node in argnodes:`
			`parent = node.parentNode`
			`nodes = []`
			`for n in parent.childNodes:`
			`if n.nodeType != xml.dom.core.ELEMENT or n.tagName != argname_gi:`
			`nodes.append(n)`
			`desc = doc.createElement("description")`
			`for n in nodes:`
			`parent.removeChild(n)`
			`desc.appendChild(n)`
			`if node.childNodes:`
			`# keep the <args>...</args>, newline & indent`
			`parent.insertBefore(doc.createText("\n "), node)`
			`else:`
			`# no arguments, remove the <args/> node`
			`parent.removeChild(node)`
			`parent.appendChild(doc.createText("\n "))`
			`parent.appendChild(desc)`
			`parent.appendChild(doc.createText("\n"))`

			`def handle_args(doc):`
			`rewrite_desc_entries(doc, "args")`
			`rewrite_desc_entries(doc, "constructor-args")`


			`def handle_comments(doc, node=None):`
			`if node is None:`
			`node = doc`
			`for n in node.childNodes:`
			`if n.nodeType == xml.dom.core.ELEMENT:`
			`if n.tagName == "COMMENT":`
			`comment = doc.createComment(n.childNodes[0].data)`
			`node.replaceChild(comment, n)`
			`else:`
			`handle_comments(doc, n)`


			`def handle_labels(doc):`
			`labels = doc.getElementsByTagName("label")`
			`for label in labels:`
			`id = label.getAttribute("id")`
			`if not id:`
			`continue`
			`parent = label.parentNode`
			`if parent.tagName == "title":`
			`parent.parentNode.setAttribute("id", id)`
			`else:`
			`parent.setAttribute("id", id)`
			`# now, remove <label id="..."/> from parent:`
			`parent.removeChild(label)`


Add some additional cleanup transformations. 1998-11-23 19:10:35 -04:00			`def fixup_trailing_whitespace(doc, wsmap):`
			`queue = [doc]`
			`while queue:`
			`node = queue[0]`
			`del queue[0]`
			`if node.nodeType == xml.dom.core.ELEMENT \`
			`and wsmap.has_key(node.tagName):`
			`ws = wsmap[node.tagName]`
			`children = node.childNodes`
			`children.reverse()`
			`if children[0].nodeType == xml.dom.core.TEXT:`
			`data = string.rstrip(children[0].data) + ws`
			`children[0].data = data`
			`children.reverse()`
			`# hack to get the title in place:`
			`if node.tagName == "title" \`
			`and node.parentNode.firstChild.nodeType == xml.dom.core.ELEMENT:`
			`node.parentNode.insertBefore(doc.createText("\n "),`
			`node.parentNode.firstChild)`
			`for child in node.childNodes:`
			`if child.nodeType == xml.dom.core.ELEMENT:`
			`queue.append(child)`


			`def normalize(doc):`
			`for node in doc.childNodes:`
			`if node.nodeType == xml.dom.core.ELEMENT:`
			`node.normalize()`


			`def cleanup_trailing_parens(doc, element_names):`
			`d = {}`
			`for gi in element_names:`
			`d[gi] = gi`
			`rewrite_element = d.has_key`
			`queue = []`
			`for node in doc.childNodes:`
			`if node.nodeType == xml.dom.core.ELEMENT:`
			`queue.append(node)`
			`while queue:`
			`node = queue[0]`
			`del queue[0]`
			`if rewrite_element(node.tagName):`
			`children = node.childNodes`
			`if len(children) == 1 \`
			`and children[0].nodeType == xml.dom.core.TEXT:`
			`data = children[0].data`
			`if data[-2:] == "()":`
			`children[0].data = data[:-2]`
			`else:`
			`for child in node.childNodes:`
			`if child.nodeType == xml.dom.core.ELEMENT:`
			`queue.append(child)`


Script to squirrel around with the DOM tree of document fragments from the LaTeX-based ESIS streams to be a little better structured, and generally perform clean-up. Preliminary. 1998-11-23 13:02:03 -04:00			`def convert(ifp, ofp):`
			`p = xml.dom.esis_builder.EsisBuilder()`
			`p.feed(ifp.read())`
			`doc = p.document`
Add some additional cleanup transformations. 1998-11-23 19:10:35 -04:00			`normalize(doc)`
Script to squirrel around with the DOM tree of document fragments from the LaTeX-based ESIS streams to be a little better structured, and generally perform clean-up. Preliminary. 1998-11-23 13:02:03 -04:00			`handle_args(doc)`
			`handle_comments(doc)`
			`simplify(doc)`
			`handle_labels(doc)`
Add some additional cleanup transformations. 1998-11-23 19:10:35 -04:00			`fixup_trailing_whitespace(doc, {`
			`"abstract": "\n",`
			`"title": "",`
			`"chapter": "\n\n",`
			`"section": "\n\n",`
			`"subsection": "\n\n",`
			`"subsubsection": "\n\n",`
			`"paragraph": "\n\n",`
			`"subparagraph": "\n\n",`
			`})`
Script to squirrel around with the DOM tree of document fragments from the LaTeX-based ESIS streams to be a little better structured, and generally perform clean-up. Preliminary. 1998-11-23 13:02:03 -04:00			`cleanup_root_text(doc)`
Add some additional cleanup transformations. 1998-11-23 19:10:35 -04:00			`cleanup_trailing_parens(doc, ["function", "method", "cfunction"])`
Script to squirrel around with the DOM tree of document fragments from the LaTeX-based ESIS streams to be a little better structured, and generally perform clean-up. Preliminary. 1998-11-23 13:02:03 -04:00			`try:`
			`ofp.write(doc.toxml())`
			`ofp.write("\n")`
			`except IOError, (err, msg):`
			`# Ignore EPIPE; it just means that whoever we're writing to stopped`
			`# reading. The rest of the output would be ignored. All other errors`
			`# should still be reported,`
			`if err != errno.EPIPE:`
			`raise`


			`def main():`
			`if len(sys.argv) == 1:`
			`ifp = sys.stdin`
			`ofp = sys.stdout`
			`elif len(sys.argv) == 2:`
			`ifp = open(sys.argv[1])`
			`ofp = sys.stdout`
			`elif len(sys.argv) == 3:`
			`ifp = open(sys.argv[1])`
			`ofp = open(sys.argv[2], "w")`
			`else:`
			`usage()`
			`sys.exit(2)`
			`convert(ifp, ofp)`


			`if __name__ == "__main__":`
			`main()`