From 55c3819e6a055264f1fd9853ab2ed022a84ef626 Mon Sep 17 00:00:00 2001 From: Fred Drake Date: Thu, 29 Jun 2000 19:39:57 +0000 Subject: [PATCH] Paul Prescod : W3C DOM implementation for Python. --- Lib/xml/dom/minidom.py | 385 +++++++++++++++++++++++++++++++++++++++++ Lib/xml/dom/pulldom.py | 267 ++++++++++++++++++++++++++++ 2 files changed, 652 insertions(+) create mode 100644 Lib/xml/dom/minidom.py create mode 100644 Lib/xml/dom/pulldom.py diff --git a/Lib/xml/dom/minidom.py b/Lib/xml/dom/minidom.py new file mode 100644 index 00000000000..32d2d2b1bfa --- /dev/null +++ b/Lib/xml/dom/minidom.py @@ -0,0 +1,385 @@ +import pulldom +import string +from StringIO import StringIO +import types + +""" +minidom.py -- a lightweight DOM implementation based on SAX. + +Todo: +===== + * convenience methods for getting elements and text. + * more testing + * bring some of the writer and linearizer code into conformance with this + interface + * SAX 2 namespaces +""" + +class Node: + ELEMENT_NODE = 1 + ATTRIBUTE_NODE = 2 + TEXT_NODE = 3 + CDATA_SECTION_NODE = 4 + ENTITY_REFERENCE_NODE = 5 + ENTITY_NODE = 6 + PROCESSING_INSTRUCTION_NODE = 7 + COMMENT_NODE = 8 + DOCUMENT_NODE = 9 + DOCUMENT_TYPE_NODE = 10 + DOCUMENT_FRAGMENT_NODE = 11 + NOTATION_NODE = 12 + + allnodes=[] + + def __init__( self ): + self.childNodes=[] + Node.allnodes.append( repr( id( self ))+repr( self.__class__ )) + + def __getattr__( self, key ): + if key[0:2]=="__": raise AttributeError + # getattr should never call getattr! + if self.__dict__.has_key("inGetAttr"): + del self.inGetAttr + raise AttributeError, key + + prefix,attrname=key[:5],key[5:] + if prefix=="_get_": + self.inGetAttr=1 + if hasattr( self, attrname ): + del self.inGetAttr + return (lambda self=self, attrname=attrname: + getattr( self, attrname )) + else: + del self.inGetAttr + raise AttributeError, key + else: + self.inGetAttr=1 + try: + func = getattr( self, "_get_"+key ) + except AttributeError: + raise AttributeError, key + del self.inGetAttr + return func() + + def __nonzero__(self): return 1 + + def toxml( self ): + writer=StringIO() + self.writexml( writer ) + return writer.getvalue() + + def hasChildNodes( self ): + if self.childNodes: return 1 + else: return 0 + + def insertBefore( self, newChild, refChild): + index=self.childNodes.index( refChild ) + self.childNodes.insert( index, newChild ) + + def appendChild( self, node ): + self.childNodes.append( node ) + + def unlink( self ): + self.parentNode=None + while self.childNodes: + self.childNodes[-1].unlink() + del self.childNodes[-1] # probably not most efficient! + self.childNodes=None + if self.attributes: + for attr in self.attributes.values(): + attr.unlink() + self.attributes=None + index=Node.allnodes.index( repr( id( self ))+repr( self.__class__ )) + del Node.allnodes[index] + +def _write_data( writer, data): + "Writes datachars to writer." + data=string.replace(data,"&","&") + data=string.replace(data,"<","<") + data=string.replace(data,"\"",""") + data=string.replace(data,">",">") + writer.write(data) + +def _closeElement( element ): + del element.parentNode + for node in element.elements: + _closeElement( node ) + +def _getElementsByTagNameHelper( parent, name, rc ): + for node in parent.childNodes: + if node.nodeType==Node.ELEMENT_NODE and\ + (name=="*" or node.tagName==name): + rc.append( node ) + _getElementsByTagNameHelper( node, name, rc ) + return rc + +def _getElementsByTagNameNSHelper( parent, nsURI, localName, rc ): + for node in parent.childNodes: + if (node.nodeType==Node.ELEMENT_NODE ): + if ((localName=="*" or node.tagName==localName) and + (nsURI=="*" or node.namespaceURI==nsURI)): + rc.append( node ) + _getElementsByTagNameNSHelper( node, name, rc ) + +class Attr(Node): + nodeType=Node.ATTRIBUTE_NODE + def __init__( self, qName, namespaceURI="", prefix="", + localName=None ): + Node.__init__( self ) + assert qName + # skip setattr for performance + self.__dict__["nodeName"] = self.__dict__["name"] = qName + self.__dict__["localName"]=localName or qName + self.__dict__["prefix"]=prefix + self.__dict__["namespaceURI"]=namespaceURI + # nodeValue and value are set elsewhere + self.attributes=None + + def __setattr__( self, name, value ): + if name in ("value", "nodeValue" ): + self.__dict__["value"]=self.__dict__["nodeValue"]=value + else: + self.__dict__[name]=value + +class AttributeList: + # the attribute list is a transient interface to the underlying dictionaries + # mutations here will change the underlying element's dictionary + def __init__( self, attrs, attrsNS ): + self.__attrs=attrs + self.__attrsNS=attrs + self.length=len( self.__attrs.keys() ) + + def item( self, index ): + try: + return self[self.keys()[index]] + except IndexError: + return None + + def items( self ): + return map( lambda node: (node.tagName, node.value), + self.__attrs.values() ) + + def itemsNS( self ): + return map( lambda node: ((node.URI, node.localName), node.value), + self.__attrs.values() ) + + def keys( self ): + return self.__attrs.keys() + + def keysNS( self ): + return self.__attrsNS.keys() + + def values( self ): + return self.__attrs.values() + + def __len__( self ): + return self.length + + def __cmp__( self, other ): + if self.__attrs is other.__attrs: + return 0 + else: + return cmp( id( self ), id( other ) ) + + #FIXME: is it appropriate to return .value? + def __getitem__( self, attname_or_tuple ): + if type( attname_or_tuple ) == type( (1,2) ): + return self.__attrsNS[attname_or_tuple].value + else: + return self.__attrs[attname_or_tuple].value + + def __setitem__( self, attname ): + raise TypeError, "object does not support item assignment" + +class Element( Node ): + nodeType=Node.ELEMENT_NODE + def __init__( self, tagName, namespaceURI="", prefix="", + localName=None ): + Node.__init__( self ) + self.tagName = self.nodeName = tagName + self.localName=localName or tagName + self.prefix=prefix + self.namespaceURI=namespaceURI + self.nodeValue=None + + self.__attrs={} # attributes are double-indexed: + self.__attrsNS={}# tagName -> Attribute + # URI,localName -> Attribute + # in the future: consider lazy generation of attribute objects + # this is too tricky for now because of headaches + # with namespaces. + + def getAttribute( self, attname ): + return self.__attrs[attname].value + + def getAttributeNS( self, namespaceURI, localName ): + return self.__attrsNS[(namespaceURI, localName)].value + + def setAttribute( self, attname, value ): + attr=Attr( attname ) + # for performance + attr.__dict__["value"]=attr.__dict__["nodeValue"]=value + self.setAttributeNode( attr ) + + def setAttributeNS( self, namespaceURI, qualifiedName, value ): + attr=createAttributeNS( namespaceURI, qualifiedName ) + # for performance + attr.__dict__["value"]=attr.__dict__["nodeValue"]=value + self.setAttributeNode( attr ) + + def setAttributeNode( self, attr ): + self.__attrs[attr.name]=attr + self.__attrsNS[(attr.namespaceURI,attr.localName)]=attr + + def removeAttribute( self, name ): + attr = self.__attrs[name] + self.removeAttributeNode( attr ) + + def removeAttributeNS( self, namespaceURI, localName ): + attr = self.__attrsNS[(uri, localName)] + self.removeAttributeNode( attr ) + + def removeAttributeNode( self, node ): + del self.__attrs[node.name] + del self.__attrsNS[(node.namespaceURI, node.localName)] + + def getElementsByTagName( self, name ): + return _getElementsByTagNameHelper( self, name, [] ) + + def getElementsByTagNameNS(self,namespaceURI,localName): + _getElementsByTagNameNSHelper( self, namespaceURI, localName, [] ) + + def __repr__( self ): + return "" + + def writexml(self, writer): + writer.write("<"+self.tagName) + + a_names=self._get_attributes().keys() + a_names.sort() + + for a_name in a_names: + writer.write(" "+a_name+"=\"") + _write_data(writer, self._get_attributes()[a_name]) + writer.write("\"") + if self.childNodes: + writer.write(">") + for node in self.childNodes: + node.writexml( writer ) + writer.write("") + else: + writer.write("/>") + + def _get_attributes( self ): + return AttributeList( self.__attrs, self.__attrsNS ) + +class Comment( Node ): + nodeType=Node.COMMENT_NODE + def __init__(self, data ): + Node.__init__( self ) + self.data=self.nodeValue=data + self.nodeName="#comment" + self.attributes=None + + def writexml( self, writer ): + writer.write( "" ) + +class ProcessingInstruction( Node ): + nodeType=Node.PROCESSING_INSTRUCTION_NODE + def __init__(self, target, data ): + Node.__init__( self ) + self.target = self.nodeName = target + self.data = self.nodeValue = data + self.attributes=None + + def writexml( self, writer ): + writer.write( "" ) + +class Text( Node ): + nodeType=Node.TEXT_NODE + nodeName="#text" + def __init__(self, data ): + Node.__init__( self ) + self.data = self.nodeValue = data + self.attributes=None + + def __repr__(self): + if len( self.data )> 10: + dotdotdot="..." + else: + dotdotdot="" + return "" + + def writexml( self, writer ): + _write_data( writer, self.data ) + +class Document( Node ): + nodeType=Node.DOCUMENT_NODE + def __init__( self ): + Node.__init__( self ) + self.documentElement=None + self.attributes=None + self.nodeName="#document" + self.nodeValue=None + + createElement=Element + + createTextNode=Text + + createComment=Comment + + createProcessingInstruction=ProcessingInstruction + + createAttribute=Attr + + def createElementNS(self, namespaceURI, qualifiedName): + fields = string.split(qualifiedName, ':') + if len(fields) == 2: + prefix = fields[0] + localName = fields[1] + elif len(fields) == 1: + prefix = '' + localName = fields[0] + return Element(self, qualifiedName, namespaceURI, prefix, localName) + + def createAttributeNS(self, namespaceURI, qualifiedName): + fields = string.split(qualifiedName,':') + if len(fields) == 2: + localName = fields[1] + prefix = fields[0] + elif len(fields) == 1: + localName = fields[0] + prefix = None + return Attr(qualifiedName, namespaceURI, prefix, localName) + + def getElementsByTagNameNS(self,namespaceURI,localName): + _getElementsByTagNameNSHelper( self, namespaceURI, localName ) + + def close( self ): + for node in self.elements: + _closeElement( node ) + + def unlink( self ): + self.documentElement=None + Node.unlink( self ) + + def getElementsByTagName( self, name ): + rc=[] + _getElementsByTagNameHelper( self, name, rc ) + return rc + + def writexml( self, writer ): + for node in self.childNodes: + node.writexml( writer ) + +def _doparse( func, args, kwargs ): + events=apply( func, args, kwargs ) + (toktype, rootNode)=events.getEvent() + events.expandNode( rootNode ) + return rootNode + +def parse( *args, **kwargs ): + return _doparse( pulldom.parse, args, kwargs ) + +def parseString( *args, **kwargs ): + return _doparse( pulldom.parseString, args, kwargs ) diff --git a/Lib/xml/dom/pulldom.py b/Lib/xml/dom/pulldom.py new file mode 100644 index 00000000000..9c856469b1e --- /dev/null +++ b/Lib/xml/dom/pulldom.py @@ -0,0 +1,267 @@ +import minidom +import types +import string +import sys +import pyexpat +from xml.sax import ExpatParser + +#todo: SAX2/namespace handling + +START_ELEMENT="START_ELEMENT" +END_ELEMENT="END_ELEMENT" +COMMENT="COMMENT" +START_DOCUMENT="START_DOCUMENT" +END_DOCUMENT="END_DOCUMENT" +PROCESSING_INSTRUCTION="PROCESSING_INSTRUCTION" +IGNORABLE_WHITESPACE="IGNORABLE_WHITESPACE" +CHARACTERS="CHARACTERS" + +class PullDOM: + def __init__( self ): + self.firstEvent=[None,None] + self.lastEvent=self.firstEvent + + def setDocumentLocator( self, locator ): pass + + def startElement( self, tagName , attrs ): + if not hasattr( self, "curNode" ): + # FIXME: hack! + self.startDocument( ) + + node = self.document.createElement( tagName ) #FIXME namespaces! + for attr in attrs.keys(): + node.setAttribute( attr, attrs[attr] ) + + parent=self.curNode + node.parentNode = parent + if parent.childNodes: + node.previousSibling=parent.childNodes[-1] + node.previousSibling.nextSibling=node + self.curNode = node + # FIXME: do I have to screen namespace attributes + self.lastEvent[1]=[(START_ELEMENT, node), None ] + self.lastEvent=self.lastEvent[1] + #self.events.append( (START_ELEMENT, node) ) + + def endElement( self, name ): + node = self.curNode + self.lastEvent[1]=[(END_ELEMENT, node), None ] + self.lastEvent=self.lastEvent[1] + #self.events.append( (END_ELEMENT, node )) + self.curNode = node.parentNode + + def comment( self, s): + node = self.document.createComment ( s ) + parent=self.curNode + node.parentNode=parent + if parent.childNodes: + node.previousSibling=parent.childNodes[-1] + node.previousSibling.nextSibling=node + self.lastEvent[1]=[(COMMENT, node), None ] + self.lastEvent=self.lastEvent[1] + #self.events.append( (COMMENT, node )) + + def processingInstruction( self, target, data ): + node = self.document.createProcessingInstruction( target, data ) + #self.appendChild( node ) + + parent=self.curNode + node.parentNode=parent + if parent.childNodes: + node.previousSibling=parent.childNodes[-1] + node.previousSibling.nextSibling=node + self.lastEvent[1]=[(PROCESSING_INSTRUCTION, node), None ] + self.lastEvent=self.lastEvent[1] + #self.events.append( (PROCESSING_INSTRUCTION, node) ) + + def ignorableWhitespace( self, chars ): + node = self.document.createTextNode( chars[start:start+length] ) + parent=self.curNode + node.parentNode=parent + if parent.childNodes: + node.previousSibling=parent.childNodes[-1] + node.previousSibling.nextSibling=node + self.lastEvent[1]=[(IGNORABLE_WHITESPACE, node), None ] + self.lastEvent=self.lastEvent[1] + #self.events.append( (IGNORABLE_WHITESPACE, node)) + + def characters( self, chars ): + node = self.document.createTextNode( chars ) + node.parentNode=self.curNode + self.lastEvent[1]=[(CHARACTERS, node), None ] + self.lastEvent=self.lastEvent[1] + + def startDocument( self ): + node = self.curNode = self.document = minidom.Document() + node.parentNode=None + self.lastEvent[1]=[(START_DOCUMENT, node), None ] + self.lastEvent=self.lastEvent[1] + #self.events.append( (START_DOCUMENT, node) ) + + def endDocument( self ): + assert( not self.curNode.parentNode ) + for node in self.curNode.childNodes: + if node.nodeType==node.ELEMENT_NODE: + self.document.documentElement = node + #if not self.document.documentElement: + # raise Error, "No document element" + + self.lastEvent[1]=[(END_DOCUMENT, node), None ] + #self.events.append( (END_DOCUMENT, self.curNode) ) + +class ErrorHandler: + def warning( self, exception ): + print exception + def error( self, exception ): + raise exception + def fatalError( self, exception ): + raise exception + +class DOMEventStream: + def __init__( self, stream, parser, bufsize ): + self.stream=stream + self.parser=parser + self.bufsize=bufsize + self.reset() + + def reset( self ): + self.pulldom = PullDOM() + self.parser.setContentHandler( self.pulldom ) + + def __getitem__( self, pos ): + rc=self.getEvent() + if rc: return rc + raise IndexError + + def expandNode( self, node ): + event=self.getEvent() + while event: + token,cur_node=event + if cur_node is node: return + + if token !=END_ELEMENT: + cur_node.parentNode.childNodes.append( cur_node ) + event=self.getEvent() + if node.nodeType==minidom.Node.DOCUMENT_NODE: + for child in node.childNodes: + if child.nodeType==minidom.Node.ELEMENT_NODE: + node.documentElement=child + + def getEvent( self ): + if not self.pulldom.firstEvent[1]: + self.pulldom.lastEvent=self.pulldom.firstEvent + while not self.pulldom.firstEvent[1]: + buf=self.stream.read( self.bufsize ) + if not buf: + #FIXME: why doesn't Expat close work? + #self.parser.close() + return None + self.parser.feed( buf ) + rc=self.pulldom.firstEvent[1][0] + self.pulldom.firstEvent[1]=self.pulldom.firstEvent[1][1] + return rc + +# FIXME: sax2 +#def _getParser( ): + # from xml.sax.saxexts import make_parser + # expat doesn't report errors properly! Figure it out + # return make_parser() + # return make_parser("xml.sax.drivers.drv_xmllib") + + + +def _getParser(): + return ExpatParser() + +default_bufsize=(2**14)-20 +# FIXME: move into sax package for common usage +def parse( stream_or_string, parser=None, bufsize=default_bufsize ): + if type( stream_or_string ) == type( "" ): + stream=open( stream_or_string ) + else: + stream=stream_or_string + if not parser: + parser=_getParser() + return DOMEventStream( stream, parser, bufsize ) + +def parseString( string, parser=None ): + try: + import cStringIO + stringio=cStringIO.StringIO + except ImportError: + import StringIO + stringio=StringIO.StringIO + + bufsize=len( string ) + stringio( string ) + parser=_getParser() + return DOMEventStream( buf, parser, bufsize ) + +#FIXME: Use Lars' instead!!! +class SAX_expat: + "SAX driver for the Pyexpat C module." + + def __init__(self): + self.parser=pyexpat.ParserCreate() + self.started=0 + + def setDocumentHandler( self, handler ): + self.parser.StartElementHandler = handler.startElement + self.parser.EndElementHandler = handler.endElement + self.parser.CharacterDataHandler = handler.datachars + self.parser.ProcessingInstructionHandler = handler.processingInstruction + self.doc_handler=handler + + def setErrorHandler( self, handler ): + self.err_handler=handler + + # --- Locator methods. Only usable after errors. + + def getLineNumber(self): + return self.parser.ErrorLineNumber + + def getColumnNumber(self): + return self.parser.ErrorColumnNumber + + # --- Internal + + def __report_error(self): + msg=pyexpat.ErrorString(self.parser.ErrorCode) + self.err_handler.fatalError(msg) + + # --- EXPERIMENTAL PYTHON SAX EXTENSIONS + + def get_parser_name(self): + return "pyexpat" + + def get_parser_version(self): + return "Unknown" + + def get_driver_version(self): + return version + + def is_validating(self): + return 0 + + def is_dtd_reading(self): + return 0 + + def reset(self): + self.parser=pyexpat.ParserCreate() + self.parser.StartElementHandler = self.startElement + self.parser.EndElementHandler = self.endElement + self.parser.CharacterDataHandler = self.characters + self.parser.ProcessingInstructionHandler = self.processingInstruction + + def feed(self,data): + if not self.started: + self.doc_handler.startDocument() + self.started=1 + if not self.parser.Parse(data): + self.__report_error() + + def close(self): + if not self.parser.Parse("",1): + self.__report_error() + self.doc_handler.endDocument() + self.parser = None