import minidom import types import string import sys from xml.sax import ExpatParser #todo: SAX2/namespace handling START_ELEMENT="START_ELEMENT" END_ELEMENT="END_ELEMENT" COMMENT="COMMENT" START_DOCUMENT="START_DOCUMENT" END_DOCUMENT="END_DOCUMENT" PROCESSING_INSTRUCTION="PROCESSING_INSTRUCTION" IGNORABLE_WHITESPACE="IGNORABLE_WHITESPACE" CHARACTERS="CHARACTERS" class PullDOM: def __init__( self ): self.firstEvent=[None,None] self.lastEvent=self.firstEvent def setDocumentLocator( self, locator ): pass def startElement( self, tagName , attrs ): if not hasattr( self, "curNode" ): # FIXME: hack! self.startDocument( ) node = self.document.createElement( tagName ) #FIXME namespaces! for attr in attrs.keys(): node.setAttribute( attr, attrs[attr] ) parent=self.curNode node.parentNode = parent if parent.childNodes: node.previousSibling=parent.childNodes[-1] node.previousSibling.nextSibling=node self.curNode = node # FIXME: do I have to screen namespace attributes self.lastEvent[1]=[(START_ELEMENT, node), None ] self.lastEvent=self.lastEvent[1] #self.events.append( (START_ELEMENT, node) ) def endElement( self, name ): node = self.curNode self.lastEvent[1]=[(END_ELEMENT, node), None ] self.lastEvent=self.lastEvent[1] #self.events.append( (END_ELEMENT, node )) self.curNode = node.parentNode def comment( self, s): node = self.document.createComment ( s ) parent=self.curNode node.parentNode=parent if parent.childNodes: node.previousSibling=parent.childNodes[-1] node.previousSibling.nextSibling=node self.lastEvent[1]=[(COMMENT, node), None ] self.lastEvent=self.lastEvent[1] #self.events.append( (COMMENT, node )) def processingInstruction( self, target, data ): node = self.document.createProcessingInstruction( target, data ) #self.appendChild( node ) parent=self.curNode node.parentNode=parent if parent.childNodes: node.previousSibling=parent.childNodes[-1] node.previousSibling.nextSibling=node self.lastEvent[1]=[(PROCESSING_INSTRUCTION, node), None ] self.lastEvent=self.lastEvent[1] #self.events.append( (PROCESSING_INSTRUCTION, node) ) def ignorableWhitespace( self, chars ): node = self.document.createTextNode( chars[start:start+length] ) parent=self.curNode node.parentNode=parent if parent.childNodes: node.previousSibling=parent.childNodes[-1] node.previousSibling.nextSibling=node self.lastEvent[1]=[(IGNORABLE_WHITESPACE, node), None ] self.lastEvent=self.lastEvent[1] #self.events.append( (IGNORABLE_WHITESPACE, node)) def characters( self, chars ): node = self.document.createTextNode( chars ) node.parentNode=self.curNode self.lastEvent[1]=[(CHARACTERS, node), None ] self.lastEvent=self.lastEvent[1] def startDocument( self ): node = self.curNode = self.document = minidom.Document() node.parentNode=None self.lastEvent[1]=[(START_DOCUMENT, node), None ] self.lastEvent=self.lastEvent[1] #self.events.append( (START_DOCUMENT, node) ) def endDocument( self ): assert( not self.curNode.parentNode ) for node in self.curNode.childNodes: if node.nodeType==node.ELEMENT_NODE: self.document.documentElement = node #if not self.document.documentElement: # raise Error, "No document element" self.lastEvent[1]=[(END_DOCUMENT, node), None ] #self.events.append( (END_DOCUMENT, self.curNode) ) class ErrorHandler: def warning( self, exception ): print exception def error( self, exception ): raise exception def fatalError( self, exception ): raise exception class DOMEventStream: def __init__( self, stream, parser, bufsize ): self.stream=stream self.parser=parser self.bufsize=bufsize self.reset() def reset( self ): self.pulldom = PullDOM() self.parser.setContentHandler( self.pulldom ) def __getitem__( self, pos ): rc=self.getEvent() if rc: return rc raise IndexError def expandNode( self, node ): event=self.getEvent() while event: token,cur_node=event if cur_node is node: return if token !=END_ELEMENT: cur_node.parentNode.appendChild( cur_node ) event=self.getEvent() def getEvent( self ): if not self.pulldom.firstEvent[1]: self.pulldom.lastEvent=self.pulldom.firstEvent while not self.pulldom.firstEvent[1]: buf=self.stream.read( self.bufsize ) if not buf: #FIXME: why doesn't Expat close work? #self.parser.close() return None self.parser.feed( buf ) rc=self.pulldom.firstEvent[1][0] self.pulldom.firstEvent[1]=self.pulldom.firstEvent[1][1] return rc # FIXME: sax2 #def _getParser( ): # from xml.sax.saxexts import make_parser # expat doesn't report errors properly! Figure it out # return make_parser() # return make_parser("xml.sax.drivers.drv_xmllib") def _getParser(): return ExpatParser() default_bufsize=(2**14)-20 # FIXME: move into sax package for common usage def parse( stream_or_string, parser=None, bufsize=default_bufsize ): if type( stream_or_string ) == type( "" ): stream=open( stream_or_string ) else: stream=stream_or_string if not parser: parser=_getParser() return DOMEventStream( stream, parser, bufsize ) def parseString( string, parser=None ): try: import cStringIO stringio=cStringIO.StringIO except ImportError: import StringIO stringio=StringIO.StringIO bufsize=len( string ) buf=stringio( string ) parser=_getParser() return DOMEventStream( buf, parser, bufsize )