cpython/Lib/xml/dom/pulldom.py

352 lines
12 KiB
Python
Raw Normal View History

import xml.sax
import xml.sax.handler
import types
try:
_StringTypes = [types.StringType, types.UnicodeType]
except AttributeError:
_StringTypes = [types.StringType]
2000-09-24 02:21:58 -03:00
START_ELEMENT = "START_ELEMENT"
END_ELEMENT = "END_ELEMENT"
COMMENT = "COMMENT"
START_DOCUMENT = "START_DOCUMENT"
END_DOCUMENT = "END_DOCUMENT"
PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
CHARACTERS = "CHARACTERS"
2000-09-24 18:54:14 -03:00
class PullDOM(xml.sax.ContentHandler):
_locator = None
document = None
def __init__(self, documentFactory=None):
from xml.dom import XML_NAMESPACE
self.documentFactory = documentFactory
2000-09-24 02:21:58 -03:00
self.firstEvent = [None, None]
self.lastEvent = self.firstEvent
self.elementStack = []
self.push = self.elementStack.append
try:
self.pop = self.elementStack.pop
except AttributeError:
# use class' pop instead
pass
self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts
2000-09-24 18:54:14 -03:00
self._current_context = self._ns_contexts[-1]
self.pending_events = []
def pop(self):
result = self.elementStack[-1]
2001-01-27 04:47:37 -04:00
del self.elementStack[-1]
return result
def setDocumentLocator(self, locator):
self._locator = locator
2000-09-24 18:54:14 -03:00
def startPrefixMapping(self, prefix, uri):
if not hasattr(self, '_xmlns_attrs'):
self._xmlns_attrs = []
self._xmlns_attrs.append((prefix or 'xmlns', uri))
2000-09-24 18:54:14 -03:00
self._ns_contexts.append(self._current_context.copy())
self._current_context[uri] = prefix or None
2000-09-24 18:54:14 -03:00
def endPrefixMapping(self, prefix):
self._current_context = self._ns_contexts.pop()
2000-09-24 18:54:14 -03:00
def startElementNS(self, name, tagName , attrs):
# Retrieve xml namespace declaration attributes.
xmlns_uri = 'http://www.w3.org/2000/xmlns/'
xmlns_attrs = getattr(self, '_xmlns_attrs', None)
if xmlns_attrs is not None:
for aname, value in xmlns_attrs:
attrs._attrs[(xmlns_uri, aname)] = value
self._xmlns_attrs = []
uri, localname = name
if uri:
2000-09-24 18:54:14 -03:00
# When using namespaces, the reader may or may not
# provide us with the original name. If not, create
# *a* valid tagName from the current context.
if tagName is None:
prefix = self._current_context[uri]
if prefix:
tagName = prefix + ":" + localname
else:
tagName = localname
if self.document:
node = self.document.createElementNS(uri, tagName)
else:
node = self.buildDocument(uri, tagName)
2000-09-24 18:54:14 -03:00
else:
# When the tagname is not prefixed, it just appears as
# localname
if self.document:
node = self.document.createElement(localname)
else:
node = self.buildDocument(None, localname)
2000-09-24 18:54:14 -03:00
for aname,value in attrs.items():
a_uri, a_localname = aname
if a_uri == xmlns_uri:
if a_localname == 'xmlns':
qname = a_localname
else:
qname = 'xmlns:' + a_localname
attr = self.document.createAttributeNS(a_uri, qname)
node.setAttributeNodeNS(attr)
elif a_uri:
prefix = self._current_context[a_uri]
if prefix:
qname = prefix + ":" + a_localname
else:
qname = a_localname
attr = self.document.createAttributeNS(a_uri, qname)
node.setAttributeNodeNS(attr)
2000-09-24 18:54:14 -03:00
else:
attr = self.document.createAttribute(a_localname)
node.setAttributeNode(attr)
2000-09-24 18:54:14 -03:00
attr.value = value
2000-09-24 02:21:58 -03:00
self.lastEvent[1] = [(START_ELEMENT, node), None]
self.lastEvent = self.lastEvent[1]
self.push(node)
2000-09-24 18:54:14 -03:00
def endElementNS(self, name, tagName):
self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
2000-09-24 02:21:58 -03:00
self.lastEvent = self.lastEvent[1]
def startElement(self, name, attrs):
if self.document:
node = self.document.createElement(name)
else:
node = self.buildDocument(None, name)
for aname,value in attrs.items():
attr = self.document.createAttribute(aname)
attr.value = value
node.setAttributeNode(attr)
self.lastEvent[1] = [(START_ELEMENT, node), None]
self.lastEvent = self.lastEvent[1]
self.push(node)
def endElement(self, name):
self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
self.lastEvent = self.lastEvent[1]
2000-09-24 02:21:58 -03:00
def comment(self, s):
if self.document:
node = self.document.createComment(s)
self.lastEvent[1] = [(COMMENT, node), None]
self.lastEvent = self.lastEvent[1]
else:
event = [(COMMENT, s), None]
self.pending_events.append(event)
2000-09-24 02:21:58 -03:00
def processingInstruction(self, target, data):
if self.document:
node = self.document.createProcessingInstruction(target, data)
self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
self.lastEvent = self.lastEvent[1]
else:
event = [(PROCESSING_INSTRUCTION, target, data), None]
self.pending_events.append(event)
2000-09-24 02:21:58 -03:00
def ignorableWhitespace(self, chars):
node = self.document.createTextNode(chars)
2000-09-24 02:21:58 -03:00
self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
self.lastEvent = self.lastEvent[1]
def characters(self, chars):
node = self.document.createTextNode(chars)
self.lastEvent[1] = [(CHARACTERS, node), None]
self.lastEvent = self.lastEvent[1]
def startDocument(self):
if self.documentFactory is None:
import xml.dom.minidom
self.documentFactory = xml.dom.minidom.Document.implementation
def buildDocument(self, uri, tagname):
# Can't do that in startDocument, since we need the tagname
# XXX: obtain DocumentType
node = self.documentFactory.createDocument(uri, tagname, None)
self.document = node
2000-09-24 02:21:58 -03:00
self.lastEvent[1] = [(START_DOCUMENT, node), None]
self.lastEvent = self.lastEvent[1]
self.push(node)
# Put everything we have seen so far into the document
for e in self.pending_events:
if e[0][0] == PROCESSING_INSTRUCTION:
_,target,data = e[0]
n = self.document.createProcessingInstruction(target, data)
e[0] = (PROCESSING_INSTRUCTION, n)
elif e[0][0] == COMMENT:
n = self.document.createComment(e[0][1])
e[0] = (COMMENT, n)
else:
raise AssertionError("Unknown pending event ",e[0][0])
self.lastEvent[1] = e
self.lastEvent = e
self.pending_events = None
return node.firstChild
2000-09-24 02:21:58 -03:00
def endDocument(self):
self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
self.pop()
def clear(self):
"clear(): Explicitly release parsing structures"
self.document = None
class ErrorHandler:
2000-09-24 02:21:58 -03:00
def warning(self, exception):
print exception
2000-09-24 02:21:58 -03:00
def error(self, exception):
raise exception
2000-09-24 02:21:58 -03:00
def fatalError(self, exception):
raise exception
class DOMEventStream:
2000-09-24 02:21:58 -03:00
def __init__(self, stream, parser, bufsize):
self.stream = stream
self.parser = parser
self.bufsize = bufsize
if not hasattr(self.parser, 'feed'):
self.getEvent = self._slurp
self.reset()
2000-09-24 02:21:58 -03:00
def reset(self):
self.pulldom = PullDOM()
2000-09-24 18:54:14 -03:00
# This content handler relies on namespace support
self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
2000-09-24 02:21:58 -03:00
self.parser.setContentHandler(self.pulldom)
2000-09-24 02:21:58 -03:00
def __getitem__(self, pos):
rc = self.getEvent()
if rc:
return rc
raise IndexError
def next(self):
rc = self.getEvent()
if rc:
return rc
raise StopIteration
def __iter__(self):
return self
2000-09-24 02:21:58 -03:00
def expandNode(self, node):
event = self.getEvent()
parents = [node]
while event:
2000-09-24 02:21:58 -03:00
token, cur_node = event
if cur_node is node:
return
if token != END_ELEMENT:
parents[-1].appendChild(cur_node)
if token == START_ELEMENT:
parents.append(cur_node)
elif token == END_ELEMENT:
del parents[-1]
2000-09-24 02:21:58 -03:00
event = self.getEvent()
def getEvent(self):
# use IncrementalParser interface, so we get the desired
# pull effect
if not self.pulldom.firstEvent[1]:
2000-09-24 02:21:58 -03:00
self.pulldom.lastEvent = self.pulldom.firstEvent
while not self.pulldom.firstEvent[1]:
buf = self.stream.read(self.bufsize)
if not buf:
self.parser.close()
return None
2000-09-24 02:21:58 -03:00
self.parser.feed(buf)
rc = self.pulldom.firstEvent[1][0]
self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
return rc
def _slurp(self):
""" Fallback replacement for getEvent() using the
standard SAX2 interface, which means we slurp the
SAX events into memory (no performance gain, but
we are compatible to all SAX parsers).
"""
self.parser.parse(self.stream)
self.getEvent = self._emit
return self._emit()
def _emit(self):
""" Fallback replacement for getEvent() that emits
the events that _slurp() read previously.
"""
rc = self.pulldom.firstEvent[1][0]
self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
return rc
def clear(self):
"""clear(): Explicitly release parsing objects"""
self.pulldom.clear()
del self.pulldom
self.parser = None
self.stream = None
class SAX2DOM(PullDOM):
def startElementNS(self, name, tagName , attrs):
PullDOM.startElementNS(self, name, tagName, attrs)
curNode = self.elementStack[-1]
parentNode = self.elementStack[-2]
parentNode.appendChild(curNode)
def startElement(self, name, attrs):
PullDOM.startElement(self, name, attrs)
curNode = self.elementStack[-1]
parentNode = self.elementStack[-2]
parentNode.appendChild(curNode)
def processingInstruction(self, target, data):
PullDOM.processingInstruction(self, target, data)
node = self.lastEvent[0][1]
parentNode = self.elementStack[-1]
parentNode.appendChild(node)
def ignorableWhitespace(self, chars):
PullDOM.ignorableWhitespace(self, chars)
node = self.lastEvent[0][1]
parentNode = self.elementStack[-1]
parentNode.appendChild(node)
def characters(self, chars):
PullDOM.characters(self, chars)
node = self.lastEvent[0][1]
parentNode = self.elementStack[-1]
parentNode.appendChild(node)
2000-09-24 02:21:58 -03:00
default_bufsize = (2 ** 14) - 20
def parse(stream_or_string, parser=None, bufsize=None):
if bufsize is None:
bufsize = default_bufsize
if type(stream_or_string) in _StringTypes:
2000-09-24 02:21:58 -03:00
stream = open(stream_or_string)
else:
2000-09-24 02:21:58 -03:00
stream = stream_or_string
if not parser:
2000-09-24 18:54:14 -03:00
parser = xml.sax.make_parser()
2000-09-24 02:21:58 -03:00
return DOMEventStream(stream, parser, bufsize)
2000-09-24 02:21:58 -03:00
def parseString(string, parser=None):
try:
2000-09-24 02:21:58 -03:00
from cStringIO import StringIO
except ImportError:
2000-09-24 02:21:58 -03:00
from StringIO import StringIO
2000-09-24 02:21:58 -03:00
bufsize = len(string)
buf = StringIO(string)
2000-09-24 18:54:14 -03:00
if not parser:
parser = xml.sax.make_parser()
2000-09-24 02:21:58 -03:00
return DOMEventStream(buf, parser, bufsize)