Issue #2175: SAX parsers now support a character stream of InputSource object.

This commit is contained in:
Serhiy Storchaka 2015-04-02 21:00:13 +03:00
parent 278ba2690c
commit 61de087f0f
7 changed files with 64 additions and 12 deletions

View File

@ -100,8 +100,10 @@ The :class:`XMLReader` interface supports the following methods:
system identifier (a string identifying the input source -- typically a file system identifier (a string identifying the input source -- typically a file
name or an URL), a file-like object, or an :class:`InputSource` object. When name or an URL), a file-like object, or an :class:`InputSource` object. When
:meth:`parse` returns, the input is completely processed, and the parser object :meth:`parse` returns, the input is completely processed, and the parser object
can be discarded or reset. As a limitation, the current implementation only can be discarded or reset.
accepts byte streams; processing of character streams is for further study.
.. versionchanged:: 3.5
Added support of character streams.
.. method:: XMLReader.getContentHandler() .. method:: XMLReader.getContentHandler()
@ -288,8 +290,7 @@ InputSource Objects
.. method:: InputSource.setByteStream(bytefile) .. method:: InputSource.setByteStream(bytefile)
Set the byte stream (a Python file-like object which does not perform Set the byte stream (a :term:`binary file`) for this input source.
byte-to-character conversion) for this input source.
The SAX parser will ignore this if there is also a character stream specified, The SAX parser will ignore this if there is also a character stream specified,
but it will use a byte stream in preference to opening a URI connection itself. but it will use a byte stream in preference to opening a URI connection itself.
@ -308,8 +309,7 @@ InputSource Objects
.. method:: InputSource.setCharacterStream(charfile) .. method:: InputSource.setCharacterStream(charfile)
Set the character stream for this input source. (The stream must be a Python 1.6 Set the character stream (a :term:`text file`) for this input source.
Unicode-wrapped file-like that performs conversion to strings.)
If there is a character stream specified, the SAX parser will ignore any byte If there is a character stream specified, the SAX parser will ignore any byte
stream and will not attempt to open a URI connection to the system identifier. stream and will not attempt to open a URI connection to the system identifier.

View File

@ -499,6 +499,13 @@ xmlrpc
* :class:`xmlrpc.client.ServerProxy` is now a :term:`context manager`. * :class:`xmlrpc.client.ServerProxy` is now a :term:`context manager`.
(Contributed by Claudiu Popa in :issue:`20627`.) (Contributed by Claudiu Popa in :issue:`20627`.)
xml.sax
-------
* SAX parsers now support a character stream of
:class:`~xml.sax.xmlreader.InputSource` object.
(Contributed by Serhiy Storchaka in :issue:`2175`.)
faulthandler faulthandler
------------ ------------

View File

@ -185,12 +185,24 @@ class PrepareInputSourceTest(unittest.TestCase):
def make_byte_stream(self): def make_byte_stream(self):
return BytesIO(b"This is a byte stream.") return BytesIO(b"This is a byte stream.")
def make_character_stream(self):
return StringIO("This is a character stream.")
def checkContent(self, stream, content): def checkContent(self, stream, content):
self.assertIsNotNone(stream) self.assertIsNotNone(stream)
self.assertEqual(stream.read(), content) self.assertEqual(stream.read(), content)
stream.close() stream.close()
def test_character_stream(self):
# If the source is an InputSource with a character stream, use it.
src = InputSource(self.file)
src.setCharacterStream(self.make_character_stream())
prep = prepare_input_source(src)
self.assertIsNone(prep.getByteStream())
self.checkContent(prep.getCharacterStream(),
"This is a character stream.")
def test_byte_stream(self): def test_byte_stream(self):
# If the source is an InputSource that does not have a character # If the source is an InputSource that does not have a character
# stream but does have a byte stream, use the byte stream. # stream but does have a byte stream, use the byte stream.
@ -225,6 +237,14 @@ class PrepareInputSourceTest(unittest.TestCase):
self.checkContent(prep.getByteStream(), self.checkContent(prep.getByteStream(),
b"This is a byte stream.") b"This is a byte stream.")
def test_text_file(self):
# If the source is a text file-like object, use it as a character
# stream.
prep = prepare_input_source(self.make_character_stream())
self.assertIsNone(prep.getByteStream())
self.checkContent(prep.getCharacterStream(),
"This is a character stream.")
# ===== XMLGenerator # ===== XMLGenerator
@ -904,6 +924,19 @@ class ExpatReaderTest(XmlTestBase):
self.assertEqual(result.getvalue(), xml_test_out) self.assertEqual(result.getvalue(), xml_test_out)
def test_expat_inpsource_character_stream(self):
parser = create_parser()
result = BytesIO()
xmlgen = XMLGenerator(result)
parser.setContentHandler(xmlgen)
inpsrc = InputSource()
with open(TEST_XMLFILE, 'rt', encoding='iso-8859-1') as f:
inpsrc.setCharacterStream(f)
parser.parse(inpsrc)
self.assertEqual(result.getvalue(), xml_test_out)
# ===== IncrementalParser support # ===== IncrementalParser support
def test_expat_incremental(self): def test_expat_incremental(self):

View File

@ -219,9 +219,14 @@ class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):
self._parsing = 0 self._parsing = 0
# break cycle created by expat handlers pointing to our methods # break cycle created by expat handlers pointing to our methods
self._parser = None self._parser = None
bs = self._source.getByteStream() try:
if bs is not None: file = self._source.getCharacterStream()
bs.close() if file is not None:
file.close()
finally:
file = self._source.getByteStream()
if file is not None:
file.close()
def _reset_cont_handler(self): def _reset_cont_handler(self):
self._parser.ProcessingInstructionHandler = \ self._parser.ProcessingInstructionHandler = \

View File

@ -345,11 +345,14 @@ def prepare_input_source(source, base=""):
elif hasattr(source, "read"): elif hasattr(source, "read"):
f = source f = source
source = xmlreader.InputSource() source = xmlreader.InputSource()
source.setByteStream(f) if isinstance(f.read(0), str):
source.setCharacterStream(f)
else:
source.setByteStream(f)
if hasattr(f, "name") and isinstance(f.name, str): if hasattr(f, "name") and isinstance(f.name, str):
source.setSystemId(f.name) source.setSystemId(f.name)
if source.getByteStream() is None: if source.getCharacterStream() is None and source.getByteStream() is None:
sysid = source.getSystemId() sysid = source.getSystemId()
basehead = os.path.dirname(os.path.normpath(base)) basehead = os.path.dirname(os.path.normpath(base))
sysidfilename = os.path.join(basehead, sysid) sysidfilename = os.path.join(basehead, sysid)

View File

@ -117,7 +117,9 @@ class IncrementalParser(XMLReader):
source = saxutils.prepare_input_source(source) source = saxutils.prepare_input_source(source)
self.prepareParser(source) self.prepareParser(source)
file = source.getByteStream() file = source.getCharacterStream()
if file is None:
file = source.getByteStream()
buffer = file.read(self._bufsize) buffer = file.read(self._bufsize)
while buffer: while buffer:
self.feed(buffer) self.feed(buffer)

View File

@ -16,6 +16,8 @@ Core and Builtins
Library Library
------- -------
- Issue #2175: SAX parsers now support a character stream of InputSource object.
- Issue #16840: Tkinter now supports 64-bit integers added in Tcl 8.4 and - Issue #16840: Tkinter now supports 64-bit integers added in Tcl 8.4 and
arbitrary precision integers added in Tcl 8.5. arbitrary precision integers added in Tcl 8.5.