bpo-35018: Sax parser should provide user access to lexical handlers (GH-20958)
Co-Authored-By: Jonathan Gossage <jgossage@gmail.com>
This commit is contained in:
parent
67acf74c4e
commit
e28b8c9387
|
@ -11,12 +11,12 @@
|
|||
|
||||
--------------
|
||||
|
||||
The SAX API defines four kinds of handlers: content handlers, DTD handlers,
|
||||
error handlers, and entity resolvers. Applications normally only need to
|
||||
implement those interfaces whose events they are interested in; they can
|
||||
implement the interfaces in a single object or in multiple objects. Handler
|
||||
implementations should inherit from the base classes provided in the module
|
||||
:mod:`xml.sax.handler`, so that all methods get default implementations.
|
||||
The SAX API defines five kinds of handlers: content handlers, DTD handlers,
|
||||
error handlers, entity resolvers and lexical handlers. Applications normally
|
||||
only need to implement those interfaces whose events they are interested in;
|
||||
they can implement the interfaces in a single object or in multiple objects.
|
||||
Handler implementations should inherit from the base classes provided in the
|
||||
module :mod:`xml.sax.handler`, so that all methods get default implementations.
|
||||
|
||||
|
||||
.. class:: ContentHandler
|
||||
|
@ -47,6 +47,12 @@ implementations should inherit from the base classes provided in the module
|
|||
application. The methods of this object control whether errors are immediately
|
||||
converted to exceptions or are handled in some other way.
|
||||
|
||||
|
||||
.. class:: LexicalHandler
|
||||
|
||||
Interface used by the parser to represent low freqency events which may not
|
||||
be of interest to many applications.
|
||||
|
||||
In addition to these classes, :mod:`xml.sax.handler` provides symbolic constants
|
||||
for the feature and property names.
|
||||
|
||||
|
@ -114,7 +120,7 @@ for the feature and property names.
|
|||
.. data:: property_lexical_handler
|
||||
|
||||
| value: ``"http://xml.org/sax/properties/lexical-handler"``
|
||||
| data type: xml.sax.sax2lib.LexicalHandler (not supported in Python 2)
|
||||
| data type: xml.sax.handler.LexicalHandler (not supported in Python 2)
|
||||
| description: An optional extension handler for lexical events like
|
||||
comments.
|
||||
| access: read/write
|
||||
|
@ -413,3 +419,45 @@ the passed-in exception object.
|
|||
information will continue to be passed to the application. Raising an exception
|
||||
in this method will cause parsing to end.
|
||||
|
||||
|
||||
.. _lexical-handler-objects:
|
||||
|
||||
LexicalHandler Objects
|
||||
----------------------
|
||||
Optional SAX2 handler for lexical events.
|
||||
|
||||
This handler is used to obtain lexical information about an XML
|
||||
document. Lexical information includes information describing the
|
||||
document encoding used and XML comments embedded in the document, as
|
||||
well as section boundaries for the DTD and for any CDATA sections.
|
||||
The lexical handlers are used in the same manner as content handlers.
|
||||
|
||||
Set the LexicalHandler of an XMLReader by using the setProperty method
|
||||
with the property identifier
|
||||
``'http://xml.org/sax/properties/lexical-handler'``.
|
||||
|
||||
|
||||
.. method:: LexicalHandler.comment(content)
|
||||
|
||||
Reports a comment anywhere in the document (including the DTD and
|
||||
outside the document element).
|
||||
|
||||
.. method:: LexicalHandler.startDTD(name, public_id, system_id)
|
||||
|
||||
Reports the start of the DTD declarations if the document has an
|
||||
associated DTD.
|
||||
|
||||
.. method:: LexicalHandler.endDTD()
|
||||
|
||||
Reports the end of DTD declaration.
|
||||
|
||||
.. method:: LexicalHandler.startCDATA()
|
||||
|
||||
Reports the start of a CDATA marked section.
|
||||
|
||||
The contents of the CDATA marked section will be reported through
|
||||
the characters handler.
|
||||
|
||||
.. method:: LexicalHandler.endCDATA()
|
||||
|
||||
Reports the end of a CDATA marked section.
|
||||
|
|
|
@ -139,6 +139,13 @@ Add :data:`sys.orig_argv` attribute: the list of the original command line
|
|||
arguments passed to the Python executable.
|
||||
(Contributed by Victor Stinner in :issue:`23427`.)
|
||||
|
||||
xml
|
||||
---
|
||||
|
||||
Add a :class:`~xml.sax.handler.LexicalHandler` class to the
|
||||
:mod:`xml.sax.handler` module.
|
||||
(Contributed by Jonathan Gossage and Zackery Spytz in :issue:`35018`.)
|
||||
|
||||
|
||||
Optimizations
|
||||
=============
|
||||
|
|
|
@ -13,7 +13,8 @@ except SAXReaderNotAvailable:
|
|||
from xml.sax.saxutils import XMLGenerator, escape, unescape, quoteattr, \
|
||||
XMLFilterBase, prepare_input_source
|
||||
from xml.sax.expatreader import create_parser
|
||||
from xml.sax.handler import feature_namespaces, feature_external_ges
|
||||
from xml.sax.handler import (feature_namespaces, feature_external_ges,
|
||||
LexicalHandler)
|
||||
from xml.sax.xmlreader import InputSource, AttributesImpl, AttributesNSImpl
|
||||
from io import BytesIO, StringIO
|
||||
import codecs
|
||||
|
@ -1356,6 +1357,155 @@ class XmlReaderTest(XmlTestBase):
|
|||
self.assertEqual(attrs.getQNameByName((ns_uri, "attr")), "ns:attr")
|
||||
|
||||
|
||||
class LexicalHandlerTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.parser = None
|
||||
|
||||
self.specified_version = '1.0'
|
||||
self.specified_encoding = 'UTF-8'
|
||||
self.specified_doctype = 'wish'
|
||||
self.specified_entity_names = ('nbsp', 'source', 'target')
|
||||
self.specified_comment = ('Comment in a DTD',
|
||||
'Really! You think so?')
|
||||
self.test_data = StringIO()
|
||||
self.test_data.write('<?xml version="{}" encoding="{}"?>\n'.
|
||||
format(self.specified_version,
|
||||
self.specified_encoding))
|
||||
self.test_data.write('<!DOCTYPE {} [\n'.
|
||||
format(self.specified_doctype))
|
||||
self.test_data.write('<!-- {} -->\n'.
|
||||
format(self.specified_comment[0]))
|
||||
self.test_data.write('<!ELEMENT {} (to,from,heading,body,footer)>\n'.
|
||||
format(self.specified_doctype))
|
||||
self.test_data.write('<!ELEMENT to (#PCDATA)>\n')
|
||||
self.test_data.write('<!ELEMENT from (#PCDATA)>\n')
|
||||
self.test_data.write('<!ELEMENT heading (#PCDATA)>\n')
|
||||
self.test_data.write('<!ELEMENT body (#PCDATA)>\n')
|
||||
self.test_data.write('<!ELEMENT footer (#PCDATA)>\n')
|
||||
self.test_data.write('<!ENTITY {} " ">\n'.
|
||||
format(self.specified_entity_names[0]))
|
||||
self.test_data.write('<!ENTITY {} "Written by: Alexander.">\n'.
|
||||
format(self.specified_entity_names[1]))
|
||||
self.test_data.write('<!ENTITY {} "Hope it gets to: Aristotle.">\n'.
|
||||
format(self.specified_entity_names[2]))
|
||||
self.test_data.write(']>\n')
|
||||
self.test_data.write('<{}>'.format(self.specified_doctype))
|
||||
self.test_data.write('<to>Aristotle</to>\n')
|
||||
self.test_data.write('<from>Alexander</from>\n')
|
||||
self.test_data.write('<heading>Supplication</heading>\n')
|
||||
self.test_data.write('<body>Teach me patience!</body>\n')
|
||||
self.test_data.write('<footer>&{};&{};&{};</footer>\n'.
|
||||
format(self.specified_entity_names[1],
|
||||
self.specified_entity_names[0],
|
||||
self.specified_entity_names[2]))
|
||||
self.test_data.write('<!-- {} -->\n'.format(self.specified_comment[1]))
|
||||
self.test_data.write('</{}>\n'.format(self.specified_doctype))
|
||||
self.test_data.seek(0)
|
||||
|
||||
# Data received from handlers - to be validated
|
||||
self.version = None
|
||||
self.encoding = None
|
||||
self.standalone = None
|
||||
self.doctype = None
|
||||
self.publicID = None
|
||||
self.systemID = None
|
||||
self.end_of_dtd = False
|
||||
self.comments = []
|
||||
|
||||
def test_handlers(self):
|
||||
class TestLexicalHandler(LexicalHandler):
|
||||
def __init__(self, test_harness, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.test_harness = test_harness
|
||||
|
||||
def startDTD(self, doctype, publicID, systemID):
|
||||
self.test_harness.doctype = doctype
|
||||
self.test_harness.publicID = publicID
|
||||
self.test_harness.systemID = systemID
|
||||
|
||||
def endDTD(self):
|
||||
self.test_harness.end_of_dtd = True
|
||||
|
||||
def comment(self, text):
|
||||
self.test_harness.comments.append(text)
|
||||
|
||||
self.parser = create_parser()
|
||||
self.parser.setContentHandler(ContentHandler())
|
||||
self.parser.setProperty(
|
||||
'http://xml.org/sax/properties/lexical-handler',
|
||||
TestLexicalHandler(self))
|
||||
source = InputSource()
|
||||
source.setCharacterStream(self.test_data)
|
||||
self.parser.parse(source)
|
||||
self.assertEqual(self.doctype, self.specified_doctype)
|
||||
self.assertIsNone(self.publicID)
|
||||
self.assertIsNone(self.systemID)
|
||||
self.assertTrue(self.end_of_dtd)
|
||||
self.assertEqual(len(self.comments),
|
||||
len(self.specified_comment))
|
||||
self.assertEqual(f' {self.specified_comment[0]} ', self.comments[0])
|
||||
|
||||
|
||||
class CDATAHandlerTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.parser = None
|
||||
self.specified_chars = []
|
||||
self.specified_chars.append(('Parseable character data', False))
|
||||
self.specified_chars.append(('<> &% - assorted other XML junk.', True))
|
||||
self.char_index = 0 # Used to index specified results within handlers
|
||||
self.test_data = StringIO()
|
||||
self.test_data.write('<root_doc>\n')
|
||||
self.test_data.write('<some_pcdata>\n')
|
||||
self.test_data.write(f'{self.specified_chars[0][0]}\n')
|
||||
self.test_data.write('</some_pcdata>\n')
|
||||
self.test_data.write('<some_cdata>\n')
|
||||
self.test_data.write(f'<![CDATA[{self.specified_chars[1][0]}]]>\n')
|
||||
self.test_data.write('</some_cdata>\n')
|
||||
self.test_data.write('</root_doc>\n')
|
||||
self.test_data.seek(0)
|
||||
|
||||
# Data received from handlers - to be validated
|
||||
self.chardata = []
|
||||
self.in_cdata = False
|
||||
|
||||
def test_handlers(self):
|
||||
class TestLexicalHandler(LexicalHandler):
|
||||
def __init__(self, test_harness, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.test_harness = test_harness
|
||||
|
||||
def startCDATA(self):
|
||||
self.test_harness.in_cdata = True
|
||||
|
||||
def endCDATA(self):
|
||||
self.test_harness.in_cdata = False
|
||||
|
||||
class TestCharHandler(ContentHandler):
|
||||
def __init__(self, test_harness, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.test_harness = test_harness
|
||||
|
||||
def characters(self, content):
|
||||
if content != '\n':
|
||||
h = self.test_harness
|
||||
t = h.specified_chars[h.char_index]
|
||||
h.assertEqual(t[0], content)
|
||||
h.assertEqual(t[1], h.in_cdata)
|
||||
h.char_index += 1
|
||||
|
||||
self.parser = create_parser()
|
||||
self.parser.setContentHandler(TestCharHandler(self))
|
||||
self.parser.setProperty(
|
||||
'http://xml.org/sax/properties/lexical-handler',
|
||||
TestLexicalHandler(self))
|
||||
source = InputSource()
|
||||
source.setCharacterStream(self.test_data)
|
||||
self.parser.parse(source)
|
||||
|
||||
self.assertFalse(self.in_cdata)
|
||||
self.assertEqual(self.char_index, 2)
|
||||
|
||||
|
||||
def test_main():
|
||||
run_unittest(MakeParserTest,
|
||||
ParseTest,
|
||||
|
@ -1368,7 +1518,10 @@ def test_main():
|
|||
StreamReaderWriterXmlgenTest,
|
||||
ExpatReaderTest,
|
||||
ErrorReportingTest,
|
||||
XmlReaderTest)
|
||||
XmlReaderTest,
|
||||
LexicalHandlerTest,
|
||||
CDATAHandlerTest)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_main()
|
||||
|
|
|
@ -340,3 +340,48 @@ all_properties = [property_lexical_handler,
|
|||
property_xml_string,
|
||||
property_encoding,
|
||||
property_interning_dict]
|
||||
|
||||
|
||||
class LexicalHandler:
|
||||
"""Optional SAX2 handler for lexical events.
|
||||
|
||||
This handler is used to obtain lexical information about an XML
|
||||
document, that is, information about how the document was encoded
|
||||
(as opposed to what it contains, which is reported to the
|
||||
ContentHandler), such as comments and CDATA marked section
|
||||
boundaries.
|
||||
|
||||
To set the LexicalHandler of an XMLReader, use the setProperty
|
||||
method with the property identifier
|
||||
'http://xml.org/sax/properties/lexical-handler'."""
|
||||
|
||||
def comment(self, content):
|
||||
"""Reports a comment anywhere in the document (including the
|
||||
DTD and outside the document element).
|
||||
|
||||
content is a string that holds the contents of the comment."""
|
||||
|
||||
def startDTD(self, name, public_id, system_id):
|
||||
"""Report the start of the DTD declarations, if the document
|
||||
has an associated DTD.
|
||||
|
||||
A startEntity event will be reported before declaration events
|
||||
from the external DTD subset are reported, and this can be
|
||||
used to infer from which subset DTD declarations derive.
|
||||
|
||||
name is the name of the document element type, public_id the
|
||||
public identifier of the DTD (or None if none were supplied)
|
||||
and system_id the system identfier of the external subset (or
|
||||
None if none were supplied)."""
|
||||
|
||||
def endDTD(self):
|
||||
"""Signals the end of DTD declarations."""
|
||||
|
||||
def startCDATA(self):
|
||||
"""Reports the beginning of a CDATA marked section.
|
||||
|
||||
The contents of the CDATA marked section will be reported
|
||||
through the characters event."""
|
||||
|
||||
def endCDATA(self):
|
||||
"""Reports the end of a CDATA marked section."""
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
Add the :class:`xml.sax.handler.LexicalHandler` class that is present in
|
||||
other SAX XML implementations.
|
Loading…
Reference in New Issue