From dde3eebdaa8d2c51971ca704d53af7cbcda8bb34 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Wed, 1 May 2019 21:49:58 +0200 Subject: [PATCH] bpo-36676: Namespace prefix aware parsing support for the ET.XMLParser target (GH-12885) * bpo-36676: Implement namespace prefix aware parsing support for the XMLParser target in ElementTree. --- Doc/library/xml.etree.elementtree.rst | 22 ++- Lib/test/test_xml_etree.py | 93 +++++++++++- Lib/xml/etree/ElementTree.py | 30 +++- .../2019-04-20-13-10-34.bpo-36676.XF4Egb.rst | 3 + Modules/_elementtree.c | 140 +++++++++++++++--- 5 files changed, 258 insertions(+), 30 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2019-04-20-13-10-34.bpo-36676.XF4Egb.rst diff --git a/Doc/library/xml.etree.elementtree.rst b/Doc/library/xml.etree.elementtree.rst index c9e04c2fc8f..66090af00fa 100644 --- a/Doc/library/xml.etree.elementtree.rst +++ b/Doc/library/xml.etree.elementtree.rst @@ -1086,7 +1086,7 @@ TreeBuilder Objects In addition, a custom :class:`TreeBuilder` object can provide the - following method: + following methods: .. method:: doctype(name, pubid, system) @@ -1096,6 +1096,23 @@ TreeBuilder Objects .. versionadded:: 3.2 + .. method:: start_ns(prefix, uri) + + Is called whenever the parser encounters a new namespace declaration, + before the ``start()`` callback for the opening element that defines it. + *prefix* is ``''`` for the default namespace and the declared + namespace prefix name otherwise. *uri* is the namespace URI. + + .. versionadded:: 3.8 + + .. method:: end_ns(prefix) + + Is called after the ``end()`` callback of an element that declared + a namespace prefix mapping, with the name of the *prefix* that went + out of scope. + + .. versionadded:: 3.8 + .. _elementtree-xmlparser-objects: @@ -1131,7 +1148,8 @@ XMLParser Objects :meth:`XMLParser.feed` calls *target*\'s ``start(tag, attrs_dict)`` method for each opening tag, its ``end(tag)`` method for each closing tag, and data - is processed by method ``data(data)``. :meth:`XMLParser.close` calls + is processed by method ``data(data)``. For further supported callback + methods, see the :class:`TreeBuilder` class. :meth:`XMLParser.close` calls *target*\'s method ``close()``. :class:`XMLParser` can be used not only for building a tree structure. This is an example of counting the maximum depth of an XML file:: diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index 8a228b8ccd6..0abc42a173d 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -14,12 +14,13 @@ import locale import operator import pickle import sys +import textwrap import types import unittest import warnings import weakref -from itertools import product +from itertools import product, islice from test import support from test.support import TESTFN, findfile, import_fresh_module, gc_collect, swap_attr @@ -694,12 +695,17 @@ class ElementTreeTest(unittest.TestCase): self.append(("pi", target, data)) def comment(self, data): self.append(("comment", data)) + def start_ns(self, prefix, uri): + self.append(("start-ns", prefix, uri)) + def end_ns(self, prefix): + self.append(("end-ns", prefix)) builder = Builder() parser = ET.XMLParser(target=builder) parser.feed(data) self.assertEqual(builder, [ ('pi', 'pi', 'data'), ('comment', ' comment '), + ('start-ns', '', 'namespace'), ('start', '{namespace}root'), ('start', '{namespace}element'), ('end', '{namespace}element'), @@ -708,8 +714,30 @@ class ElementTreeTest(unittest.TestCase): ('start', '{namespace}empty-element'), ('end', '{namespace}empty-element'), ('end', '{namespace}root'), + ('end-ns', ''), ]) + def test_custom_builder_only_end_ns(self): + class Builder(list): + def end_ns(self, prefix): + self.append(("end-ns", prefix)) + + builder = Builder() + parser = ET.XMLParser(target=builder) + parser.feed(textwrap.dedent("""\ + + + + text + texttail + + + """)) + self.assertEqual(builder, [ + ('end-ns', 'a'), + ('end-ns', 'p'), + ('end-ns', ''), + ]) # Element.getchildren() and ElementTree.getiterator() are deprecated. @checkwarnings(("This method will be removed in future versions. " @@ -1194,14 +1222,19 @@ class XMLPullParserTest(unittest.TestCase): for i in range(0, len(data), chunk_size): parser.feed(data[i:i+chunk_size]) - def assert_events(self, parser, expected): + def assert_events(self, parser, expected, max_events=None): self.assertEqual( [(event, (elem.tag, elem.text)) - for event, elem in parser.read_events()], + for event, elem in islice(parser.read_events(), max_events)], expected) - def assert_event_tags(self, parser, expected): - events = parser.read_events() + def assert_event_tuples(self, parser, expected, max_events=None): + self.assertEqual( + list(islice(parser.read_events(), max_events)), + expected) + + def assert_event_tags(self, parser, expected, max_events=None): + events = islice(parser.read_events(), max_events) self.assertEqual([(action, elem.tag) for action, elem in events], expected) @@ -1276,6 +1309,56 @@ class XMLPullParserTest(unittest.TestCase): self.assertEqual(list(parser.read_events()), [('end-ns', None)]) self.assertIsNone(parser.close()) + def test_ns_events_start(self): + parser = ET.XMLPullParser(events=('start-ns', 'start', 'end')) + self._feed(parser, "\n") + self.assert_event_tuples(parser, [ + ('start-ns', ('', 'abc')), + ('start-ns', ('p', 'xyz')), + ], max_events=2) + self.assert_event_tags(parser, [ + ('start', '{abc}tag'), + ], max_events=1) + + self._feed(parser, "\n") + self.assert_event_tags(parser, [ + ('start', '{abc}child'), + ('end', '{abc}child'), + ]) + + self._feed(parser, "\n") + parser.close() + self.assert_event_tags(parser, [ + ('end', '{abc}tag'), + ]) + + def test_ns_events_start_end(self): + parser = ET.XMLPullParser(events=('start-ns', 'start', 'end', 'end-ns')) + self._feed(parser, "\n") + self.assert_event_tuples(parser, [ + ('start-ns', ('', 'abc')), + ('start-ns', ('p', 'xyz')), + ], max_events=2) + self.assert_event_tags(parser, [ + ('start', '{abc}tag'), + ], max_events=1) + + self._feed(parser, "\n") + self.assert_event_tags(parser, [ + ('start', '{abc}child'), + ('end', '{abc}child'), + ]) + + self._feed(parser, "\n") + parser.close() + self.assert_event_tags(parser, [ + ('end', '{abc}tag'), + ], max_events=1) + self.assert_event_tuples(parser, [ + ('end-ns', None), + ('end-ns', None), + ]) + def test_events(self): parser = ET.XMLPullParser(events=()) self._feed(parser, "\n") diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index c6400480f5b..5b26ac72fd1 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -1518,6 +1518,10 @@ class XMLParser: parser.StartElementHandler = self._start if hasattr(target, 'end'): parser.EndElementHandler = self._end + if hasattr(target, 'start_ns'): + parser.StartNamespaceDeclHandler = self._start_ns + if hasattr(target, 'end_ns'): + parser.EndNamespaceDeclHandler = self._end_ns if hasattr(target, 'data'): parser.CharacterDataHandler = target.data # miscellaneous callbacks @@ -1559,12 +1563,24 @@ class XMLParser: append((event, end(tag))) parser.EndElementHandler = handler elif event_name == "start-ns": - def handler(prefix, uri, event=event_name, append=append): - append((event, (prefix or "", uri or ""))) + # TreeBuilder does not implement .start_ns() + if hasattr(self.target, "start_ns"): + def handler(prefix, uri, event=event_name, append=append, + start_ns=self._start_ns): + append((event, start_ns(prefix, uri))) + else: + def handler(prefix, uri, event=event_name, append=append): + append((event, (prefix or '', uri or ''))) parser.StartNamespaceDeclHandler = handler elif event_name == "end-ns": - def handler(prefix, event=event_name, append=append): - append((event, None)) + # TreeBuilder does not implement .end_ns() + if hasattr(self.target, "end_ns"): + def handler(prefix, event=event_name, append=append, + end_ns=self._end_ns): + append((event, end_ns(prefix))) + else: + def handler(prefix, event=event_name, append=append): + append((event, None)) parser.EndNamespaceDeclHandler = handler elif event_name == 'comment': def handler(text, event=event_name, append=append, self=self): @@ -1595,6 +1611,12 @@ class XMLParser: self._names[key] = name return name + def _start_ns(self, prefix, uri): + return self.target.start_ns(prefix or '', uri or '') + + def _end_ns(self, prefix): + return self.target.end_ns(prefix or '') + def _start(self, tag, attr_list): # Handler for expat's StartElementHandler. Since ordered_attributes # is set, the attributes are reported as a list of alternating diff --git a/Misc/NEWS.d/next/Library/2019-04-20-13-10-34.bpo-36676.XF4Egb.rst b/Misc/NEWS.d/next/Library/2019-04-20-13-10-34.bpo-36676.XF4Egb.rst new file mode 100644 index 00000000000..e0bede81eec --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-04-20-13-10-34.bpo-36676.XF4Egb.rst @@ -0,0 +1,3 @@ +The XMLParser() in xml.etree.ElementTree provides namespace prefix context to the +parser target if it defines the callback methods "start_ns()" and/or "end_ns()". +Patch by Stefan Behnel. diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c index 5481c616787..b69e3a45fe3 100644 --- a/Modules/_elementtree.c +++ b/Modules/_elementtree.c @@ -2911,6 +2911,39 @@ treebuilder_handle_pi(TreeBuilderObject* self, PyObject* target, PyObject* text) return NULL; } +LOCAL(PyObject*) +treebuilder_handle_start_ns(TreeBuilderObject* self, PyObject* prefix, PyObject* uri) +{ + PyObject* parcel; + + if (self->events_append && self->start_ns_event_obj) { + parcel = PyTuple_Pack(2, prefix, uri); + if (!parcel) { + return NULL; + } + + if (treebuilder_append_event(self, self->start_ns_event_obj, parcel) < 0) { + Py_DECREF(parcel); + return NULL; + } + Py_DECREF(parcel); + } + + Py_RETURN_NONE; +} + +LOCAL(PyObject*) +treebuilder_handle_end_ns(TreeBuilderObject* self, PyObject* prefix) +{ + if (self->events_append && self->end_ns_event_obj) { + if (treebuilder_append_event(self, self->end_ns_event_obj, prefix) < 0) { + return NULL; + } + } + + Py_RETURN_NONE; +} + /* -------------------------------------------------------------------- */ /* methods (in alphabetical order) */ @@ -3046,6 +3079,8 @@ typedef struct { PyObject *names; + PyObject *handle_start_ns; + PyObject *handle_end_ns; PyObject *handle_start; PyObject *handle_data; PyObject *handle_end; @@ -3357,42 +3392,89 @@ expat_end_handler(XMLParserObject* self, const XML_Char* tag_in) } static void -expat_start_ns_handler(XMLParserObject* self, const XML_Char* prefix, - const XML_Char *uri) +expat_start_ns_handler(XMLParserObject* self, const XML_Char* prefix_in, + const XML_Char *uri_in) { - TreeBuilderObject *target = (TreeBuilderObject*) self->target; - PyObject *parcel; + PyObject* res = NULL; + PyObject* uri; + PyObject* prefix; + PyObject* stack[2]; if (PyErr_Occurred()) return; - if (!target->events_append || !target->start_ns_event_obj) - return; + if (!uri_in) + uri_in = ""; + if (!prefix_in) + prefix_in = ""; - if (!uri) - uri = ""; - if (!prefix) - prefix = ""; + if (TreeBuilder_CheckExact(self->target)) { + /* shortcut - TreeBuilder does not actually implement .start_ns() */ + TreeBuilderObject *target = (TreeBuilderObject*) self->target; - parcel = Py_BuildValue("ss", prefix, uri); - if (!parcel) - return; - treebuilder_append_event(target, target->start_ns_event_obj, parcel); - Py_DECREF(parcel); + if (target->events_append && target->start_ns_event_obj) { + prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict"); + if (!prefix) + return; + uri = PyUnicode_DecodeUTF8(uri_in, strlen(uri_in), "strict"); + if (!uri) { + Py_DECREF(prefix); + return; + } + + res = treebuilder_handle_start_ns(target, prefix, uri); + Py_DECREF(uri); + Py_DECREF(prefix); + } + } else if (self->handle_start_ns) { + prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict"); + if (!prefix) + return; + uri = PyUnicode_DecodeUTF8(uri_in, strlen(uri_in), "strict"); + if (!uri) { + Py_DECREF(prefix); + return; + } + + stack[0] = prefix; + stack[1] = uri; + res = _PyObject_FastCall(self->handle_start_ns, stack, 2); + Py_DECREF(uri); + Py_DECREF(prefix); + } + + Py_XDECREF(res); } static void expat_end_ns_handler(XMLParserObject* self, const XML_Char* prefix_in) { - TreeBuilderObject *target = (TreeBuilderObject*) self->target; + PyObject *res = NULL; + PyObject* prefix; if (PyErr_Occurred()) return; - if (!target->events_append) - return; + if (!prefix_in) + prefix_in = ""; - treebuilder_append_event(target, target->end_ns_event_obj, Py_None); + if (TreeBuilder_CheckExact(self->target)) { + /* shortcut - TreeBuilder does not actually implement .end_ns() */ + TreeBuilderObject *target = (TreeBuilderObject*) self->target; + + if (target->events_append && target->end_ns_event_obj) { + res = treebuilder_handle_end_ns(target, Py_None); + } + } else if (self->handle_end_ns) { + prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict"); + if (!prefix) + return; + + res = _PyObject_FastCall(self->handle_end_ns, &prefix, 1); + Py_DECREF(prefix); + } + + Py_XDECREF(res); } static void @@ -3546,6 +3628,7 @@ xmlparser_new(PyTypeObject *type, PyObject *args, PyObject *kwds) if (self) { self->parser = NULL; self->target = self->entity = self->names = NULL; + self->handle_start_ns = self->handle_end_ns = NULL; self->handle_start = self->handle_data = self->handle_end = NULL; self->handle_comment = self->handle_pi = self->handle_close = NULL; self->handle_doctype = NULL; @@ -3614,6 +3697,14 @@ _elementtree_XMLParser___init___impl(XMLParserObject *self, PyObject *target, } self->target = target; + self->handle_start_ns = PyObject_GetAttrString(target, "start_ns"); + if (ignore_attribute_error(self->handle_start_ns)) { + return -1; + } + self->handle_end_ns = PyObject_GetAttrString(target, "end_ns"); + if (ignore_attribute_error(self->handle_end_ns)) { + return -1; + } self->handle_start = PyObject_GetAttrString(target, "start"); if (ignore_attribute_error(self->handle_start)) { return -1; @@ -3645,6 +3736,12 @@ _elementtree_XMLParser___init___impl(XMLParserObject *self, PyObject *target, /* configure parser */ EXPAT(SetUserData)(self->parser, self); + if (self->handle_start_ns || self->handle_end_ns) + EXPAT(SetNamespaceDeclHandler)( + self->parser, + (XML_StartNamespaceDeclHandler) expat_start_ns_handler, + (XML_EndNamespaceDeclHandler) expat_end_ns_handler + ); EXPAT(SetElementHandler)( self->parser, (XML_StartElementHandler) expat_start_handler, @@ -3689,6 +3786,9 @@ xmlparser_gc_traverse(XMLParserObject *self, visitproc visit, void *arg) Py_VISIT(self->handle_end); Py_VISIT(self->handle_data); Py_VISIT(self->handle_start); + Py_VISIT(self->handle_start_ns); + Py_VISIT(self->handle_end_ns); + Py_VISIT(self->handle_doctype); Py_VISIT(self->target); Py_VISIT(self->entity); @@ -3712,6 +3812,8 @@ xmlparser_gc_clear(XMLParserObject *self) Py_CLEAR(self->handle_end); Py_CLEAR(self->handle_data); Py_CLEAR(self->handle_start); + Py_CLEAR(self->handle_start_ns); + Py_CLEAR(self->handle_end_ns); Py_CLEAR(self->handle_doctype); Py_CLEAR(self->target);