bpo-36676: Namespace prefix aware parsing support for the ET.XMLParser target (GH-12885)

* bpo-36676: Implement namespace prefix aware parsing support for the XMLParser target in ElementTree.
This commit is contained in:
Stefan Behnel 2019-05-01 21:49:58 +02:00 committed by GitHub
parent 43851a202c
commit dde3eebdaa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 258 additions and 30 deletions

View File

@ -1086,7 +1086,7 @@ TreeBuilder Objects
In addition, a custom :class:`TreeBuilder` object can provide the In addition, a custom :class:`TreeBuilder` object can provide the
following method: following methods:
.. method:: doctype(name, pubid, system) .. method:: doctype(name, pubid, system)
@ -1096,6 +1096,23 @@ TreeBuilder Objects
.. versionadded:: 3.2 .. versionadded:: 3.2
.. method:: start_ns(prefix, uri)
Is called whenever the parser encounters a new namespace declaration,
before the ``start()`` callback for the opening element that defines it.
*prefix* is ``''`` for the default namespace and the declared
namespace prefix name otherwise. *uri* is the namespace URI.
.. versionadded:: 3.8
.. method:: end_ns(prefix)
Is called after the ``end()`` callback of an element that declared
a namespace prefix mapping, with the name of the *prefix* that went
out of scope.
.. versionadded:: 3.8
.. _elementtree-xmlparser-objects: .. _elementtree-xmlparser-objects:
@ -1131,7 +1148,8 @@ XMLParser Objects
:meth:`XMLParser.feed` calls *target*\'s ``start(tag, attrs_dict)`` method :meth:`XMLParser.feed` calls *target*\'s ``start(tag, attrs_dict)`` method
for each opening tag, its ``end(tag)`` method for each closing tag, and data for each opening tag, its ``end(tag)`` method for each closing tag, and data
is processed by method ``data(data)``. :meth:`XMLParser.close` calls is processed by method ``data(data)``. For further supported callback
methods, see the :class:`TreeBuilder` class. :meth:`XMLParser.close` calls
*target*\'s method ``close()``. :class:`XMLParser` can be used not only for *target*\'s method ``close()``. :class:`XMLParser` can be used not only for
building a tree structure. This is an example of counting the maximum depth building a tree structure. This is an example of counting the maximum depth
of an XML file:: of an XML file::

View File

@ -14,12 +14,13 @@ import locale
import operator import operator
import pickle import pickle
import sys import sys
import textwrap
import types import types
import unittest import unittest
import warnings import warnings
import weakref import weakref
from itertools import product from itertools import product, islice
from test import support from test import support
from test.support import TESTFN, findfile, import_fresh_module, gc_collect, swap_attr from test.support import TESTFN, findfile, import_fresh_module, gc_collect, swap_attr
@ -694,12 +695,17 @@ class ElementTreeTest(unittest.TestCase):
self.append(("pi", target, data)) self.append(("pi", target, data))
def comment(self, data): def comment(self, data):
self.append(("comment", data)) self.append(("comment", data))
def start_ns(self, prefix, uri):
self.append(("start-ns", prefix, uri))
def end_ns(self, prefix):
self.append(("end-ns", prefix))
builder = Builder() builder = Builder()
parser = ET.XMLParser(target=builder) parser = ET.XMLParser(target=builder)
parser.feed(data) parser.feed(data)
self.assertEqual(builder, [ self.assertEqual(builder, [
('pi', 'pi', 'data'), ('pi', 'pi', 'data'),
('comment', ' comment '), ('comment', ' comment '),
('start-ns', '', 'namespace'),
('start', '{namespace}root'), ('start', '{namespace}root'),
('start', '{namespace}element'), ('start', '{namespace}element'),
('end', '{namespace}element'), ('end', '{namespace}element'),
@ -708,8 +714,30 @@ class ElementTreeTest(unittest.TestCase):
('start', '{namespace}empty-element'), ('start', '{namespace}empty-element'),
('end', '{namespace}empty-element'), ('end', '{namespace}empty-element'),
('end', '{namespace}root'), ('end', '{namespace}root'),
('end-ns', ''),
]) ])
def test_custom_builder_only_end_ns(self):
class Builder(list):
def end_ns(self, prefix):
self.append(("end-ns", prefix))
builder = Builder()
parser = ET.XMLParser(target=builder)
parser.feed(textwrap.dedent("""\
<?pi data?>
<!-- comment -->
<root xmlns='namespace' xmlns:p='pns' xmlns:a='ans'>
<a:element key='value'>text</a:element>
<p:element>text</p:element>tail
<empty-element/>
</root>
"""))
self.assertEqual(builder, [
('end-ns', 'a'),
('end-ns', 'p'),
('end-ns', ''),
])
# Element.getchildren() and ElementTree.getiterator() are deprecated. # Element.getchildren() and ElementTree.getiterator() are deprecated.
@checkwarnings(("This method will be removed in future versions. " @checkwarnings(("This method will be removed in future versions. "
@ -1194,14 +1222,19 @@ class XMLPullParserTest(unittest.TestCase):
for i in range(0, len(data), chunk_size): for i in range(0, len(data), chunk_size):
parser.feed(data[i:i+chunk_size]) parser.feed(data[i:i+chunk_size])
def assert_events(self, parser, expected): def assert_events(self, parser, expected, max_events=None):
self.assertEqual( self.assertEqual(
[(event, (elem.tag, elem.text)) [(event, (elem.tag, elem.text))
for event, elem in parser.read_events()], for event, elem in islice(parser.read_events(), max_events)],
expected) expected)
def assert_event_tags(self, parser, expected): def assert_event_tuples(self, parser, expected, max_events=None):
events = parser.read_events() self.assertEqual(
list(islice(parser.read_events(), max_events)),
expected)
def assert_event_tags(self, parser, expected, max_events=None):
events = islice(parser.read_events(), max_events)
self.assertEqual([(action, elem.tag) for action, elem in events], self.assertEqual([(action, elem.tag) for action, elem in events],
expected) expected)
@ -1276,6 +1309,56 @@ class XMLPullParserTest(unittest.TestCase):
self.assertEqual(list(parser.read_events()), [('end-ns', None)]) self.assertEqual(list(parser.read_events()), [('end-ns', None)])
self.assertIsNone(parser.close()) self.assertIsNone(parser.close())
def test_ns_events_start(self):
parser = ET.XMLPullParser(events=('start-ns', 'start', 'end'))
self._feed(parser, "<tag xmlns='abc' xmlns:p='xyz'>\n")
self.assert_event_tuples(parser, [
('start-ns', ('', 'abc')),
('start-ns', ('p', 'xyz')),
], max_events=2)
self.assert_event_tags(parser, [
('start', '{abc}tag'),
], max_events=1)
self._feed(parser, "<child />\n")
self.assert_event_tags(parser, [
('start', '{abc}child'),
('end', '{abc}child'),
])
self._feed(parser, "</tag>\n")
parser.close()
self.assert_event_tags(parser, [
('end', '{abc}tag'),
])
def test_ns_events_start_end(self):
parser = ET.XMLPullParser(events=('start-ns', 'start', 'end', 'end-ns'))
self._feed(parser, "<tag xmlns='abc' xmlns:p='xyz'>\n")
self.assert_event_tuples(parser, [
('start-ns', ('', 'abc')),
('start-ns', ('p', 'xyz')),
], max_events=2)
self.assert_event_tags(parser, [
('start', '{abc}tag'),
], max_events=1)
self._feed(parser, "<child />\n")
self.assert_event_tags(parser, [
('start', '{abc}child'),
('end', '{abc}child'),
])
self._feed(parser, "</tag>\n")
parser.close()
self.assert_event_tags(parser, [
('end', '{abc}tag'),
], max_events=1)
self.assert_event_tuples(parser, [
('end-ns', None),
('end-ns', None),
])
def test_events(self): def test_events(self):
parser = ET.XMLPullParser(events=()) parser = ET.XMLPullParser(events=())
self._feed(parser, "<root/>\n") self._feed(parser, "<root/>\n")

View File

@ -1518,6 +1518,10 @@ class XMLParser:
parser.StartElementHandler = self._start parser.StartElementHandler = self._start
if hasattr(target, 'end'): if hasattr(target, 'end'):
parser.EndElementHandler = self._end parser.EndElementHandler = self._end
if hasattr(target, 'start_ns'):
parser.StartNamespaceDeclHandler = self._start_ns
if hasattr(target, 'end_ns'):
parser.EndNamespaceDeclHandler = self._end_ns
if hasattr(target, 'data'): if hasattr(target, 'data'):
parser.CharacterDataHandler = target.data parser.CharacterDataHandler = target.data
# miscellaneous callbacks # miscellaneous callbacks
@ -1559,12 +1563,24 @@ class XMLParser:
append((event, end(tag))) append((event, end(tag)))
parser.EndElementHandler = handler parser.EndElementHandler = handler
elif event_name == "start-ns": elif event_name == "start-ns":
def handler(prefix, uri, event=event_name, append=append): # TreeBuilder does not implement .start_ns()
append((event, (prefix or "", uri or ""))) if hasattr(self.target, "start_ns"):
def handler(prefix, uri, event=event_name, append=append,
start_ns=self._start_ns):
append((event, start_ns(prefix, uri)))
else:
def handler(prefix, uri, event=event_name, append=append):
append((event, (prefix or '', uri or '')))
parser.StartNamespaceDeclHandler = handler parser.StartNamespaceDeclHandler = handler
elif event_name == "end-ns": elif event_name == "end-ns":
def handler(prefix, event=event_name, append=append): # TreeBuilder does not implement .end_ns()
append((event, None)) if hasattr(self.target, "end_ns"):
def handler(prefix, event=event_name, append=append,
end_ns=self._end_ns):
append((event, end_ns(prefix)))
else:
def handler(prefix, event=event_name, append=append):
append((event, None))
parser.EndNamespaceDeclHandler = handler parser.EndNamespaceDeclHandler = handler
elif event_name == 'comment': elif event_name == 'comment':
def handler(text, event=event_name, append=append, self=self): def handler(text, event=event_name, append=append, self=self):
@ -1595,6 +1611,12 @@ class XMLParser:
self._names[key] = name self._names[key] = name
return name return name
def _start_ns(self, prefix, uri):
return self.target.start_ns(prefix or '', uri or '')
def _end_ns(self, prefix):
return self.target.end_ns(prefix or '')
def _start(self, tag, attr_list): def _start(self, tag, attr_list):
# Handler for expat's StartElementHandler. Since ordered_attributes # Handler for expat's StartElementHandler. Since ordered_attributes
# is set, the attributes are reported as a list of alternating # is set, the attributes are reported as a list of alternating

View File

@ -0,0 +1,3 @@
The XMLParser() in xml.etree.ElementTree provides namespace prefix context to the
parser target if it defines the callback methods "start_ns()" and/or "end_ns()".
Patch by Stefan Behnel.

View File

@ -2911,6 +2911,39 @@ treebuilder_handle_pi(TreeBuilderObject* self, PyObject* target, PyObject* text)
return NULL; return NULL;
} }
LOCAL(PyObject*)
treebuilder_handle_start_ns(TreeBuilderObject* self, PyObject* prefix, PyObject* uri)
{
PyObject* parcel;
if (self->events_append && self->start_ns_event_obj) {
parcel = PyTuple_Pack(2, prefix, uri);
if (!parcel) {
return NULL;
}
if (treebuilder_append_event(self, self->start_ns_event_obj, parcel) < 0) {
Py_DECREF(parcel);
return NULL;
}
Py_DECREF(parcel);
}
Py_RETURN_NONE;
}
LOCAL(PyObject*)
treebuilder_handle_end_ns(TreeBuilderObject* self, PyObject* prefix)
{
if (self->events_append && self->end_ns_event_obj) {
if (treebuilder_append_event(self, self->end_ns_event_obj, prefix) < 0) {
return NULL;
}
}
Py_RETURN_NONE;
}
/* -------------------------------------------------------------------- */ /* -------------------------------------------------------------------- */
/* methods (in alphabetical order) */ /* methods (in alphabetical order) */
@ -3046,6 +3079,8 @@ typedef struct {
PyObject *names; PyObject *names;
PyObject *handle_start_ns;
PyObject *handle_end_ns;
PyObject *handle_start; PyObject *handle_start;
PyObject *handle_data; PyObject *handle_data;
PyObject *handle_end; PyObject *handle_end;
@ -3357,42 +3392,89 @@ expat_end_handler(XMLParserObject* self, const XML_Char* tag_in)
} }
static void static void
expat_start_ns_handler(XMLParserObject* self, const XML_Char* prefix, expat_start_ns_handler(XMLParserObject* self, const XML_Char* prefix_in,
const XML_Char *uri) const XML_Char *uri_in)
{ {
TreeBuilderObject *target = (TreeBuilderObject*) self->target; PyObject* res = NULL;
PyObject *parcel; PyObject* uri;
PyObject* prefix;
PyObject* stack[2];
if (PyErr_Occurred()) if (PyErr_Occurred())
return; return;
if (!target->events_append || !target->start_ns_event_obj) if (!uri_in)
return; uri_in = "";
if (!prefix_in)
prefix_in = "";
if (!uri) if (TreeBuilder_CheckExact(self->target)) {
uri = ""; /* shortcut - TreeBuilder does not actually implement .start_ns() */
if (!prefix) TreeBuilderObject *target = (TreeBuilderObject*) self->target;
prefix = "";
parcel = Py_BuildValue("ss", prefix, uri); if (target->events_append && target->start_ns_event_obj) {
if (!parcel) prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
return; if (!prefix)
treebuilder_append_event(target, target->start_ns_event_obj, parcel); return;
Py_DECREF(parcel); uri = PyUnicode_DecodeUTF8(uri_in, strlen(uri_in), "strict");
if (!uri) {
Py_DECREF(prefix);
return;
}
res = treebuilder_handle_start_ns(target, prefix, uri);
Py_DECREF(uri);
Py_DECREF(prefix);
}
} else if (self->handle_start_ns) {
prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
if (!prefix)
return;
uri = PyUnicode_DecodeUTF8(uri_in, strlen(uri_in), "strict");
if (!uri) {
Py_DECREF(prefix);
return;
}
stack[0] = prefix;
stack[1] = uri;
res = _PyObject_FastCall(self->handle_start_ns, stack, 2);
Py_DECREF(uri);
Py_DECREF(prefix);
}
Py_XDECREF(res);
} }
static void static void
expat_end_ns_handler(XMLParserObject* self, const XML_Char* prefix_in) expat_end_ns_handler(XMLParserObject* self, const XML_Char* prefix_in)
{ {
TreeBuilderObject *target = (TreeBuilderObject*) self->target; PyObject *res = NULL;
PyObject* prefix;
if (PyErr_Occurred()) if (PyErr_Occurred())
return; return;
if (!target->events_append) if (!prefix_in)
return; prefix_in = "";
treebuilder_append_event(target, target->end_ns_event_obj, Py_None); if (TreeBuilder_CheckExact(self->target)) {
/* shortcut - TreeBuilder does not actually implement .end_ns() */
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
if (target->events_append && target->end_ns_event_obj) {
res = treebuilder_handle_end_ns(target, Py_None);
}
} else if (self->handle_end_ns) {
prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
if (!prefix)
return;
res = _PyObject_FastCall(self->handle_end_ns, &prefix, 1);
Py_DECREF(prefix);
}
Py_XDECREF(res);
} }
static void static void
@ -3546,6 +3628,7 @@ xmlparser_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
if (self) { if (self) {
self->parser = NULL; self->parser = NULL;
self->target = self->entity = self->names = NULL; self->target = self->entity = self->names = NULL;
self->handle_start_ns = self->handle_end_ns = NULL;
self->handle_start = self->handle_data = self->handle_end = NULL; self->handle_start = self->handle_data = self->handle_end = NULL;
self->handle_comment = self->handle_pi = self->handle_close = NULL; self->handle_comment = self->handle_pi = self->handle_close = NULL;
self->handle_doctype = NULL; self->handle_doctype = NULL;
@ -3614,6 +3697,14 @@ _elementtree_XMLParser___init___impl(XMLParserObject *self, PyObject *target,
} }
self->target = target; self->target = target;
self->handle_start_ns = PyObject_GetAttrString(target, "start_ns");
if (ignore_attribute_error(self->handle_start_ns)) {
return -1;
}
self->handle_end_ns = PyObject_GetAttrString(target, "end_ns");
if (ignore_attribute_error(self->handle_end_ns)) {
return -1;
}
self->handle_start = PyObject_GetAttrString(target, "start"); self->handle_start = PyObject_GetAttrString(target, "start");
if (ignore_attribute_error(self->handle_start)) { if (ignore_attribute_error(self->handle_start)) {
return -1; return -1;
@ -3645,6 +3736,12 @@ _elementtree_XMLParser___init___impl(XMLParserObject *self, PyObject *target,
/* configure parser */ /* configure parser */
EXPAT(SetUserData)(self->parser, self); EXPAT(SetUserData)(self->parser, self);
if (self->handle_start_ns || self->handle_end_ns)
EXPAT(SetNamespaceDeclHandler)(
self->parser,
(XML_StartNamespaceDeclHandler) expat_start_ns_handler,
(XML_EndNamespaceDeclHandler) expat_end_ns_handler
);
EXPAT(SetElementHandler)( EXPAT(SetElementHandler)(
self->parser, self->parser,
(XML_StartElementHandler) expat_start_handler, (XML_StartElementHandler) expat_start_handler,
@ -3689,6 +3786,9 @@ xmlparser_gc_traverse(XMLParserObject *self, visitproc visit, void *arg)
Py_VISIT(self->handle_end); Py_VISIT(self->handle_end);
Py_VISIT(self->handle_data); Py_VISIT(self->handle_data);
Py_VISIT(self->handle_start); Py_VISIT(self->handle_start);
Py_VISIT(self->handle_start_ns);
Py_VISIT(self->handle_end_ns);
Py_VISIT(self->handle_doctype);
Py_VISIT(self->target); Py_VISIT(self->target);
Py_VISIT(self->entity); Py_VISIT(self->entity);
@ -3712,6 +3812,8 @@ xmlparser_gc_clear(XMLParserObject *self)
Py_CLEAR(self->handle_end); Py_CLEAR(self->handle_end);
Py_CLEAR(self->handle_data); Py_CLEAR(self->handle_data);
Py_CLEAR(self->handle_start); Py_CLEAR(self->handle_start);
Py_CLEAR(self->handle_start_ns);
Py_CLEAR(self->handle_end_ns);
Py_CLEAR(self->handle_doctype); Py_CLEAR(self->handle_doctype);
Py_CLEAR(self->target); Py_CLEAR(self->target);