bpo-36676: Namespace prefix aware parsing support for the ET.XMLParser target (GH-12885)

* bpo-36676: Implement namespace prefix aware parsing support for the XMLParser target in ElementTree.
This commit is contained in:
Stefan Behnel 2019-05-01 21:49:58 +02:00 committed by GitHub
parent 43851a202c
commit dde3eebdaa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 258 additions and 30 deletions

View File

@ -1086,7 +1086,7 @@ TreeBuilder Objects
In addition, a custom :class:`TreeBuilder` object can provide the
following method:
following methods:
.. method:: doctype(name, pubid, system)
@ -1096,6 +1096,23 @@ TreeBuilder Objects
.. versionadded:: 3.2
.. method:: start_ns(prefix, uri)
Is called whenever the parser encounters a new namespace declaration,
before the ``start()`` callback for the opening element that defines it.
*prefix* is ``''`` for the default namespace and the declared
namespace prefix name otherwise. *uri* is the namespace URI.
.. versionadded:: 3.8
.. method:: end_ns(prefix)
Is called after the ``end()`` callback of an element that declared
a namespace prefix mapping, with the name of the *prefix* that went
out of scope.
.. versionadded:: 3.8
.. _elementtree-xmlparser-objects:
@ -1131,7 +1148,8 @@ XMLParser Objects
:meth:`XMLParser.feed` calls *target*\'s ``start(tag, attrs_dict)`` method
for each opening tag, its ``end(tag)`` method for each closing tag, and data
is processed by method ``data(data)``. :meth:`XMLParser.close` calls
is processed by method ``data(data)``. For further supported callback
methods, see the :class:`TreeBuilder` class. :meth:`XMLParser.close` calls
*target*\'s method ``close()``. :class:`XMLParser` can be used not only for
building a tree structure. This is an example of counting the maximum depth
of an XML file::

View File

@ -14,12 +14,13 @@ import locale
import operator
import pickle
import sys
import textwrap
import types
import unittest
import warnings
import weakref
from itertools import product
from itertools import product, islice
from test import support
from test.support import TESTFN, findfile, import_fresh_module, gc_collect, swap_attr
@ -694,12 +695,17 @@ class ElementTreeTest(unittest.TestCase):
self.append(("pi", target, data))
def comment(self, data):
self.append(("comment", data))
def start_ns(self, prefix, uri):
self.append(("start-ns", prefix, uri))
def end_ns(self, prefix):
self.append(("end-ns", prefix))
builder = Builder()
parser = ET.XMLParser(target=builder)
parser.feed(data)
self.assertEqual(builder, [
('pi', 'pi', 'data'),
('comment', ' comment '),
('start-ns', '', 'namespace'),
('start', '{namespace}root'),
('start', '{namespace}element'),
('end', '{namespace}element'),
@ -708,8 +714,30 @@ class ElementTreeTest(unittest.TestCase):
('start', '{namespace}empty-element'),
('end', '{namespace}empty-element'),
('end', '{namespace}root'),
('end-ns', ''),
])
def test_custom_builder_only_end_ns(self):
class Builder(list):
def end_ns(self, prefix):
self.append(("end-ns", prefix))
builder = Builder()
parser = ET.XMLParser(target=builder)
parser.feed(textwrap.dedent("""\
<?pi data?>
<!-- comment -->
<root xmlns='namespace' xmlns:p='pns' xmlns:a='ans'>
<a:element key='value'>text</a:element>
<p:element>text</p:element>tail
<empty-element/>
</root>
"""))
self.assertEqual(builder, [
('end-ns', 'a'),
('end-ns', 'p'),
('end-ns', ''),
])
# Element.getchildren() and ElementTree.getiterator() are deprecated.
@checkwarnings(("This method will be removed in future versions. "
@ -1194,14 +1222,19 @@ class XMLPullParserTest(unittest.TestCase):
for i in range(0, len(data), chunk_size):
parser.feed(data[i:i+chunk_size])
def assert_events(self, parser, expected):
def assert_events(self, parser, expected, max_events=None):
self.assertEqual(
[(event, (elem.tag, elem.text))
for event, elem in parser.read_events()],
for event, elem in islice(parser.read_events(), max_events)],
expected)
def assert_event_tags(self, parser, expected):
events = parser.read_events()
def assert_event_tuples(self, parser, expected, max_events=None):
self.assertEqual(
list(islice(parser.read_events(), max_events)),
expected)
def assert_event_tags(self, parser, expected, max_events=None):
events = islice(parser.read_events(), max_events)
self.assertEqual([(action, elem.tag) for action, elem in events],
expected)
@ -1276,6 +1309,56 @@ class XMLPullParserTest(unittest.TestCase):
self.assertEqual(list(parser.read_events()), [('end-ns', None)])
self.assertIsNone(parser.close())
def test_ns_events_start(self):
parser = ET.XMLPullParser(events=('start-ns', 'start', 'end'))
self._feed(parser, "<tag xmlns='abc' xmlns:p='xyz'>\n")
self.assert_event_tuples(parser, [
('start-ns', ('', 'abc')),
('start-ns', ('p', 'xyz')),
], max_events=2)
self.assert_event_tags(parser, [
('start', '{abc}tag'),
], max_events=1)
self._feed(parser, "<child />\n")
self.assert_event_tags(parser, [
('start', '{abc}child'),
('end', '{abc}child'),
])
self._feed(parser, "</tag>\n")
parser.close()
self.assert_event_tags(parser, [
('end', '{abc}tag'),
])
def test_ns_events_start_end(self):
parser = ET.XMLPullParser(events=('start-ns', 'start', 'end', 'end-ns'))
self._feed(parser, "<tag xmlns='abc' xmlns:p='xyz'>\n")
self.assert_event_tuples(parser, [
('start-ns', ('', 'abc')),
('start-ns', ('p', 'xyz')),
], max_events=2)
self.assert_event_tags(parser, [
('start', '{abc}tag'),
], max_events=1)
self._feed(parser, "<child />\n")
self.assert_event_tags(parser, [
('start', '{abc}child'),
('end', '{abc}child'),
])
self._feed(parser, "</tag>\n")
parser.close()
self.assert_event_tags(parser, [
('end', '{abc}tag'),
], max_events=1)
self.assert_event_tuples(parser, [
('end-ns', None),
('end-ns', None),
])
def test_events(self):
parser = ET.XMLPullParser(events=())
self._feed(parser, "<root/>\n")

View File

@ -1518,6 +1518,10 @@ class XMLParser:
parser.StartElementHandler = self._start
if hasattr(target, 'end'):
parser.EndElementHandler = self._end
if hasattr(target, 'start_ns'):
parser.StartNamespaceDeclHandler = self._start_ns
if hasattr(target, 'end_ns'):
parser.EndNamespaceDeclHandler = self._end_ns
if hasattr(target, 'data'):
parser.CharacterDataHandler = target.data
# miscellaneous callbacks
@ -1559,12 +1563,24 @@ class XMLParser:
append((event, end(tag)))
parser.EndElementHandler = handler
elif event_name == "start-ns":
def handler(prefix, uri, event=event_name, append=append):
append((event, (prefix or "", uri or "")))
# TreeBuilder does not implement .start_ns()
if hasattr(self.target, "start_ns"):
def handler(prefix, uri, event=event_name, append=append,
start_ns=self._start_ns):
append((event, start_ns(prefix, uri)))
else:
def handler(prefix, uri, event=event_name, append=append):
append((event, (prefix or '', uri or '')))
parser.StartNamespaceDeclHandler = handler
elif event_name == "end-ns":
def handler(prefix, event=event_name, append=append):
append((event, None))
# TreeBuilder does not implement .end_ns()
if hasattr(self.target, "end_ns"):
def handler(prefix, event=event_name, append=append,
end_ns=self._end_ns):
append((event, end_ns(prefix)))
else:
def handler(prefix, event=event_name, append=append):
append((event, None))
parser.EndNamespaceDeclHandler = handler
elif event_name == 'comment':
def handler(text, event=event_name, append=append, self=self):
@ -1595,6 +1611,12 @@ class XMLParser:
self._names[key] = name
return name
def _start_ns(self, prefix, uri):
return self.target.start_ns(prefix or '', uri or '')
def _end_ns(self, prefix):
return self.target.end_ns(prefix or '')
def _start(self, tag, attr_list):
# Handler for expat's StartElementHandler. Since ordered_attributes
# is set, the attributes are reported as a list of alternating

View File

@ -0,0 +1,3 @@
The XMLParser() in xml.etree.ElementTree provides namespace prefix context to the
parser target if it defines the callback methods "start_ns()" and/or "end_ns()".
Patch by Stefan Behnel.

View File

@ -2911,6 +2911,39 @@ treebuilder_handle_pi(TreeBuilderObject* self, PyObject* target, PyObject* text)
return NULL;
}
LOCAL(PyObject*)
treebuilder_handle_start_ns(TreeBuilderObject* self, PyObject* prefix, PyObject* uri)
{
PyObject* parcel;
if (self->events_append && self->start_ns_event_obj) {
parcel = PyTuple_Pack(2, prefix, uri);
if (!parcel) {
return NULL;
}
if (treebuilder_append_event(self, self->start_ns_event_obj, parcel) < 0) {
Py_DECREF(parcel);
return NULL;
}
Py_DECREF(parcel);
}
Py_RETURN_NONE;
}
LOCAL(PyObject*)
treebuilder_handle_end_ns(TreeBuilderObject* self, PyObject* prefix)
{
if (self->events_append && self->end_ns_event_obj) {
if (treebuilder_append_event(self, self->end_ns_event_obj, prefix) < 0) {
return NULL;
}
}
Py_RETURN_NONE;
}
/* -------------------------------------------------------------------- */
/* methods (in alphabetical order) */
@ -3046,6 +3079,8 @@ typedef struct {
PyObject *names;
PyObject *handle_start_ns;
PyObject *handle_end_ns;
PyObject *handle_start;
PyObject *handle_data;
PyObject *handle_end;
@ -3357,42 +3392,89 @@ expat_end_handler(XMLParserObject* self, const XML_Char* tag_in)
}
static void
expat_start_ns_handler(XMLParserObject* self, const XML_Char* prefix,
const XML_Char *uri)
expat_start_ns_handler(XMLParserObject* self, const XML_Char* prefix_in,
const XML_Char *uri_in)
{
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
PyObject *parcel;
PyObject* res = NULL;
PyObject* uri;
PyObject* prefix;
PyObject* stack[2];
if (PyErr_Occurred())
return;
if (!target->events_append || !target->start_ns_event_obj)
return;
if (!uri_in)
uri_in = "";
if (!prefix_in)
prefix_in = "";
if (!uri)
uri = "";
if (!prefix)
prefix = "";
if (TreeBuilder_CheckExact(self->target)) {
/* shortcut - TreeBuilder does not actually implement .start_ns() */
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
parcel = Py_BuildValue("ss", prefix, uri);
if (!parcel)
return;
treebuilder_append_event(target, target->start_ns_event_obj, parcel);
Py_DECREF(parcel);
if (target->events_append && target->start_ns_event_obj) {
prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
if (!prefix)
return;
uri = PyUnicode_DecodeUTF8(uri_in, strlen(uri_in), "strict");
if (!uri) {
Py_DECREF(prefix);
return;
}
res = treebuilder_handle_start_ns(target, prefix, uri);
Py_DECREF(uri);
Py_DECREF(prefix);
}
} else if (self->handle_start_ns) {
prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
if (!prefix)
return;
uri = PyUnicode_DecodeUTF8(uri_in, strlen(uri_in), "strict");
if (!uri) {
Py_DECREF(prefix);
return;
}
stack[0] = prefix;
stack[1] = uri;
res = _PyObject_FastCall(self->handle_start_ns, stack, 2);
Py_DECREF(uri);
Py_DECREF(prefix);
}
Py_XDECREF(res);
}
static void
expat_end_ns_handler(XMLParserObject* self, const XML_Char* prefix_in)
{
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
PyObject *res = NULL;
PyObject* prefix;
if (PyErr_Occurred())
return;
if (!target->events_append)
return;
if (!prefix_in)
prefix_in = "";
treebuilder_append_event(target, target->end_ns_event_obj, Py_None);
if (TreeBuilder_CheckExact(self->target)) {
/* shortcut - TreeBuilder does not actually implement .end_ns() */
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
if (target->events_append && target->end_ns_event_obj) {
res = treebuilder_handle_end_ns(target, Py_None);
}
} else if (self->handle_end_ns) {
prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
if (!prefix)
return;
res = _PyObject_FastCall(self->handle_end_ns, &prefix, 1);
Py_DECREF(prefix);
}
Py_XDECREF(res);
}
static void
@ -3546,6 +3628,7 @@ xmlparser_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
if (self) {
self->parser = NULL;
self->target = self->entity = self->names = NULL;
self->handle_start_ns = self->handle_end_ns = NULL;
self->handle_start = self->handle_data = self->handle_end = NULL;
self->handle_comment = self->handle_pi = self->handle_close = NULL;
self->handle_doctype = NULL;
@ -3614,6 +3697,14 @@ _elementtree_XMLParser___init___impl(XMLParserObject *self, PyObject *target,
}
self->target = target;
self->handle_start_ns = PyObject_GetAttrString(target, "start_ns");
if (ignore_attribute_error(self->handle_start_ns)) {
return -1;
}
self->handle_end_ns = PyObject_GetAttrString(target, "end_ns");
if (ignore_attribute_error(self->handle_end_ns)) {
return -1;
}
self->handle_start = PyObject_GetAttrString(target, "start");
if (ignore_attribute_error(self->handle_start)) {
return -1;
@ -3645,6 +3736,12 @@ _elementtree_XMLParser___init___impl(XMLParserObject *self, PyObject *target,
/* configure parser */
EXPAT(SetUserData)(self->parser, self);
if (self->handle_start_ns || self->handle_end_ns)
EXPAT(SetNamespaceDeclHandler)(
self->parser,
(XML_StartNamespaceDeclHandler) expat_start_ns_handler,
(XML_EndNamespaceDeclHandler) expat_end_ns_handler
);
EXPAT(SetElementHandler)(
self->parser,
(XML_StartElementHandler) expat_start_handler,
@ -3689,6 +3786,9 @@ xmlparser_gc_traverse(XMLParserObject *self, visitproc visit, void *arg)
Py_VISIT(self->handle_end);
Py_VISIT(self->handle_data);
Py_VISIT(self->handle_start);
Py_VISIT(self->handle_start_ns);
Py_VISIT(self->handle_end_ns);
Py_VISIT(self->handle_doctype);
Py_VISIT(self->target);
Py_VISIT(self->entity);
@ -3712,6 +3812,8 @@ xmlparser_gc_clear(XMLParserObject *self)
Py_CLEAR(self->handle_end);
Py_CLEAR(self->handle_data);
Py_CLEAR(self->handle_start);
Py_CLEAR(self->handle_start_ns);
Py_CLEAR(self->handle_end_ns);
Py_CLEAR(self->handle_doctype);
Py_CLEAR(self->target);