bpo-36676: Namespace prefix aware parsing support for the ET.XMLParser target (GH-12885)
* bpo-36676: Implement namespace prefix aware parsing support for the XMLParser target in ElementTree.
This commit is contained in:
parent
43851a202c
commit
dde3eebdaa
|
@ -1086,7 +1086,7 @@ TreeBuilder Objects
|
|||
|
||||
|
||||
In addition, a custom :class:`TreeBuilder` object can provide the
|
||||
following method:
|
||||
following methods:
|
||||
|
||||
.. method:: doctype(name, pubid, system)
|
||||
|
||||
|
@ -1096,6 +1096,23 @@ TreeBuilder Objects
|
|||
|
||||
.. versionadded:: 3.2
|
||||
|
||||
.. method:: start_ns(prefix, uri)
|
||||
|
||||
Is called whenever the parser encounters a new namespace declaration,
|
||||
before the ``start()`` callback for the opening element that defines it.
|
||||
*prefix* is ``''`` for the default namespace and the declared
|
||||
namespace prefix name otherwise. *uri* is the namespace URI.
|
||||
|
||||
.. versionadded:: 3.8
|
||||
|
||||
.. method:: end_ns(prefix)
|
||||
|
||||
Is called after the ``end()`` callback of an element that declared
|
||||
a namespace prefix mapping, with the name of the *prefix* that went
|
||||
out of scope.
|
||||
|
||||
.. versionadded:: 3.8
|
||||
|
||||
|
||||
.. _elementtree-xmlparser-objects:
|
||||
|
||||
|
@ -1131,7 +1148,8 @@ XMLParser Objects
|
|||
|
||||
:meth:`XMLParser.feed` calls *target*\'s ``start(tag, attrs_dict)`` method
|
||||
for each opening tag, its ``end(tag)`` method for each closing tag, and data
|
||||
is processed by method ``data(data)``. :meth:`XMLParser.close` calls
|
||||
is processed by method ``data(data)``. For further supported callback
|
||||
methods, see the :class:`TreeBuilder` class. :meth:`XMLParser.close` calls
|
||||
*target*\'s method ``close()``. :class:`XMLParser` can be used not only for
|
||||
building a tree structure. This is an example of counting the maximum depth
|
||||
of an XML file::
|
||||
|
|
|
@ -14,12 +14,13 @@ import locale
|
|||
import operator
|
||||
import pickle
|
||||
import sys
|
||||
import textwrap
|
||||
import types
|
||||
import unittest
|
||||
import warnings
|
||||
import weakref
|
||||
|
||||
from itertools import product
|
||||
from itertools import product, islice
|
||||
from test import support
|
||||
from test.support import TESTFN, findfile, import_fresh_module, gc_collect, swap_attr
|
||||
|
||||
|
@ -694,12 +695,17 @@ class ElementTreeTest(unittest.TestCase):
|
|||
self.append(("pi", target, data))
|
||||
def comment(self, data):
|
||||
self.append(("comment", data))
|
||||
def start_ns(self, prefix, uri):
|
||||
self.append(("start-ns", prefix, uri))
|
||||
def end_ns(self, prefix):
|
||||
self.append(("end-ns", prefix))
|
||||
builder = Builder()
|
||||
parser = ET.XMLParser(target=builder)
|
||||
parser.feed(data)
|
||||
self.assertEqual(builder, [
|
||||
('pi', 'pi', 'data'),
|
||||
('comment', ' comment '),
|
||||
('start-ns', '', 'namespace'),
|
||||
('start', '{namespace}root'),
|
||||
('start', '{namespace}element'),
|
||||
('end', '{namespace}element'),
|
||||
|
@ -708,8 +714,30 @@ class ElementTreeTest(unittest.TestCase):
|
|||
('start', '{namespace}empty-element'),
|
||||
('end', '{namespace}empty-element'),
|
||||
('end', '{namespace}root'),
|
||||
('end-ns', ''),
|
||||
])
|
||||
|
||||
def test_custom_builder_only_end_ns(self):
|
||||
class Builder(list):
|
||||
def end_ns(self, prefix):
|
||||
self.append(("end-ns", prefix))
|
||||
|
||||
builder = Builder()
|
||||
parser = ET.XMLParser(target=builder)
|
||||
parser.feed(textwrap.dedent("""\
|
||||
<?pi data?>
|
||||
<!-- comment -->
|
||||
<root xmlns='namespace' xmlns:p='pns' xmlns:a='ans'>
|
||||
<a:element key='value'>text</a:element>
|
||||
<p:element>text</p:element>tail
|
||||
<empty-element/>
|
||||
</root>
|
||||
"""))
|
||||
self.assertEqual(builder, [
|
||||
('end-ns', 'a'),
|
||||
('end-ns', 'p'),
|
||||
('end-ns', ''),
|
||||
])
|
||||
|
||||
# Element.getchildren() and ElementTree.getiterator() are deprecated.
|
||||
@checkwarnings(("This method will be removed in future versions. "
|
||||
|
@ -1194,14 +1222,19 @@ class XMLPullParserTest(unittest.TestCase):
|
|||
for i in range(0, len(data), chunk_size):
|
||||
parser.feed(data[i:i+chunk_size])
|
||||
|
||||
def assert_events(self, parser, expected):
|
||||
def assert_events(self, parser, expected, max_events=None):
|
||||
self.assertEqual(
|
||||
[(event, (elem.tag, elem.text))
|
||||
for event, elem in parser.read_events()],
|
||||
for event, elem in islice(parser.read_events(), max_events)],
|
||||
expected)
|
||||
|
||||
def assert_event_tags(self, parser, expected):
|
||||
events = parser.read_events()
|
||||
def assert_event_tuples(self, parser, expected, max_events=None):
|
||||
self.assertEqual(
|
||||
list(islice(parser.read_events(), max_events)),
|
||||
expected)
|
||||
|
||||
def assert_event_tags(self, parser, expected, max_events=None):
|
||||
events = islice(parser.read_events(), max_events)
|
||||
self.assertEqual([(action, elem.tag) for action, elem in events],
|
||||
expected)
|
||||
|
||||
|
@ -1276,6 +1309,56 @@ class XMLPullParserTest(unittest.TestCase):
|
|||
self.assertEqual(list(parser.read_events()), [('end-ns', None)])
|
||||
self.assertIsNone(parser.close())
|
||||
|
||||
def test_ns_events_start(self):
|
||||
parser = ET.XMLPullParser(events=('start-ns', 'start', 'end'))
|
||||
self._feed(parser, "<tag xmlns='abc' xmlns:p='xyz'>\n")
|
||||
self.assert_event_tuples(parser, [
|
||||
('start-ns', ('', 'abc')),
|
||||
('start-ns', ('p', 'xyz')),
|
||||
], max_events=2)
|
||||
self.assert_event_tags(parser, [
|
||||
('start', '{abc}tag'),
|
||||
], max_events=1)
|
||||
|
||||
self._feed(parser, "<child />\n")
|
||||
self.assert_event_tags(parser, [
|
||||
('start', '{abc}child'),
|
||||
('end', '{abc}child'),
|
||||
])
|
||||
|
||||
self._feed(parser, "</tag>\n")
|
||||
parser.close()
|
||||
self.assert_event_tags(parser, [
|
||||
('end', '{abc}tag'),
|
||||
])
|
||||
|
||||
def test_ns_events_start_end(self):
|
||||
parser = ET.XMLPullParser(events=('start-ns', 'start', 'end', 'end-ns'))
|
||||
self._feed(parser, "<tag xmlns='abc' xmlns:p='xyz'>\n")
|
||||
self.assert_event_tuples(parser, [
|
||||
('start-ns', ('', 'abc')),
|
||||
('start-ns', ('p', 'xyz')),
|
||||
], max_events=2)
|
||||
self.assert_event_tags(parser, [
|
||||
('start', '{abc}tag'),
|
||||
], max_events=1)
|
||||
|
||||
self._feed(parser, "<child />\n")
|
||||
self.assert_event_tags(parser, [
|
||||
('start', '{abc}child'),
|
||||
('end', '{abc}child'),
|
||||
])
|
||||
|
||||
self._feed(parser, "</tag>\n")
|
||||
parser.close()
|
||||
self.assert_event_tags(parser, [
|
||||
('end', '{abc}tag'),
|
||||
], max_events=1)
|
||||
self.assert_event_tuples(parser, [
|
||||
('end-ns', None),
|
||||
('end-ns', None),
|
||||
])
|
||||
|
||||
def test_events(self):
|
||||
parser = ET.XMLPullParser(events=())
|
||||
self._feed(parser, "<root/>\n")
|
||||
|
|
|
@ -1518,6 +1518,10 @@ class XMLParser:
|
|||
parser.StartElementHandler = self._start
|
||||
if hasattr(target, 'end'):
|
||||
parser.EndElementHandler = self._end
|
||||
if hasattr(target, 'start_ns'):
|
||||
parser.StartNamespaceDeclHandler = self._start_ns
|
||||
if hasattr(target, 'end_ns'):
|
||||
parser.EndNamespaceDeclHandler = self._end_ns
|
||||
if hasattr(target, 'data'):
|
||||
parser.CharacterDataHandler = target.data
|
||||
# miscellaneous callbacks
|
||||
|
@ -1559,12 +1563,24 @@ class XMLParser:
|
|||
append((event, end(tag)))
|
||||
parser.EndElementHandler = handler
|
||||
elif event_name == "start-ns":
|
||||
def handler(prefix, uri, event=event_name, append=append):
|
||||
append((event, (prefix or "", uri or "")))
|
||||
# TreeBuilder does not implement .start_ns()
|
||||
if hasattr(self.target, "start_ns"):
|
||||
def handler(prefix, uri, event=event_name, append=append,
|
||||
start_ns=self._start_ns):
|
||||
append((event, start_ns(prefix, uri)))
|
||||
else:
|
||||
def handler(prefix, uri, event=event_name, append=append):
|
||||
append((event, (prefix or '', uri or '')))
|
||||
parser.StartNamespaceDeclHandler = handler
|
||||
elif event_name == "end-ns":
|
||||
def handler(prefix, event=event_name, append=append):
|
||||
append((event, None))
|
||||
# TreeBuilder does not implement .end_ns()
|
||||
if hasattr(self.target, "end_ns"):
|
||||
def handler(prefix, event=event_name, append=append,
|
||||
end_ns=self._end_ns):
|
||||
append((event, end_ns(prefix)))
|
||||
else:
|
||||
def handler(prefix, event=event_name, append=append):
|
||||
append((event, None))
|
||||
parser.EndNamespaceDeclHandler = handler
|
||||
elif event_name == 'comment':
|
||||
def handler(text, event=event_name, append=append, self=self):
|
||||
|
@ -1595,6 +1611,12 @@ class XMLParser:
|
|||
self._names[key] = name
|
||||
return name
|
||||
|
||||
def _start_ns(self, prefix, uri):
|
||||
return self.target.start_ns(prefix or '', uri or '')
|
||||
|
||||
def _end_ns(self, prefix):
|
||||
return self.target.end_ns(prefix or '')
|
||||
|
||||
def _start(self, tag, attr_list):
|
||||
# Handler for expat's StartElementHandler. Since ordered_attributes
|
||||
# is set, the attributes are reported as a list of alternating
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
The XMLParser() in xml.etree.ElementTree provides namespace prefix context to the
|
||||
parser target if it defines the callback methods "start_ns()" and/or "end_ns()".
|
||||
Patch by Stefan Behnel.
|
|
@ -2911,6 +2911,39 @@ treebuilder_handle_pi(TreeBuilderObject* self, PyObject* target, PyObject* text)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
LOCAL(PyObject*)
|
||||
treebuilder_handle_start_ns(TreeBuilderObject* self, PyObject* prefix, PyObject* uri)
|
||||
{
|
||||
PyObject* parcel;
|
||||
|
||||
if (self->events_append && self->start_ns_event_obj) {
|
||||
parcel = PyTuple_Pack(2, prefix, uri);
|
||||
if (!parcel) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (treebuilder_append_event(self, self->start_ns_event_obj, parcel) < 0) {
|
||||
Py_DECREF(parcel);
|
||||
return NULL;
|
||||
}
|
||||
Py_DECREF(parcel);
|
||||
}
|
||||
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
LOCAL(PyObject*)
|
||||
treebuilder_handle_end_ns(TreeBuilderObject* self, PyObject* prefix)
|
||||
{
|
||||
if (self->events_append && self->end_ns_event_obj) {
|
||||
if (treebuilder_append_event(self, self->end_ns_event_obj, prefix) < 0) {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
Py_RETURN_NONE;
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------------------- */
|
||||
/* methods (in alphabetical order) */
|
||||
|
||||
|
@ -3046,6 +3079,8 @@ typedef struct {
|
|||
|
||||
PyObject *names;
|
||||
|
||||
PyObject *handle_start_ns;
|
||||
PyObject *handle_end_ns;
|
||||
PyObject *handle_start;
|
||||
PyObject *handle_data;
|
||||
PyObject *handle_end;
|
||||
|
@ -3357,42 +3392,89 @@ expat_end_handler(XMLParserObject* self, const XML_Char* tag_in)
|
|||
}
|
||||
|
||||
static void
|
||||
expat_start_ns_handler(XMLParserObject* self, const XML_Char* prefix,
|
||||
const XML_Char *uri)
|
||||
expat_start_ns_handler(XMLParserObject* self, const XML_Char* prefix_in,
|
||||
const XML_Char *uri_in)
|
||||
{
|
||||
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
|
||||
PyObject *parcel;
|
||||
PyObject* res = NULL;
|
||||
PyObject* uri;
|
||||
PyObject* prefix;
|
||||
PyObject* stack[2];
|
||||
|
||||
if (PyErr_Occurred())
|
||||
return;
|
||||
|
||||
if (!target->events_append || !target->start_ns_event_obj)
|
||||
return;
|
||||
if (!uri_in)
|
||||
uri_in = "";
|
||||
if (!prefix_in)
|
||||
prefix_in = "";
|
||||
|
||||
if (!uri)
|
||||
uri = "";
|
||||
if (!prefix)
|
||||
prefix = "";
|
||||
if (TreeBuilder_CheckExact(self->target)) {
|
||||
/* shortcut - TreeBuilder does not actually implement .start_ns() */
|
||||
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
|
||||
|
||||
parcel = Py_BuildValue("ss", prefix, uri);
|
||||
if (!parcel)
|
||||
return;
|
||||
treebuilder_append_event(target, target->start_ns_event_obj, parcel);
|
||||
Py_DECREF(parcel);
|
||||
if (target->events_append && target->start_ns_event_obj) {
|
||||
prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
|
||||
if (!prefix)
|
||||
return;
|
||||
uri = PyUnicode_DecodeUTF8(uri_in, strlen(uri_in), "strict");
|
||||
if (!uri) {
|
||||
Py_DECREF(prefix);
|
||||
return;
|
||||
}
|
||||
|
||||
res = treebuilder_handle_start_ns(target, prefix, uri);
|
||||
Py_DECREF(uri);
|
||||
Py_DECREF(prefix);
|
||||
}
|
||||
} else if (self->handle_start_ns) {
|
||||
prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
|
||||
if (!prefix)
|
||||
return;
|
||||
uri = PyUnicode_DecodeUTF8(uri_in, strlen(uri_in), "strict");
|
||||
if (!uri) {
|
||||
Py_DECREF(prefix);
|
||||
return;
|
||||
}
|
||||
|
||||
stack[0] = prefix;
|
||||
stack[1] = uri;
|
||||
res = _PyObject_FastCall(self->handle_start_ns, stack, 2);
|
||||
Py_DECREF(uri);
|
||||
Py_DECREF(prefix);
|
||||
}
|
||||
|
||||
Py_XDECREF(res);
|
||||
}
|
||||
|
||||
static void
|
||||
expat_end_ns_handler(XMLParserObject* self, const XML_Char* prefix_in)
|
||||
{
|
||||
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
|
||||
PyObject *res = NULL;
|
||||
PyObject* prefix;
|
||||
|
||||
if (PyErr_Occurred())
|
||||
return;
|
||||
|
||||
if (!target->events_append)
|
||||
return;
|
||||
if (!prefix_in)
|
||||
prefix_in = "";
|
||||
|
||||
treebuilder_append_event(target, target->end_ns_event_obj, Py_None);
|
||||
if (TreeBuilder_CheckExact(self->target)) {
|
||||
/* shortcut - TreeBuilder does not actually implement .end_ns() */
|
||||
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
|
||||
|
||||
if (target->events_append && target->end_ns_event_obj) {
|
||||
res = treebuilder_handle_end_ns(target, Py_None);
|
||||
}
|
||||
} else if (self->handle_end_ns) {
|
||||
prefix = PyUnicode_DecodeUTF8(prefix_in, strlen(prefix_in), "strict");
|
||||
if (!prefix)
|
||||
return;
|
||||
|
||||
res = _PyObject_FastCall(self->handle_end_ns, &prefix, 1);
|
||||
Py_DECREF(prefix);
|
||||
}
|
||||
|
||||
Py_XDECREF(res);
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -3546,6 +3628,7 @@ xmlparser_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
|||
if (self) {
|
||||
self->parser = NULL;
|
||||
self->target = self->entity = self->names = NULL;
|
||||
self->handle_start_ns = self->handle_end_ns = NULL;
|
||||
self->handle_start = self->handle_data = self->handle_end = NULL;
|
||||
self->handle_comment = self->handle_pi = self->handle_close = NULL;
|
||||
self->handle_doctype = NULL;
|
||||
|
@ -3614,6 +3697,14 @@ _elementtree_XMLParser___init___impl(XMLParserObject *self, PyObject *target,
|
|||
}
|
||||
self->target = target;
|
||||
|
||||
self->handle_start_ns = PyObject_GetAttrString(target, "start_ns");
|
||||
if (ignore_attribute_error(self->handle_start_ns)) {
|
||||
return -1;
|
||||
}
|
||||
self->handle_end_ns = PyObject_GetAttrString(target, "end_ns");
|
||||
if (ignore_attribute_error(self->handle_end_ns)) {
|
||||
return -1;
|
||||
}
|
||||
self->handle_start = PyObject_GetAttrString(target, "start");
|
||||
if (ignore_attribute_error(self->handle_start)) {
|
||||
return -1;
|
||||
|
@ -3645,6 +3736,12 @@ _elementtree_XMLParser___init___impl(XMLParserObject *self, PyObject *target,
|
|||
|
||||
/* configure parser */
|
||||
EXPAT(SetUserData)(self->parser, self);
|
||||
if (self->handle_start_ns || self->handle_end_ns)
|
||||
EXPAT(SetNamespaceDeclHandler)(
|
||||
self->parser,
|
||||
(XML_StartNamespaceDeclHandler) expat_start_ns_handler,
|
||||
(XML_EndNamespaceDeclHandler) expat_end_ns_handler
|
||||
);
|
||||
EXPAT(SetElementHandler)(
|
||||
self->parser,
|
||||
(XML_StartElementHandler) expat_start_handler,
|
||||
|
@ -3689,6 +3786,9 @@ xmlparser_gc_traverse(XMLParserObject *self, visitproc visit, void *arg)
|
|||
Py_VISIT(self->handle_end);
|
||||
Py_VISIT(self->handle_data);
|
||||
Py_VISIT(self->handle_start);
|
||||
Py_VISIT(self->handle_start_ns);
|
||||
Py_VISIT(self->handle_end_ns);
|
||||
Py_VISIT(self->handle_doctype);
|
||||
|
||||
Py_VISIT(self->target);
|
||||
Py_VISIT(self->entity);
|
||||
|
@ -3712,6 +3812,8 @@ xmlparser_gc_clear(XMLParserObject *self)
|
|||
Py_CLEAR(self->handle_end);
|
||||
Py_CLEAR(self->handle_data);
|
||||
Py_CLEAR(self->handle_start);
|
||||
Py_CLEAR(self->handle_start_ns);
|
||||
Py_CLEAR(self->handle_end_ns);
|
||||
Py_CLEAR(self->handle_doctype);
|
||||
|
||||
Py_CLEAR(self->target);
|
||||
|
|
Loading…
Reference in New Issue