bpo-28238: Implement "{*}tag" and "{ns}*" wildcard tag selection support for ElementPath, and extend the surrounding tests and docs. (GH-12997)
This commit is contained in:
parent
cf48e55f7f
commit
47541689cc
|
@ -399,6 +399,12 @@ module. We'll be using the ``countrydata`` XML document from the
|
||||||
# All 'neighbor' nodes that are the second child of their parent
|
# All 'neighbor' nodes that are the second child of their parent
|
||||||
root.findall(".//neighbor[2]")
|
root.findall(".//neighbor[2]")
|
||||||
|
|
||||||
|
For XML with namespaces, use the usual qualified ``{namespace}tag`` notation::
|
||||||
|
|
||||||
|
# All dublin-core "title" tags in the document
|
||||||
|
root.findall(".//{http://purl.org/dc/elements/1.1/}title")
|
||||||
|
|
||||||
|
|
||||||
Supported XPath syntax
|
Supported XPath syntax
|
||||||
^^^^^^^^^^^^^^^^^^^^^^
|
^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
@ -411,9 +417,16 @@ Supported XPath syntax
|
||||||
| | For example, ``spam`` selects all child elements |
|
| | For example, ``spam`` selects all child elements |
|
||||||
| | named ``spam``, and ``spam/egg`` selects all |
|
| | named ``spam``, and ``spam/egg`` selects all |
|
||||||
| | grandchildren named ``egg`` in all children named |
|
| | grandchildren named ``egg`` in all children named |
|
||||||
| | ``spam``. |
|
| | ``spam``. ``{namespace}*`` selects all tags in the |
|
||||||
|
| | given namespace, ``{*}spam`` selects tags named |
|
||||||
|
| | ``spam`` in any (or no) namespace, and ``{}*`` |
|
||||||
|
| | only selects tags that are not in a namespace. |
|
||||||
|
| | |
|
||||||
|
| | .. versionchanged:: 3.8 |
|
||||||
|
| | Support for star-wildcards was added. |
|
||||||
+-----------------------+------------------------------------------------------+
|
+-----------------------+------------------------------------------------------+
|
||||||
| ``*`` | Selects all child elements. For example, ``*/egg`` |
|
| ``*`` | Selects all child elements, including comments and |
|
||||||
|
| | processing instructions. For example, ``*/egg`` |
|
||||||
| | selects all grandchildren named ``egg``. |
|
| | selects all grandchildren named ``egg``. |
|
||||||
+-----------------------+------------------------------------------------------+
|
+-----------------------+------------------------------------------------------+
|
||||||
| ``.`` | Selects the current node. This is mostly useful |
|
| ``.`` | Selects the current node. This is mostly useful |
|
||||||
|
|
|
@ -532,6 +532,11 @@ xml
|
||||||
external entities by default.
|
external entities by default.
|
||||||
(Contributed by Christian Heimes in :issue:`17239`.)
|
(Contributed by Christian Heimes in :issue:`17239`.)
|
||||||
|
|
||||||
|
* The ``.find*()`` methods in the :mod:`xml.etree.ElementTree` module
|
||||||
|
support wildcard searches like ``{*}tag`` which ignores the namespace
|
||||||
|
and ``{namespace}*`` which returns all tags in the given namespace.
|
||||||
|
(Contributed by Stefan Behnel in :issue:`28238`.)
|
||||||
|
|
||||||
* The :mod:`xml.etree.ElementTree` module provides a new function
|
* The :mod:`xml.etree.ElementTree` module provides a new function
|
||||||
:func:`–xml.etree.ElementTree.canonicalize()` that implements C14N 2.0.
|
:func:`–xml.etree.ElementTree.canonicalize()` that implements C14N 2.0.
|
||||||
(Contributed by Stefan Behnel in :issue:`13611`.)
|
(Contributed by Stefan Behnel in :issue:`13611`.)
|
||||||
|
|
|
@ -1137,16 +1137,21 @@ class ElementTreeTest(unittest.TestCase):
|
||||||
def test_xpath_tokenizer(self):
|
def test_xpath_tokenizer(self):
|
||||||
# Test the XPath tokenizer.
|
# Test the XPath tokenizer.
|
||||||
from xml.etree import ElementPath
|
from xml.etree import ElementPath
|
||||||
def check(p, expected):
|
def check(p, expected, namespaces=None):
|
||||||
self.assertEqual([op or tag
|
self.assertEqual([op or tag
|
||||||
for op, tag in ElementPath.xpath_tokenizer(p)],
|
for op, tag in ElementPath.xpath_tokenizer(p, namespaces)],
|
||||||
expected)
|
expected)
|
||||||
|
|
||||||
# tests from the xml specification
|
# tests from the xml specification
|
||||||
check("*", ['*'])
|
check("*", ['*'])
|
||||||
|
check("{ns}*", ['{ns}*'])
|
||||||
|
check("{}*", ['{}*'])
|
||||||
|
check("{*}tag", ['{*}tag'])
|
||||||
|
check("{*}*", ['{*}*'])
|
||||||
check("text()", ['text', '()'])
|
check("text()", ['text', '()'])
|
||||||
check("@name", ['@', 'name'])
|
check("@name", ['@', 'name'])
|
||||||
check("@*", ['@', '*'])
|
check("@*", ['@', '*'])
|
||||||
|
check("@{ns}attr", ['@', '{ns}attr'])
|
||||||
check("para[1]", ['para', '[', '1', ']'])
|
check("para[1]", ['para', '[', '1', ']'])
|
||||||
check("para[last()]", ['para', '[', 'last', '()', ']'])
|
check("para[last()]", ['para', '[', 'last', '()', ']'])
|
||||||
check("*/para", ['*', '/', 'para'])
|
check("*/para", ['*', '/', 'para'])
|
||||||
|
@ -1158,6 +1163,7 @@ class ElementTreeTest(unittest.TestCase):
|
||||||
check("//olist/item", ['//', 'olist', '/', 'item'])
|
check("//olist/item", ['//', 'olist', '/', 'item'])
|
||||||
check(".", ['.'])
|
check(".", ['.'])
|
||||||
check(".//para", ['.', '//', 'para'])
|
check(".//para", ['.', '//', 'para'])
|
||||||
|
check(".//{*}tag", ['.', '//', '{*}tag'])
|
||||||
check("..", ['..'])
|
check("..", ['..'])
|
||||||
check("../@lang", ['..', '/', '@', 'lang'])
|
check("../@lang", ['..', '/', '@', 'lang'])
|
||||||
check("chapter[title]", ['chapter', '[', 'title', ']'])
|
check("chapter[title]", ['chapter', '[', 'title', ']'])
|
||||||
|
@ -1168,6 +1174,8 @@ class ElementTreeTest(unittest.TestCase):
|
||||||
check("{http://spam}egg", ['{http://spam}egg'])
|
check("{http://spam}egg", ['{http://spam}egg'])
|
||||||
check("./spam.egg", ['.', '/', 'spam.egg'])
|
check("./spam.egg", ['.', '/', 'spam.egg'])
|
||||||
check(".//{http://spam}egg", ['.', '//', '{http://spam}egg'])
|
check(".//{http://spam}egg", ['.', '//', '{http://spam}egg'])
|
||||||
|
check("./xsd:type", ['.', '/', '{http://www.w3.org/2001/XMLSchema}type'],
|
||||||
|
{'xsd': 'http://www.w3.org/2001/XMLSchema'})
|
||||||
|
|
||||||
def test_processinginstruction(self):
|
def test_processinginstruction(self):
|
||||||
# Test ProcessingInstruction directly
|
# Test ProcessingInstruction directly
|
||||||
|
@ -2669,6 +2677,50 @@ class ElementFindTest(unittest.TestCase):
|
||||||
self.assertEqual(len(root.findall(".//xx:b", namespaces=nsmap)), 2)
|
self.assertEqual(len(root.findall(".//xx:b", namespaces=nsmap)), 2)
|
||||||
self.assertEqual(len(root.findall(".//b", namespaces=nsmap)), 1)
|
self.assertEqual(len(root.findall(".//b", namespaces=nsmap)), 1)
|
||||||
|
|
||||||
|
def test_findall_wildcard(self):
|
||||||
|
root = ET.XML('''
|
||||||
|
<a xmlns:x="X" xmlns:y="Y">
|
||||||
|
<x:b><c/></x:b>
|
||||||
|
<b/>
|
||||||
|
<c><x:b/><b/></c><y:b/>
|
||||||
|
</a>''')
|
||||||
|
root.append(ET.Comment('test'))
|
||||||
|
|
||||||
|
self.assertEqual(summarize_list(root.findall("{*}b")),
|
||||||
|
['{X}b', 'b', '{Y}b'])
|
||||||
|
self.assertEqual(summarize_list(root.findall("{*}c")),
|
||||||
|
['c'])
|
||||||
|
self.assertEqual(summarize_list(root.findall("{X}*")),
|
||||||
|
['{X}b'])
|
||||||
|
self.assertEqual(summarize_list(root.findall("{Y}*")),
|
||||||
|
['{Y}b'])
|
||||||
|
self.assertEqual(summarize_list(root.findall("{}*")),
|
||||||
|
['b', 'c'])
|
||||||
|
self.assertEqual(summarize_list(root.findall("{}b")), # only for consistency
|
||||||
|
['b'])
|
||||||
|
self.assertEqual(summarize_list(root.findall("{}b")),
|
||||||
|
summarize_list(root.findall("b")))
|
||||||
|
self.assertEqual(summarize_list(root.findall("{*}*")),
|
||||||
|
['{X}b', 'b', 'c', '{Y}b'])
|
||||||
|
# This is an unfortunate difference, but that's how find('*') works.
|
||||||
|
self.assertEqual(summarize_list(root.findall("{*}*") + [root[-1]]),
|
||||||
|
summarize_list(root.findall("*")))
|
||||||
|
|
||||||
|
self.assertEqual(summarize_list(root.findall(".//{*}b")),
|
||||||
|
['{X}b', 'b', '{X}b', 'b', '{Y}b'])
|
||||||
|
self.assertEqual(summarize_list(root.findall(".//{*}c")),
|
||||||
|
['c', 'c'])
|
||||||
|
self.assertEqual(summarize_list(root.findall(".//{X}*")),
|
||||||
|
['{X}b', '{X}b'])
|
||||||
|
self.assertEqual(summarize_list(root.findall(".//{Y}*")),
|
||||||
|
['{Y}b'])
|
||||||
|
self.assertEqual(summarize_list(root.findall(".//{}*")),
|
||||||
|
['c', 'b', 'c', 'b'])
|
||||||
|
self.assertEqual(summarize_list(root.findall(".//{}b")), # only for consistency
|
||||||
|
['b', 'b'])
|
||||||
|
self.assertEqual(summarize_list(root.findall(".//{}b")),
|
||||||
|
summarize_list(root.findall(".//b")))
|
||||||
|
|
||||||
def test_bad_find(self):
|
def test_bad_find(self):
|
||||||
e = ET.XML(SAMPLE_XML)
|
e = ET.XML(SAMPLE_XML)
|
||||||
with self.assertRaisesRegex(SyntaxError, 'cannot use absolute path'):
|
with self.assertRaisesRegex(SyntaxError, 'cannot use absolute path'):
|
||||||
|
|
|
@ -99,13 +99,70 @@ def get_parent_map(context):
|
||||||
parent_map[e] = p
|
parent_map[e] = p
|
||||||
return parent_map
|
return parent_map
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def _is_wildcard_tag(tag):
|
||||||
|
return tag[:3] == '{*}' or tag[-2:] == '}*'
|
||||||
|
|
||||||
|
|
||||||
|
def _prepare_tag(tag):
|
||||||
|
_isinstance, _str = isinstance, str
|
||||||
|
if tag == '{*}*':
|
||||||
|
# Same as '*', but no comments or processing instructions.
|
||||||
|
# It can be a surprise that '*' includes those, but there is no
|
||||||
|
# justification for '{*}*' doing the same.
|
||||||
|
def select(context, result):
|
||||||
|
for elem in result:
|
||||||
|
if _isinstance(elem.tag, _str):
|
||||||
|
yield elem
|
||||||
|
elif tag == '{}*':
|
||||||
|
# Any tag that is not in a namespace.
|
||||||
|
def select(context, result):
|
||||||
|
for elem in result:
|
||||||
|
el_tag = elem.tag
|
||||||
|
if _isinstance(el_tag, _str) and el_tag[0] != '{':
|
||||||
|
yield elem
|
||||||
|
elif tag[:3] == '{*}':
|
||||||
|
# The tag in any (or no) namespace.
|
||||||
|
suffix = tag[2:] # '}name'
|
||||||
|
no_ns = slice(-len(suffix), None)
|
||||||
|
tag = tag[3:]
|
||||||
|
def select(context, result):
|
||||||
|
for elem in result:
|
||||||
|
el_tag = elem.tag
|
||||||
|
if el_tag == tag or _isinstance(el_tag, _str) and el_tag[no_ns] == suffix:
|
||||||
|
yield elem
|
||||||
|
elif tag[-2:] == '}*':
|
||||||
|
# Any tag in the given namespace.
|
||||||
|
ns = tag[:-1]
|
||||||
|
ns_only = slice(None, len(ns))
|
||||||
|
def select(context, result):
|
||||||
|
for elem in result:
|
||||||
|
el_tag = elem.tag
|
||||||
|
if _isinstance(el_tag, _str) and el_tag[ns_only] == ns:
|
||||||
|
yield elem
|
||||||
|
else:
|
||||||
|
raise RuntimeError(f"internal parser error, got {tag}")
|
||||||
|
return select
|
||||||
|
|
||||||
|
|
||||||
def prepare_child(next, token):
|
def prepare_child(next, token):
|
||||||
tag = token[1]
|
tag = token[1]
|
||||||
def select(context, result):
|
if _is_wildcard_tag(tag):
|
||||||
for elem in result:
|
select_tag = _prepare_tag(tag)
|
||||||
for e in elem:
|
def select(context, result):
|
||||||
if e.tag == tag:
|
def select_child(result):
|
||||||
yield e
|
for elem in result:
|
||||||
|
yield from elem
|
||||||
|
return select_tag(context, select_child(result))
|
||||||
|
else:
|
||||||
|
if tag[:2] == '{}':
|
||||||
|
tag = tag[2:] # '{}tag' == 'tag'
|
||||||
|
def select(context, result):
|
||||||
|
for elem in result:
|
||||||
|
for e in elem:
|
||||||
|
if e.tag == tag:
|
||||||
|
yield e
|
||||||
return select
|
return select
|
||||||
|
|
||||||
def prepare_star(next, token):
|
def prepare_star(next, token):
|
||||||
|
@ -130,11 +187,24 @@ def prepare_descendant(next, token):
|
||||||
tag = token[1]
|
tag = token[1]
|
||||||
else:
|
else:
|
||||||
raise SyntaxError("invalid descendant")
|
raise SyntaxError("invalid descendant")
|
||||||
def select(context, result):
|
|
||||||
for elem in result:
|
if _is_wildcard_tag(tag):
|
||||||
for e in elem.iter(tag):
|
select_tag = _prepare_tag(tag)
|
||||||
if e is not elem:
|
def select(context, result):
|
||||||
yield e
|
def select_child(result):
|
||||||
|
for elem in result:
|
||||||
|
for e in elem.iter():
|
||||||
|
if e is not elem:
|
||||||
|
yield e
|
||||||
|
return select_tag(context, select_child(result))
|
||||||
|
else:
|
||||||
|
if tag[:2] == '{}':
|
||||||
|
tag = tag[2:] # '{}tag' == 'tag'
|
||||||
|
def select(context, result):
|
||||||
|
for elem in result:
|
||||||
|
for e in elem.iter(tag):
|
||||||
|
if e is not elem:
|
||||||
|
yield e
|
||||||
return select
|
return select
|
||||||
|
|
||||||
def prepare_parent(next, token):
|
def prepare_parent(next, token):
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
The ``.find*()`` methods of xml.etree.ElementTree can now search for
|
||||||
|
wildcards like ``{*}tag`` and ``{ns}*`` that match a tag in any namespace
|
||||||
|
or all tags in a namespace. Patch by Stefan Behnel.
|
|
@ -1149,6 +1149,13 @@ checkpath(PyObject* tag)
|
||||||
const Py_ssize_t len = PyUnicode_GET_LENGTH(tag);
|
const Py_ssize_t len = PyUnicode_GET_LENGTH(tag);
|
||||||
void *data = PyUnicode_DATA(tag);
|
void *data = PyUnicode_DATA(tag);
|
||||||
unsigned int kind = PyUnicode_KIND(tag);
|
unsigned int kind = PyUnicode_KIND(tag);
|
||||||
|
if (len >= 3 && PyUnicode_READ(kind, data, 0) == '{' && (
|
||||||
|
PyUnicode_READ(kind, data, 1) == '}' || (
|
||||||
|
PyUnicode_READ(kind, data, 1) == '*' &&
|
||||||
|
PyUnicode_READ(kind, data, 2) == '}'))) {
|
||||||
|
/* wildcard: '{}tag' or '{*}tag' */
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
for (i = 0; i < len; i++) {
|
for (i = 0; i < len; i++) {
|
||||||
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
||||||
if (ch == '{')
|
if (ch == '{')
|
||||||
|
@ -1162,7 +1169,13 @@ checkpath(PyObject* tag)
|
||||||
}
|
}
|
||||||
if (PyBytes_Check(tag)) {
|
if (PyBytes_Check(tag)) {
|
||||||
char *p = PyBytes_AS_STRING(tag);
|
char *p = PyBytes_AS_STRING(tag);
|
||||||
for (i = 0; i < PyBytes_GET_SIZE(tag); i++) {
|
const Py_ssize_t len = PyBytes_GET_SIZE(tag);
|
||||||
|
if (len >= 3 && p[0] == '{' && (
|
||||||
|
p[1] == '}' || p[1] == '*' && p[2] == '}')) {
|
||||||
|
/* wildcard: '{}tag' or '{*}tag' */
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
for (i = 0; i < len; i++) {
|
||||||
if (p[i] == '{')
|
if (p[i] == '{')
|
||||||
check = 0;
|
check = 0;
|
||||||
else if (p[i] == '}')
|
else if (p[i] == '}')
|
||||||
|
|
Loading…
Reference in New Issue