bpo-28238: Implement "{*}tag" and "{ns}*" wildcard tag selection support for ElementPath, and extend the surrounding tests and docs. (GH-12997)

2019-05-03 20:58:16 +02:00 · 2019-05-03 20:58:16 +02:00 · 47541689cc
parent cf48e55f7f
commit 47541689cc
6 changed files with 171 additions and 15 deletions
--- a/Doc/library/xml.etree.elementtree.rst
+++ b/Doc/library/xml.etree.elementtree.rst
@ -399,6 +399,12 @@ module.  We'll be using the ``countrydata`` XML document from the
   # All 'neighbor' nodes that are the second child of their parent
   root.findall(".//neighbor[2]")
 For XML with namespaces, use the usual qualified ``{namespace}tag`` notation::
   # All dublin-core "title" tags in the document
   root.findall(".//{http://purl.org/dc/elements/1.1/}title")
 Supported XPath syntax
 ^^^^^^^^^^^^^^^^^^^^^^
@ -411,9 +417,16 @@ Supported XPath syntax
 |                       | For example, ``spam`` selects all child elements     |
 |                       | named ``spam``, and ``spam/egg`` selects all         |
 |                       | grandchildren named ``egg`` in all children named    |
-|                       | ``spam``.                                            |
+|                       | ``spam``.  ``{namespace}*`` selects all tags in the  |
 |                       | given namespace, ``{*}spam`` selects tags named      |
 |                       | ``spam`` in any (or no) namespace, and ``{}*``       |
 |                       | only selects tags that are not in a namespace.       |
 |                       |                                                      |
 |                       | .. versionchanged:: 3.8                              |
 |                       |    Support for star-wildcards was added.             |
 +-----------------------+------------------------------------------------------+
-| ``*``                 | Selects all child elements.  For example, ``*/egg``  |
+| ``*``                 | Selects all child elements, including comments and   |
 |                       | processing instructions.  For example, ``*/egg``     |
 |                       | selects all grandchildren named ``egg``.             |
 +-----------------------+------------------------------------------------------+
 | ``.``                 | Selects the current node.  This is mostly useful     |
--- a/Doc/whatsnew/3.8.rst
+++ b/Doc/whatsnew/3.8.rst
@ -532,6 +532,11 @@ xml
  external entities by default.
  (Contributed by Christian Heimes in :issue:`17239`.)
 * The ``.find*()`` methods in the :mod:`xml.etree.ElementTree` module
  support wildcard searches like ``{*}tag`` which ignores the namespace
  and ``{namespace}*`` which returns all tags in the given namespace.
  (Contributed by Stefan Behnel in :issue:`28238`.)
 * The :mod:`xml.etree.ElementTree` module provides a new function
  :func:`–xml.etree.ElementTree.canonicalize()` that implements C14N 2.0.
  (Contributed by Stefan Behnel in :issue:`13611`.)
--- a/Lib/test/test_xml_etree.py
+++ b/Lib/test/test_xml_etree.py
@ -1137,16 +1137,21 @@ class ElementTreeTest(unittest.TestCase):
    def test_xpath_tokenizer(self):
        # Test the XPath tokenizer.
        from xml.etree import ElementPath
-        def check(p, expected):
+        def check(p, expected, namespaces=None):
            self.assertEqual([op or tag
-                              for op, tag in ElementPath.xpath_tokenizer(p)],
+                              for op, tag in ElementPath.xpath_tokenizer(p, namespaces)],
                             expected)
        # tests from the xml specification
        check("*", ['*'])
        check("{ns}*", ['{ns}*'])
        check("{}*", ['{}*'])
        check("{*}tag", ['{*}tag'])
        check("{*}*", ['{*}*'])
        check("text()", ['text', '()'])
        check("@name", ['@', 'name'])
        check("@*", ['@', '*'])
        check("@{ns}attr", ['@', '{ns}attr'])
        check("para[1]", ['para', '[', '1', ']'])
        check("para[last()]", ['para', '[', 'last', '()', ']'])
        check("*/para", ['*', '/', 'para'])
@ -1158,6 +1163,7 @@ class ElementTreeTest(unittest.TestCase):
        check("//olist/item", ['//', 'olist', '/', 'item'])
        check(".", ['.'])
        check(".//para", ['.', '//', 'para'])
        check(".//{*}tag", ['.', '//', '{*}tag'])
        check("..", ['..'])
        check("../@lang", ['..', '/', '@', 'lang'])
        check("chapter[title]", ['chapter', '[', 'title', ']'])
@ -1168,6 +1174,8 @@ class ElementTreeTest(unittest.TestCase):
        check("{http://spam}egg", ['{http://spam}egg'])
        check("./spam.egg", ['.', '/', 'spam.egg'])
        check(".//{http://spam}egg", ['.', '//', '{http://spam}egg'])
        check("./xsd:type", ['.', '/', '{http://www.w3.org/2001/XMLSchema}type'],
              {'xsd': 'http://www.w3.org/2001/XMLSchema'})
    def test_processinginstruction(self):
        # Test ProcessingInstruction directly
@ -2669,6 +2677,50 @@ class ElementFindTest(unittest.TestCase):
        self.assertEqual(len(root.findall(".//xx:b", namespaces=nsmap)), 2)
        self.assertEqual(len(root.findall(".//b", namespaces=nsmap)), 1)
    def test_findall_wildcard(self):
        root = ET.XML('''
            <a xmlns:x="X" xmlns:y="Y">
                <x:b><c/></x:b>
                <b/>
                <c><x:b/><b/></c><y:b/>
            </a>''')
        root.append(ET.Comment('test'))
        self.assertEqual(summarize_list(root.findall("{*}b")),
                         ['{X}b', 'b', '{Y}b'])
        self.assertEqual(summarize_list(root.findall("{*}c")),
                         ['c'])
        self.assertEqual(summarize_list(root.findall("{X}*")),
                         ['{X}b'])
        self.assertEqual(summarize_list(root.findall("{Y}*")),
                         ['{Y}b'])
        self.assertEqual(summarize_list(root.findall("{}*")),
                         ['b', 'c'])
        self.assertEqual(summarize_list(root.findall("{}b")),  # only for consistency
                         ['b'])
        self.assertEqual(summarize_list(root.findall("{}b")),
                         summarize_list(root.findall("b")))
        self.assertEqual(summarize_list(root.findall("{*}*")),
                         ['{X}b', 'b', 'c', '{Y}b'])
        # This is an unfortunate difference, but that's how find('*') works.
        self.assertEqual(summarize_list(root.findall("{*}*") + [root[-1]]),
                         summarize_list(root.findall("*")))
        self.assertEqual(summarize_list(root.findall(".//{*}b")),
                         ['{X}b', 'b', '{X}b', 'b', '{Y}b'])
        self.assertEqual(summarize_list(root.findall(".//{*}c")),
                         ['c', 'c'])
        self.assertEqual(summarize_list(root.findall(".//{X}*")),
                         ['{X}b', '{X}b'])
        self.assertEqual(summarize_list(root.findall(".//{Y}*")),
                         ['{Y}b'])
        self.assertEqual(summarize_list(root.findall(".//{}*")),
                         ['c', 'b', 'c', 'b'])
        self.assertEqual(summarize_list(root.findall(".//{}b")),  # only for consistency
                         ['b', 'b'])
        self.assertEqual(summarize_list(root.findall(".//{}b")),
                         summarize_list(root.findall(".//b")))
    def test_bad_find(self):
        e = ET.XML(SAMPLE_XML)
        with self.assertRaisesRegex(SyntaxError, 'cannot use absolute path'):
--- a/Lib/xml/etree/ElementPath.py
+++ b/Lib/xml/etree/ElementPath.py
@ -99,13 +99,70 @@ def get_parent_map(context):
                parent_map[e] = p
    return parent_map
 def _is_wildcard_tag(tag):
    return tag[:3] == '{*}' or tag[-2:] == '}*'
 def _prepare_tag(tag):
    _isinstance, _str = isinstance, str
    if tag == '{*}*':
        # Same as '*', but no comments or processing instructions.
        # It can be a surprise that '*' includes those, but there is no
        # justification for '{*}*' doing the same.
        def select(context, result):
            for elem in result:
                if _isinstance(elem.tag, _str):
                    yield elem
    elif tag == '{}*':
        # Any tag that is not in a namespace.
        def select(context, result):
            for elem in result:
                el_tag = elem.tag
                if _isinstance(el_tag, _str) and el_tag[0] != '{':
                    yield elem
    elif tag[:3] == '{*}':
        # The tag in any (or no) namespace.
        suffix = tag[2:]  # '}name'
        no_ns = slice(-len(suffix), None)
        tag = tag[3:]
        def select(context, result):
            for elem in result:
                el_tag = elem.tag
                if el_tag == tag or _isinstance(el_tag, _str) and el_tag[no_ns] == suffix:
                    yield elem
    elif tag[-2:] == '}*':
        # Any tag in the given namespace.
        ns = tag[:-1]
        ns_only = slice(None, len(ns))
        def select(context, result):
            for elem in result:
                el_tag = elem.tag
                if _isinstance(el_tag, _str) and el_tag[ns_only] == ns:
                    yield elem
    else:
        raise RuntimeError(f"internal parser error, got {tag}")
    return select
 def prepare_child(next, token):
    tag = token[1]
-    def select(context, result):
+    if _is_wildcard_tag(tag):
-        for elem in result:
+        select_tag = _prepare_tag(tag)
-            for e in elem:
+        def select(context, result):
-                if e.tag == tag:
+            def select_child(result):
-                    yield e
+                for elem in result:
                    yield from elem
            return select_tag(context, select_child(result))
    else:
        if tag[:2] == '{}':
            tag = tag[2:]  # '{}tag' == 'tag'
        def select(context, result):
            for elem in result:
                for e in elem:
                    if e.tag == tag:
                        yield e
    return select
 def prepare_star(next, token):
@ -130,11 +187,24 @@ def prepare_descendant(next, token):
        tag = token[1]
    else:
        raise SyntaxError("invalid descendant")
-    def select(context, result):
+
-        for elem in result:
+    if _is_wildcard_tag(tag):
-            for e in elem.iter(tag):
+        select_tag = _prepare_tag(tag)
-                if e is not elem:
+        def select(context, result):
-                    yield e
+            def select_child(result):
                for elem in result:
                    for e in elem.iter():
                        if e is not elem:
                            yield e
            return select_tag(context, select_child(result))
    else:
        if tag[:2] == '{}':
            tag = tag[2:]  # '{}tag' == 'tag'
        def select(context, result):
            for elem in result:
                for e in elem.iter(tag):
                    if e is not elem:
                        yield e
    return select
 def prepare_parent(next, token):
--- a/Misc/NEWS.d/next/Library/2019-04-28-15-01-29.bpo-28238.gdk38f.rst
+++ b/Misc/NEWS.d/next/Library/2019-04-28-15-01-29.bpo-28238.gdk38f.rst
@ -0,0 +1,3 @@
 The ``.find*()`` methods of xml.etree.ElementTree can now search for
 wildcards like ``{*}tag`` and ``{ns}*`` that match a tag in any namespace
 or all tags in a namespace.  Patch by Stefan Behnel.
--- a/Modules/_elementtree.c
+++ b/Modules/_elementtree.c
@ -1149,6 +1149,13 @@ checkpath(PyObject* tag)
        const Py_ssize_t len = PyUnicode_GET_LENGTH(tag);
        void *data = PyUnicode_DATA(tag);
        unsigned int kind = PyUnicode_KIND(tag);
        if (len >= 3 && PyUnicode_READ(kind, data, 0) == '{' && (
                PyUnicode_READ(kind, data, 1) == '}' || (
                PyUnicode_READ(kind, data, 1) == '*' &&
                PyUnicode_READ(kind, data, 2) == '}'))) {
            /* wildcard: '{}tag' or '{*}tag' */
            return 1;
        }
        for (i = 0; i < len; i++) {
            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
            if (ch == '{')
@ -1162,7 +1169,13 @@ checkpath(PyObject* tag)
    }
    if (PyBytes_Check(tag)) {
        char *p = PyBytes_AS_STRING(tag);
-        for (i = 0; i < PyBytes_GET_SIZE(tag); i++) {
+        const Py_ssize_t len = PyBytes_GET_SIZE(tag);
        if (len >= 3 && p[0] == '{' && (
                p[1] == '}' || p[1] == '*' && p[2] == '}')) {
            /* wildcard: '{}tag' or '{*}tag' */
            return 1;
        }
        for (i = 0; i < len; i++) {
            if (p[i] == '{')
                check = 0;
            else if (p[i] == '}')