From 64d11e60f23f6b1435704adb87ebf818e5a4c0c1 Mon Sep 17 00:00:00 2001 From: Eli Bendersky Date: Fri, 15 Jun 2012 07:42:50 +0300 Subject: [PATCH] Replace the iter/itertext methods of Element in _elementtree with true C implementations, instead of the bootstrapped Python code. In addition to being cleaner (removing the last remains of the bootstrapping code in _elementtree), this gives a 10x performance boost for iter() on large documents. Also reorganized the tests a bit to be more robust. --- Lib/test/test_xml_etree.py | 247 +++++++++++++----------- Lib/test/test_xml_etree_c.py | 28 +-- Lib/xml/etree/ElementTree.py | 6 +- Modules/_elementtree.c | 362 ++++++++++++++++++++++++++--------- 4 files changed, 414 insertions(+), 229 deletions(-) diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index 49a5633a830..bee63292568 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -23,7 +23,8 @@ import weakref from test import support from test.support import findfile, import_fresh_module, gc_collect -pyET = import_fresh_module('xml.etree.ElementTree', blocked=['_elementtree']) +pyET = None +ET = None SIMPLE_XMLFILE = findfile("simple.xml", subdir="xmltestdata") try: @@ -209,10 +210,8 @@ def interface(): These methods return an iterable. See bug 6472. - >>> check_method(element.iter("tag").__next__) >>> check_method(element.iterfind("tag").__next__) >>> check_method(element.iterfind("*").__next__) - >>> check_method(tree.iter("tag").__next__) >>> check_method(tree.iterfind("tag").__next__) >>> check_method(tree.iterfind("*").__next__) @@ -291,42 +290,6 @@ def cdata(): 'hello' """ -# Only with Python implementation -def simplefind(): - """ - Test find methods using the elementpath fallback. - - >>> ElementTree = pyET - - >>> CurrentElementPath = ElementTree.ElementPath - >>> ElementTree.ElementPath = ElementTree._SimpleElementPath() - >>> elem = ElementTree.XML(SAMPLE_XML) - >>> elem.find("tag").tag - 'tag' - >>> ElementTree.ElementTree(elem).find("tag").tag - 'tag' - >>> elem.findtext("tag") - 'text' - >>> elem.findtext("tog") - >>> elem.findtext("tog", "default") - 'default' - >>> ElementTree.ElementTree(elem).findtext("tag") - 'text' - >>> summarize_list(elem.findall("tag")) - ['tag', 'tag'] - >>> summarize_list(elem.findall(".//tag")) - ['tag', 'tag', 'tag'] - - Path syntax doesn't work in this case. - - >>> elem.find("section/tag") - >>> elem.findtext("section/tag") - >>> summarize_list(elem.findall("section/tag")) - [] - - >>> ElementTree.ElementPath = CurrentElementPath - """ - def find(): """ Test find methods (including xpath syntax). @@ -1002,36 +965,6 @@ def methods(): '1 < 2\n' """ -def iterators(): - """ - Test iterators. - - >>> e = ET.XML("this is a paragraph...") - >>> summarize_list(e.iter()) - ['html', 'body', 'i'] - >>> summarize_list(e.find("body").iter()) - ['body', 'i'] - >>> summarize(next(e.iter())) - 'html' - >>> "".join(e.itertext()) - 'this is a paragraph...' - >>> "".join(e.find("body").itertext()) - 'this is a paragraph.' - >>> next(e.itertext()) - 'this is a ' - - Method iterparse should return an iterator. See bug 6472. - - >>> sourcefile = serialize(e, to_string=False) - >>> next(ET.iterparse(sourcefile)) # doctest: +ELLIPSIS - ('end', ) - - >>> tree = ET.ElementTree(None) - >>> tree.iter() - Traceback (most recent call last): - AttributeError: 'NoneType' object has no attribute 'iter' - """ - ENTITY_XML = """\ @@ -1339,6 +1272,7 @@ XINCLUDE["default.xml"] = """\ """.format(html.escape(SIMPLE_XMLFILE, True)) + def xinclude_loader(href, parse="xml", encoding=None): try: data = XINCLUDE[href] @@ -1411,22 +1345,6 @@ def xinclude(): >>> # print(serialize(document)) # C5 """ -def xinclude_default(): - """ - >>> from xml.etree import ElementInclude - - >>> document = xinclude_loader("default.xml") - >>> ElementInclude.include(document) - >>> print(serialize(document)) # default - -

Example.

- - text - texttail - - -
- """ # # badly formatted xi:include tags @@ -1917,9 +1835,8 @@ class ElementTreeTest(unittest.TestCase): self.assertIsInstance(ET.QName, type) self.assertIsInstance(ET.ElementTree, type) self.assertIsInstance(ET.Element, type) - # XXX issue 14128 with C ElementTree - # self.assertIsInstance(ET.TreeBuilder, type) - # self.assertIsInstance(ET.XMLParser, type) + self.assertIsInstance(ET.TreeBuilder, type) + self.assertIsInstance(ET.XMLParser, type) def test_Element_subclass_trivial(self): class MyElement(ET.Element): @@ -1953,6 +1870,73 @@ class ElementTreeTest(unittest.TestCase): self.assertEqual(mye.newmethod(), 'joe') +class ElementIterTest(unittest.TestCase): + def _ilist(self, elem, tag=None): + return summarize_list(elem.iter(tag)) + + def test_basic(self): + doc = ET.XML("this is a paragraph...") + self.assertEqual(self._ilist(doc), ['html', 'body', 'i']) + self.assertEqual(self._ilist(doc.find('body')), ['body', 'i']) + self.assertEqual(next(doc.iter()).tag, 'html') + self.assertEqual(''.join(doc.itertext()), 'this is a paragraph...') + self.assertEqual(''.join(doc.find('body').itertext()), + 'this is a paragraph.') + self.assertEqual(next(doc.itertext()), 'this is a ') + + # iterparse should return an iterator + sourcefile = serialize(doc, to_string=False) + self.assertEqual(next(ET.iterparse(sourcefile))[0], 'end') + + tree = ET.ElementTree(None) + self.assertRaises(AttributeError, tree.iter) + + def test_corners(self): + # single root, no subelements + a = ET.Element('a') + self.assertEqual(self._ilist(a), ['a']) + + # one child + b = ET.SubElement(a, 'b') + self.assertEqual(self._ilist(a), ['a', 'b']) + + # one child and one grandchild + c = ET.SubElement(b, 'c') + self.assertEqual(self._ilist(a), ['a', 'b', 'c']) + + # two children, only first with grandchild + d = ET.SubElement(a, 'd') + self.assertEqual(self._ilist(a), ['a', 'b', 'c', 'd']) + + # replace first child by second + a[0] = a[1] + del a[1] + self.assertEqual(self._ilist(a), ['a', 'd']) + + def test_iter_by_tag(self): + doc = ET.XML(''' + + + bedroom1 + bedroom2 + + nothing here + + + bedroom8 + + ''') + + self.assertEqual(self._ilist(doc, 'room'), ['room'] * 3) + self.assertEqual(self._ilist(doc, 'house'), ['house'] * 2) + + # make sure both tag=None and tag='*' return all tags + all_tags = ['document', 'house', 'room', 'room', + 'shed', 'house', 'room'] + self.assertEqual(self._ilist(doc), all_tags) + self.assertEqual(self._ilist(doc, '*'), all_tags) + + class TreeBuilderTest(unittest.TestCase): sample1 = (' +

Example.

+ + text + texttail + + +''') class XMLParserTest(unittest.TestCase): sample1 = '22' sample2 = ('>> cElementTree = cET - >>> e = cElementTree.Element('a') - >>> getattr(e, '\uD800') # doctest: +ELLIPSIS - Traceback (most recent call last): - ... - UnicodeEncodeError: ... - - >>> p = cElementTree.XMLParser() - >>> p.version.split()[0] - 'Expat' - >>> getattr(p, '\uD800') - Traceback (most recent call last): - ... - AttributeError: 'XMLParser' object has no attribute '\ud800' - """ - - class MiscTests(unittest.TestCase): # Issue #8651. @support.bigmemtest(size=support._2G + 100, memuse=1) @@ -46,6 +21,7 @@ class MiscTests(unittest.TestCase): finally: data = None + @unittest.skipUnless(cET, 'requires _elementtree') class TestAliasWorking(unittest.TestCase): # Test that the cET alias module is alive @@ -53,6 +29,7 @@ class TestAliasWorking(unittest.TestCase): e = cET_alias.Element('foo') self.assertEqual(e.tag, 'foo') + @unittest.skipUnless(cET, 'requires _elementtree') class TestAcceleratorImported(unittest.TestCase): # Test that the C accelerator was imported, as expected @@ -67,7 +44,6 @@ def test_main(): from test import test_xml_etree, test_xml_etree_c # Run the tests specific to the C implementation - support.run_doctest(test_xml_etree_c, verbosity=True) support.run_unittest( MiscTests, TestAliasWorking, diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index e068fc2443d..47766253e54 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -916,11 +916,7 @@ def _namespaces(elem, default_namespace=None): _raise_serialization_error(qname) # populate qname and namespaces table - try: - iterate = elem.iter - except AttributeError: - iterate = elem.getiterator # cET compatibility - for elem in iterate(): + for elem in elem.iter(): tag = elem.tag if isinstance(tag, QName): if tag.text not in qnames: diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c index 103e778eea7..f0b5a3fa242 100644 --- a/Modules/_elementtree.c +++ b/Modules/_elementtree.c @@ -103,8 +103,6 @@ do { memory -= size; printf("%8d - %s\n", memory, comment); } while (0) /* glue functions (see the init function for details) */ static PyObject* elementtree_parseerror_obj; static PyObject* elementtree_deepcopy_obj; -static PyObject* elementtree_iter_obj; -static PyObject* elementtree_itertext_obj; static PyObject* elementpath_obj; /* helpers */ @@ -1109,67 +1107,32 @@ element_getchildren(ElementObject* self, PyObject* args) return list; } -static PyObject* -element_iter(ElementObject* self, PyObject* args) -{ - PyObject* result; +static PyObject * +create_elementiter(ElementObject *self, PyObject *tag, int gettext); + + +static PyObject * +element_iter(ElementObject *self, PyObject *args) +{ PyObject* tag = Py_None; if (!PyArg_ParseTuple(args, "|O:iter", &tag)) return NULL; - if (!elementtree_iter_obj) { - PyErr_SetString( - PyExc_RuntimeError, - "iter helper not found" - ); - return NULL; - } - - args = PyTuple_New(2); - if (!args) - return NULL; - - Py_INCREF(self); PyTuple_SET_ITEM(args, 0, (PyObject*) self); - Py_INCREF(tag); PyTuple_SET_ITEM(args, 1, (PyObject*) tag); - - result = PyObject_CallObject(elementtree_iter_obj, args); - - Py_DECREF(args); - - return result; + return create_elementiter(self, tag, 0); } static PyObject* element_itertext(ElementObject* self, PyObject* args) { - PyObject* result; - if (!PyArg_ParseTuple(args, ":itertext")) return NULL; - if (!elementtree_itertext_obj) { - PyErr_SetString( - PyExc_RuntimeError, - "itertext helper not found" - ); - return NULL; - } - - args = PyTuple_New(1); - if (!args) - return NULL; - - Py_INCREF(self); PyTuple_SET_ITEM(args, 0, (PyObject*) self); - - result = PyObject_CallObject(elementtree_itertext_obj, args); - - Py_DECREF(args); - - return result; + return create_elementiter(self, Py_None, 1); } + static PyObject* element_getitem(PyObject* self_, Py_ssize_t index) { @@ -1790,6 +1753,267 @@ static PyTypeObject Element_Type = { 0, /* tp_free */ }; +/******************************* Element iterator ****************************/ + +/* ElementIterObject represents the iteration state over an XML element in + * pre-order traversal. To keep track of which sub-element should be returned + * next, a stack of parents is maintained. This is a standard stack-based + * iterative pre-order traversal of a tree. + * The stack is managed using a single-linked list starting at parent_stack. + * Each stack node contains the saved parent to which we should return after + * the current one is exhausted, and the next child to examine in that parent. + */ +typedef struct ParentLocator_t { + ElementObject *parent; + Py_ssize_t child_index; + struct ParentLocator_t *next; +} ParentLocator; + +typedef struct { + PyObject_HEAD + ParentLocator *parent_stack; + ElementObject *root_element; + PyObject *sought_tag; + int root_done; + int gettext; +} ElementIterObject; + + +static void +elementiter_dealloc(ElementIterObject *it) +{ + ParentLocator *p = it->parent_stack; + while (p) { + ParentLocator *temp = p; + Py_XDECREF(p->parent); + p = p->next; + PyObject_Free(temp); + } + + Py_XDECREF(it->sought_tag); + Py_XDECREF(it->root_element); + + PyObject_GC_UnTrack(it); + PyObject_GC_Del(it); +} + +static int +elementiter_traverse(ElementIterObject *it, visitproc visit, void *arg) +{ + ParentLocator *p = it->parent_stack; + while (p) { + Py_VISIT(p->parent); + p = p->next; + } + + Py_VISIT(it->root_element); + Py_VISIT(it->sought_tag); + return 0; +} + +/* Helper function for elementiter_next. Add a new parent to the parent stack. + */ +static ParentLocator * +parent_stack_push_new(ParentLocator *stack, ElementObject *parent) +{ + ParentLocator *new_node = PyObject_Malloc(sizeof(ParentLocator)); + if (new_node) { + new_node->parent = parent; + Py_INCREF(parent); + new_node->child_index = 0; + new_node->next = stack; + } + return new_node; +} + +static PyObject * +elementiter_next(ElementIterObject *it) +{ + /* Sub-element iterator. + * + * A short note on gettext: this function serves both the iter() and + * itertext() methods to avoid code duplication. However, there are a few + * small differences in the way these iterations work. Namely: + * - itertext() only yields text from nodes that have it, and continues + * iterating when a node doesn't have text (so it doesn't return any + * node like iter()) + * - itertext() also has to handle tail, after finishing with all the + * children of a node. + */ + + while (1) { + /* Handle the case reached in the beginning and end of iteration, where + * the parent stack is empty. The root_done flag gives us indication + * whether we've just started iterating (so root_done is 0), in which + * case the root is returned. If root_done is 1 and we're here, the + * iterator is exhausted. + */ + if (!it->parent_stack->parent) { + if (it->root_done) { + PyErr_SetNone(PyExc_StopIteration); + return NULL; + } else { + it->parent_stack = parent_stack_push_new(it->parent_stack, + it->root_element); + if (!it->parent_stack) { + PyErr_NoMemory(); + return NULL; + } + + it->root_done = 1; + if (it->sought_tag == Py_None || + PyObject_RichCompareBool(it->root_element->tag, + it->sought_tag, Py_EQ) == 1) { + if (it->gettext) { + PyObject *text = JOIN_OBJ(it->root_element->text); + if (PyObject_IsTrue(text)) { + Py_INCREF(text); + return text; + } + } else { + Py_INCREF(it->root_element); + return (PyObject *)it->root_element; + } + } + } + } + + /* See if there are children left to traverse in the current parent. If + * yes, visit the next child. If not, pop the stack and try again. + */ + ElementObject *cur_parent = it->parent_stack->parent; + Py_ssize_t child_index = it->parent_stack->child_index; + if (cur_parent->extra && child_index < cur_parent->extra->length) { + ElementObject *child = (ElementObject *) + cur_parent->extra->children[child_index]; + it->parent_stack->child_index++; + it->parent_stack = parent_stack_push_new(it->parent_stack, + child); + if (!it->parent_stack) { + PyErr_NoMemory(); + return NULL; + } + + if (it->gettext) { + PyObject *text = JOIN_OBJ(child->text); + if (PyObject_IsTrue(text)) { + Py_INCREF(text); + return text; + } + } else if (it->sought_tag == Py_None || + PyObject_RichCompareBool(child->tag, + it->sought_tag, Py_EQ) == 1) { + Py_INCREF(child); + return (PyObject *)child; + } + else + continue; + } + else { + PyObject *tail = it->gettext ? JOIN_OBJ(cur_parent->tail) : Py_None; + ParentLocator *next = it->parent_stack->next; + Py_XDECREF(it->parent_stack->parent); + PyObject_Free(it->parent_stack); + it->parent_stack = next; + + /* Note that extra condition on it->parent_stack->parent here; + * this is because itertext() is supposed to only return *inner* + * text, not text following the element it began iteration with. + */ + if (it->parent_stack->parent && PyObject_IsTrue(tail)) { + Py_INCREF(tail); + return tail; + } + } + } + + return NULL; +} + + +static PyTypeObject ElementIter_Type = { + PyVarObject_HEAD_INIT(NULL, 0) + "_elementtree._element_iterator", /* tp_name */ + sizeof(ElementIterObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + /* methods */ + (destructor)elementiter_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_reserved */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + 0, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC, /* tp_flags */ + 0, /* tp_doc */ + (traverseproc)elementiter_traverse, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + PyObject_SelfIter, /* tp_iter */ + (iternextfunc)elementiter_next, /* tp_iternext */ + 0, /* tp_methods */ + 0, /* tp_members */ + 0, /* tp_getset */ + 0, /* tp_base */ + 0, /* tp_dict */ + 0, /* tp_descr_get */ + 0, /* tp_descr_set */ + 0, /* tp_dictoffset */ + 0, /* tp_init */ + 0, /* tp_alloc */ + 0, /* tp_new */ +}; + + +static PyObject * +create_elementiter(ElementObject *self, PyObject *tag, int gettext) +{ + ElementIterObject *it; + PyObject *star = NULL; + + it = PyObject_GC_New(ElementIterObject, &ElementIter_Type); + if (!it) + return NULL; + if (!(it->parent_stack = PyObject_Malloc(sizeof(ParentLocator)))) { + PyObject_GC_Del(it); + return NULL; + } + + it->parent_stack->parent = NULL; + it->parent_stack->child_index = 0; + it->parent_stack->next = NULL; + + if (PyUnicode_Check(tag)) + star = PyUnicode_FromString("*"); + else if (PyBytes_Check(tag)) + star = PyBytes_FromString("*"); + + if (star && PyObject_RichCompareBool(tag, star, Py_EQ) == 1) + tag = Py_None; + + Py_XDECREF(star); + it->sought_tag = tag; + it->root_done = 0; + it->gettext = gettext; + it->root_element = self; + + Py_INCREF(self); + Py_INCREF(tag); + + PyObject_GC_Track(it); + return (PyObject *)it; +} + + /* ==================================================================== */ /* the tree builder type */ @@ -3238,8 +3462,7 @@ static struct PyModuleDef _elementtreemodule = { PyMODINIT_FUNC PyInit__elementtree(void) { - PyObject *m, *g, *temp; - char* bootstrap; + PyObject *m, *temp; /* Initialize object types */ if (PyType_Ready(&TreeBuilder_Type) < 0) @@ -3255,44 +3478,6 @@ PyInit__elementtree(void) if (!m) return NULL; - /* The code below requires that the module gets already added - to sys.modules. */ - PyDict_SetItemString(PyImport_GetModuleDict(), - _elementtreemodule.m_name, - m); - - /* python glue code */ - - g = PyDict_New(); - if (!g) - return NULL; - - PyDict_SetItemString(g, "__builtins__", PyEval_GetBuiltins()); - - bootstrap = ( - "def iter(node, tag=None):\n" /* helper */ - " if tag == '*':\n" - " tag = None\n" - " if tag is None or node.tag == tag:\n" - " yield node\n" - " for node in node:\n" - " for node in iter(node, tag):\n" - " yield node\n" - - "def itertext(node):\n" /* helper */ - " if node.text:\n" - " yield node.text\n" - " for e in node:\n" - " for s in e.itertext():\n" - " yield s\n" - " if e.tail:\n" - " yield e.tail\n" - - ); - - if (!PyRun_String(bootstrap, Py_file_input, g, NULL)) - return NULL; - if (!(temp = PyImport_ImportModule("copy"))) return NULL; elementtree_deepcopy_obj = PyObject_GetAttrString(temp, "deepcopy"); @@ -3301,9 +3486,6 @@ PyInit__elementtree(void) if (!(elementpath_obj = PyImport_ImportModule("xml.etree.ElementPath"))) return NULL; - elementtree_iter_obj = PyDict_GetItemString(g, "iter"); - elementtree_itertext_obj = PyDict_GetItemString(g, "itertext"); - /* link against pyexpat */ expat_capi = PyCapsule_Import(PyExpat_CAPSULE_NAME, 0); if (expat_capi) {