From c6cb4cdd21c0c3a09b0617dbfaa7053d3bfa6def Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Wed, 24 Jul 2019 20:08:02 +0200 Subject: [PATCH] bpo-37399: Correctly attach tail text to the last element/comment/pi (GH-14856) * bpo-37399: Correctly attach tail text to the last element/comment/pi, even when comments or pis are discarded. Also fixes the insertion of PIs when "insert_pis=True" is configured for a TreeBuilder. --- Lib/test/test_xml_etree.py | 60 ++++++++++++++++++++++++++ Modules/_elementtree.c | 88 +++++++++++++++++++++++++++----------- 2 files changed, 124 insertions(+), 24 deletions(-) diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index 61737493a90..b2492cda848 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -2954,6 +2954,66 @@ class TreeBuilderTest(unittest.TestCase): self.assertEqual(b.pi('target'), (len('target'), None)) self.assertEqual(b.pi('pitarget', ' text '), (len('pitarget'), ' text ')) + def test_late_tail(self): + # Issue #37399: The tail of an ignored comment could overwrite the text before it. + class TreeBuilderSubclass(ET.TreeBuilder): + pass + + xml = "texttail" + a = ET.fromstring(xml) + self.assertEqual(a.text, "texttail") + + parser = ET.XMLParser(target=TreeBuilderSubclass()) + parser.feed(xml) + a = parser.close() + self.assertEqual(a.text, "texttail") + + xml = "texttail" + a = ET.fromstring(xml) + self.assertEqual(a.text, "texttail") + + xml = "texttail" + parser = ET.XMLParser(target=TreeBuilderSubclass()) + parser.feed(xml) + a = parser.close() + self.assertEqual(a.text, "texttail") + + def test_late_tail_mix_pi_comments(self): + # Issue #37399: The tail of an ignored comment could overwrite the text before it. + # Test appending tails to comments/pis. + class TreeBuilderSubclass(ET.TreeBuilder): + pass + + xml = "text \ntail" + parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=True)) + parser.feed(xml) + a = parser.close() + self.assertEqual(a[0].text, ' comment ') + self.assertEqual(a[0].tail, '\ntail') + self.assertEqual(a.text, "text ") + + parser = ET.XMLParser(target=TreeBuilderSubclass(insert_comments=True)) + parser.feed(xml) + a = parser.close() + self.assertEqual(a[0].text, ' comment ') + self.assertEqual(a[0].tail, '\ntail') + self.assertEqual(a.text, "text ") + + xml = "text\ntail" + parser = ET.XMLParser(target=ET.TreeBuilder(insert_pis=True)) + parser.feed(xml) + a = parser.close() + self.assertEqual(a[0].text, 'pi data') + self.assertEqual(a[0].tail, 'tail') + self.assertEqual(a.text, "text\n") + + parser = ET.XMLParser(target=TreeBuilderSubclass(insert_pis=True)) + parser.feed(xml) + a = parser.close() + self.assertEqual(a[0].text, 'pi data') + self.assertEqual(a[0].tail, 'tail') + self.assertEqual(a.text, "text\n") + def test_treebuilder_elementfactory_none(self): parser = ET.XMLParser(target=ET.TreeBuilder(element_factory=None)) parser.feed(self.sample1) diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c index b9b50165bfd..830ce8635ea 100644 --- a/Modules/_elementtree.c +++ b/Modules/_elementtree.c @@ -2399,6 +2399,7 @@ typedef struct { PyObject *this; /* current node */ PyObject *last; /* most recently created node */ + PyObject *last_for_tail; /* most recently created node that takes a tail */ PyObject *data; /* data collector (string or list), or NULL */ @@ -2530,6 +2531,7 @@ treebuilder_gc_traverse(TreeBuilderObject *self, visitproc visit, void *arg) Py_VISIT(self->root); Py_VISIT(self->this); Py_VISIT(self->last); + Py_VISIT(self->last_for_tail); Py_VISIT(self->data); Py_VISIT(self->stack); Py_VISIT(self->pi_factory); @@ -2551,6 +2553,7 @@ treebuilder_gc_clear(TreeBuilderObject *self) Py_CLEAR(self->stack); Py_CLEAR(self->data); Py_CLEAR(self->last); + Py_CLEAR(self->last_for_tail); Py_CLEAR(self->this); Py_CLEAR(self->pi_factory); Py_CLEAR(self->comment_factory); @@ -2622,21 +2625,50 @@ _elementtree__set_factories_impl(PyObject *module, PyObject *comment_factory, } static int -treebuilder_set_element_text_or_tail(PyObject *element, PyObject **data, - PyObject **dest, _Py_Identifier *name) +treebuilder_extend_element_text_or_tail(PyObject *element, PyObject **data, + PyObject **dest, _Py_Identifier *name) { + /* Fast paths for the "almost always" cases. */ if (Element_CheckExact(element)) { - PyObject *tmp = JOIN_OBJ(*dest); - *dest = JOIN_SET(*data, PyList_CheckExact(*data)); - *data = NULL; - Py_DECREF(tmp); - return 0; + PyObject *dest_obj = JOIN_OBJ(*dest); + if (dest_obj == Py_None) { + *dest = JOIN_SET(*data, PyList_CheckExact(*data)); + *data = NULL; + Py_DECREF(dest_obj); + return 0; + } + else if (JOIN_GET(*dest)) { + if (PyList_SetSlice(dest_obj, PY_SSIZE_T_MAX, PY_SSIZE_T_MAX, *data) < 0) { + return -1; + } + Py_CLEAR(*data); + return 0; + } } - else { - PyObject *joined = list_join(*data); + + /* Fallback for the non-Element / non-trivial cases. */ + { int r; - if (joined == NULL) + PyObject* joined; + PyObject* previous = _PyObject_GetAttrId(element, name); + if (!previous) return -1; + joined = list_join(*data); + if (!joined) { + Py_DECREF(previous); + return -1; + } + if (previous != Py_None) { + PyObject *tmp = PyNumber_Add(previous, joined); + Py_DECREF(joined); + Py_DECREF(previous); + if (!tmp) + return -1; + joined = tmp; + } else { + Py_DECREF(previous); + } + r = _PyObject_SetAttrId(element, name, joined); Py_DECREF(joined); if (r < 0) @@ -2649,21 +2681,21 @@ treebuilder_set_element_text_or_tail(PyObject *element, PyObject **data, LOCAL(int) treebuilder_flush_data(TreeBuilderObject* self) { - PyObject *element = self->last; - if (!self->data) { return 0; } - if (self->this == element) { + if (!self->last_for_tail) { + PyObject *element = self->last; _Py_IDENTIFIER(text); - return treebuilder_set_element_text_or_tail( + return treebuilder_extend_element_text_or_tail( element, &self->data, &((ElementObject *) element)->text, &PyId_text); } else { + PyObject *element = self->last_for_tail; _Py_IDENTIFIER(tail); - return treebuilder_set_element_text_or_tail( + return treebuilder_extend_element_text_or_tail( element, &self->data, &((ElementObject *) element)->tail, &PyId_tail); } @@ -2739,6 +2771,7 @@ treebuilder_handle_start(TreeBuilderObject* self, PyObject* tag, } this = self->this; + Py_CLEAR(self->last_for_tail); if (this != Py_None) { if (treebuilder_add_subelement(this, node) < 0) @@ -2836,6 +2869,8 @@ treebuilder_handle_end(TreeBuilderObject* self, PyObject* tag) item = self->last; self->last = self->this; + Py_INCREF(self->last); + Py_XSETREF(self->last_for_tail, self->last); self->index--; self->this = PyList_GET_ITEM(self->stack, self->index); Py_INCREF(self->this); @@ -2851,7 +2886,7 @@ treebuilder_handle_end(TreeBuilderObject* self, PyObject* tag) LOCAL(PyObject*) treebuilder_handle_comment(TreeBuilderObject* self, PyObject* text) { - PyObject* comment = NULL; + PyObject* comment; PyObject* this; if (treebuilder_flush_data(self) < 0) { @@ -2867,6 +2902,8 @@ treebuilder_handle_comment(TreeBuilderObject* self, PyObject* text) if (self->insert_comments && this != Py_None) { if (treebuilder_add_subelement(this, comment) < 0) goto error; + Py_INCREF(comment); + Py_XSETREF(self->last_for_tail, comment); } } else { Py_INCREF(text); @@ -2888,7 +2925,7 @@ treebuilder_handle_comment(TreeBuilderObject* self, PyObject* text) LOCAL(PyObject*) treebuilder_handle_pi(TreeBuilderObject* self, PyObject* target, PyObject* text) { - PyObject* pi = NULL; + PyObject* pi; PyObject* this; PyObject* stack[2] = {target, text}; @@ -2906,6 +2943,8 @@ treebuilder_handle_pi(TreeBuilderObject* self, PyObject* target, PyObject* text) if (self->insert_pis && this != Py_None) { if (treebuilder_add_subelement(this, pi) < 0) goto error; + Py_INCREF(pi); + Py_XSETREF(self->last_for_tail, pi); } } else { pi = PyTuple_Pack(2, target, text); @@ -3495,8 +3534,8 @@ expat_end_ns_handler(XMLParserObject* self, const XML_Char* prefix_in) static void expat_comment_handler(XMLParserObject* self, const XML_Char* comment_in) { - PyObject* comment = NULL; - PyObject* res = NULL; + PyObject* comment; + PyObject* res; if (PyErr_Occurred()) return; @@ -3510,16 +3549,17 @@ expat_comment_handler(XMLParserObject* self, const XML_Char* comment_in) return; /* parser will look for errors */ res = treebuilder_handle_comment(target, comment); + Py_XDECREF(res); + Py_DECREF(comment); } else if (self->handle_comment) { comment = PyUnicode_DecodeUTF8(comment_in, strlen(comment_in), "strict"); if (!comment) return; res = _PyObject_CallOneArg(self->handle_comment, comment); + Py_XDECREF(res); + Py_DECREF(comment); } - - Py_XDECREF(res); - Py_DECREF(comment); } static void @@ -3587,7 +3627,7 @@ static void expat_pi_handler(XMLParserObject* self, const XML_Char* target_in, const XML_Char* data_in) { - PyObject* pi_target = NULL; + PyObject* pi_target; PyObject* data; PyObject* res; PyObject* stack[2]; @@ -3599,7 +3639,7 @@ expat_pi_handler(XMLParserObject* self, const XML_Char* target_in, /* shortcut */ TreeBuilderObject *target = (TreeBuilderObject*) self->target; - if (target->events_append && target->pi_event_obj) { + if ((target->events_append && target->pi_event_obj) || target->insert_pis) { pi_target = PyUnicode_DecodeUTF8(target_in, strlen(target_in), "strict"); if (!pi_target) goto error;