bpo-37399: Correctly attach tail text to the last element/comment/pi (GH-14856)

* bpo-37399: Correctly attach tail text to the last element/comment/pi, even when comments or pis are discarded.
Also fixes the insertion of PIs when "insert_pis=True" is configured for a TreeBuilder.
This commit is contained in:
Stefan Behnel 2019-07-24 20:08:02 +02:00 committed by GitHub
parent 151b91dfd2
commit c6cb4cdd21
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 124 additions and 24 deletions

View File

@ -2954,6 +2954,66 @@ class TreeBuilderTest(unittest.TestCase):
self.assertEqual(b.pi('target'), (len('target'), None))
self.assertEqual(b.pi('pitarget', ' text '), (len('pitarget'), ' text '))
def test_late_tail(self):
# Issue #37399: The tail of an ignored comment could overwrite the text before it.
class TreeBuilderSubclass(ET.TreeBuilder):
pass
xml = "<a>text<!-- comment -->tail</a>"
a = ET.fromstring(xml)
self.assertEqual(a.text, "texttail")
parser = ET.XMLParser(target=TreeBuilderSubclass())
parser.feed(xml)
a = parser.close()
self.assertEqual(a.text, "texttail")
xml = "<a>text<?pi data?>tail</a>"
a = ET.fromstring(xml)
self.assertEqual(a.text, "texttail")
xml = "<a>text<?pi data?>tail</a>"
parser = ET.XMLParser(target=TreeBuilderSubclass())
parser.feed(xml)
a = parser.close()
self.assertEqual(a.text, "texttail")
def test_late_tail_mix_pi_comments(self):
# Issue #37399: The tail of an ignored comment could overwrite the text before it.
# Test appending tails to comments/pis.
class TreeBuilderSubclass(ET.TreeBuilder):
pass
xml = "<a>text<?pi1?> <!-- comment -->\n<?pi2?>tail</a>"
parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=True))
parser.feed(xml)
a = parser.close()
self.assertEqual(a[0].text, ' comment ')
self.assertEqual(a[0].tail, '\ntail')
self.assertEqual(a.text, "text ")
parser = ET.XMLParser(target=TreeBuilderSubclass(insert_comments=True))
parser.feed(xml)
a = parser.close()
self.assertEqual(a[0].text, ' comment ')
self.assertEqual(a[0].tail, '\ntail')
self.assertEqual(a.text, "text ")
xml = "<a>text<!-- comment -->\n<?pi data?>tail</a>"
parser = ET.XMLParser(target=ET.TreeBuilder(insert_pis=True))
parser.feed(xml)
a = parser.close()
self.assertEqual(a[0].text, 'pi data')
self.assertEqual(a[0].tail, 'tail')
self.assertEqual(a.text, "text\n")
parser = ET.XMLParser(target=TreeBuilderSubclass(insert_pis=True))
parser.feed(xml)
a = parser.close()
self.assertEqual(a[0].text, 'pi data')
self.assertEqual(a[0].tail, 'tail')
self.assertEqual(a.text, "text\n")
def test_treebuilder_elementfactory_none(self):
parser = ET.XMLParser(target=ET.TreeBuilder(element_factory=None))
parser.feed(self.sample1)

View File

@ -2399,6 +2399,7 @@ typedef struct {
PyObject *this; /* current node */
PyObject *last; /* most recently created node */
PyObject *last_for_tail; /* most recently created node that takes a tail */
PyObject *data; /* data collector (string or list), or NULL */
@ -2530,6 +2531,7 @@ treebuilder_gc_traverse(TreeBuilderObject *self, visitproc visit, void *arg)
Py_VISIT(self->root);
Py_VISIT(self->this);
Py_VISIT(self->last);
Py_VISIT(self->last_for_tail);
Py_VISIT(self->data);
Py_VISIT(self->stack);
Py_VISIT(self->pi_factory);
@ -2551,6 +2553,7 @@ treebuilder_gc_clear(TreeBuilderObject *self)
Py_CLEAR(self->stack);
Py_CLEAR(self->data);
Py_CLEAR(self->last);
Py_CLEAR(self->last_for_tail);
Py_CLEAR(self->this);
Py_CLEAR(self->pi_factory);
Py_CLEAR(self->comment_factory);
@ -2622,21 +2625,50 @@ _elementtree__set_factories_impl(PyObject *module, PyObject *comment_factory,
}
static int
treebuilder_set_element_text_or_tail(PyObject *element, PyObject **data,
PyObject **dest, _Py_Identifier *name)
treebuilder_extend_element_text_or_tail(PyObject *element, PyObject **data,
PyObject **dest, _Py_Identifier *name)
{
/* Fast paths for the "almost always" cases. */
if (Element_CheckExact(element)) {
PyObject *tmp = JOIN_OBJ(*dest);
*dest = JOIN_SET(*data, PyList_CheckExact(*data));
*data = NULL;
Py_DECREF(tmp);
return 0;
PyObject *dest_obj = JOIN_OBJ(*dest);
if (dest_obj == Py_None) {
*dest = JOIN_SET(*data, PyList_CheckExact(*data));
*data = NULL;
Py_DECREF(dest_obj);
return 0;
}
else if (JOIN_GET(*dest)) {
if (PyList_SetSlice(dest_obj, PY_SSIZE_T_MAX, PY_SSIZE_T_MAX, *data) < 0) {
return -1;
}
Py_CLEAR(*data);
return 0;
}
}
else {
PyObject *joined = list_join(*data);
/* Fallback for the non-Element / non-trivial cases. */
{
int r;
if (joined == NULL)
PyObject* joined;
PyObject* previous = _PyObject_GetAttrId(element, name);
if (!previous)
return -1;
joined = list_join(*data);
if (!joined) {
Py_DECREF(previous);
return -1;
}
if (previous != Py_None) {
PyObject *tmp = PyNumber_Add(previous, joined);
Py_DECREF(joined);
Py_DECREF(previous);
if (!tmp)
return -1;
joined = tmp;
} else {
Py_DECREF(previous);
}
r = _PyObject_SetAttrId(element, name, joined);
Py_DECREF(joined);
if (r < 0)
@ -2649,21 +2681,21 @@ treebuilder_set_element_text_or_tail(PyObject *element, PyObject **data,
LOCAL(int)
treebuilder_flush_data(TreeBuilderObject* self)
{
PyObject *element = self->last;
if (!self->data) {
return 0;
}
if (self->this == element) {
if (!self->last_for_tail) {
PyObject *element = self->last;
_Py_IDENTIFIER(text);
return treebuilder_set_element_text_or_tail(
return treebuilder_extend_element_text_or_tail(
element, &self->data,
&((ElementObject *) element)->text, &PyId_text);
}
else {
PyObject *element = self->last_for_tail;
_Py_IDENTIFIER(tail);
return treebuilder_set_element_text_or_tail(
return treebuilder_extend_element_text_or_tail(
element, &self->data,
&((ElementObject *) element)->tail, &PyId_tail);
}
@ -2739,6 +2771,7 @@ treebuilder_handle_start(TreeBuilderObject* self, PyObject* tag,
}
this = self->this;
Py_CLEAR(self->last_for_tail);
if (this != Py_None) {
if (treebuilder_add_subelement(this, node) < 0)
@ -2836,6 +2869,8 @@ treebuilder_handle_end(TreeBuilderObject* self, PyObject* tag)
item = self->last;
self->last = self->this;
Py_INCREF(self->last);
Py_XSETREF(self->last_for_tail, self->last);
self->index--;
self->this = PyList_GET_ITEM(self->stack, self->index);
Py_INCREF(self->this);
@ -2851,7 +2886,7 @@ treebuilder_handle_end(TreeBuilderObject* self, PyObject* tag)
LOCAL(PyObject*)
treebuilder_handle_comment(TreeBuilderObject* self, PyObject* text)
{
PyObject* comment = NULL;
PyObject* comment;
PyObject* this;
if (treebuilder_flush_data(self) < 0) {
@ -2867,6 +2902,8 @@ treebuilder_handle_comment(TreeBuilderObject* self, PyObject* text)
if (self->insert_comments && this != Py_None) {
if (treebuilder_add_subelement(this, comment) < 0)
goto error;
Py_INCREF(comment);
Py_XSETREF(self->last_for_tail, comment);
}
} else {
Py_INCREF(text);
@ -2888,7 +2925,7 @@ treebuilder_handle_comment(TreeBuilderObject* self, PyObject* text)
LOCAL(PyObject*)
treebuilder_handle_pi(TreeBuilderObject* self, PyObject* target, PyObject* text)
{
PyObject* pi = NULL;
PyObject* pi;
PyObject* this;
PyObject* stack[2] = {target, text};
@ -2906,6 +2943,8 @@ treebuilder_handle_pi(TreeBuilderObject* self, PyObject* target, PyObject* text)
if (self->insert_pis && this != Py_None) {
if (treebuilder_add_subelement(this, pi) < 0)
goto error;
Py_INCREF(pi);
Py_XSETREF(self->last_for_tail, pi);
}
} else {
pi = PyTuple_Pack(2, target, text);
@ -3495,8 +3534,8 @@ expat_end_ns_handler(XMLParserObject* self, const XML_Char* prefix_in)
static void
expat_comment_handler(XMLParserObject* self, const XML_Char* comment_in)
{
PyObject* comment = NULL;
PyObject* res = NULL;
PyObject* comment;
PyObject* res;
if (PyErr_Occurred())
return;
@ -3510,16 +3549,17 @@ expat_comment_handler(XMLParserObject* self, const XML_Char* comment_in)
return; /* parser will look for errors */
res = treebuilder_handle_comment(target, comment);
Py_XDECREF(res);
Py_DECREF(comment);
} else if (self->handle_comment) {
comment = PyUnicode_DecodeUTF8(comment_in, strlen(comment_in), "strict");
if (!comment)
return;
res = _PyObject_CallOneArg(self->handle_comment, comment);
Py_XDECREF(res);
Py_DECREF(comment);
}
Py_XDECREF(res);
Py_DECREF(comment);
}
static void
@ -3587,7 +3627,7 @@ static void
expat_pi_handler(XMLParserObject* self, const XML_Char* target_in,
const XML_Char* data_in)
{
PyObject* pi_target = NULL;
PyObject* pi_target;
PyObject* data;
PyObject* res;
PyObject* stack[2];
@ -3599,7 +3639,7 @@ expat_pi_handler(XMLParserObject* self, const XML_Char* target_in,
/* shortcut */
TreeBuilderObject *target = (TreeBuilderObject*) self->target;
if (target->events_append && target->pi_event_obj) {
if ((target->events_append && target->pi_event_obj) || target->insert_pis) {
pi_target = PyUnicode_DecodeUTF8(target_in, strlen(target_in), "strict");
if (!pi_target)
goto error;