diff --git a/Include/pyexpat.h b/Include/pyexpat.h index 168b5b2ae1d..8a799746b35 100644 --- a/Include/pyexpat.h +++ b/Include/pyexpat.h @@ -45,6 +45,7 @@ struct PyExpat_CAPI void (*SetUserData)(XML_Parser parser, void *userData); void (*SetStartDoctypeDeclHandler)(XML_Parser parser, XML_StartDoctypeDeclHandler start); + enum XML_Status (*SetEncoding)(XML_Parser parser, const XML_Char *encoding); /* always add new stuff to the end! */ }; diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index 7e0f25a460a..746ca28282b 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -677,15 +677,18 @@ class ElementTreeTest(unittest.TestCase): elem = ET.fromstring("text") self.assertEqual(ET.tostring(elem), b'text') - def test_encoding(encoding): - def check(encoding): - ET.XML("" % encoding) - check("ascii") - check("us-ascii") - check("iso-8859-1") - check("iso-8859-15") - check("cp437") - check("mac-roman") + def test_encoding(self): + def check(encoding, body=''): + xml = ("%s" % + (encoding, body)) + self.assertEqual(ET.XML(xml.encode(encoding)).text, body) + self.assertEqual(ET.XML(xml).text, body) + check("ascii", 'a') + check("us-ascii", 'a') + check("iso-8859-1", '\xbd') + check("iso-8859-15", '\u20ac') + check("cp437", '\u221a') + check("mac-roman", '\u02da') def test_methods(self): # Test serialization methods. @@ -1842,11 +1845,13 @@ class TreeBuilderTest(unittest.TestCase): class XMLParserTest(unittest.TestCase): - sample1 = '22' - sample2 = ('' - 'text') + sample1 = b'22' + sample2 = (b'' + b'text') + sample3 = ('\n' + '$\xa3\u20ac\U0001017b') def _check_sample_element(self, e): self.assertEqual(e.tag, 'file') @@ -1882,12 +1887,21 @@ class XMLParserTest(unittest.TestCase): _doctype = (name, pubid, system) parser = MyParserWithDoctype() - parser.feed(self.sample2) + with self.assertWarns(DeprecationWarning): + parser.feed(self.sample2) parser.close() self.assertEqual(_doctype, ('html', '-//W3C//DTD XHTML 1.0 Transitional//EN', 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd')) + def test_parse_string(self): + parser = ET.XMLParser(target=ET.TreeBuilder()) + parser.feed(self.sample3) + e = parser.close() + self.assertEqual(e.tag, 'money') + self.assertEqual(e.attrib['value'], '$\xa3\u20ac\U0001017b') + self.assertEqual(e.text, '$\xa3\u20ac\U0001017b') + class NamespaceParseTest(unittest.TestCase): def test_find_with_namespace(self): @@ -2297,6 +2311,7 @@ def test_main(module=None): ElementFindTest, ElementIterTest, TreeBuilderTest, + XMLParserTest, BugsTest, ] diff --git a/Misc/NEWS b/Misc/NEWS index 465c0f529c0..15c6c9eca5c 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -24,6 +24,9 @@ Core and Builtins Library ------- +- Issue #16986: ElementTree now correctly parses a string input not only when + an internal XML encoding is UTF-8 or US-ASCII. + - Issue #17812: Fixed quadratic complexity of base64.b32encode(). - Issue #17980: Fix possible abuse of ssl.match_hostname() for denial of diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c index 029ff7b5c60..5078b8372f8 100644 --- a/Modules/_elementtree.c +++ b/Modules/_elementtree.c @@ -3330,7 +3330,7 @@ xmlparser_dealloc(XMLParserObject* self) } LOCAL(PyObject*) -expat_parse(XMLParserObject* self, char* data, int data_len, int final) +expat_parse(XMLParserObject* self, const char* data, int data_len, int final) { int ok; @@ -3376,16 +3376,37 @@ xmlparser_close(XMLParserObject* self, PyObject* args) } static PyObject* -xmlparser_feed(XMLParserObject* self, PyObject* args) +xmlparser_feed(XMLParserObject* self, PyObject* arg) { /* feed data to parser */ - char* data; - int data_len; - if (!PyArg_ParseTuple(args, "s#:feed", &data, &data_len)) - return NULL; - - return expat_parse(self, data, data_len, 0); + if (PyUnicode_Check(arg)) { + Py_ssize_t data_len; + const char *data = PyUnicode_AsUTF8AndSize(arg, &data_len); + if (data == NULL) + return NULL; + if (data_len > INT_MAX) { + PyErr_SetString(PyExc_OverflowError, "size does not fit in an int"); + return NULL; + } + /* Explicitly set UTF-8 encoding. Return code ignored. */ + (void)EXPAT(SetEncoding)(self->parser, "utf-8"); + return expat_parse(self, data, (int)data_len, 0); + } + else { + Py_buffer view; + PyObject *res; + if (PyObject_GetBuffer(arg, &view, PyBUF_SIMPLE) < 0) + return NULL; + if (view.len > INT_MAX) { + PyBuffer_Release(&view); + PyErr_SetString(PyExc_OverflowError, "size does not fit in an int"); + return NULL; + } + res = expat_parse(self, view.buf, (int)view.len, 0); + PyBuffer_Release(&view); + return res; + } } static PyObject* @@ -3570,7 +3591,7 @@ xmlparser_setevents(XMLParserObject *self, PyObject* args) } static PyMethodDef xmlparser_methods[] = { - {"feed", (PyCFunction) xmlparser_feed, METH_VARARGS}, + {"feed", (PyCFunction) xmlparser_feed, METH_O}, {"close", (PyCFunction) xmlparser_close, METH_VARARGS}, {"_parse", (PyCFunction) xmlparser_parse, METH_VARARGS}, {"_setevents", (PyCFunction) xmlparser_setevents, METH_VARARGS}, diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c index 022b0cbaf92..4750225449c 100644 --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -1937,6 +1937,7 @@ MODULE_INITFUNC(void) capi.SetUnknownEncodingHandler = XML_SetUnknownEncodingHandler; capi.SetUserData = XML_SetUserData; capi.SetStartDoctypeDeclHandler = XML_SetStartDoctypeDeclHandler; + capi.SetEncoding = XML_SetEncoding; /* export using capsule */ capi_object = PyCapsule_New(&capi, PyExpat_CAPSULE_NAME, NULL);