Issue #6233: ElementTree failed converting unicode characters to XML

entities when they could't be represented in the requested output
encoding.  Patch by Jerry Chen.
This commit is contained in:
Antoine Pitrou 2010-02-09 16:51:16 +00:00
parent 28a817e3ba
commit c77dd32be4
4 changed files with 25 additions and 6 deletions

View File

@ -210,6 +210,17 @@ def check_encoding(ET, encoding):
"""
ET.XML("<?xml version='1.0' encoding='%s'?><xml />" % encoding)
def check_issue6233():
"""
>>> from xml.etree import ElementTree as ET
>>> e = ET.XML("<?xml version='1.0' encoding='utf-8'?><body>t\xe3g</body>")
>>> ET.tostring(e, 'ascii')
b"<?xml version='1.0' encoding='ascii'?>\\n<body>t&#227;g</body>"
>>> e = ET.XML("<?xml version='1.0' encoding='iso-8859-1'?><body>t\xe3g</body>".encode('iso-8859-1')) # create byte string with the right encoding
>>> ET.tostring(e, 'ascii')
b"<?xml version='1.0' encoding='ascii'?>\\n<body>t&#227;g</body>"
"""
#
# xinclude tests (samples from appendix C of the xinclude specification)

View File

@ -662,9 +662,9 @@ class ElementTree:
# write XML to file
tag = node.tag
if tag is Comment:
file.write(_encode("<!-- %s -->" % _escape_cdata(node.text), encoding))
file.write(b"<!-- " + _encode_cdata(node.text, encoding) + b" -->")
elif tag is ProcessingInstruction:
file.write(_encode("<?%s?>" % _escape_cdata(node.text), encoding))
file.write(b"<?" + _encode_cdata(node.text, encoding) + b"?>")
else:
items = list(node.items())
xmlns_items = [] # new namespaces in this scope
@ -696,7 +696,7 @@ class ElementTree:
if node.text or len(node):
file.write(_encode(">", encoding))
if node.text:
file.write(_encode(_escape_cdata(node.text), encoding))
file.write(_encode_cdata(node.text, encoding))
for n in node:
self._write(file, n, encoding, namespaces)
file.write(_encode("</" + tag + ">", encoding))
@ -705,7 +705,7 @@ class ElementTree:
for k, v in xmlns_items:
del namespaces[v]
if node.tail:
file.write(_encode(_escape_cdata(node.tail), encoding))
file.write(_encode_cdata(node.tail, encoding))
# --------------------------------------------------------------------
# helpers
@ -788,13 +788,16 @@ def _encode_entity(text, pattern=_escape):
# the following functions assume an ascii-compatible encoding
# (or "utf-16")
def _escape_cdata(text):
def _encode_cdata(text, encoding):
# escape character data
try:
text = text.replace("&", "&amp;")
text = text.replace("<", "&lt;")
text = text.replace(">", "&gt;")
return text
if encoding:
return text.encode(encoding, "xmlcharrefreplace")
else:
return text
except (TypeError, AttributeError):
_raise_serialization_error(text)

View File

@ -131,6 +131,7 @@ Greg Chapman
Brad Chapman
David Chaum
Nicolas Chauvat
Jerry Chen
Michael Chermside
Albert Chin-A-Young
Adal Chiriliuc

View File

@ -242,6 +242,10 @@ C-API
Library
-------
- Issue #6233: ElementTree failed converting unicode characters to XML
entities when they could't be represented in the requested output
encoding. Patch by Jerry Chen.
- Issue #6003: add an argument to ``zipfile.Zipfile.writestr`` to
specify the compression type.