Issue #13612: handle unknown encodings without a buffer overflow.
This affects pyexpat and _elementtree. PyExpat_CAPI now exposes a new function - DefaultUnknownEncodingHandler. Based on a patch by Serhiy Storchaka.
This commit is contained in:
parent
6b5a38c728
commit
6dc32b34dd
|
@ -6,7 +6,7 @@
|
|||
#define PyExpat_CAPI_MAGIC "pyexpat.expat_CAPI 1.0"
|
||||
#define PyExpat_CAPSULE_NAME "pyexpat.expat_CAPI"
|
||||
|
||||
struct PyExpat_CAPI
|
||||
struct PyExpat_CAPI
|
||||
{
|
||||
char* magic; /* set to PyExpat_CAPI_MAGIC */
|
||||
int size; /* set to sizeof(struct PyExpat_CAPI) */
|
||||
|
@ -46,6 +46,8 @@ struct PyExpat_CAPI
|
|||
void (*SetStartDoctypeDeclHandler)(XML_Parser parser,
|
||||
XML_StartDoctypeDeclHandler start);
|
||||
enum XML_Status (*SetEncoding)(XML_Parser parser, const XML_Char *encoding);
|
||||
int (*DefaultUnknownEncodingHandler)(
|
||||
void *encodingHandlerData, const XML_Char *name, XML_Encoding *info);
|
||||
/* always add new stuff to the end! */
|
||||
};
|
||||
|
||||
|
|
|
@ -690,6 +690,98 @@ class ElementTreeTest(unittest.TestCase):
|
|||
check("cp437", '\u221a')
|
||||
check("mac-roman", '\u02da')
|
||||
|
||||
def xml(encoding):
|
||||
return "<?xml version='1.0' encoding='%s'?><xml />" % encoding
|
||||
def bxml(encoding):
|
||||
return xml(encoding).encode(encoding)
|
||||
supported_encodings = [
|
||||
'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le',
|
||||
'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
|
||||
'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
|
||||
'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
|
||||
'cp437', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852',
|
||||
'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862',
|
||||
'cp863', 'cp865', 'cp866', 'cp869', 'cp874', 'cp1006', 'cp1250',
|
||||
'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256',
|
||||
'cp1257', 'cp1258',
|
||||
'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
|
||||
'mac-roman', 'mac-turkish',
|
||||
'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004',
|
||||
'iso2022-jp-3', 'iso2022-jp-ext',
|
||||
'koi8-r', 'koi8-u',
|
||||
'hz', 'ptcp154',
|
||||
]
|
||||
for encoding in supported_encodings:
|
||||
self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'<xml />')
|
||||
|
||||
unsupported_ascii_compatible_encodings = [
|
||||
'big5', 'big5hkscs',
|
||||
'cp932', 'cp949', 'cp950',
|
||||
'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
|
||||
'gb2312', 'gbk', 'gb18030',
|
||||
'iso2022-kr', 'johab',
|
||||
'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
|
||||
'utf-7',
|
||||
]
|
||||
for encoding in unsupported_ascii_compatible_encodings:
|
||||
self.assertRaises(ValueError, ET.XML, bxml(encoding))
|
||||
|
||||
unsupported_ascii_incompatible_encodings = [
|
||||
'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140',
|
||||
'utf_32', 'utf_32_be', 'utf_32_le',
|
||||
]
|
||||
for encoding in unsupported_ascii_incompatible_encodings:
|
||||
self.assertRaises(ET.ParseError, ET.XML, bxml(encoding))
|
||||
|
||||
self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii'))
|
||||
self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii'))
|
||||
|
||||
def xml(encoding):
|
||||
return "<?xml version='1.0' encoding='%s'?><xml />" % encoding
|
||||
def bxml(encoding):
|
||||
return xml(encoding).encode(encoding)
|
||||
supported_encodings = [
|
||||
'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le',
|
||||
'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
|
||||
'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
|
||||
'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
|
||||
'cp437', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852',
|
||||
'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862',
|
||||
'cp863', 'cp865', 'cp866', 'cp869', 'cp874', 'cp1006', 'cp1250',
|
||||
'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256',
|
||||
'cp1257', 'cp1258',
|
||||
'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2',
|
||||
'mac-roman', 'mac-turkish',
|
||||
'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004',
|
||||
'iso2022-jp-3', 'iso2022-jp-ext',
|
||||
'koi8-r', 'koi8-u',
|
||||
'hz', 'ptcp154',
|
||||
]
|
||||
for encoding in supported_encodings:
|
||||
self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'<xml />')
|
||||
|
||||
unsupported_ascii_compatible_encodings = [
|
||||
'big5', 'big5hkscs',
|
||||
'cp932', 'cp949', 'cp950',
|
||||
'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
|
||||
'gb2312', 'gbk', 'gb18030',
|
||||
'iso2022-kr', 'johab',
|
||||
'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
|
||||
'utf-7',
|
||||
]
|
||||
for encoding in unsupported_ascii_compatible_encodings:
|
||||
self.assertRaises(ValueError, ET.XML, bxml(encoding))
|
||||
|
||||
unsupported_ascii_incompatible_encodings = [
|
||||
'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140',
|
||||
'utf_32', 'utf_32_be', 'utf_32_le',
|
||||
]
|
||||
for encoding in unsupported_ascii_incompatible_encodings:
|
||||
self.assertRaises(ET.ParseError, ET.XML, bxml(encoding))
|
||||
|
||||
self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii'))
|
||||
self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii'))
|
||||
|
||||
def test_methods(self):
|
||||
# Test serialization methods.
|
||||
|
||||
|
|
|
@ -3136,47 +3136,6 @@ expat_pi_handler(XMLParserObject* self, const XML_Char* target_in,
|
|||
}
|
||||
}
|
||||
|
||||
static int
|
||||
expat_unknown_encoding_handler(XMLParserObject *self, const XML_Char *name,
|
||||
XML_Encoding *info)
|
||||
{
|
||||
PyObject* u;
|
||||
unsigned char s[256];
|
||||
int i;
|
||||
void *data;
|
||||
unsigned int kind;
|
||||
|
||||
memset(info, 0, sizeof(XML_Encoding));
|
||||
|
||||
for (i = 0; i < 256; i++)
|
||||
s[i] = i;
|
||||
|
||||
u = PyUnicode_Decode((char*) s, 256, name, "replace");
|
||||
if (!u)
|
||||
return XML_STATUS_ERROR;
|
||||
if (PyUnicode_READY(u))
|
||||
return XML_STATUS_ERROR;
|
||||
|
||||
if (PyUnicode_GET_LENGTH(u) != 256) {
|
||||
Py_DECREF(u);
|
||||
return XML_STATUS_ERROR;
|
||||
}
|
||||
|
||||
kind = PyUnicode_KIND(u);
|
||||
data = PyUnicode_DATA(u);
|
||||
for (i = 0; i < 256; i++) {
|
||||
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
||||
if (ch != Py_UNICODE_REPLACEMENT_CHARACTER)
|
||||
info->map[i] = ch;
|
||||
else
|
||||
info->map[i] = -1;
|
||||
}
|
||||
|
||||
Py_DECREF(u);
|
||||
|
||||
return XML_STATUS_OK;
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------------------- */
|
||||
|
||||
static PyObject *
|
||||
|
@ -3278,7 +3237,7 @@ xmlparser_init(PyObject *self, PyObject *args, PyObject *kwds)
|
|||
);
|
||||
EXPAT(SetUnknownEncodingHandler)(
|
||||
self_xp->parser,
|
||||
(XML_UnknownEncodingHandler) expat_unknown_encoding_handler, NULL
|
||||
EXPAT(DefaultUnknownEncodingHandler), NULL
|
||||
);
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -1111,53 +1111,49 @@ static struct PyMethodDef xmlparse_methods[] = {
|
|||
Make it as simple as possible.
|
||||
*/
|
||||
|
||||
static char template_buffer[257];
|
||||
|
||||
static void
|
||||
init_template_buffer(void)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < 256; i++) {
|
||||
template_buffer[i] = i;
|
||||
}
|
||||
template_buffer[256] = 0;
|
||||
}
|
||||
|
||||
static int
|
||||
PyUnknownEncodingHandler(void *encodingHandlerData,
|
||||
const XML_Char *name,
|
||||
XML_Encoding *info)
|
||||
{
|
||||
PyUnicodeObject *_u_string = NULL;
|
||||
int result = 0;
|
||||
static unsigned char template_buffer[256] = {0};
|
||||
PyObject* u;
|
||||
int i;
|
||||
int kind;
|
||||
void *data;
|
||||
unsigned int kind;
|
||||
|
||||
/* Yes, supports only 8bit encodings */
|
||||
_u_string = (PyUnicodeObject *)
|
||||
PyUnicode_Decode(template_buffer, 256, name, "replace");
|
||||
|
||||
if (_u_string == NULL || PyUnicode_READY(_u_string) == -1)
|
||||
return result;
|
||||
|
||||
kind = PyUnicode_KIND(_u_string);
|
||||
data = PyUnicode_DATA(_u_string);
|
||||
|
||||
for (i = 0; i < 256; i++) {
|
||||
/* Stupid to access directly, but fast */
|
||||
Py_UCS4 c = PyUnicode_READ(kind, data, i);
|
||||
if (c == Py_UNICODE_REPLACEMENT_CHARACTER)
|
||||
info->map[i] = -1;
|
||||
else
|
||||
info->map[i] = c;
|
||||
if (template_buffer[1] == 0) {
|
||||
for (i = 0; i < 256; i++)
|
||||
template_buffer[i] = i;
|
||||
}
|
||||
|
||||
u = PyUnicode_Decode((char*) template_buffer, 256, name, "replace");
|
||||
if (u == NULL || PyUnicode_READY(u))
|
||||
return XML_STATUS_ERROR;
|
||||
|
||||
if (PyUnicode_GET_LENGTH(u) != 256) {
|
||||
Py_DECREF(u);
|
||||
PyErr_SetString(PyExc_ValueError,
|
||||
"multi-byte encodings are not supported");
|
||||
return XML_STATUS_ERROR;
|
||||
}
|
||||
|
||||
kind = PyUnicode_KIND(u);
|
||||
data = PyUnicode_DATA(u);
|
||||
for (i = 0; i < 256; i++) {
|
||||
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
||||
if (ch != Py_UNICODE_REPLACEMENT_CHARACTER)
|
||||
info->map[i] = ch;
|
||||
else
|
||||
info->map[i] = -1;
|
||||
}
|
||||
|
||||
info->data = NULL;
|
||||
info->convert = NULL;
|
||||
info->release = NULL;
|
||||
result = 1;
|
||||
Py_DECREF(_u_string);
|
||||
return result;
|
||||
Py_DECREF(u);
|
||||
|
||||
return XML_STATUS_OK;
|
||||
}
|
||||
|
||||
|
||||
|
@ -1752,7 +1748,6 @@ MODULE_INITFUNC(void)
|
|||
Py_BuildValue("(iii)", info.major,
|
||||
info.minor, info.micro));
|
||||
}
|
||||
init_template_buffer();
|
||||
/* XXX When Expat supports some way of figuring out how it was
|
||||
compiled, this should check and set native_encoding
|
||||
appropriately.
|
||||
|
@ -1938,6 +1933,7 @@ MODULE_INITFUNC(void)
|
|||
capi.SetUserData = XML_SetUserData;
|
||||
capi.SetStartDoctypeDeclHandler = XML_SetStartDoctypeDeclHandler;
|
||||
capi.SetEncoding = XML_SetEncoding;
|
||||
capi.DefaultUnknownEncodingHandler = PyUnknownEncodingHandler;
|
||||
|
||||
/* export using capsule */
|
||||
capi_object = PyCapsule_New(&capi, PyExpat_CAPSULE_NAME, NULL);
|
||||
|
|
Loading…
Reference in New Issue