diff --git a/Include/pyexpat.h b/Include/pyexpat.h --- a/Include/pyexpat.h +++ b/Include/pyexpat.h @@ -6,7 +6,7 @@ #define PyExpat_CAPI_MAGIC "pyexpat.expat_CAPI 1.0" #define PyExpat_CAPSULE_NAME "pyexpat.expat_CAPI" -struct PyExpat_CAPI +struct PyExpat_CAPI { char* magic; /* set to PyExpat_CAPI_MAGIC */ int size; /* set to sizeof(struct PyExpat_CAPI) */ @@ -46,6 +46,8 @@ void (*SetStartDoctypeDeclHandler)(XML_Parser parser, XML_StartDoctypeDeclHandler start); enum XML_Status (*SetEncoding)(XML_Parser parser, const XML_Char *encoding); + int (*DefaultUnknownEncodingHandler)( + void *encodingHandlerData, const XML_Char *name, XML_Encoding *info); /* always add new stuff to the end! */ }; diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -681,6 +681,98 @@ check("cp437", '\u221a') check("mac-roman", '\u02da') + def xml(encoding): + return "" % encoding + def bxml(encoding): + return xml(encoding).encode(encoding) + supported_encodings = [ + 'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le', + 'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5', + 'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10', + 'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16', + 'cp437', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', + 'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', + 'cp863', 'cp865', 'cp866', 'cp869', 'cp874', 'cp1006', 'cp1250', + 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256', + 'cp1257', 'cp1258', + 'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2', + 'mac-roman', 'mac-turkish', + 'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004', + 'iso2022-jp-3', 'iso2022-jp-ext', + 'koi8-r', 'koi8-u', + 'hz', 'ptcp154', + ] + for encoding in supported_encodings: + self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'') + + unsupported_ascii_compatible_encodings = [ + 'big5', 'big5hkscs', + 'cp932', 'cp949', 'cp950', + 'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr', + 'gb2312', 'gbk', 'gb18030', + 'iso2022-kr', 'johab', + 'shift-jis', 'shift-jis-2004', 'shift-jisx0213', + 'utf-7', + ] + for encoding in unsupported_ascii_compatible_encodings: + self.assertRaises(ValueError, ET.XML, bxml(encoding)) + + unsupported_ascii_incompatible_encodings = [ + 'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140', + 'utf_32', 'utf_32_be', 'utf_32_le', + ] + for encoding in unsupported_ascii_incompatible_encodings: + self.assertRaises(ET.ParseError, ET.XML, bxml(encoding)) + + self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii')) + self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii')) + + def xml(encoding): + return "" % encoding + def bxml(encoding): + return xml(encoding).encode(encoding) + supported_encodings = [ + 'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le', + 'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5', + 'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10', + 'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16', + 'cp437', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', + 'cp855', 'cp856', 'cp857', 'cp858', 'cp860', 'cp861', 'cp862', + 'cp863', 'cp865', 'cp866', 'cp869', 'cp874', 'cp1006', 'cp1250', + 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', 'cp1256', + 'cp1257', 'cp1258', + 'mac-cyrillic', 'mac-greek', 'mac-iceland', 'mac-latin2', + 'mac-roman', 'mac-turkish', + 'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004', + 'iso2022-jp-3', 'iso2022-jp-ext', + 'koi8-r', 'koi8-u', + 'hz', 'ptcp154', + ] + for encoding in supported_encodings: + self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'') + + unsupported_ascii_compatible_encodings = [ + 'big5', 'big5hkscs', + 'cp932', 'cp949', 'cp950', + 'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr', + 'gb2312', 'gbk', 'gb18030', + 'iso2022-kr', 'johab', + 'shift-jis', 'shift-jis-2004', 'shift-jisx0213', + 'utf-7', + ] + for encoding in unsupported_ascii_compatible_encodings: + self.assertRaises(ValueError, ET.XML, bxml(encoding)) + + unsupported_ascii_incompatible_encodings = [ + 'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140', + 'utf_32', 'utf_32_be', 'utf_32_le', + ] + for encoding in unsupported_ascii_incompatible_encodings: + self.assertRaises(ET.ParseError, ET.XML, bxml(encoding)) + + self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii')) + self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii')) + def test_methods(self): # Test serialization methods. diff --git a/Modules/_elementtree.c b/Modules/_elementtree.c --- a/Modules/_elementtree.c +++ b/Modules/_elementtree.c @@ -3095,44 +3095,13 @@ } static int -expat_unknown_encoding_handler(XMLParserObject *self, const XML_Char *name, +expat_unknown_encoding_handler(void *encodingHandlerData, + const XML_Char *name, XML_Encoding *info) { - PyObject* u; - unsigned char s[256]; - int i; - void *data; - unsigned int kind; - - memset(info, 0, sizeof(XML_Encoding)); - - for (i = 0; i < 256; i++) - s[i] = i; - - u = PyUnicode_Decode((char*) s, 256, name, "replace"); - if (!u) - return XML_STATUS_ERROR; - if (PyUnicode_READY(u)) - return XML_STATUS_ERROR; - - if (PyUnicode_GET_LENGTH(u) != 256) { - Py_DECREF(u); - return XML_STATUS_ERROR; - } - - kind = PyUnicode_KIND(u); - data = PyUnicode_DATA(u); - for (i = 0; i < 256; i++) { - Py_UCS4 ch = PyUnicode_READ(kind, data, i); - if (ch != Py_UNICODE_REPLACEMENT_CHARACTER) - info->map[i] = ch; - else - info->map[i] = -1; - } - - Py_DECREF(u); - - return XML_STATUS_OK; + return EXPAT(DefaultUnknownEncodingHandler)(encodingHandlerData, + name, + info); } /* -------------------------------------------------------------------- */ diff --git a/Modules/pyexpat.c b/Modules/pyexpat.c --- a/Modules/pyexpat.c +++ b/Modules/pyexpat.c @@ -1111,53 +1111,49 @@ Make it as simple as possible. */ -static char template_buffer[257]; - -static void -init_template_buffer(void) -{ - int i; - for (i = 0; i < 256; i++) { - template_buffer[i] = i; - } - template_buffer[256] = 0; -} - static int PyUnknownEncodingHandler(void *encodingHandlerData, const XML_Char *name, XML_Encoding *info) { - PyUnicodeObject *_u_string = NULL; - int result = 0; + static unsigned char template_buffer[256] = {0}; + PyObject* u; int i; - int kind; void *data; + unsigned int kind; - /* Yes, supports only 8bit encodings */ - _u_string = (PyUnicodeObject *) - PyUnicode_Decode(template_buffer, 256, name, "replace"); + if (template_buffer[1] == 0) { + for (i = 0; i < 256; i++) + template_buffer[i] = i; + } - if (_u_string == NULL || PyUnicode_READY(_u_string) == -1) - return result; + u = PyUnicode_Decode((char*) template_buffer, 256, name, "replace"); + if (u == NULL || PyUnicode_READY(u)) + return XML_STATUS_ERROR; - kind = PyUnicode_KIND(_u_string); - data = PyUnicode_DATA(_u_string); + if (PyUnicode_GET_LENGTH(u) != 256) { + Py_DECREF(u); + PyErr_SetString(PyExc_ValueError, + "multi-byte encodings are not supported"); + return XML_STATUS_ERROR; + } + kind = PyUnicode_KIND(u); + data = PyUnicode_DATA(u); for (i = 0; i < 256; i++) { - /* Stupid to access directly, but fast */ - Py_UCS4 c = PyUnicode_READ(kind, data, i); - if (c == Py_UNICODE_REPLACEMENT_CHARACTER) + Py_UCS4 ch = PyUnicode_READ(kind, data, i); + if (ch != Py_UNICODE_REPLACEMENT_CHARACTER) + info->map[i] = ch; + else info->map[i] = -1; - else - info->map[i] = c; } + info->data = NULL; info->convert = NULL; info->release = NULL; - result = 1; - Py_DECREF(_u_string); - return result; + Py_DECREF(u); + + return XML_STATUS_OK; } @@ -1752,7 +1748,6 @@ Py_BuildValue("(iii)", info.major, info.minor, info.micro)); } - init_template_buffer(); /* XXX When Expat supports some way of figuring out how it was compiled, this should check and set native_encoding appropriately. @@ -1938,6 +1933,7 @@ capi.SetUserData = XML_SetUserData; capi.SetStartDoctypeDeclHandler = XML_SetStartDoctypeDeclHandler; capi.SetEncoding = XML_SetEncoding; + capi.DefaultUnknownEncodingHandler = PyUnknownEncodingHandler; /* export using capsule */ capi_object = PyCapsule_New(&capi, PyExpat_CAPSULE_NAME, NULL);