diff -r 540a9c69c2ea Lib/test/test_xml_etree.py --- a/Lib/test/test_xml_etree.py Fri Sep 13 19:53:08 2013 +0200 +++ b/Lib/test/test_xml_etree.py Sat Sep 14 21:44:09 2013 +0300 @@ -680,13 +680,18 @@ check("iso-8859-15", '\u20ac') check("cp437", '\u221a') check("mac-roman", '\u02da') + check('shift-jis-2004', '\u203e\u3406\uff66') + check('euc-jis-2004', '\u3406\uff66') def xml(encoding): return "" % encoding def bxml(encoding): return xml(encoding).encode(encoding) supported_encodings = [ - 'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le', + # expat built-in encodings + 'iso-8859-1', 'us-ascii', 'utf-8', 'utf-16', 'utf-16be', 'utf-16le', + # 8-bit Python encodings + 'ascii', 'latin1', 'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5', 'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10', 'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16', @@ -701,32 +706,30 @@ 'iso2022-jp-3', 'iso2022-jp-ext', 'koi8-r', 'koi8-u', 'hz', 'ptcp154', + # multibyte Python encodings + 'big5', 'big5hkscs', + 'cp932', 'cp949', 'cp950', + 'euc-jp', 'euc-jis-2004', 'euc-jisx0213', + 'gb2312', 'gbk', + 'johab', + 'shift-jis', 'shift-jis-2004', 'shift-jisx0213', ] for encoding in supported_encodings: self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'') - unsupported_ascii_compatible_encodings = [ - 'big5', 'big5hkscs', - 'cp932', 'cp949', 'cp950', - 'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr', - 'gb2312', 'gbk', 'gb18030', - 'iso2022-kr', 'johab', - 'shift-jis', 'shift-jis-2004', 'shift-jisx0213', - 'utf-7', + unsupported_encodings = [ + 'utf_32', 'utf_32_be', 'utf_32_le', + 'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140', + 'euc-kr', 'gb18030', 'iso2022-kr', 'utf-7', ] - for encoding in unsupported_ascii_compatible_encodings: - self.assertRaises(ValueError, ET.XML, bxml(encoding)) - - unsupported_ascii_incompatible_encodings = [ - 'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140', - 'utf_32', 'utf_32_be', 'utf_32_le', - ] - for encoding in unsupported_ascii_incompatible_encodings: - self.assertRaises(ET.ParseError, ET.XML, bxml(encoding)) + for encoding in unsupported_encodings: + self.assertRaises((ET.ParseError, ValueError), + ET.XML, bxml(encoding)) self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii')) self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii')) + def test_methods(self): # Test serialization methods. diff -r 540a9c69c2ea Modules/pyexpat.c --- a/Modules/pyexpat.c Fri Sep 13 19:53:08 2013 +0200 +++ b/Modules/pyexpat.c Sat Sep 14 21:44:09 2013 +0300 @@ -1128,15 +1128,152 @@ /* pyexpat international encoding support. - Make it as simple as possible. */ +static PyObject *encodings_cache = NULL; + +static PyObject * +pyexpat_encoding_create(const XML_Char *name, int map[256]) +{ + int max_len[256]; + int c, k, allocated = 256, tabsize = 256; + PyObject *encoder = NULL, *data = NULL, *args = NULL, *result = NULL; + int *table; + encoder = PyCodec_Encoder(name); + if (encoder == NULL) + goto onError; + data = PyDict_GetItem(encodings_cache, encoder); + if (data != NULL) { + Py_DECREF(encoder); + if (data == Py_None) + return NULL; + table = (int *)PyBytes_AS_STRING(data); + memcpy(map, table, 256 * sizeof(int)); + for (k = 0; k < 256; k++) { + if (map[k] < -1) + map[k] = -((-map[k]) & 0xff); + } + return data; + } + data = PyBytes_FromStringAndSize(NULL, 256 * sizeof(int)); + if (data == NULL) + goto onError; + table = (int *)PyBytes_AS_STRING(data); + args = PyTuple_New(1); + if (args == NULL) + goto onError; + memset(max_len, 0, sizeof(max_len)); + for(k = 0; k < 256; k++) + table[k] = -1; + for (c = 0; c < 0x10000; c++) { + PyObject *u, *bytes; + const unsigned char *encoded; + Py_ssize_t n; + int i; + int *t; + unsigned char b; + + u = PyUnicode_FromOrdinal(c); + if (u == NULL) + goto onError; + PyTuple_SET_ITEM(args, 0, u); + result = PyObject_CallObject(encoder, args); + Py_DECREF(u); + if (result == NULL) { + if (!PyErr_ExceptionMatches(PyExc_UnicodeEncodeError)) + goto onError; + PyErr_Clear(); + continue; + } + if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 2) + goto fail; + bytes = PyTuple_GET_ITEM(result, 0); + n = PyBytes_GET_SIZE(bytes); + if (n < 0 || n > 4) + goto fail; + encoded = (unsigned char *)PyBytes_AS_STRING(bytes); + b = encoded[0]; + if (!max_len[b]) + max_len[b] = n; + else if (max_len[b] != n) + goto fail; + t = table; + for (i = 1; i != n; b = encoded[i++]) { + assert(t[b] < 0); + if (t[b] == -1) { + t[b] = t - table - tabsize; + if (tabsize == allocated) { + allocated += allocated; + if (_PyBytes_Resize(&data, sizeof(int) * allocated) < 0) + goto onError; + table = (int *)PyBytes_AS_STRING(data); + } + t = table + tabsize; + for(k = 0; k < 256; k++) + t[k] = -1; + tabsize += 256; + } + else + t -= t[b]; + } + t[b] = c; + Py_CLEAR(result); + } + if (_PyBytes_Resize(&data, sizeof(int) * tabsize) < 0) + goto onError; + memcpy(map, table, 256 * sizeof(int)); + for (k = 0; k < 256; k++) { + if (table[k] < -1) { + map[k] = -max_len[k]; + table[k] -= max_len[k]; + } + } + PyTuple_SET_ITEM(args, 0, NULL); + Py_DECREF(args); + PyDict_SetItem(encodings_cache, encoder, data); + return data; +fail: + Py_INCREF(Py_None); + PyDict_SetItem(encodings_cache, encoder, Py_None); +onError: + Py_XDECREF(data); + PyTuple_SET_ITEM(args, 0, NULL); + Py_XDECREF(args); + Py_XDECREF(encoder); + Py_XDECREF(result); + return NULL; +} + +static int +pyexpat_encoding_convert(void *data, const char *s) +{ + const int *t = (const int *)PyBytes_AS_STRING((PyObject *)data); + while (1) { + int c = t[(unsigned char)*s++]; + if (c >= -1) + return c; + t += (-c) & ~0xff; + } +} + +static void +pyexpat_encoding_release(void *data) +{ + Py_DECREF(data); +} + static int PyUnknownEncodingHandler(void *encodingHandlerData, const XML_Char *name, XML_Encoding *info) { static unsigned char template_buffer[256] = {0}; + /* ASCII characters that can appear in a well-formed XML document + except the characters $@\^`{}~ */ + static const unsigned char compulsory_chars[] = + "\t\n\r !\"#%&'()*+,-./0123456789:;<=>?" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_" + "abcdefghijklmnopqrstuvwxyz|\x7f"; PyObject* u; int i; void *data; @@ -1158,9 +1295,16 @@ if (PyUnicode_GET_LENGTH(u) != 256) { Py_DECREF(u); - PyErr_SetString(PyExc_ValueError, - "multi-byte encodings are not supported"); - return XML_STATUS_ERROR; + info->data = pyexpat_encoding_create(name, info->map); + if (info->data == NULL) { + PyErr_Format(PyExc_ValueError, + "multi-byte encoding '%s' is not supported", name); + return XML_STATUS_ERROR; + } + Py_INCREF(info->data); + info->convert = pyexpat_encoding_convert; + info->release = pyexpat_encoding_release; + return XML_STATUS_OK; } kind = PyUnicode_KIND(u); @@ -1173,6 +1317,16 @@ info->map[i] = -1; } + for (i = 0; i < sizeof(compulsory_chars) - 1; i++) { + unsigned char b = compulsory_chars[i]; + if (PyUnicode_READ(kind, data, b) != b) { + Py_DECREF(u); + PyErr_Format(PyExc_ValueError, + "encoding '%s' is not ASCII compatible", name); + return XML_STATUS_ERROR; + } + } + info->data = NULL; info->convert = NULL; info->release = NULL; @@ -1746,6 +1900,12 @@ if (PyType_Ready(&Xmlparsetype) < 0) return NULL; + if (encodings_cache == NULL) { + encodings_cache = PyDict_New(); + if (encodings_cache == NULL) + return NULL; + } + /* Create the module and add the functions */ m = PyModule_Create(&pyexpatmodule); if (m == NULL)