diff -r 540a9c69c2ea Lib/test/test_xml_etree.py
--- a/Lib/test/test_xml_etree.py Fri Sep 13 19:53:08 2013 +0200
+++ b/Lib/test/test_xml_etree.py Sat Sep 14 21:44:09 2013 +0300
@@ -680,13 +680,18 @@
check("iso-8859-15", '\u20ac')
check("cp437", '\u221a')
check("mac-roman", '\u02da')
+ check('shift-jis-2004', '\u203e\u3406\uff66')
+ check('euc-jis-2004', '\u3406\uff66')
def xml(encoding):
return "" % encoding
def bxml(encoding):
return xml(encoding).encode(encoding)
supported_encodings = [
- 'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le',
+ # expat built-in encodings
+ 'iso-8859-1', 'us-ascii', 'utf-8', 'utf-16', 'utf-16be', 'utf-16le',
+ # 8-bit Python encodings
+ 'ascii', 'latin1',
'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
@@ -701,32 +706,30 @@
'iso2022-jp-3', 'iso2022-jp-ext',
'koi8-r', 'koi8-u',
'hz', 'ptcp154',
+ # multibyte Python encodings
+ 'big5', 'big5hkscs',
+ 'cp932', 'cp949', 'cp950',
+ 'euc-jp', 'euc-jis-2004', 'euc-jisx0213',
+ 'gb2312', 'gbk',
+ 'johab',
+ 'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
]
for encoding in supported_encodings:
self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'')
- unsupported_ascii_compatible_encodings = [
- 'big5', 'big5hkscs',
- 'cp932', 'cp949', 'cp950',
- 'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
- 'gb2312', 'gbk', 'gb18030',
- 'iso2022-kr', 'johab',
- 'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
- 'utf-7',
+ unsupported_encodings = [
+ 'utf_32', 'utf_32_be', 'utf_32_le',
+ 'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140',
+ 'euc-kr', 'gb18030', 'iso2022-kr', 'utf-7',
]
- for encoding in unsupported_ascii_compatible_encodings:
- self.assertRaises(ValueError, ET.XML, bxml(encoding))
-
- unsupported_ascii_incompatible_encodings = [
- 'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140',
- 'utf_32', 'utf_32_be', 'utf_32_le',
- ]
- for encoding in unsupported_ascii_incompatible_encodings:
- self.assertRaises(ET.ParseError, ET.XML, bxml(encoding))
+ for encoding in unsupported_encodings:
+ self.assertRaises((ET.ParseError, ValueError),
+ ET.XML, bxml(encoding))
self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii'))
self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii'))
+
def test_methods(self):
# Test serialization methods.
diff -r 540a9c69c2ea Modules/pyexpat.c
--- a/Modules/pyexpat.c Fri Sep 13 19:53:08 2013 +0200
+++ b/Modules/pyexpat.c Sat Sep 14 21:44:09 2013 +0300
@@ -1128,15 +1128,152 @@
/* pyexpat international encoding support.
- Make it as simple as possible.
*/
+static PyObject *encodings_cache = NULL;
+
+static PyObject *
+pyexpat_encoding_create(const XML_Char *name, int map[256])
+{
+ int max_len[256];
+ int c, k, allocated = 256, tabsize = 256;
+ PyObject *encoder = NULL, *data = NULL, *args = NULL, *result = NULL;
+ int *table;
+ encoder = PyCodec_Encoder(name);
+ if (encoder == NULL)
+ goto onError;
+ data = PyDict_GetItem(encodings_cache, encoder);
+ if (data != NULL) {
+ Py_DECREF(encoder);
+ if (data == Py_None)
+ return NULL;
+ table = (int *)PyBytes_AS_STRING(data);
+ memcpy(map, table, 256 * sizeof(int));
+ for (k = 0; k < 256; k++) {
+ if (map[k] < -1)
+ map[k] = -((-map[k]) & 0xff);
+ }
+ return data;
+ }
+ data = PyBytes_FromStringAndSize(NULL, 256 * sizeof(int));
+ if (data == NULL)
+ goto onError;
+ table = (int *)PyBytes_AS_STRING(data);
+ args = PyTuple_New(1);
+ if (args == NULL)
+ goto onError;
+ memset(max_len, 0, sizeof(max_len));
+ for(k = 0; k < 256; k++)
+ table[k] = -1;
+ for (c = 0; c < 0x10000; c++) {
+ PyObject *u, *bytes;
+ const unsigned char *encoded;
+ Py_ssize_t n;
+ int i;
+ int *t;
+ unsigned char b;
+
+ u = PyUnicode_FromOrdinal(c);
+ if (u == NULL)
+ goto onError;
+ PyTuple_SET_ITEM(args, 0, u);
+ result = PyObject_CallObject(encoder, args);
+ Py_DECREF(u);
+ if (result == NULL) {
+ if (!PyErr_ExceptionMatches(PyExc_UnicodeEncodeError))
+ goto onError;
+ PyErr_Clear();
+ continue;
+ }
+ if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 2)
+ goto fail;
+ bytes = PyTuple_GET_ITEM(result, 0);
+ n = PyBytes_GET_SIZE(bytes);
+ if (n < 0 || n > 4)
+ goto fail;
+ encoded = (unsigned char *)PyBytes_AS_STRING(bytes);
+ b = encoded[0];
+ if (!max_len[b])
+ max_len[b] = n;
+ else if (max_len[b] != n)
+ goto fail;
+ t = table;
+ for (i = 1; i != n; b = encoded[i++]) {
+ assert(t[b] < 0);
+ if (t[b] == -1) {
+ t[b] = t - table - tabsize;
+ if (tabsize == allocated) {
+ allocated += allocated;
+ if (_PyBytes_Resize(&data, sizeof(int) * allocated) < 0)
+ goto onError;
+ table = (int *)PyBytes_AS_STRING(data);
+ }
+ t = table + tabsize;
+ for(k = 0; k < 256; k++)
+ t[k] = -1;
+ tabsize += 256;
+ }
+ else
+ t -= t[b];
+ }
+ t[b] = c;
+ Py_CLEAR(result);
+ }
+ if (_PyBytes_Resize(&data, sizeof(int) * tabsize) < 0)
+ goto onError;
+ memcpy(map, table, 256 * sizeof(int));
+ for (k = 0; k < 256; k++) {
+ if (table[k] < -1) {
+ map[k] = -max_len[k];
+ table[k] -= max_len[k];
+ }
+ }
+ PyTuple_SET_ITEM(args, 0, NULL);
+ Py_DECREF(args);
+ PyDict_SetItem(encodings_cache, encoder, data);
+ return data;
+fail:
+ Py_INCREF(Py_None);
+ PyDict_SetItem(encodings_cache, encoder, Py_None);
+onError:
+ Py_XDECREF(data);
+ PyTuple_SET_ITEM(args, 0, NULL);
+ Py_XDECREF(args);
+ Py_XDECREF(encoder);
+ Py_XDECREF(result);
+ return NULL;
+}
+
+static int
+pyexpat_encoding_convert(void *data, const char *s)
+{
+ const int *t = (const int *)PyBytes_AS_STRING((PyObject *)data);
+ while (1) {
+ int c = t[(unsigned char)*s++];
+ if (c >= -1)
+ return c;
+ t += (-c) & ~0xff;
+ }
+}
+
+static void
+pyexpat_encoding_release(void *data)
+{
+ Py_DECREF(data);
+}
+
static int
PyUnknownEncodingHandler(void *encodingHandlerData,
const XML_Char *name,
XML_Encoding *info)
{
static unsigned char template_buffer[256] = {0};
+ /* ASCII characters that can appear in a well-formed XML document
+ except the characters $@\^`{}~ */
+ static const unsigned char compulsory_chars[] =
+ "\t\n\r !\"#%&'()*+,-./0123456789:;<=>?"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_"
+ "abcdefghijklmnopqrstuvwxyz|\x7f";
PyObject* u;
int i;
void *data;
@@ -1158,9 +1295,16 @@
if (PyUnicode_GET_LENGTH(u) != 256) {
Py_DECREF(u);
- PyErr_SetString(PyExc_ValueError,
- "multi-byte encodings are not supported");
- return XML_STATUS_ERROR;
+ info->data = pyexpat_encoding_create(name, info->map);
+ if (info->data == NULL) {
+ PyErr_Format(PyExc_ValueError,
+ "multi-byte encoding '%s' is not supported", name);
+ return XML_STATUS_ERROR;
+ }
+ Py_INCREF(info->data);
+ info->convert = pyexpat_encoding_convert;
+ info->release = pyexpat_encoding_release;
+ return XML_STATUS_OK;
}
kind = PyUnicode_KIND(u);
@@ -1173,6 +1317,16 @@
info->map[i] = -1;
}
+ for (i = 0; i < sizeof(compulsory_chars) - 1; i++) {
+ unsigned char b = compulsory_chars[i];
+ if (PyUnicode_READ(kind, data, b) != b) {
+ Py_DECREF(u);
+ PyErr_Format(PyExc_ValueError,
+ "encoding '%s' is not ASCII compatible", name);
+ return XML_STATUS_ERROR;
+ }
+ }
+
info->data = NULL;
info->convert = NULL;
info->release = NULL;
@@ -1746,6 +1900,12 @@
if (PyType_Ready(&Xmlparsetype) < 0)
return NULL;
+ if (encodings_cache == NULL) {
+ encodings_cache = PyDict_New();
+ if (encodings_cache == NULL)
+ return NULL;
+ }
+
/* Create the module and add the functions */
m = PyModule_Create(&pyexpatmodule);
if (m == NULL)