diff -r 37794a002517 Lib/test/test_xml_etree.py
--- a/Lib/test/test_xml_etree.py Sat May 25 11:33:13 2013 -0400
+++ b/Lib/test/test_xml_etree.py Sun May 26 13:30:07 2013 +0300
@@ -680,13 +680,18 @@
check("iso-8859-15", '\u20ac')
check("cp437", '\u221a')
check("mac-roman", '\u02da')
+ check('shift-jis-2004', '\u203e\u3406\uff66')
+ check('euc-jis-2004', '\u3406\uff66')
def xml(encoding):
return "" % encoding
def bxml(encoding):
return xml(encoding).encode(encoding)
supported_encodings = [
- 'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le',
+ # expat built-in encodings
+ 'iso-8859-1', 'us-ascii', 'utf-8', 'utf-16', 'utf-16be', 'utf-16le',
+ # 8-bit Python encodings
+ 'ascii', 'latin1',
'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5',
'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10',
'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16',
@@ -701,32 +706,30 @@
'iso2022-jp-3', 'iso2022-jp-ext',
'koi8-r', 'koi8-u',
'hz', 'ptcp154',
+ # multibyte Python encodings
+ 'big5', 'big5hkscs',
+ 'cp932', 'cp949', 'cp950',
+ 'euc-jp', 'euc-jis-2004', 'euc-jisx0213',
+ 'gb2312', 'gbk',
+ 'johab',
+ 'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
]
for encoding in supported_encodings:
self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'')
- unsupported_ascii_compatible_encodings = [
- 'big5', 'big5hkscs',
- 'cp932', 'cp949', 'cp950',
- 'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
- 'gb2312', 'gbk', 'gb18030',
- 'iso2022-kr', 'johab',
- 'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
- 'utf-7',
+ unsupported_encodings = [
+ 'utf_32', 'utf_32_be', 'utf_32_le',
+ 'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140',
+ 'euc-kr', 'gb18030', 'iso2022-kr', 'utf-7',
]
- for encoding in unsupported_ascii_compatible_encodings:
- self.assertRaises(ValueError, ET.XML, bxml(encoding))
-
- unsupported_ascii_incompatible_encodings = [
- 'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140',
- 'utf_32', 'utf_32_be', 'utf_32_le',
- ]
- for encoding in unsupported_ascii_incompatible_encodings:
- self.assertRaises(ET.ParseError, ET.XML, bxml(encoding))
+ for encoding in unsupported_encodings:
+ self.assertRaises((ET.ParseError, ValueError),
+ ET.XML, bxml(encoding))
self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii'))
self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii'))
+
def test_methods(self):
# Test serialization methods.
diff -r 37794a002517 Modules/pyexpat.c
--- a/Modules/pyexpat.c Sat May 25 11:33:13 2013 -0400
+++ b/Modules/pyexpat.c Sun May 26 13:30:07 2013 +0300
@@ -1111,17 +1111,64 @@
Make it as simple as possible.
*/
+/* built-in maps for multibyte encodings */
+
+struct pyexpat_encoding {
+ const char *name;
+ int map[256];
+};
+
+#include "pyexpat_encodings.h"
+
+static int
+pyexpat_encoding_convert(void *data, const char *s)
+{
+ int c;
+ const struct pyexpat_encoding *enc = (const struct pyexpat_encoding *)data;
+ PyObject* u = PyUnicode_Decode(s, -enc->map[(unsigned char)*s],
+ enc->name, NULL);
+ if (u == NULL)
+ return -1;
+ if (PyUnicode_GET_LENGTH(u) != 1) {
+ c = -1;
+ PyErr_Format(PyExc_ValueError,
+ "'%s' decoder returns wrong number of characters",
+ enc->name);
+ }
+ else
+ c = PyUnicode_READ_CHAR(u, 0);
+ Py_DECREF(u);
+ return c;
+}
+
static int
PyUnknownEncodingHandler(void *encodingHandlerData,
const XML_Char *name,
XML_Encoding *info)
{
static unsigned char template_buffer[256] = {0};
+ /* ASCII characters that can appear in a well-formed XML document
+ except the characters $@\^`{}~ */
+ static const unsigned char compulsory_chars[] =
+ "\t\n\r !\"#%&'()*+,-./0123456789:;<=>?"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_"
+ "abcdefghijklmnopqrstuvwxyz|\x7f";
PyObject* u;
int i;
void *data;
unsigned int kind;
+ for (i = 0; i < Py_ARRAY_LENGTH(pyexpat_encodings); i++) {
+ if (strcasecmp(name, pyexpat_encodings[i].name) == 0) {
+ memcpy(info->map, pyexpat_encodings[i].map, 256 * sizeof(int));
+ info->data = (void *)&(pyexpat_encodings[i]);
+ info->convert = pyexpat_encoding_convert;
+ info->release = NULL;
+
+ return XML_STATUS_OK;
+ }
+ }
+
if (template_buffer[1] == 0) {
for (i = 0; i < 256; i++)
template_buffer[i] = i;
@@ -1133,8 +1180,8 @@
if (PyUnicode_GET_LENGTH(u) != 256) {
Py_DECREF(u);
- PyErr_SetString(PyExc_ValueError,
- "multi-byte encodings are not supported");
+ PyErr_Format(PyExc_ValueError,
+ "multi-byte encoding '%s' is not supported", name);
return XML_STATUS_ERROR;
}
@@ -1148,6 +1195,16 @@
info->map[i] = -1;
}
+ for (i = 0; i < sizeof(compulsory_chars) - 1; i++) {
+ unsigned char b = compulsory_chars[i];
+ if (PyUnicode_READ(kind, data, b) != b) {
+ Py_DECREF(u);
+ PyErr_Format(PyExc_ValueError,
+ "encoding '%s' is not ASCII compatible", name);
+ return XML_STATUS_ERROR;
+ }
+ }
+
info->data = NULL;
info->convert = NULL;
info->release = NULL;
diff -r 37794a002517 Modules/pyexpat_encodings.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Modules/pyexpat_encodings.h Sun May 26 13:30:07 2013 +0300
@@ -0,0 +1,480 @@
+/* this file was generated by Tools/unicode/genexpatencodings.py */
+
+static const struct pyexpat_encoding pyexpat_encodings[] = {
+ {"big5", {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -1, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -1, -1, -1, -1, -1, -1,
+ }},
+ {"big5hkscs", {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ -1, -1, -1, -1, -1, -1, -1, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -1,
+ }},
+ {"cp932", {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ 128, -2, -2, -2, -2, -1, -1, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ 63728, 65377, 65378, 65379, 65380, 65381, 65382, 65383,
+ 65384, 65385, 65386, 65387, 65388, 65389, 65390, 65391,
+ 65392, 65393, 65394, 65395, 65396, 65397, 65398, 65399,
+ 65400, 65401, 65402, 65403, 65404, 65405, 65406, 65407,
+ 65408, 65409, 65410, 65411, 65412, 65413, 65414, 65415,
+ 65416, 65417, 65418, 65419, 65420, 65421, 65422, 65423,
+ 65424, 65425, 65426, 65427, 65428, 65429, 65430, 65431,
+ 65432, 65433, 65434, 65435, 65436, 65437, 65438, 65439,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -1, -1, -2, -2, -1,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -1, -1, -1, 63729, 63730, 63731,
+ }},
+ {"cp949", {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ -1, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -1, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -1, -1,
+ }},
+ {"cp950", {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -1, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -1, -1, -1, -1, -1, -1,
+ }},
+ {"euc-jp", {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -2, -3,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -2, -2, -2, -2, -2, -2, -2,
+ -2, -1, -1, -1, -1, -1, -1, -1,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ }},
+ {"euc-jis-2004", {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -2, -3,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -1,
+ }},
+ {"euc-jisx0213", {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -2, -3,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -1,
+ }},
+ {"gb2312", {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -1, -1, -1, -1, -1, -1,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ }},
+ {"gbk", {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ -1, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -1,
+ }},
+ {"johab", {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ -1, -1, -1, -1, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -1, -1, -1, -1,
+ -1, -2, -2, -2, -2, -2, -2, -1,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -1, -1, -1, -1, -1, -1,
+ }},
+ {"shift-jis", {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ -1, -2, -2, -2, -2, -1, -1, -1,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -1, 65377, 65378, 65379, 65380, 65381, 65382, 65383,
+ 65384, 65385, 65386, 65387, 65388, 65389, 65390, 65391,
+ 65392, 65393, 65394, 65395, 65396, 65397, 65398, 65399,
+ 65400, 65401, 65402, 65403, 65404, 65405, 65406, 65407,
+ 65408, 65409, 65410, 65411, 65412, 65413, 65414, 65415,
+ 65416, 65417, 65418, 65419, 65420, 65421, 65422, 65423,
+ 65424, 65425, 65426, 65427, 65428, 65429, 65430, 65431,
+ 65432, 65433, 65434, 65435, 65436, 65437, 65438, 65439,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1,
+ }},
+ {"shift-jis-2004", {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 165, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 8254, 127,
+ -1, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -1, 65377, 65378, 65379, 65380, 65381, 65382, 65383,
+ 65384, 65385, 65386, 65387, 65388, 65389, 65390, 65391,
+ 65392, 65393, 65394, 65395, 65396, 65397, 65398, 65399,
+ 65400, 65401, 65402, 65403, 65404, 65405, 65406, 65407,
+ 65408, 65409, 65410, 65411, 65412, 65413, 65414, 65415,
+ 65416, 65417, 65418, 65419, 65420, 65421, 65422, 65423,
+ 65424, 65425, 65426, 65427, 65428, 65429, 65430, 65431,
+ 65432, 65433, 65434, 65435, 65436, 65437, 65438, 65439,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -1, -1, -1,
+ }},
+ {"shift-jisx0213", {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 165, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 8254, 127,
+ -1, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -1, 65377, 65378, 65379, 65380, 65381, 65382, 65383,
+ 65384, 65385, 65386, 65387, 65388, 65389, 65390, 65391,
+ 65392, 65393, 65394, 65395, 65396, 65397, 65398, 65399,
+ 65400, 65401, 65402, 65403, 65404, 65405, 65406, 65407,
+ 65408, 65409, 65410, 65411, 65412, 65413, 65414, 65415,
+ 65416, 65417, 65418, 65419, 65420, 65421, 65422, 65423,
+ 65424, 65425, 65426, 65427, 65428, 65429, 65430, 65431,
+ 65432, 65433, 65434, 65435, 65436, 65437, 65438, 65439,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -2, -2, -2,
+ -2, -2, -2, -2, -2, -1, -1, -1,
+ }},
+};
diff -r 37794a002517 Tools/unicode/genexpatencodings.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Tools/unicode/genexpatencodings.py Sun May 26 13:30:07 2013 +0300
@@ -0,0 +1,76 @@
+#!/usr/bin/env python
+# This script generates encoding maps for multibyte encoding support
+# in pyexpat.
+
+import sys
+
+MAXCODE = 0xFFFF # sys.maxunicode
+MAXLEN = 4
+
+# ASCII characters that can appear in a well-formed XML document
+# except the characters $@\^`{}~
+compulsory_chars = (b'\t\n\r' +
+ bytes(b for b in range(32, 128)
+ if b not in b'$@\\^`{}~'))
+compulsory_str = compulsory_chars.decode('ascii')
+
+encodings = sys.argv[1:]
+if not encodings:
+ encodings = [
+ 'big5', 'big5hkscs',
+ 'cp932', 'cp949', 'cp950',
+ 'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr',
+ 'gb2312', 'gbk', 'gb18030',
+ 'iso2022-kr', 'johab',
+ 'shift-jis', 'shift-jis-2004', 'shift-jisx0213',
+ 'utf-7',
+ ]
+
+print('''\
+/* this file was generated by Tools/unicode/genexpatencodings.py */
+
+static const struct pyexpat_encoding pyexpat_encodings[] = {''')
+
+for encoding in encodings:
+ try:
+ if (compulsory_str.encode(encoding) != compulsory_chars or
+ compulsory_chars.decode(encoding) != compulsory_str):
+ print('%s is not ASCII compatible' % encoding,
+ file=sys.stderr)
+ continue
+ except UnicodeEncodeError:
+ print('%s is not ASCII compatible' % encoding,
+ file=sys.stderr)
+ continue
+
+ tab = [None] * 256
+ max_len = [-1] * 256
+ min_len = [MAXLEN + 1] * 256
+ for c in range(MAXCODE + 1):
+ try:
+ d = chr(c).encode(encoding)
+ except UnicodeEncodeError:
+ continue
+ b = d[0]
+ if tab[b] is None:
+ tab[b] = c
+ max_len[b] = max(max_len[b], len(d))
+ min_len[b] = min(min_len[b], len(d))
+ if max(max_len) == 1:
+ print('%s is 8-bit encoding' % encoding, file=sys.stderr)
+ continue
+ if max(max_len) > MAXLEN:
+ print('%s is not supported' % encoding, file=sys.stderr)
+ continue
+ if any(n != m for n, m in zip(min_len, max_len) if m > 0):
+ print('%s is not supported' % encoding, file=sys.stderr)
+ continue
+ info_map = [c if n == 1 else -n if n > 1 else -1
+ for c, n in zip(tab, max_len)]
+
+ print(' {"%s", {' % encoding.lower())
+ for i in range(0, 256, 8):
+ print(' %s,' % ', '.join(str(x) for x in info_map[i: i + 8]))
+ print(' }},')
+
+print('};')