diff -r 37794a002517 Lib/test/test_xml_etree.py --- a/Lib/test/test_xml_etree.py Sat May 25 11:33:13 2013 -0400 +++ b/Lib/test/test_xml_etree.py Sun May 26 10:09:14 2013 +0300 @@ -686,8 +686,11 @@ def bxml(encoding): return xml(encoding).encode(encoding) supported_encodings = [ + # expat built-in encodings 'ascii', 'utf-8', 'utf-8-sig', 'utf-16', 'utf-16be', 'utf-16le', - 'iso8859-1', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5', + 'iso8859-1', + # 8-bit Python encodings + 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5', 'iso8859-6', 'iso8859-7', 'iso8859-8', 'iso8859-9', 'iso8859-10', 'iso8859-13', 'iso8859-14', 'iso8859-15', 'iso8859-16', 'cp437', 'cp720', 'cp737', 'cp775', 'cp850', 'cp852', @@ -701,28 +704,23 @@ 'iso2022-jp-3', 'iso2022-jp-ext', 'koi8-r', 'koi8-u', 'hz', 'ptcp154', - ] - for encoding in supported_encodings: - self.assertEqual(ET.tostring(ET.XML(bxml(encoding))), b'') - - unsupported_ascii_compatible_encodings = [ + # multibyte Python encodings 'big5', 'big5hkscs', 'cp932', 'cp949', 'cp950', - 'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr', - 'gb2312', 'gbk', 'gb18030', - 'iso2022-kr', 'johab', + 'euc-jp', 'euc-jis-2004', 'euc-jisx0213', + 'gb2312', 'gbk', + 'johab', 'shift-jis', 'shift-jis-2004', 'shift-jisx0213', - 'utf-7', ] - for encoding in unsupported_ascii_compatible_encodings: - self.assertRaises(ValueError, ET.XML, bxml(encoding)) - - unsupported_ascii_incompatible_encodings = [ + + unsupported_encodings = [ + 'utf_32', 'utf_32_be', 'utf_32_le', 'cp037', 'cp424', 'cp500', 'cp864', 'cp875', 'cp1026', 'cp1140', - 'utf_32', 'utf_32_be', 'utf_32_le', + 'euc-kr', 'gb18030', 'iso2022-kr', 'utf-7', ] - for encoding in unsupported_ascii_incompatible_encodings: - self.assertRaises(ET.ParseError, ET.XML, bxml(encoding)) + for encoding in unsupported_encodings: + self.assertRaises((ET.ParseError, ValueError), + ET.XML, bxml(encoding)) self.assertRaises(ValueError, ET.XML, xml('undefined').encode('ascii')) self.assertRaises(LookupError, ET.XML, xml('xxx').encode('ascii')) diff -r 37794a002517 Modules/pyexpat.c --- a/Modules/pyexpat.c Sat May 25 11:33:13 2013 -0400 +++ b/Modules/pyexpat.c Sun May 26 10:09:14 2013 +0300 @@ -1111,17 +1111,64 @@ Make it as simple as possible. */ +/* built-in maps for multibyte encodings */ + +struct pyexpat_encoding { + const char *name; + int map[256]; +}; + +#include "pyexpat_encodings.h" + +static int +pyexpat_encoding_convert(void *data, const char *s) +{ + int c; + const struct pyexpat_encoding *enc = (const struct pyexpat_encoding *)data; + PyObject* u = PyUnicode_Decode(s, enc->map[(unsigned char)*s], + enc->name, NULL); + if (u == NULL) + return -1; + if (PyUnicode_GET_LENGTH(u) != 1) { + c = -1; + PyErr_Format(PyExc_ValueError, + "'%s' decoder returns wrong number of characters", + enc->name); + } + else + c = PyUnicode_READ_CHAR(u, 0); + Py_DECREF(u); + return c; +} + static int PyUnknownEncodingHandler(void *encodingHandlerData, const XML_Char *name, XML_Encoding *info) { static unsigned char template_buffer[256] = {0}; + /* ASCII characters that can appear in a well-formed XML document + except the characters $@\^`{}~ */ + static const unsigned char compulsory_chars[] = + "\t\n\r !\"#%&'()*+,-./0123456789:;<=>?" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_" + "abcdefghijklmnopqrstuvwxyz|\x7f"; PyObject* u; int i; void *data; unsigned int kind; + for (i = 0; i < Py_ARRAY_LENGTH(pyexpat_encodings); i++) { + if (strcasecmp(name, pyexpat_encodings[i].name) == 0) { + memcpy(info->map, pyexpat_encodings[i].map, 256 * sizeof(int)); + info->data = (void *)&(pyexpat_encodings[i]); + info->convert = pyexpat_encoding_convert; + info->release = NULL; + + return XML_STATUS_OK; + } + } + if (template_buffer[1] == 0) { for (i = 0; i < 256; i++) template_buffer[i] = i; @@ -1133,8 +1180,8 @@ if (PyUnicode_GET_LENGTH(u) != 256) { Py_DECREF(u); - PyErr_SetString(PyExc_ValueError, - "multi-byte encodings are not supported"); + PyErr_Format(PyExc_ValueError, + "multi-byte encoding '%s' is not supported", name); return XML_STATUS_ERROR; } @@ -1148,6 +1195,16 @@ info->map[i] = -1; } + for (i = 0; i < sizeof(compulsory_chars) - 1; i++) { + unsigned char b = compulsory_chars[i]; + if (PyUnicode_READ(kind, data, b) != b) { + Py_DECREF(u); + PyErr_Format(PyExc_ValueError, + "encoding '%s' is not ASCII compatible", name); + return XML_STATUS_ERROR; + } + } + info->data = NULL; info->convert = NULL; info->release = NULL; diff -r 37794a002517 Modules/pyexpat_encodings.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Modules/pyexpat_encodings.h Sun May 26 10:09:14 2013 +0300 @@ -0,0 +1,480 @@ +/* this file was generated by Tools/unicode/genexpatencodings.py */ + +static const struct pyexpat_encoding pyexpat_encodings[] = { + {"big5", { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -1, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -1, -1, -1, -1, -1, -1, + }}, + {"big5hkscs", { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, + -1, -1, -1, -1, -1, -1, -1, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -1, + }}, + {"cp932", { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, + 128, -2, -2, -2, -2, -1, -1, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + 63728, 65377, 65378, 65379, 65380, 65381, 65382, 65383, + 65384, 65385, 65386, 65387, 65388, 65389, 65390, 65391, + 65392, 65393, 65394, 65395, 65396, 65397, 65398, 65399, + 65400, 65401, 65402, 65403, 65404, 65405, 65406, 65407, + 65408, 65409, 65410, 65411, 65412, 65413, 65414, 65415, + 65416, 65417, 65418, 65419, 65420, 65421, 65422, 65423, + 65424, 65425, 65426, 65427, 65428, 65429, 65430, 65431, + 65432, 65433, 65434, 65435, 65436, 65437, 65438, 65439, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -1, -1, -2, -2, -1, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -1, -1, -1, 63729, 63730, 63731, + }}, + {"cp949", { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, + -1, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -1, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -1, -1, + }}, + {"cp950", { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -1, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -1, -1, -1, -1, -1, -1, + }}, + {"euc-jp", { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -2, -3, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -2, -2, -2, -2, -2, -2, -2, + -2, -1, -1, -1, -1, -1, -1, -1, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + }}, + {"euc-jis-2004", { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -2, -3, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -1, + }}, + {"euc-jisx0213", { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -2, -3, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -1, + }}, + {"gb2312", { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -1, -1, -1, -1, -1, -1, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -1, -1, -1, -1, -1, -1, -1, -1, + }}, + {"gbk", { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, + -1, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -1, + }}, + {"johab", { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, + -1, -1, -1, -1, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -1, -1, -1, -1, + -1, -2, -2, -2, -2, -2, -2, -1, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -1, -1, -1, -1, -1, -1, + }}, + {"shift-jis", { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 126, 127, + -1, -2, -2, -2, -2, -1, -1, -1, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -1, 65377, 65378, 65379, 65380, 65381, 65382, 65383, + 65384, 65385, 65386, 65387, 65388, 65389, 65390, 65391, + 65392, 65393, 65394, 65395, 65396, 65397, 65398, 65399, + 65400, 65401, 65402, 65403, 65404, 65405, 65406, 65407, + 65408, 65409, 65410, 65411, 65412, 65413, 65414, 65415, + 65416, 65417, 65418, 65419, 65420, 65421, 65422, 65423, + 65424, 65425, 65426, 65427, 65428, 65429, 65430, 65431, + 65432, 65433, 65434, 65435, 65436, 65437, 65438, 65439, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, + }}, + {"shift-jis-2004", { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 165, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 8254, 127, + -1, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -1, 65377, 65378, 65379, 65380, 65381, 65382, 65383, + 65384, 65385, 65386, 65387, 65388, 65389, 65390, 65391, + 65392, 65393, 65394, 65395, 65396, 65397, 65398, 65399, + 65400, 65401, 65402, 65403, 65404, 65405, 65406, 65407, + 65408, 65409, 65410, 65411, 65412, 65413, 65414, 65415, + 65416, 65417, 65418, 65419, 65420, 65421, 65422, 65423, + 65424, 65425, 65426, 65427, 65428, 65429, 65430, 65431, + 65432, 65433, 65434, 65435, 65436, 65437, 65438, 65439, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -1, -1, -1, + }}, + {"shift-jisx0213", { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, + 88, 89, 90, 91, 165, 93, 94, 95, + 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 124, 125, 8254, 127, + -1, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -1, 65377, 65378, 65379, 65380, 65381, 65382, 65383, + 65384, 65385, 65386, 65387, 65388, 65389, 65390, 65391, + 65392, 65393, 65394, 65395, 65396, 65397, 65398, 65399, + 65400, 65401, 65402, 65403, 65404, 65405, 65406, 65407, + 65408, 65409, 65410, 65411, 65412, 65413, 65414, 65415, + 65416, 65417, 65418, 65419, 65420, 65421, 65422, 65423, + 65424, 65425, 65426, 65427, 65428, 65429, 65430, 65431, + 65432, 65433, 65434, 65435, 65436, 65437, 65438, 65439, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -2, -2, -2, + -2, -2, -2, -2, -2, -1, -1, -1, + }}, +}; diff -r 37794a002517 Tools/unicode/genexpatencodings.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Tools/unicode/genexpatencodings.py Sun May 26 10:09:14 2013 +0300 @@ -0,0 +1,76 @@ +#!/usr/bin/env python +# This script generates encoding maps for multibyte encoding support +# in pyexpat. + +import sys + +MAXCODE = 0xFFFF # sys.maxunicode +MAXLEN = 4 + +# ASCII characters that can appear in a well-formed XML document +# except the characters $@\^`{}~ +compulsory_chars = (b'\t\n\r' + + bytes(b for b in range(32, 128) + if b not in b'$@\\^`{}~')) +compulsory_str = compulsory_chars.decode('ascii') + +encodings = sys.argv[1:] +if not encodings: + encodings = [ + 'big5', 'big5hkscs', + 'cp932', 'cp949', 'cp950', + 'euc-jp', 'euc-jis-2004', 'euc-jisx0213', 'euc-kr', + 'gb2312', 'gbk', 'gb18030', + 'iso2022-kr', 'johab', + 'shift-jis', 'shift-jis-2004', 'shift-jisx0213', + 'utf-7', + ] + +print('''\ +/* this file was generated by Tools/unicode/genexpatencodings.py */ + +static const struct pyexpat_encoding pyexpat_encodings[] = {''') + +for encoding in encodings: + try: + if (compulsory_str.encode(encoding) != compulsory_chars or + compulsory_chars.decode(encoding) != compulsory_str): + print('%s is not ASCII compatible' % encoding, + file=sys.stderr) + continue + except UnicodeEncodeError: + print('%s is not ASCII compatible' % encoding, + file=sys.stderr) + continue + + tab = [None] * 256 + max_len = [-1] * 256 + min_len = [MAXLEN + 1] * 256 + for c in range(MAXCODE + 1): + try: + d = chr(c).encode(encoding) + except UnicodeEncodeError: + continue + b = d[0] + if tab[b] is None: + tab[b] = c + max_len[b] = max(max_len[b], len(d)) + min_len[b] = min(min_len[b], len(d)) + if max(max_len) == 1: + print('%s is 8-bit encoding' % encoding, file=sys.stderr) + continue + if max(max_len) > MAXLEN: + print('%s is not supported' % encoding, file=sys.stderr) + continue + if any(n != m for n, m in zip(min_len, max_len) if m > 0): + print('%s is not supported' % encoding, file=sys.stderr) + continue + info_map = [c if n == 1 else -n if n > 1 else -1 + for c, n in zip(tab, max_len)] + + print(' {"%s", {' % encoding.lower()) + for i in range(0, 256, 8): + print(' %s,' % ', '.join(str(x) for x in info_map[i: i + 8])) + print(' }},') + +print('};')