diff -r f3e348ab08c6 Lib/test/test_sax.py --- a/Lib/test/test_sax.py Wed Jan 16 23:25:41 2013 +0100 +++ b/Lib/test/test_sax.py Thu Jan 17 18:47:20 2013 +0200 @@ -13,8 +13,8 @@ from xml.sax.expatreader import create_parser from xml.sax.handler import feature_namespaces from xml.sax.xmlreader import InputSource, AttributesImpl, AttributesNSImpl -from io import StringIO -from test.support import findfile, run_unittest +from io import BytesIO, StringIO +from test.support import findfile, run_unittest, TESTFN, unlink import unittest TEST_XMLFILE = findfile("test.xml", subdir="xmltestdata") @@ -79,6 +79,129 @@ self.assertEqual(attrs["attr"], "val") self.assertEqual(attrs.getQNameByName("attr"), "attr") + +def xml_str(doc, encoding=None): + if encoding is None: + return doc + return '\n%s' % (encoding, doc) + +def xml_bytes(doc, encoding, decl_encoding=...): + if decl_encoding is ...: + decl_encoding = encoding + return xml_str(doc, decl_encoding).encode(encoding, 'xmlcharrefreplace') + +def make_xml_file(doc, encoding, decl_encoding=...): + if decl_encoding is ...: + decl_encoding = encoding + with open(TESTFN, 'w', encoding=encoding, errors='xmlcharrefreplace') as f: + f.write(xml_str(doc, decl_encoding)) + + +class ParseTest(unittest.TestCase): + data = '$\xa3\u20ac\U0001017b' + + def tearDown(self): + unlink(TESTFN) + + def check_parse(self, f): + from xml.sax import parse + result = StringIO() + parse(f, XMLGenerator(result, 'utf-8')) + self.assertEqual(result.getvalue(), xml_str(self.data, 'utf-8')) + + def test_parse_text(self): + encodings = ('us-ascii', 'iso-8859-1', 'utf-8', + 'utf-16', 'utf-16le', 'utf-16be') + for encoding in encodings: + self.check_parse(StringIO(xml_str(self.data, encoding))) + make_xml_file(self.data, encoding) + with open(TESTFN, 'r', encoding=encoding) as f: + self.check_parse(f) + self.check_parse(StringIO(self.data)) + make_xml_file(self.data, encoding, None) + with open(TESTFN, 'r', encoding=encoding) as f: + self.check_parse(f) + + def test_parse_bytes(self): + # UTF-8 is default encoding, UA-ASCII is compatible with UTF-8, + # UTF-16 is autodetected + encodings = ('us-ascii', 'utf-8', 'utf-16', 'utf-16le', 'utf-16be') + for encoding in encodings: + self.check_parse(BytesIO(xml_bytes(self.data, encoding))) + make_xml_file(self.data, encoding) + self.check_parse(TESTFN) + with open(TESTFN, 'rb') as f: + self.check_parse(f) + self.check_parse(BytesIO(xml_bytes(self.data, encoding, None))) + make_xml_file(self.data, encoding, None) + self.check_parse(TESTFN) + with open(TESTFN, 'rb') as f: + self.check_parse(f) + # accept UTF-8 with BOM + self.check_parse(BytesIO(xml_bytes(self.data, 'utf-8-sig', 'utf-8'))) + make_xml_file(self.data, 'utf-8-sig', 'utf-8') + self.check_parse(TESTFN) + with open(TESTFN, 'rb') as f: + self.check_parse(f) + self.check_parse(BytesIO(xml_bytes(self.data, 'utf-8-sig', None))) + make_xml_file(self.data, 'utf-8-sig', None) + self.check_parse(TESTFN) + with open(TESTFN, 'rb') as f: + self.check_parse(f) + # accept data with declared encoding + self.check_parse(BytesIO(xml_bytes(self.data, 'iso-8859-1'))) + make_xml_file(self.data, 'iso-8859-1') + self.check_parse(TESTFN) + with open(TESTFN, 'rb') as f: + self.check_parse(f) + # fail on non-UTF-8 incompatible data without declared encoding + with self.assertRaises(SAXException): + self.check_parse(BytesIO(xml_bytes(self.data, 'iso-8859-1', None))) + make_xml_file(self.data, 'iso-8859-1', None) + with self.assertRaises(SAXException): + self.check_parse(TESTFN) + with open(TESTFN, 'rb') as f: + with self.assertRaises(SAXException): + self.check_parse(f) + + def test_parse_InputSource(self): + # accept data without declared but with explicitly specified encoding + make_xml_file(self.data, 'iso-8859-1', None) + with open(TESTFN, 'rb') as f: + input = InputSource() + input.setByteStream(f) + input.setEncoding('iso-8859-1') + self.check_parse(input) + + def check_parseString(self, s): + from xml.sax import parseString + result = StringIO() + parseString(s, XMLGenerator(result, 'utf-8')) + self.assertEqual(result.getvalue(), xml_str(self.data, 'utf-8')) + + def test_parseString_text(self): + encodings = ('us-ascii', 'iso-8859-1', 'utf-8', + 'utf-16', 'utf-16le', 'utf-16be') + for encoding in encodings: + self.check_parseString(xml_str(self.data, encoding)) + self.check_parseString(self.data) + + def test_parseString_bytes(self): + # UTF-8 is default encoding, UA-ASCII is compatible with UTF-8, + # UTF-16 is autodetected + encodings = ('us-ascii', 'utf-8', 'utf-16', 'utf-16le', 'utf-16be') + for encoding in encodings: + self.check_parseString(xml_bytes(self.data, encoding)) + self.check_parseString(xml_bytes(self.data, encoding, None)) + # accept UTF-8 with BOM + self.check_parseString(xml_bytes(self.data, 'utf-8-sig', 'utf-8')) + self.check_parseString(xml_bytes(self.data, 'utf-8-sig', None)) + # accept data with declared encoding + self.check_parseString(xml_bytes(self.data, 'iso-8859-1')) + # fail on non-UTF-8 incompatible data without declared encoding + with self.assertRaises(SAXException): + self.check_parseString(xml_bytes(self.data, 'iso-8859-1', None)) + class MakeParserTest(unittest.TestCase): def test_make_parser2(self): # Creating parsers several times in a row should succeed. @@ -796,6 +919,7 @@ def test_main(): run_unittest(MakeParserTest, + ParseTest, SaxutilsTest, XmlgenTest, ExpatReaderTest, diff -r f3e348ab08c6 Lib/xml/etree/ElementTree.py --- a/Lib/xml/etree/ElementTree.py Wed Jan 16 23:25:41 2013 +0100 +++ b/Lib/xml/etree/ElementTree.py Thu Jan 17 18:47:20 2013 +0200 @@ -658,6 +658,7 @@ parser = XMLParser(target=TreeBuilder()) while 1: data = source.read(65536) + print(repr(data)) if not data: break parser.feed(data) diff -r f3e348ab08c6 Lib/xml/sax/__init__.py --- a/Lib/xml/sax/__init__.py Wed Jan 16 23:25:41 2013 +0100 +++ b/Lib/xml/sax/__init__.py Thu Jan 17 18:47:20 2013 +0200 @@ -33,8 +33,6 @@ parser.parse(source) def parseString(string, handler, errorHandler=ErrorHandler()): - from io import BytesIO - if errorHandler is None: errorHandler = ErrorHandler() parser = make_parser() @@ -42,7 +40,12 @@ parser.setErrorHandler(errorHandler) inpsrc = InputSource() - inpsrc.setByteStream(BytesIO(string)) + if isinstance(string, str): + from io import StringIO + inpsrc.setCharacterStream(StringIO(string)) + else: + from io import BytesIO + inpsrc.setByteStream(BytesIO(string)) parser.parse(inpsrc) # this is the parser list used by the make_parser function if no diff -r f3e348ab08c6 Lib/xml/sax/expatreader.py --- a/Lib/xml/sax/expatreader.py Wed Jan 16 23:25:41 2013 +0100 +++ b/Lib/xml/sax/expatreader.py Thu Jan 17 18:47:20 2013 +0200 @@ -219,9 +219,14 @@ self._parsing = 0 # break cycle created by expat handlers pointing to our methods self._parser = None - bs = self._source.getByteStream() - if bs is not None: - bs.close() + try: + file = self._source.getCharacterStream() + if file is not None: + file.close() + finally: + file = self._source.getByteStream() + if file is not None: + file.close() def _reset_cont_handler(self): self._parser.ProcessingInstructionHandler = \ diff -r f3e348ab08c6 Lib/xml/sax/saxutils.py --- a/Lib/xml/sax/saxutils.py Wed Jan 16 23:25:41 2013 +0100 +++ b/Lib/xml/sax/saxutils.py Thu Jan 17 18:47:20 2013 +0200 @@ -311,11 +311,14 @@ elif hasattr(source, "read"): f = source source = xmlreader.InputSource() - source.setByteStream(f) + if isinstance(f.read(0), str): + source.setCharacterStream(f) + else: + source.setByteStream(f) if hasattr(f, "name"): source.setSystemId(f.name) - if source.getByteStream() is None: + if source.getCharacterStream() is None and source.getByteStream() is None: sysid = source.getSystemId() basehead = os.path.dirname(os.path.normpath(base)) sysidfilename = os.path.join(basehead, sysid) diff -r f3e348ab08c6 Lib/xml/sax/xmlreader.py --- a/Lib/xml/sax/xmlreader.py Wed Jan 16 23:25:41 2013 +0100 +++ b/Lib/xml/sax/xmlreader.py Thu Jan 17 18:47:20 2013 +0200 @@ -117,7 +117,9 @@ source = saxutils.prepare_input_source(source) self.prepareParser(source) - file = source.getByteStream() + file = source.getCharacterStream() + if file is None: + file = source.getByteStream() buffer = file.read(self._bufsize) while buffer: self.feed(buffer) diff -r f3e348ab08c6 Modules/pyexpat.c --- a/Modules/pyexpat.c Wed Jan 16 23:25:41 2013 +0100 +++ b/Modules/pyexpat.c Thu Jan 17 18:47:20 2013 +0200 @@ -778,17 +778,54 @@ "Parse(data[, isfinal])\n\ Parse XML data. `isfinal' should be true at end of input."); +#define MAX_CHUNK_SIZE (1 << 20) + static PyObject * xmlparse_Parse(xmlparseobject *self, PyObject *args) { - char *s; - int slen; + PyObject *data; int isFinal = 0; + const char *s; + Py_ssize_t slen; - if (!PyArg_ParseTuple(args, "s#|i:Parse", &s, &slen, &isFinal)) + if (!PyArg_ParseTuple(args, "O|i:Parse", &data, &isFinal)) return NULL; - return get_parse_result(self, XML_Parse(self->itself, s, slen, isFinal)); + if (PyUnicode_Check(data)) { + s = PyUnicode_AsUTF8AndSize(data, &slen); + if (s == NULL) + return NULL; + /* Explicitly set UTF-8 encoding. Return code ignored. */ + (void)XML_SetEncoding(self->itself, "utf-8"); + while (slen > MAX_CHUNK_SIZE) { + if (!XML_Parse(self->itself, s, MAX_CHUNK_SIZE, 0)) + return get_parse_result(self, 0); + s += MAX_CHUNK_SIZE; + slen -= MAX_CHUNK_SIZE; + } + return get_parse_result(self, + XML_Parse(self->itself, s, slen, isFinal)); + } + else { + Py_buffer view; + int rc; + + if (PyObject_GetBuffer(data, &view, PyBUF_SIMPLE) < 0) + return NULL; + s = view.buf; + slen = view.len; + while (slen > MAX_CHUNK_SIZE) { + if (!XML_Parse(self->itself, s, MAX_CHUNK_SIZE, 0)) { + PyBuffer_Release(&view); + return get_parse_result(self, 0); + } + s += MAX_CHUNK_SIZE; + slen -= MAX_CHUNK_SIZE; + } + rc = XML_Parse(self->itself, s, slen, isFinal); + PyBuffer_Release(&view); + return get_parse_result(self, rc); + } } /* File reading copied from cPickle */