diff -r 5eefc85b8be8 Include/pyexpat.h --- a/Include/pyexpat.h Mon Feb 25 17:21:42 2013 +0200 +++ b/Include/pyexpat.h Mon Feb 25 17:38:48 2013 +0200 @@ -45,6 +45,7 @@ void (*SetUserData)(XML_Parser parser, void *userData); void (*SetStartDoctypeDeclHandler)(XML_Parser parser, XML_StartDoctypeDeclHandler start); + enum XML_Status (*SetEncoding)(XML_Parser parser, const XML_Char *encoding); /* always add new stuff to the end! */ }; diff -r 5eefc85b8be8 Lib/test/test_xml_etree.py --- a/Lib/test/test_xml_etree.py Mon Feb 25 17:21:42 2013 +0200 +++ b/Lib/test/test_xml_etree.py Mon Feb 25 17:38:48 2013 +0200 @@ -1847,11 +1847,13 @@ class XMLParserTest(unittest.TestCase): - sample1 = '22' - sample2 = ('' - 'text') + sample1 = b'22' + sample2 = (b'' + b'text') + sample3 = ('\n' + '$\xa3\u20ac\U0001017b') def _check_sample_element(self, e): self.assertEqual(e.tag, 'file') @@ -1885,12 +1887,21 @@ _doctype = (name, pubid, system) parser = MyParserWithDoctype() - parser.feed(self.sample2) + with self.assertWarns(DeprecationWarning): + parser.feed(self.sample2) parser.close() self.assertEqual(_doctype, ('html', '-//W3C//DTD XHTML 1.0 Transitional//EN', 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd')) + def test_parse_string(self): + parser = ET.XMLParser(target=ET.TreeBuilder()) + parser.feed(self.sample3) + e = parser.close() + self.assertEqual(e.tag, 'money') + self.assertEqual(e.attrib['value'], '$\xa3\u20ac\U0001017b') + self.assertEqual(e.text, '$\xa3\u20ac\U0001017b') + class NamespaceParseTest(unittest.TestCase): def test_find_with_namespace(self): @@ -2312,6 +2323,7 @@ ElementFindTest, ElementIterTest, TreeBuilderTest, + XMLParserTest, BugsTest, ] diff -r 5eefc85b8be8 Modules/_elementtree.c --- a/Modules/_elementtree.c Mon Feb 25 17:21:42 2013 +0200 +++ b/Modules/_elementtree.c Mon Feb 25 17:38:48 2013 +0200 @@ -3327,13 +3327,22 @@ Py_TYPE(self)->tp_free((PyObject *)self); } +#define MAX_CHUNK_SIZE (1 << 20) + LOCAL(PyObject*) -expat_parse(XMLParserObject* self, char* data, int data_len, int final) +expat_parse(XMLParserObject* self, const char* data, Py_ssize_t data_len, int final) { int ok; + while (data_len > MAX_CHUNK_SIZE) { + ok = EXPAT(Parse)(self->parser, data, MAX_CHUNK_SIZE, 0); + if (!ok) + goto done; + data += MAX_CHUNK_SIZE; + data_len -= MAX_CHUNK_SIZE; + } ok = EXPAT(Parse)(self->parser, data, data_len, final); - +done: if (PyErr_Occurred()) return NULL; @@ -3374,16 +3383,28 @@ } static PyObject* -xmlparser_feed(XMLParserObject* self, PyObject* args) +xmlparser_feed(XMLParserObject* self, PyObject* arg) { /* feed data to parser */ - char* data; - int data_len; - if (!PyArg_ParseTuple(args, "s#:feed", &data, &data_len)) - return NULL; - - return expat_parse(self, data, data_len, 0); + if (PyUnicode_Check(arg)) { + Py_ssize_t data_len; + const char *data = PyUnicode_AsUTF8AndSize(arg, &data_len); + if (data == NULL) + return NULL; + /* Explicitly set UTF-8 encoding. Return code ignored. */ + (void)EXPAT(SetEncoding)(self->parser, "utf-8"); + return expat_parse(self, data, data_len, 0); + } + else { + Py_buffer view; + PyObject *res; + if (PyObject_GetBuffer(arg, &view, PyBUF_SIMPLE) < 0) + return NULL; + res = expat_parse(self, view.buf, view.len, 0); + PyBuffer_Release(&view); + return res; + } } static PyObject* @@ -3568,7 +3589,7 @@ } static PyMethodDef xmlparser_methods[] = { - {"feed", (PyCFunction) xmlparser_feed, METH_VARARGS}, + {"feed", (PyCFunction) xmlparser_feed, METH_O}, {"close", (PyCFunction) xmlparser_close, METH_VARARGS}, {"_parse", (PyCFunction) xmlparser_parse, METH_VARARGS}, {"_setevents", (PyCFunction) xmlparser_setevents, METH_VARARGS}, diff -r 5eefc85b8be8 Modules/pyexpat.c --- a/Modules/pyexpat.c Mon Feb 25 17:21:42 2013 +0200 +++ b/Modules/pyexpat.c Mon Feb 25 17:38:48 2013 +0200 @@ -1937,6 +1937,7 @@ capi.SetUnknownEncodingHandler = XML_SetUnknownEncodingHandler; capi.SetUserData = XML_SetUserData; capi.SetStartDoctypeDeclHandler = XML_SetStartDoctypeDeclHandler; + capi.SetEncoding = XML_SetEncoding; /* export using capsule */ capi_object = PyCapsule_New(&capi, PyExpat_CAPSULE_NAME, NULL);