Index: Lib/test/test_pyexpat.py =================================================================== --- Lib/test/test_pyexpat.py (revision 56475) +++ Lib/test/test_pyexpat.py (working copy) @@ -1,7 +1,7 @@ # XXX TypeErrors on calling handlers, or on bad return values from a # handler, are obscure and unhelpful. -import StringIO +from io import BytesIO import unittest import pyexpat @@ -20,11 +20,6 @@ [0, 0], ] - def test_returns_unicode(self): - for x, y in self.set_get_pairs: - self.parser.returns_unicode = x - self.assertEquals(self.parser.returns_unicode, y) - def test_ordered_attributes(self): for x, y in self.set_get_pairs: self.parser.ordered_attributes = x @@ -36,7 +31,7 @@ self.assertEquals(self.parser.specified_attributes, y) -data = '''\ +data = b'''\ @@ -130,22 +125,12 @@ 'ExternalEntityRefHandler' ] - def test_utf8(self): - - out = self.Outputter() - parser = expat.ParserCreate(namespace_separator='!') - for name in self.handler_names: - setattr(parser, name, getattr(out, name)) - parser.returns_unicode = 0 - parser.Parse(data, 1) - - # Verify output - op = out.out + def _verify_parse_output(self, op): self.assertEquals(op[0], 'PI: \'xml-stylesheet\' \'href="stylesheet.css"\'') self.assertEquals(op[1], "Comment: ' comment data '") self.assertEquals(op[2], "Notation declared: ('notation', None, 'notation.jpeg', None)") self.assertEquals(op[3], "Unparsed entity decl: ('unparsed_entity', None, 'entity.file', None, 'notation')") - self.assertEquals(op[4], "Start element: 'root' {'attr1': 'value1', 'attr2': 'value2\\xe1\\xbd\\x80'}") + self.assertEquals(op[4], "Start element: 'root' {'attr1': 'value1', 'attr2': 'value2\\u1f40'}") self.assertEquals(op[5], "NS decl: 'myns' 'http://www.python.org/namespace'") self.assertEquals(op[6], "Start element: 'http://www.python.org/namespace!subelement' {}") self.assertEquals(op[7], "Character data: 'Contents of subelements'") @@ -159,66 +144,32 @@ self.assertEquals(op[15], "External entity ref: (None, 'entity.file', None)") self.assertEquals(op[16], "End element: 'root'") + def test_unicode(self): # Try the parse again, this time producing Unicode output out = self.Outputter() parser = expat.ParserCreate(namespace_separator='!') - parser.returns_unicode = 1 for name in self.handler_names: setattr(parser, name, getattr(out, name)) parser.Parse(data, 1) op = out.out - self.assertEquals(op[0], 'PI: u\'xml-stylesheet\' u\'href="stylesheet.css"\'') - self.assertEquals(op[1], "Comment: u' comment data '") - self.assertEquals(op[2], "Notation declared: (u'notation', None, u'notation.jpeg', None)") - self.assertEquals(op[3], "Unparsed entity decl: (u'unparsed_entity', None, u'entity.file', None, u'notation')") - self.assertEquals(op[4], "Start element: u'root' {u'attr1': u'value1', u'attr2': u'value2\\u1f40'}") - self.assertEquals(op[5], "NS decl: u'myns' u'http://www.python.org/namespace'") - self.assertEquals(op[6], "Start element: u'http://www.python.org/namespace!subelement' {}") - self.assertEquals(op[7], "Character data: u'Contents of subelements'") - self.assertEquals(op[8], "End element: u'http://www.python.org/namespace!subelement'") - self.assertEquals(op[9], "End of NS decl: u'myns'") - self.assertEquals(op[10], "Start element: u'sub2' {}") - self.assertEquals(op[11], 'Start of CDATA section') - self.assertEquals(op[12], "Character data: u'contents of CDATA section'") - self.assertEquals(op[13], 'End of CDATA section') - self.assertEquals(op[14], "End element: u'sub2'") - self.assertEquals(op[15], "External entity ref: (None, u'entity.file', None)") - self.assertEquals(op[16], "End element: u'root'") + self._verify_parse_output(op) def test_parse_file(self): # Try parsing a file out = self.Outputter() parser = expat.ParserCreate(namespace_separator='!') - parser.returns_unicode = 1 for name in self.handler_names: setattr(parser, name, getattr(out, name)) - file = StringIO.StringIO(data) + file = BytesIO(data) parser.ParseFile(file) op = out.out - self.assertEquals(op[0], 'PI: u\'xml-stylesheet\' u\'href="stylesheet.css"\'') - self.assertEquals(op[1], "Comment: u' comment data '") - self.assertEquals(op[2], "Notation declared: (u'notation', None, u'notation.jpeg', None)") - self.assertEquals(op[3], "Unparsed entity decl: (u'unparsed_entity', None, u'entity.file', None, u'notation')") - self.assertEquals(op[4], "Start element: u'root' {u'attr1': u'value1', u'attr2': u'value2\\u1f40'}") - self.assertEquals(op[5], "NS decl: u'myns' u'http://www.python.org/namespace'") - self.assertEquals(op[6], "Start element: u'http://www.python.org/namespace!subelement' {}") - self.assertEquals(op[7], "Character data: u'Contents of subelements'") - self.assertEquals(op[8], "End element: u'http://www.python.org/namespace!subelement'") - self.assertEquals(op[9], "End of NS decl: u'myns'") - self.assertEquals(op[10], "Start element: u'sub2' {}") - self.assertEquals(op[11], 'Start of CDATA section') - self.assertEquals(op[12], "Character data: u'contents of CDATA section'") - self.assertEquals(op[13], 'End of CDATA section') - self.assertEquals(op[14], "End element: u'sub2'") - self.assertEquals(op[15], "External entity ref: (None, u'entity.file', None)") - self.assertEquals(op[16], "End element: u'root'") + self._verify_parse_output(op) - class NamespaceSeparatorTest(unittest.TestCase): def test_legal(self): # Tests that make sure we get errors when the namespace_separator value Index: Modules/pyexpat.c =================================================================== --- Modules/pyexpat.c (revision 56475) +++ Modules/pyexpat.c (working copy) @@ -62,8 +62,6 @@ PyObject_HEAD XML_Parser itself; - int returns_unicode; /* True if Unicode strings are returned; - if false, UTF-8 strings are returned */ int ordered_attributes; /* Return attributes as a list. */ int specified_attributes; /* Report only specified attributes. */ int in_callback; /* Is a callback active? */ @@ -185,35 +183,6 @@ return PyUnicode_DecodeUTF8((const char *)str, len, "strict"); } -/* Convert a string of XML_Chars into an 8-bit Python string. - Returns None if str is a null pointer. */ - -static PyObject * -conv_string_to_utf8(const XML_Char *str) -{ - /* XXX currently this code assumes that XML_Char is 8-bit, - and hence in UTF-8. */ - /* UTF-8 from Expat, UTF-8 desired */ - if (str == NULL) { - Py_INCREF(Py_None); - return Py_None; - } - return PyString_FromString(str); -} - -static PyObject * -conv_string_len_to_utf8(const XML_Char *str, int len) -{ - /* XXX currently this code assumes that XML_Char is 8-bit, - and hence in UTF-8. */ - /* UTF-8 from Expat, UTF-8 desired */ - if (str == NULL) { - Py_INCREF(Py_None); - return Py_None; - } - return PyString_FromStringAndSize((const char *)str, len); -} - /* Callback routines */ static void clear_handlers(xmlparseobject *self, int initial); @@ -411,14 +380,10 @@ return res; } -/* Python 2.0 and later versions, when built with Unicode support */ -#define STRING_CONV_FUNC (self->returns_unicode \ - ? conv_string_to_unicode : conv_string_to_utf8) - static PyObject* string_intern(xmlparseobject *self, const char* str) { - PyObject *result = STRING_CONV_FUNC(str); + PyObject *result = conv_string_to_unicode(str); PyObject *value; /* result can be NULL if the unicode conversion failed. */ if (!result) @@ -449,9 +414,7 @@ args = PyTuple_New(1); if (args == NULL) return -1; - temp = (self->returns_unicode - ? conv_string_len_to_unicode(buffer, len) - : conv_string_len_to_utf8(buffer, len)); + temp = (conv_string_len_to_unicode(buffer, len)); if (temp == NULL) { Py_DECREF(args); flag_error(self); @@ -556,7 +519,7 @@ Py_DECREF(container); return; } - v = STRING_CONV_FUNC((XML_Char *) atts[i+1]); + v = conv_string_to_unicode((XML_Char *) atts[i+1]); if (v == NULL) { flag_error(self); Py_DECREF(container); @@ -645,7 +608,7 @@ (void *userData, const XML_Char *target, const XML_Char *data), - ("(NO&)", string_intern(self, target), STRING_CONV_FUNC,data)) + ("(NO&)", string_intern(self, target), conv_string_to_unicode ,data)) VOID_HANDLER(UnparsedEntityDecl, (void *userData, @@ -671,9 +634,7 @@ const XML_Char *notationName), ("NiNNNNN", string_intern(self, entityName), is_parameter_entity, - (self->returns_unicode - ? conv_string_len_to_unicode(value, value_length) - : conv_string_len_to_utf8(value, value_length)), + (conv_string_len_to_unicode(value, value_length)), string_intern(self, base), string_intern(self, systemId), string_intern(self, publicId), string_intern(self, notationName))) @@ -684,7 +645,7 @@ const XML_Char *encoding, int standalone), ("(O&O&i)", - STRING_CONV_FUNC,version, STRING_CONV_FUNC,encoding, + conv_string_to_unicode ,version, conv_string_to_unicode ,encoding, standalone)) static PyObject * @@ -727,10 +688,7 @@ if (flush_character_buffer(self) < 0) goto finally; - modelobj = conv_content_model(model, - (self->returns_unicode - ? conv_string_to_unicode - : conv_string_to_utf8)); + modelobj = conv_content_model(model, (conv_string_to_unicode)); if (modelobj == NULL) { flag_error(self); goto finally; @@ -772,7 +730,7 @@ int isrequired), ("(NNO&O&i)", string_intern(self, elname), string_intern(self, attname), - STRING_CONV_FUNC,att_type, STRING_CONV_FUNC,dflt, + conv_string_to_unicode ,att_type, conv_string_to_unicode ,dflt, isrequired)) #if XML_COMBINED_VERSION >= 19504 @@ -808,7 +766,7 @@ VOID_HANDLER(Comment, (void *userData, const XML_Char *data), - ("(O&)", STRING_CONV_FUNC,data)) + ("(O&)", conv_string_to_unicode ,data)) VOID_HANDLER(StartCdataSection, (void *userData), @@ -820,15 +778,11 @@ VOID_HANDLER(Default, (void *userData, const XML_Char *s, int len), - ("(N)", (self->returns_unicode - ? conv_string_len_to_unicode(s,len) - : conv_string_len_to_utf8(s,len)))) + ("(N)", (conv_string_len_to_unicode(s,len)))) VOID_HANDLER(DefaultHandlerExpand, (void *userData, const XML_Char *s, int len), - ("(N)", (self->returns_unicode - ? conv_string_len_to_unicode(s,len) - : conv_string_len_to_utf8(s,len)))) + ("(N)", (conv_string_len_to_unicode(s,len)))) INT_HANDLER(NotStandalone, (void *userData), @@ -842,7 +796,7 @@ const XML_Char *publicId), int rc=0;, ("(O&NNN)", - STRING_CONV_FUNC,context, string_intern(self, base), + conv_string_to_unicode ,context, string_intern(self, base), string_intern(self, systemId), string_intern(self, publicId)), rc = PyInt_AsLong(rv);, rc, XML_GetUserData(parser)) @@ -924,13 +878,13 @@ goto finally; /* XXX what to do if it returns a Unicode string? */ - if (!PyString_Check(str)) { + if (!PyBytes_Check(str)) { PyErr_Format(PyExc_TypeError, - "read() did not return a string object (type=%.400s)", + "read() did not return a bytes object (type=%.400s)", str->ob_type->tp_name); goto finally; } - len = PyString_GET_SIZE(str); + len = PyBytes_GET_SIZE(str); if (len > buf_size) { PyErr_Format(PyExc_ValueError, "read() returned too much data: " @@ -938,7 +892,7 @@ buf_size, len); goto finally; } - memcpy(buf, PyString_AsString(str), len); + memcpy(buf, PyBytes_AsString(str), len); finally: Py_XDECREF(arg); Py_XDECREF(str); @@ -1044,7 +998,7 @@ = XML_GetInputContext(self->itself, &offset, &size); if (buffer != NULL) - return PyString_FromStringAndSize(buffer + offset, + return PyBytes_FromStringAndSize(buffer + offset, size - offset); else Py_RETURN_NONE; @@ -1098,7 +1052,6 @@ } else new_parser->buffer = NULL; - new_parser->returns_unicode = self->returns_unicode; new_parser->ordered_attributes = self->ordered_attributes; new_parser->specified_attributes = self->specified_attributes; new_parser->in_callback = 0; @@ -1283,8 +1236,6 @@ if (self == NULL) return NULL; - self->returns_unicode = 1; - self->buffer = NULL; self->buffer_size = CHARACTER_DATA_BUFFER_SIZE; self->buffer_used = 0; @@ -1436,8 +1387,6 @@ return get_pybool(self->ns_prefixes); if (strcmp(name, "ordered_attributes") == 0) return get_pybool(self->ordered_attributes); - if (strcmp(name, "returns_unicode") == 0) - return get_pybool((long) self->returns_unicode); if (strcmp(name, "specified_attributes") == 0) return get_pybool((long) self->specified_attributes); if (strcmp(name, "intern") == 0) { @@ -1482,7 +1431,6 @@ APPEND(rc, "buffer_used"); APPEND(rc, "namespace_prefixes"); APPEND(rc, "ordered_attributes"); - APPEND(rc, "returns_unicode"); APPEND(rc, "specified_attributes"); APPEND(rc, "intern"); @@ -1570,14 +1518,6 @@ self->ordered_attributes = 0; return 0; } - if (strcmp(name, "returns_unicode") == 0) { - if (PyObject_IsTrue(v)) { - self->returns_unicode = 1; - } - else - self->returns_unicode = 0; - return 0; - } if (strcmp(name, "specified_attributes") == 0) { if (PyObject_IsTrue(v)) self->specified_attributes = 1;