# HG changeset patch # Parent 84af71e8c051d80f6cd8b39ebe40d82f29f00f64 diff -r 84af71e8c051 Doc/library/xml.etree.elementtree.rst --- a/Doc/library/xml.etree.elementtree.rst Tue May 19 11:00:07 2015 +0300 +++ b/Doc/library/xml.etree.elementtree.rst Tue May 19 11:15:44 2015 +0000 @@ -15,9 +15,10 @@ .. warning:: - The :mod:`xml.etree.ElementTree` module is not secure against + Some parser APIs in this module are not secure against maliciously constructed data. If you need to parse untrusted or - unauthenticated data see :ref:`xml-vulnerabilities`. + unauthenticated data, use the :class:`XMLParser` class and specify + ``reject_entites=True``, or see :ref:`xml-vulnerabilities`. Tutorial -------- @@ -73,14 +74,14 @@ We can import this data by reading from a file:: import xml.etree.ElementTree as ET - tree = ET.parse('country_data.xml') + tree = ET.parse('country_data.xml', XMLParser(reject_entities=True)) root = tree.getroot() Or directly from a string:: - root = ET.fromstring(country_data_as_string) + root = ET.XML(country_data_as_string, XMLParser(reject_entities=True)) -:func:`fromstring` parses XML from a string directly into an :class:`Element`, +:func:`XML` parses XML from a string directly into an :class:`Element`, which is the root element of the parsed tree. Other parsing functions may create an :class:`ElementTree`. Check the documentation to be sure. @@ -376,7 +377,7 @@ import xml.etree.ElementTree as ET - root = ET.fromstring(countrydata) + root = ET.XML(countrydata, XMLParser(reject_entities=True)) # Top-level elements root.findall(".") @@ -946,7 +947,7 @@ >>> from xml.etree.ElementTree import ElementTree >>> tree = ElementTree() - >>> tree.parse("index.xhtml") + >>> tree.parse("index.xhtml", XMLParser(reject_entities=True)) >>> p = tree.find("body/p") # Finds first occurrence of tag p in body >>> p @@ -1032,7 +1033,7 @@ ^^^^^^^^^^^^^^^^^ -.. class:: XMLParser(html=0, target=None, encoding=None) +.. class:: XMLParser(html=0, target=None, encoding=None, *, reject_entities=False) This class is the low-level building block of the module. It uses :mod:`xml.parsers.expat` for efficient, event-based parsing of XML. It can @@ -1043,10 +1044,17 @@ deprecated. If *encoding* [1]_ is given, the value overrides the encoding specified in the XML file. + If *reject_entities* is set to ``True``, a :exc:`ValueError` will be + raised for XML documents that contain XML entity declarations. This can + defeat XML entity expansion attacks such as Billion Laughs. + .. deprecated:: 3.4 The *html* argument. The remaining arguments should be passed via keywword to prepare for the removal of the *html* argument. + .. versionadded:: 3.5 + The *reject_entities* flag. + .. method:: close() Finishes feeding data to the parser. Returns the result of calling the @@ -1088,7 +1096,7 @@ ... return self.maxDepth ... >>> target = MaxDepth() - >>> parser = XMLParser(target=target) + >>> parser = XMLParser(reject_entities=True, target=target) >>> exampleXml = """ ... ... diff -r 84af71e8c051 Doc/library/xml.rst --- a/Doc/library/xml.rst Tue May 19 11:00:07 2015 +0300 +++ b/Doc/library/xml.rst Tue May 19 11:15:44 2015 +0000 @@ -48,7 +48,8 @@ XML vulnerabilities ------------------- -The XML processing modules are not secure against maliciously constructed data. +Most of the XML processing modules are not +secure against maliciously constructed data. An attacker can abuse XML features to carry out denial of service attacks, access local files, generate network connections to other machines, or circumvent firewalls. @@ -59,8 +60,8 @@ ========================= ======== ========= ========= ======== ========= kind sax etree minidom pulldom xmlrpc ========================= ======== ========= ========= ======== ========= -billion laughs **Yes** **Yes** **Yes** **Yes** **Yes** -quadratic blowup **Yes** **Yes** **Yes** **Yes** **Yes** +billion laughs **Yes** No (4) **Yes** **Yes** **Yes** +quadratic blowup **Yes** No (4) **Yes** **Yes** **Yes** external entity expansion **Yes** No (1) No (2) **Yes** No (3) DTD retrieval **Yes** No No **Yes** No decompression bomb No No No No **Yes** @@ -71,6 +72,9 @@ 2. :mod:`xml.dom.minidom` doesn't expand external entities and simply returns the unexpanded entity verbatim. 3. :mod:`xmlrpclib` doesn't expand external entities and omits them. +4. :mod:`~xml.etree.ElementTree` is not vulnerable to + entity expansion attacks if the :class:`~xml.etree.ElementTree.XMLParser` + class is used and ``reject_entities=True`` is enabled. billion laughs / exponential entity expansion diff -r 84af71e8c051 Include/pyexpat.h --- a/Include/pyexpat.h Tue May 19 11:00:07 2015 +0300 +++ b/Include/pyexpat.h Tue May 19 11:15:44 2015 +0000 @@ -48,6 +48,9 @@ enum XML_Status (*SetEncoding)(XML_Parser parser, const XML_Char *encoding); int (*DefaultUnknownEncodingHandler)( void *encodingHandlerData, const XML_Char *name, XML_Encoding *info); + enum XML_Status (*StopParser)(XML_Parser parser, XML_Bool resumable); + void (*SetEntityDeclHandler)( + XML_Parser parser, XML_EntityDeclHandler handler); /* always add new stuff to the end! */ }; diff -r 84af71e8c051 Lib/test/test_xml_etree.py --- a/Lib/test/test_xml_etree.py Tue May 19 11:00:07 2015 +0300 +++ b/Lib/test/test_xml_etree.py Tue May 19 11:15:44 2015 +0000 @@ -31,6 +31,7 @@ except UnicodeEncodeError: raise unittest.SkipTest("filename is not encodable to utf8") SIMPLE_NS_XMLFILE = findfile("simple-ns.xml", subdir="xmltestdata") +XMLBOMB_XMLFILE = findfile("xmlbomb.xml", subdir="xmltestdata") SAMPLE_XML = """\ @@ -79,6 +80,13 @@ """ +ENTITY_DECL_XML = """\ + +]> +&a; +""" + ENTITY_XML = """\ @@ -950,7 +958,7 @@ expected = '<%s>' % elem serialized = serialize(ET.XML('<%s />' % elem), method='html') self.assertEqual(serialized, expected) - serialized = serialize(ET.XML('<%s>' % (elem,elem)), + serialized = serialize(ET.XML('<%s>' % (elem, elem)), method='html') self.assertEqual(serialized, expected) @@ -2619,6 +2627,30 @@ # -------------------------------------------------------------------- +class XmlBombTest(unittest.TestCase): + + def test_xmlbomb(self): + ET.parse(XMLBOMB_XMLFILE) # File is fully parsed by default + + parser = ET.XMLParser(reject_entities=True) + with self.assertRaisesRegex(ValueError, + 'XML entity declaration found'): + ET.parse(XMLBOMB_XMLFILE, parser=parser) + + parser = ET.XMLParser(reject_entities=False) + ET.parse(XMLBOMB_XMLFILE, parser=parser) + + xml = ENTITY_DECL_XML + with self.assertRaisesRegex(ValueError, + "XML entity declaration found"): + ET.XML(xml, ET.XMLParser(reject_entities=True)) + + parser = ET.XMLParser(reject_entities=False) + e = ET.XML(xml, parser=parser) + self.assertEqual(e.text, "MARK") + +# -------------------------------------------------------------------- + class CleanContext(object): """Provide default namespace mapping and path cache.""" @@ -2689,6 +2721,7 @@ XMLParserTest, XMLPullParserTest, BugsTest, + XmlBombTest, ] # These tests will only run for the pure-Python version that doesn't import diff -r 84af71e8c051 Lib/test/xmltestdata/xmlbomb.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Lib/test/xmltestdata/xmlbomb.xml Tue May 19 11:15:44 2015 +0000 @@ -0,0 +1,6 @@ + + + +]> +&c; diff -r 84af71e8c051 Lib/xml/etree/ElementTree.py --- a/Lib/xml/etree/ElementTree.py Tue May 19 11:00:07 2015 +0300 +++ b/Lib/xml/etree/ElementTree.py Tue May 19 11:15:44 2015 +0000 @@ -1431,7 +1431,7 @@ """ self._flush() self._last = self._elem.pop() - assert self._last.tag == tag,\ + assert self._last.tag == tag, \ "end tag mismatch (expected %s, got %s)" % ( self._last.tag, tag) self._tail = 1 @@ -1442,15 +1442,17 @@ class XMLParser: """Element structure builder for XML source data based on the expat parser. - *html* are predefined HTML entities (not supported currently), + *html* are predefined HTML entities (deprecated and not supported), *target* is an optional target object which defaults to an instance of the standard TreeBuilder class, *encoding* is an optional encoding string which if given, overrides the encoding specified in the XML file: http://www.iana.org/assignments/character-sets + *reject_entities* can be set to true to reject XML entity declarations """ - def __init__(self, html=0, target=None, encoding=None): + def __init__(self, html=0, target=None, encoding=None, *, + reject_entities=False): try: from xml.parsers import expat except ImportError: @@ -1481,6 +1483,8 @@ parser.CommentHandler = target.comment if hasattr(target, 'pi'): parser.ProcessingInstructionHandler = target.pi + if reject_entities: + parser.EntityDeclHandler = self._entity_decl # Configure pyexpat: buffering, new-style attribute handling. parser.buffer_text = 1 parser.ordered_attributes = 1 @@ -1551,7 +1555,7 @@ attrib = {} if attr_list: for i in range(0, len(attr_list), 2): - attrib[fixname(attr_list[i])] = attr_list[i+1] + attrib[fixname(attr_list[i])] = attr_list[i + 1] return self.target.start(tag, attrib) def _end(self, tag): @@ -1609,6 +1613,9 @@ self.doctype(name, pubid, system[1:-1]) self._doctype = None + def _entity_decl(self, *pos, **kw): + raise ValueError("XML entity declaration found") + def doctype(self, name, pubid, system): """(Deprecated) Handle doctype declaration diff -r 84af71e8c051 Modules/_elementtree.c --- a/Modules/_elementtree.c Tue May 19 11:00:07 2015 +0300 +++ b/Modules/_elementtree.c Tue May 19 11:15:44 2015 +0000 @@ -3235,6 +3235,16 @@ } } +static void +expat_entity_decl_handler(XMLParserObject* self, const XML_Char* entityName, + int is_parameter_entity, const XML_Char* value, int value_length, + const XML_Char* base, const XML_Char* systemId, const XML_Char* publicId, + const XML_Char *notationName) +{ + EXPAT(StopParser)(self->parser, XML_FALSE); + PyErr_SetString(PyExc_ValueError, "XML entity declaration found"); +} + /* -------------------------------------------------------------------- */ static PyObject * @@ -3257,13 +3267,16 @@ html: object = NULL target: object = NULL encoding: str(accept={str, NoneType}) = NULL + * + reject_entities: bool = False [clinic start generated code]*/ static int _elementtree_XMLParser___init___impl(XMLParserObject *self, PyObject *html, - PyObject *target, const char *encoding) -/*[clinic end generated code: output=d6a16c63dda54441 input=155bc5695baafffd]*/ + PyObject *target, const char *encoding, + int reject_entities) +/*[clinic end generated code: output=c8d4e983698bd7f1 input=c3f5ccee75d4c912]*/ { self->entity = PyDict_New(); if (!self->entity) @@ -3339,6 +3352,12 @@ self->parser, EXPAT(DefaultUnknownEncodingHandler), NULL ); + if (reject_entities) { + EXPAT(SetEntityDeclHandler)( + self->parser, + (XML_EntityDeclHandler) expat_entity_decl_handler + ); + } return 0; } diff -r 84af71e8c051 Modules/clinic/_elementtree.c.h --- a/Modules/clinic/_elementtree.c.h Tue May 19 11:00:07 2015 +0300 +++ b/Modules/clinic/_elementtree.c.h Tue May 19 11:15:44 2015 +0000 @@ -565,21 +565,23 @@ static int _elementtree_XMLParser___init___impl(XMLParserObject *self, PyObject *html, - PyObject *target, const char *encoding); + PyObject *target, const char *encoding, + int reject_entities); static int _elementtree_XMLParser___init__(PyObject *self, PyObject *args, PyObject *kwargs) { int return_value = -1; - static char *_keywords[] = {"html", "target", "encoding", NULL}; + static char *_keywords[] = {"html", "target", "encoding", "reject_entities", NULL}; PyObject *html = NULL; PyObject *target = NULL; const char *encoding = NULL; + int reject_entities = 0; - if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OOz:XMLParser", _keywords, - &html, &target, &encoding)) + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|OOz$p:XMLParser", _keywords, + &html, &target, &encoding, &reject_entities)) goto exit; - return_value = _elementtree_XMLParser___init___impl((XMLParserObject *)self, html, target, encoding); + return_value = _elementtree_XMLParser___init___impl((XMLParserObject *)self, html, target, encoding, reject_entities); exit: return return_value; @@ -663,4 +665,4 @@ exit: return return_value; } -/*[clinic end generated code: output=119aed84c1545187 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=bb15774cb51e2d2e input=a9049054013a1b77]*/ diff -r 84af71e8c051 Modules/pyexpat.c --- a/Modules/pyexpat.c Tue May 19 11:00:07 2015 +0300 +++ b/Modules/pyexpat.c Tue May 19 11:15:44 2015 +0000 @@ -1879,6 +1879,8 @@ capi.SetStartDoctypeDeclHandler = XML_SetStartDoctypeDeclHandler; capi.SetEncoding = XML_SetEncoding; capi.DefaultUnknownEncodingHandler = PyUnknownEncodingHandler; + capi.StopParser = XML_StopParser; + capi.SetEntityDeclHandler = XML_SetEntityDeclHandler; /* export using capsule */ capi_object = PyCapsule_New(&capi, PyExpat_CAPSULE_NAME, NULL);