# HG changeset patch # Parent 84af71e8c051d80f6cd8b39ebe40d82f29f00f64 # Parent 2d69d0419879a8a14ffe53517bfbbb7c94fbc838 Issue #24238: Add forbid_entities flag to ElementTree.XMLParser diff -r 2d69d0419879 Doc/library/xml.etree.elementtree.rst --- a/Doc/library/xml.etree.elementtree.rst Sat Jun 04 05:06:34 2016 +0000 +++ b/Doc/library/xml.etree.elementtree.rst Sat Jun 04 06:39:05 2016 +0000 @@ -15,9 +15,10 @@ .. warning:: - The :mod:`xml.etree.ElementTree` module is not secure against + Some parser APIs in this module are not secure against maliciously constructed data. If you need to parse untrusted or - unauthenticated data see :ref:`xml-vulnerabilities`. + unauthenticated data, use the :class:`XMLParser` class and specify + ``forbid_entites=True``, or see :ref:`xml-vulnerabilities`. Tutorial -------- @@ -73,14 +74,14 @@ We can import this data by reading from a file:: import xml.etree.ElementTree as ET - tree = ET.parse('country_data.xml') + tree = ET.parse('country_data.xml', XMLParser(forbid_entities=True)) root = tree.getroot() Or directly from a string:: - root = ET.fromstring(country_data_as_string) + root = ET.XML(country_data_as_string, XMLParser(forbid_entities=True)) -:func:`fromstring` parses XML from a string directly into an :class:`Element`, +:func:`XML` parses XML from a string directly into an :class:`Element`, which is the root element of the parsed tree. Other parsing functions may create an :class:`ElementTree`. Check the documentation to be sure. @@ -376,7 +377,7 @@ import xml.etree.ElementTree as ET - root = ET.fromstring(countrydata) + root = ET.XML(countrydata, XMLParser(forbid_entities=True)) # Top-level elements root.findall(".") @@ -954,7 +955,7 @@ >>> from xml.etree.ElementTree import ElementTree >>> tree = ElementTree() - >>> tree.parse("index.xhtml") + >>> tree.parse("index.xhtml", XMLParser(forbid_entities=True)) >>> p = tree.find("body/p") # Finds first occurrence of tag p in body >>> p @@ -1040,7 +1041,8 @@ ^^^^^^^^^^^^^^^^^ -.. class:: XMLParser(html=0, target=None, encoding=None) +.. class:: XMLParser(html=0, target=None, encoding=None, *, \ + forbid_entities=False) This class is the low-level building block of the module. It uses :mod:`xml.parsers.expat` for efficient, event-based parsing of XML. It can @@ -1051,10 +1053,17 @@ now deprecated. If *encoding* [1]_ is given, the value overrides the encoding specified in the XML file. + If *forbid_entities* is set to ``True``, a :exc:`ValueError` will be + raised for XML documents that contain XML entity declarations. This can + defeat XML entity expansion attacks such as Billion Laughs. + .. deprecated:: 3.4 The *html* argument. The remaining arguments should be passed via keyword to prepare for the removal of the *html* argument. + .. versionadded:: 3.6 + The *forbid_entities* flag. + .. method:: close() Finishes feeding data to the parser. Returns the result of calling the @@ -1096,7 +1105,7 @@ ... return self.maxDepth ... >>> target = MaxDepth() - >>> parser = XMLParser(target=target) + >>> parser = XMLParser(forbid_entities=True, target=target) >>> exampleXml = """ ... ... diff -r 2d69d0419879 Doc/library/xml.rst --- a/Doc/library/xml.rst Sat Jun 04 05:06:34 2016 +0000 +++ b/Doc/library/xml.rst Sat Jun 04 06:39:05 2016 +0000 @@ -48,7 +48,8 @@ XML vulnerabilities ------------------- -The XML processing modules are not secure against maliciously constructed data. +Most of the XML processing modules are not +secure against maliciously constructed data. An attacker can abuse XML features to carry out denial of service attacks, access local files, generate network connections to other machines, or circumvent firewalls. @@ -59,8 +60,8 @@ ========================= ======== ========= ========= ======== ========= kind sax etree minidom pulldom xmlrpc ========================= ======== ========= ========= ======== ========= -billion laughs **Yes** **Yes** **Yes** **Yes** **Yes** -quadratic blowup **Yes** **Yes** **Yes** **Yes** **Yes** +billion laughs **Yes** No (4) **Yes** **Yes** **Yes** +quadratic blowup **Yes** No (4) **Yes** **Yes** **Yes** external entity expansion **Yes** No (1) No (2) **Yes** No (3) `DTD`_ retrieval **Yes** No No **Yes** No decompression bomb No No No No **Yes** @@ -71,6 +72,9 @@ 2. :mod:`xml.dom.minidom` doesn't expand external entities and simply returns the unexpanded entity verbatim. 3. :mod:`xmlrpclib` doesn't expand external entities and omits them. +4. :mod:`~xml.etree.ElementTree` is not vulnerable to + entity expansion attacks if the :class:`~xml.etree.ElementTree.XMLParser` + class is used and ``forbid_entities=True`` is enabled. billion laughs / exponential entity expansion diff -r 2d69d0419879 Doc/whatsnew/3.6.rst --- a/Doc/whatsnew/3.6.rst Sat Jun 04 05:06:34 2016 +0000 +++ b/Doc/whatsnew/3.6.rst Sat Jun 04 06:39:05 2016 +0000 @@ -428,6 +428,15 @@ (Contributed by Clement Rouault in :issue:`23026`.) +xml.etree.ElementTree +--------------------- + +:class:`~xml.etree.ElementTree.XMLParser` has a new *forbid_entities* +parameter. By default it is false, but when set it can defeat XML entity +expansion attacks such as Billion Laughs. (Contributed by Martin Panter +in :issue:`24238`.) + + zipfile ------- diff -r 2d69d0419879 Include/pyexpat.h --- a/Include/pyexpat.h Sat Jun 04 05:06:34 2016 +0000 +++ b/Include/pyexpat.h Sat Jun 04 06:39:05 2016 +0000 @@ -48,6 +48,9 @@ enum XML_Status (*SetEncoding)(XML_Parser parser, const XML_Char *encoding); int (*DefaultUnknownEncodingHandler)( void *encodingHandlerData, const XML_Char *name, XML_Encoding *info); + enum XML_Status (*StopParser)(XML_Parser parser, XML_Bool resumable); + void (*SetEntityDeclHandler)( + XML_Parser parser, XML_EntityDeclHandler handler); /* always add new stuff to the end! */ }; diff -r 2d69d0419879 Lib/test/test_xml_etree.py --- a/Lib/test/test_xml_etree.py Sat Jun 04 05:06:34 2016 +0000 +++ b/Lib/test/test_xml_etree.py Sat Jun 04 06:39:05 2016 +0000 @@ -33,6 +33,7 @@ except UnicodeEncodeError: raise unittest.SkipTest("filename is not encodable to utf8") SIMPLE_NS_XMLFILE = findfile("simple-ns.xml", subdir="xmltestdata") +XMLBOMB_XMLFILE = findfile("xmlbomb.xml", subdir="xmltestdata") SAMPLE_XML = """\ @@ -81,6 +82,13 @@ """ +ENTITY_DECL_XML = """\ + +]> +&a; +""" + ENTITY_XML = """\ @@ -1002,7 +1010,7 @@ expected = '<%s>' % elem serialized = serialize(ET.XML('<%s />' % elem), method='html') self.assertEqual(serialized, expected) - serialized = serialize(ET.XML('<%s>' % (elem,elem)), + serialized = serialize(ET.XML('<%s>' % (elem, elem)), method='html') self.assertEqual(serialized, expected) @@ -2857,6 +2865,30 @@ # -------------------------------------------------------------------- +class XmlBombTest(unittest.TestCase): + + def test_xmlbomb(self): + ET.parse(XMLBOMB_XMLFILE) # File is fully parsed by default + + parser = ET.XMLParser(forbid_entities=True) + with self.assertRaisesRegex(ValueError, + 'XML entity declaration found'): + ET.parse(XMLBOMB_XMLFILE, parser=parser) + + parser = ET.XMLParser(forbid_entities=False) + ET.parse(XMLBOMB_XMLFILE, parser=parser) + + xml = ENTITY_DECL_XML + with self.assertRaisesRegex(ValueError, + "XML entity declaration found"): + ET.XML(xml, ET.XMLParser(forbid_entities=True)) + + parser = ET.XMLParser(forbid_entities=False) + e = ET.XML(xml, parser=parser) + self.assertEqual(e.text, "MARK") + +# -------------------------------------------------------------------- + class CleanContext(object): """Provide default namespace mapping and path cache.""" @@ -2927,6 +2959,7 @@ XMLParserTest, XMLPullParserTest, BugsTest, + XmlBombTest, ] # These tests will only run for the pure-Python version that doesn't import diff -r 2d69d0419879 Lib/test/xmltestdata/xmlbomb.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Lib/test/xmltestdata/xmlbomb.xml Sat Jun 04 06:39:05 2016 +0000 @@ -0,0 +1,6 @@ + + + +]> +&c; diff -r 2d69d0419879 Lib/xml/etree/ElementTree.py --- a/Lib/xml/etree/ElementTree.py Sat Jun 04 05:06:34 2016 +0000 +++ b/Lib/xml/etree/ElementTree.py Sat Jun 04 06:39:05 2016 +0000 @@ -1429,10 +1429,12 @@ standard TreeBuilder class, *encoding* is an optional encoding string which if given, overrides the encoding specified in the XML file: http://www.iana.org/assignments/character-sets + *forbid_entities* can be set to true to reject XML entity declarations """ - def __init__(self, html=0, target=None, encoding=None): + def __init__(self, html=0, target=None, encoding=None, *, + forbid_entities=False): try: from xml.parsers import expat except ImportError: @@ -1463,6 +1465,8 @@ parser.CommentHandler = target.comment if hasattr(target, 'pi'): parser.ProcessingInstructionHandler = target.pi + if forbid_entities: + parser.EntityDeclHandler = self._entity_decl # Configure pyexpat: buffering, new-style attribute handling. parser.buffer_text = 1 parser.ordered_attributes = 1 @@ -1591,6 +1595,9 @@ self.doctype(name, pubid, system[1:-1]) self._doctype = None + def _entity_decl(self, *pos, **kw): + raise ValueError("XML entity declaration found") + def doctype(self, name, pubid, system): """(Deprecated) Handle doctype declaration diff -r 2d69d0419879 Misc/NEWS --- a/Misc/NEWS Sat Jun 04 05:06:34 2016 +0000 +++ b/Misc/NEWS Sat Jun 04 06:39:05 2016 +0000 @@ -27,6 +27,10 @@ Library ------- +- Issue #24238: Added ElementTree.XMLParser(forbid_entities=...) parameter, + which can be used to defeat XML entity expansion attacks such as Billion + Laughs. + - Issue #26373: subprocess.Popen.communicate now correctly ignores BrokenPipeError when the child process dies before .communicate() is called in more/all circumstances. diff -r 2d69d0419879 Modules/_elementtree.c --- a/Modules/_elementtree.c Sat Jun 04 05:06:34 2016 +0000 +++ b/Modules/_elementtree.c Sat Jun 04 06:39:05 2016 +0000 @@ -3191,6 +3191,16 @@ } } +static void +expat_entity_decl_handler(XMLParserObject* self, const XML_Char* entityName, + int is_parameter_entity, const XML_Char* value, int value_length, + const XML_Char* base, const XML_Char* systemId, const XML_Char* publicId, + const XML_Char *notationName) +{ + EXPAT(StopParser)(self->parser, XML_FALSE); + PyErr_SetString(PyExc_ValueError, "XML entity declaration found"); +} + /* -------------------------------------------------------------------- */ static PyObject * @@ -3213,6 +3223,8 @@ html: object = NULL target: object = NULL encoding: str(accept={str, NoneType}) = NULL + * + forbid_entities: bool = False [clinic start generated code]*/ @@ -3295,6 +3307,12 @@ self->parser, EXPAT(DefaultUnknownEncodingHandler), NULL ); + if (forbid_entities) { + EXPAT(SetEntityDeclHandler)( + self->parser, + (XML_EntityDeclHandler) expat_entity_decl_handler + ); + } return 0; } diff -r 2d69d0419879 Modules/pyexpat.c --- a/Modules/pyexpat.c Sat Jun 04 05:06:34 2016 +0000 +++ b/Modules/pyexpat.c Sat Jun 04 06:39:05 2016 +0000 @@ -1880,6 +1880,8 @@ capi.SetStartDoctypeDeclHandler = XML_SetStartDoctypeDeclHandler; capi.SetEncoding = XML_SetEncoding; capi.DefaultUnknownEncodingHandler = PyUnknownEncodingHandler; + capi.StopParser = XML_StopParser; + capi.SetEntityDeclHandler = XML_SetEntityDeclHandler; /* export using capsule */ capi_object = PyCapsule_New(&capi, PyExpat_CAPSULE_NAME, NULL);