diff --git a/Doc/library/xml.etree.elementtree.rst b/Doc/library/xml.etree.elementtree.rst index 8563971..3271889 100644 --- a/Doc/library/xml.etree.elementtree.rst +++ b/Doc/library/xml.etree.elementtree.rst @@ -105,37 +105,42 @@ Children are nested, and we can access specific child nodes by index:: >>> root[0][1].text '2008' -Incremental parsing -^^^^^^^^^^^^^^^^^^^ - -It's possible to parse XML incrementally (i.e. not the whole document at once). -The most powerful tool for doing this is :class:`IncrementalParser`. It does -not require a blocking read to obtain the XML data, and is instead fed with -data incrementally with :meth:`IncrementalParser.data_received` calls. To get -the parsed XML elements, call :meth:`IncrementalParser.events`. Here's an -example:: - - >>> incparser = ET.IncrementalParser(['start', 'end']) - >>> incparser.data_received('sometext') - >>> list(incparser.events()) - [('start', )] - >>> incparser.data_received(' more text') - >>> for event, elem in incparser.events(): - ... print(event) - ... print(elem.tag, 'text=', elem.text) - ... - end - mytag text= sometext more text +Pull API for asynchronous parsing +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Most parsing functions provided by this module require to read the whole +document at once before returning any result. It is possible to use a +:class:`XMLParser` and feed data into it incrementally, but it's a push API that +calls methods on a callback target, which is too low-level and inconvenient for +most needs. Sometimes what the user really wants is to be able to parse XML +incrementally, without blocking operations, while enjoying the convenience of +fully constructed :class:`Element` objects. + +The most powerful tool for doing this is :class:`XMLPullParser`. It does not +require a blocking read to obtain the XML data, and is instead fed with data +incrementally with :meth:`XMLPullParser.feed` calls. To get the parsed XML +elements, call :meth:`XMLPullParser.read_events`. Here's an example:: + + >>> asyncparser = ET.XMLPullParser(['start', 'end']) + >>> asyncparser.feed('sometext') + >>> list(asyncparser.read_events()) + [('start', )] + >>> asyncparser.feed(' more text') + >>> for event, elem in asyncparser.read_events(): + ... print(event) + ... print(elem.tag, 'text=', elem.text) + ... + end The obvious use case is applications that operate in an asynchronous fashion where the XML data is being received from a socket or read incrementally from some storage device. In such cases, blocking reads are unacceptable. -Because it's so flexible, :class:`IncrementalParser` can be inconvenient -to use for simpler use-cases. If you don't mind your application blocking on -reading XML data but would still like to have incremental parsing capabilities, -take a look at :func:`iterparse`. It can be useful when you're reading a large -XML document and don't want to hold it wholly in memory. +Because it's so flexible, :class:`XMLPullParser` can be inconvenient to use for +simpler use-cases. If you don't mind your application blocking on reading XML +data but would still like to have incremental parsing capabilities, take a look +at :func:`iterparse`. It can be useful when you're reading a large XML document +and don't want to hold it wholly in memory. Finding interesting elements ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -412,29 +417,32 @@ Functions Parses an XML section into an element tree incrementally, and reports what's going on to the user. *source* is a filename or :term:`file object` containing XML data. *events* is a sequence of events to report back. The - supported events are the strings ``"start"``, ``"end"``, ``"start-ns"`` - and ``"end-ns"`` (the "ns" events are used to get detailed namespace + supported events are the strings ``"start"``, ``"end"``, ``"start-ns"`` and + ``"end-ns"`` (the "ns" events are used to get detailed namespace information). If *events* is omitted, only ``"end"`` events are reported. *parser* is an optional parser instance. If not given, the standard - :class:`XMLParser` parser is used. *parser* can only use the default - :class:`TreeBuilder` as a target. Returns an :term:`iterator` providing - ``(event, elem)`` pairs. + :class:`XMLParser` parser is used. *parser* must be a subclass of + :class:`XMLParser` and can only use the default :class:`TreeBuilder` as a + target. Returns an :term:`iterator` providing ``(event, elem)`` pairs. Note that while :func:`iterparse` builds the tree incrementally, it issues blocking reads on *source* (or the file it names). As such, it's unsuitable for asynchronous applications where blocking reads can't be made. For fully - asynchronous parsing, see :class:`IncrementalParser`. + asynchronous parsing, see :class:`XMLPullParser`. .. note:: - :func:`iterparse` only guarantees that it has seen the ">" - character of a starting tag when it emits a "start" event, so the - attributes are defined, but the contents of the text and tail attributes - are undefined at that point. The same applies to the element children; - they may or may not be present. + :func:`iterparse` only guarantees that it has seen the ">" character of a + starting tag when it emits a "start" event, so the attributes are defined, + but the contents of the text and tail attributes are undefined at that + point. The same applies to the element children; they may or may not be + present. If you need a fully populated element, look for "end" events instead. + .. deprecated:: 3.4 + The *parser* argument. + .. function:: parse(source, parser=None) Parses an XML section into an element tree. *source* is a filename or file @@ -870,48 +878,6 @@ QName Objects :class:`QName` instances are opaque. -IncrementalParser Objects -^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. class:: IncrementalParser(events=None, parser=None) - - An incremental, event-driven parser suitable for non-blocking applications. - *events* is a sequence of events to report back. The supported events are - the strings ``"start"``, ``"end"``, ``"start-ns"`` and ``"end-ns"`` (the "ns" - events are used to get detailed namespace information). If *events* is - omitted, only ``"end"`` events are reported. *parser* is an optional - parser instance. If not given, the standard :class:`XMLParser` parser is - used. *parser* can only use the default :class:`TreeBuilder` as a target. - - .. method:: data_received(data) - - Feed the given bytes data to the incremental parser. - - .. method:: eof_received() - - Signal the incremental parser that the data stream is terminated. - - .. method:: events() - - Iterate over the events which have been encountered in the data fed - to the parser. This method yields ``(event, elem)`` pairs, where - *event* is a string representing the type of event (e.g. ``"end"``) - and *elem* is the encountered :class:`Element` object. Events - provided in a previous call to :meth:`events` will not be yielded - again. - - .. note:: - - :class:`IncrementalParser` only guarantees that it has seen the ">" - character of a starting tag when it emits a "start" event, so the - attributes are defined, but the contents of the text and tail attributes - are undefined at that point. The same applies to the element children; - they may or may not be present. - - If you need a fully populated element, look for "end" events instead. - - .. versionadded:: 3.4 - .. _elementtree-treebuilder-objects: @@ -972,13 +938,17 @@ XMLParser Objects .. class:: XMLParser(html=0, target=None, encoding=None) - :class:`Element` structure builder for XML source data, based on the expat - parser. *html* are predefined HTML entities. This flag is not supported by - the current implementation. *target* is the target object. If omitted, the - builder uses an instance of the standard :class:`TreeBuilder` class. - *encoding* [1]_ is optional. If given, the value overrides the encoding + This class is the low-level building block of the module. It uses + :mod:`xml.parsers.expat` for efficient, event-based parsing of XML. It can + be fed XML data incrementall with the :meth:`feed` method, and parsing events + are translated to a push API - by invoking callbacks on the *target* object. + If *target* is omitted, the standard :class:`TreeBuilder` is used. The + *html* argument was historically used for backwards compatibility and is now + deprecated. If *encoding* [1]_ is given, the value overrides the encoding specified in the XML file. + .. deprecated:: 3.4 + The *html* argument. .. method:: close() @@ -998,12 +968,12 @@ XMLParser Objects Feeds data to the parser. *data* is encoded data. - :meth:`XMLParser.feed` calls *target*\'s ``start()`` method - for each opening tag, its ``end()`` method for each closing tag, - and data is processed by method ``data()``. :meth:`XMLParser.close` - calls *target*\'s method ``close()``. - :class:`XMLParser` can be used not only for building a tree structure. - This is an example of counting the maximum depth of an XML file:: + :meth:`XMLParser.feed` calls *target*\'s ``start(tag, attrs_dict)`` method + for each opening tag, its ``end(tag)`` method for each closing tag, and data + is processed by method ``data(data)``. :meth:`XMLParser.close` calls + *target*\'s method ``close()``. :class:`XMLParser` can be used not only for + building a tree structure. This is an example of counting the maximum depth + of an XML file:: >>> from xml.etree.ElementTree import XMLParser >>> class MaxDepth: # The target object of the parser @@ -1037,6 +1007,51 @@ XMLParser Objects >>> parser.close() 4 + +.. _elementtree-xmlpullparser-objects: + +XMLPullParser Objects +^^^^^^^^^^^^^^^^^^^^^ + +.. class:: XMLPullParser(events=None) + + A pull parser suitable for nonblocking (asynchronous) applications. Its + input-side API is similar to that of :class:`XMLParser`, but instead of + pushing calls to a callback target, :class:`XMLPullParser` collects an + internal list of parsing events and lets the user read from it. *events* is a + sequence of events to report back. The supported events are the strings + ``"start"``, ``"end"``, ``"start-ns"`` and ``"end-ns"`` (the "ns" events are + used to get detailed namespace information). If *events* is omitted, only + ``"end"`` events are reported. + + .. method:: feed(data) + + Feed the given bytes data to the parser. + + .. method:: close() + + Signal the parser that the data stream is terminated. + + .. method:: read_events() + + Iterate over the events which have been encountered in the data fed to the + parser. This method yields ``(event, elem)`` pairs, where *event* is a + string representing the type of event (e.g. ``"end"``) and *elem* is the + encountered :class:`Element` object. Events provided in a previous call + to :meth:`read_events` will not be yielded again. + + .. note:: + + :class:`XMLPullParser` only guarantees that it has seen the ">" + character of a starting tag when it emits a "start" event, so the + attributes are defined, but the contents of the text and tail attributes + are undefined at that point. The same applies to the element children; + they may or may not be present. + + If you need a fully populated element, look for "end" events instead. + + .. versionadded:: 3.4 + Exceptions ^^^^^^^^^^ diff --git a/Doc/whatsnew/3.4.rst b/Doc/whatsnew/3.4.rst index d56b422..bab3085 100644 --- a/Doc/whatsnew/3.4.rst +++ b/Doc/whatsnew/3.4.rst @@ -369,10 +369,9 @@ xml.etree --------- Add an event-driven parser for non-blocking applications, -:class:`~xml.etree.ElementTree.IncrementalParser`. - -(Contributed by Antoine Pitrou in :issue:`17782`.) +:class:`~xml.etree.ElementTree.XMLPullParser`. +(Contributed by Antoine Pitrou in :issue:`17741`.) Other improvements ================== diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index dec25b5..8cc50a1 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -950,24 +950,24 @@ class ElementTreeTest(unittest.TestCase): self.assertEqual(serialized, expected) -class IncrementalParserTest(unittest.TestCase): +class XMLPullParserTest(unittest.TestCase): def _feed(self, parser, data, chunk_size=None): if chunk_size is None: - parser.data_received(data) + parser.feed(data) else: for i in range(0, len(data), chunk_size): - parser.data_received(data[i:i+chunk_size]) + parser.feed(data[i:i+chunk_size]) def assert_event_tags(self, parser, expected): - events = parser.events() + events = parser.read_events() self.assertEqual([(action, elem.tag) for action, elem in events], expected) def test_simple_xml(self): for chunk_size in (None, 1, 5): with self.subTest(chunk_size=chunk_size): - parser = ET.IncrementalParser() + parser = ET.XMLPullParser() self.assert_event_tags(parser, []) self._feed(parser, "\n", chunk_size) self.assert_event_tags(parser, []) @@ -985,14 +985,14 @@ class IncrementalParserTest(unittest.TestCase): ]) self._feed(parser, "\n", chunk_size) self.assert_event_tags(parser, [('end', 'root')]) - # Receiving EOF sets the `root` attribute + # Closing sets the `root` attribute self.assertIs(parser.root, None) - parser.eof_received() + parser.close() self.assertEqual(parser.root.tag, 'root') - def test_data_received_while_iterating(self): - parser = ET.IncrementalParser() - it = parser.events() + def test_feed_while_iterating(self): + parser = ET.XMLPullParser() + it = parser.read_events() self._feed(parser, "\n text\n") action, elem = next(it) self.assertEqual((action, elem.tag), ('end', 'element')) @@ -1003,7 +1003,7 @@ class IncrementalParserTest(unittest.TestCase): next(it) def test_simple_xml_with_ns(self): - parser = ET.IncrementalParser() + parser = ET.XMLPullParser() self.assert_event_tags(parser, []) self._feed(parser, "\n") self.assert_event_tags(parser, []) @@ -1021,32 +1021,32 @@ class IncrementalParserTest(unittest.TestCase): ]) self._feed(parser, "\n") self.assert_event_tags(parser, [('end', '{namespace}root')]) - # Receiving EOF sets the `root` attribute + # Closing sets the `root` attribute self.assertIs(parser.root, None) - parser.eof_received() + parser.close() self.assertEqual(parser.root.tag, '{namespace}root') def test_ns_events(self): - parser = ET.IncrementalParser(events=('start-ns', 'end-ns')) + parser = ET.XMLPullParser(events=('start-ns', 'end-ns')) self._feed(parser, "\n") self._feed(parser, "\n") self.assertEqual( - list(parser.events()), + list(parser.read_events()), [('start-ns', ('', 'namespace'))]) self._feed(parser, "text\n") self._feed(parser, "texttail\n") self._feed(parser, "\n") self._feed(parser, "\n") - self.assertEqual(list(parser.events()), [('end-ns', None)]) - parser.eof_received() + self.assertEqual(list(parser.read_events()), [('end-ns', None)]) + parser.close() def test_events(self): - parser = ET.IncrementalParser(events=()) + parser = ET.XMLPullParser(events=()) self._feed(parser, "\n") self.assert_event_tags(parser, []) - parser = ET.IncrementalParser(events=('start', 'end')) + parser = ET.XMLPullParser(events=('start', 'end')) self._feed(parser, "\n") self.assert_event_tags(parser, []) self._feed(parser, "\n") @@ -1064,12 +1064,12 @@ class IncrementalParserTest(unittest.TestCase): ('end', '{foo}element'), ]) self._feed(parser, "") - parser.eof_received() + parser.close() self.assertIs(parser.root, None) self.assert_event_tags(parser, [('end', 'root')]) self.assertEqual(parser.root.tag, 'root') - parser = ET.IncrementalParser(events=('start',)) + parser = ET.XMLPullParser(events=('start',)) self._feed(parser, "\n") self.assert_event_tags(parser, []) self._feed(parser, "\n") @@ -1085,13 +1085,13 @@ class IncrementalParserTest(unittest.TestCase): ('start', '{foo}empty-element'), ]) self._feed(parser, "") - parser.eof_received() + parser.close() self.assertEqual(parser.root.tag, 'root') def test_events_sequence(self): # Test that events can be some sequence that's not just a tuple or list eventset = {'end', 'start'} - parser = ET.IncrementalParser(events=eventset) + parser = ET.XMLPullParser(events=eventset) self._feed(parser, "bar") self.assert_event_tags(parser, [('start', 'foo'), ('end', 'foo')]) @@ -1103,14 +1103,14 @@ class IncrementalParserTest(unittest.TestCase): def __next__(self): return next(self.events) - parser = ET.IncrementalParser(events=DummyIter()) + parser = ET.XMLPullParser(events=DummyIter()) self._feed(parser, "bar") self.assert_event_tags(parser, [('start', 'foo'), ('end', 'foo')]) def test_unknown_event(self): with self.assertRaises(ValueError): - ET.IncrementalParser(events=('start', 'end', 'bogus')) + ET.XMLPullParser(events=('start', 'end', 'bogus')) # @@ -2546,7 +2546,6 @@ def test_main(module=None): ElementSlicingTest, BasicElementTest, ElementTreeTest, - IncrementalParserTest, IOTest, ParseErrorTest, XIncludeTest, @@ -2555,6 +2554,7 @@ def test_main(module=None): ElementIterTest, TreeBuilderTest, XMLParserTest, + XMLPullParserTest, BugsTest, ] diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py index 3e3b09c..6526b3e 100644 --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -1210,37 +1210,39 @@ def iterparse(source, events=None, parser=None): return _IterParseIterator(source, events, parser, close_source) -class IncrementalParser: +class XMLPullParser: + + def __init__(self, events=None, *, _parser=None): + # The _parser argument is for internal use only and must not be relied + # upon in user code. It will be removed in a future release. + # See http://bugs.python.org/issue17741 for more details. - def __init__(self, events=None, parser=None): # _elementtree.c expects a list, not a deque self._events_queue = [] self._index = 0 self.root = self._root = None - if not parser: - parser = XMLParser(target=TreeBuilder()) - self._parser = parser + self._parser = _parser or XMLParser(target=TreeBuilder()) # wire up the parser for event reporting if events is None: events = ("end",) self._parser._setevents(self._events_queue, events) - def data_received(self, data): + def feed(self, data): if self._parser is None: - raise ValueError("data_received() called after end of stream") + raise ValueError("feed() called after end of stream") if data: try: self._parser.feed(data) except SyntaxError as exc: self._events_queue.append(exc) - def eof_received(self): + def close(self): self._root = self._parser.close() self._parser = None if self._index >= len(self._events_queue): self.root = self._root - def events(self): + def read_events(self): events = self._events_queue while True: index = self._index @@ -1268,14 +1270,16 @@ class IncrementalParser: class _IterParseIterator: def __init__(self, source, events, parser, close_source=False): - self._parser = IncrementalParser(events, parser) + # Use the internal, undocumented _parser argument for now; When the + # parser argument of iterparse is removed, this can be killed. + self._parser = XMLPullParser(events=events, _parser=parser) self._file = source self._close_file = close_source self.root = None def __next__(self): while 1: - for event in self._parser.events(): + for event in self._parser.read_events(): return event if self._parser._parser is None: self.root = self._parser.root @@ -1283,11 +1287,11 @@ class _IterParseIterator: self._file.close() raise StopIteration # load event buffer - data = self._file.read(16384) + data = self._file.read(16 * 1024) if data: - self._parser.data_received(data) + self._parser.feed(data) else: - self._parser.eof_received() + self._parser.close() def __iter__(self): return self @@ -1481,9 +1485,9 @@ class XMLParser: pass # unknown def _setevents(self, events_queue, events_to_report): - # Internal API for IncrementalParser + # Internal API for XMLPullParser # events_to_report: a list of events to report during parsing (same as - # the *events* of IncrementalParser's constructor. + # the *events* of XMLPullParser's constructor. # events_queue: a list of actual parsing events that will be populated # by the underlying parser. #