diff --git a/Doc/library/xml.etree.elementtree.rst b/Doc/library/xml.etree.elementtree.rst --- a/Doc/library/xml.etree.elementtree.rst +++ b/Doc/library/xml.etree.elementtree.rst @@ -397,6 +397,9 @@ Functions If you need a fully populated element, look for "end" events instead. + .. note:: + For real event-driven parsing, see :class:`IncrementalParser`. + .. function:: parse(source, parser=None) @@ -833,6 +836,48 @@ QName Objects :class:`QName` instances are opaque. +IncrementalParser Objects +^^^^^^^^^^^^^^^^^^^^^^^^^ + + +.. class:: IncrementalParser(events=None, parser=None) + + An incremental, event-driven parser suitable for non-blocking applications. + *events* is a list of events to report back. The supported events are the + strings ``"start"``, ``"end"``, ``"start-ns"`` and ``"end-ns"`` (the "ns" + events are used to get detailed namespace information). If *events* is + omitted, only ``"end"`` events are reported. *parser* is an optional + parser instance. If not given, the standard :class:`XMLParser` parser is + used. + + .. method:: data_received(data) + + Feed the given bytes data to the incremental parser. + + .. method:: eof_received() + + Signal the incremental parser that the data stream is terminated. + + .. method:: events() + + Iterate over the events which have been encountered in the data fed + to the parser. This method yields ``(event, elem)`` pairs, where + *event* is a string representing the type of event (e.g. ``"end"``) + and *elem* is the encountered :class:`Element` object. + + .. note:: + + :class:`IncrementalParser` only guarantees that it has seen the ">" + character of a starting tag when it emits a "start" event, so the + attributes are defined, but the contents of the text and tail attributes + are undefined at that point. The same applies to the element children; + they may or may not be present. + + If you need a fully populated element, look for "end" events instead. + + .. versionadded:: 3.4 + + .. _elementtree-treebuilder-objects: TreeBuilder Objects diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -903,6 +903,134 @@ class ElementTreeTest(unittest.TestCase) self.assertEqual(serialized, expected) +class IncrementalParserTest(unittest.TestCase): + + def _feed(self, parser, data, chunk_size=None): + if chunk_size is None: + parser.data_received(data) + else: + for i in range(0, len(data), chunk_size): + parser.data_received(data[i:i+chunk_size]) + + def assert_event_tags(self, parser, expected): + events = parser.events() + self.assertEqual([(action, elem.tag) for action, elem in events], + expected) + + def test_simple_xml(self): + for chunk_size in (None, 1, 5): + with self.subTest(chunk_size=chunk_size): + parser = ET.IncrementalParser() + self.assert_event_tags(parser, []) + self._feed(parser, "\n", chunk_size) + self.assert_event_tags(parser, []) + self._feed(parser, + "\n text\n", chunk_size) + self.assert_event_tags(parser, [('end', 'element')]) + self._feed(parser, "texttail\n", chunk_size) + self._feed(parser, "\n", chunk_size) + self.assert_event_tags(parser, [ + ('end', 'element'), + ('end', 'empty-element'), + ]) + self._feed(parser, "\n", chunk_size) + self.assert_event_tags(parser, [('end', 'root')]) + # Receiving EOF sets the `root` attribute + self.assertIs(parser.root, None) + parser.eof_received() + self.assertEqual(parser.root.tag, 'root') + + def test_data_received_while_iterating(self): + parser = ET.IncrementalParser() + it = parser.events() + self._feed(parser, "\n text\n") + action, elem = next(it) + self.assertEqual((action, elem.tag), ('end', 'element')) + self._feed(parser, "\n") + action, elem = next(it) + self.assertEqual((action, elem.tag), ('end', 'root')) + with self.assertRaises(StopIteration): + next(it) + + def test_simple_xml_with_ns(self): + parser = ET.IncrementalParser() + self.assert_event_tags(parser, []) + self._feed(parser, "\n") + self.assert_event_tags(parser, []) + self._feed(parser, "\n") + self.assert_event_tags(parser, []) + self._feed(parser, "text\n") + self.assert_event_tags(parser, [('end', '{namespace}element')]) + self._feed(parser, "texttail\n") + self._feed(parser, "\n") + self.assert_event_tags(parser, [ + ('end', '{namespace}element'), + ('end', '{namespace}empty-element'), + ]) + self._feed(parser, "\n") + self.assert_event_tags(parser, [('end', '{namespace}root')]) + # Receiving EOF sets the `root` attribute + self.assertIs(parser.root, None) + parser.eof_received() + self.assertEqual(parser.root.tag, '{namespace}root') + + def test_events(self): + parser = ET.IncrementalParser(events=()) + self._feed(parser, "\n") + self.assert_event_tags(parser, []) + + parser = ET.IncrementalParser(events=('start', 'end')) + self._feed(parser, "\n") + self.assert_event_tags(parser, []) + self._feed(parser, "\n") + self.assert_event_tags(parser, [('start', 'root')]) + self._feed(parser, "text\n") + self.assert_event_tags(parser, [('end', 'element')]) + self._feed(parser, + "texttail\n") + self.assert_event_tags(parser, [ + ('start', '{foo}element'), + ('start', '{foo}empty-element'), + ('end', '{foo}empty-element'), + ('end', '{foo}element'), + ]) + self._feed(parser, "") + parser.eof_received() + self.assertIs(parser.root, None) + self.assert_event_tags(parser, [('end', 'root')]) + self.assertEqual(parser.root.tag, 'root') + + parser = ET.IncrementalParser(events=('start',)) + self._feed(parser, "\n") + self.assert_event_tags(parser, []) + self._feed(parser, "\n") + self.assert_event_tags(parser, [('start', 'root')]) + self._feed(parser, "text\n") + self.assert_event_tags(parser, []) + self._feed(parser, + "texttail\n") + self.assert_event_tags(parser, [ + ('start', '{foo}element'), + ('start', '{foo}empty-element'), + ]) + self._feed(parser, "") + parser.eof_received() + self.assertEqual(parser.root.tag, 'root') + + def test_unknown_event(self): + with self.assertRaises(ValueError): + ET.IncrementalParser(events=('start', 'end', 'bogus')) + + # # xinclude tests (samples from appendix C of the xinclude specification) @@ -1406,6 +1534,7 @@ class BugsTest(unittest.TestCase): ET.register_namespace('test10777', 'http://myuri/') ET.register_namespace('test10777', 'http://myuri/') + # -------------------------------------------------------------------- @@ -2301,6 +2430,7 @@ def test_main(module=None): ElementSlicingTest, BasicElementTest, ElementTreeTest, + IncrementalParserTest, IOTest, ParseErrorTest, XIncludeTest, diff --git a/Lib/xml/etree/ElementTree.py b/Lib/xml/etree/ElementTree.py --- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -1216,84 +1216,85 @@ def iterparse(source, events=None, parse if not hasattr(source, "read"): source = open(source, "rb") close_source = True - if not parser: - parser = XMLParser(target=TreeBuilder()) return _IterParseIterator(source, events, parser, close_source) -class _IterParseIterator: + +class IncrementalParser: + + def __init__(self, events=None, parser=None): + # _elementtree.c expects a list, not a deque + self._events_queue = [] + self._index = 0 + self.root = self._root = None + if not parser: + parser = XMLParser(target=TreeBuilder()) + self._parser = parser + # wire up the parser for event reporting + if events is None: + events = ("end",) + self._parser._setevents(self._events_queue, events) + + def data_received(self, data): + if self._parser is None: + raise ValueError("data_received() called after end of stream") + if data: + try: + self._parser.feed(data) + except SyntaxError as exc: + self._events_queue.append(exc) + + def eof_received(self): + self._root = self._parser.close() + self._parser = None + if self._index >= len(self._events_queue): + self.root = self._root + + def events(self): + events = self._events_queue + while True: + index = self._index + try: + event = events[self._index] + # Avoid retaining references to past events + events[self._index] = None + except IndexError: + break + index += 1 + # Compact the list in a O(1) amortized fashion + if index * 2 >= len(events): + events[:index] = [] + self._index = 0 + else: + self._index = index + if isinstance(event, Exception): + raise event + else: + yield event + if self._parser is None: + self.root = self._root + + +class _IterParseIterator(IncrementalParser): def __init__(self, source, events, parser, close_source=False): + IncrementalParser.__init__(self, events, parser) self._file = source self._close_file = close_source - self._events = [] - self._index = 0 - self._error = None - self.root = self._root = None - self._parser = parser - # wire up the parser for event reporting - parser = self._parser._parser - append = self._events.append - if events is None: - events = ["end"] - for event in events: - if event == "start": - try: - parser.ordered_attributes = 1 - parser.specified_attributes = 1 - def handler(tag, attrib_in, event=event, append=append, - start=self._parser._start_list): - append((event, start(tag, attrib_in))) - parser.StartElementHandler = handler - except AttributeError: - def handler(tag, attrib_in, event=event, append=append, - start=self._parser._start): - append((event, start(tag, attrib_in))) - parser.StartElementHandler = handler - elif event == "end": - def handler(tag, event=event, append=append, - end=self._parser._end): - append((event, end(tag))) - parser.EndElementHandler = handler - elif event == "start-ns": - def handler(prefix, uri, event=event, append=append): - append((event, (prefix or "", uri or ""))) - parser.StartNamespaceDeclHandler = handler - elif event == "end-ns": - def handler(prefix, event=event, append=append): - append((event, None)) - parser.EndNamespaceDeclHandler = handler - else: - raise ValueError("unknown event %r" % event) def __next__(self): while 1: - try: - item = self._events[self._index] - self._index += 1 - return item - except IndexError: - pass - if self._error: - e = self._error - self._error = None - raise e + for event in self.events(): + return event if self._parser is None: - self.root = self._root if self._close_file: self._file.close() raise StopIteration # load event buffer - del self._events[:] - self._index = 0 data = self._file.read(16384) if data: - try: - self._parser.feed(data) - except SyntaxError as exc: - self._error = exc + self.data_received(data) else: - self._root = self._parser.close() - self._parser = None + self.eof_received() def __iter__(self): return self @@ -1498,6 +1499,40 @@ class XMLParser: except AttributeError: pass # unknown + def _setevents(self, event_list, events): + # Internal API for IncrementalParser + parser = self._parser + append = event_list.append + for event in events: + if event == "start": + try: + parser.ordered_attributes = 1 + parser.specified_attributes = 1 + def handler(tag, attrib_in, event=event, append=append, + start=self._start_list): + append((event, start(tag, attrib_in))) + parser.StartElementHandler = handler + except AttributeError: + def handler(tag, attrib_in, event=event, append=append, + start=self._start): + append((event, start(tag, attrib_in))) + parser.StartElementHandler = handler + elif event == "end": + def handler(tag, event=event, append=append, + end=self._end): + append((event, end(tag))) + parser.EndElementHandler = handler + elif event == "start-ns": + def handler(prefix, uri, event=event, append=append): + append((event, (prefix or "", uri or ""))) + parser.StartNamespaceDeclHandler = handler + elif event == "end-ns": + def handler(prefix, event=event, append=append): + append((event, None)) + parser.EndNamespaceDeclHandler = handler + else: + raise ValueError("unknown event %r" % event) + def _raiseerror(self, value): err = ParseError(value) err.code = value.code @@ -1635,7 +1670,7 @@ try: except ImportError: pass else: - # Overwrite 'ElementTree.parse' and 'iterparse' to use the C XMLParser + # Overwrite 'ElementTree.parse' to use the C XMLParser class ElementTree(ElementTree): __doc__ = ElementTree.__doc__ @@ -1661,56 +1696,6 @@ else: if close_source: source.close() - class iterparse: - __doc__ = iterparse.__doc__ - root = None - def __init__(self, source, events=None, parser=None): - self._close_file = False - if not hasattr(source, 'read'): - source = open(source, 'rb') - self._close_file = True - self._file = source - self._events = [] - self._index = 0 - self._error = None - self.root = self._root = None - if parser is None: - parser = XMLParser(target=TreeBuilder()) - self._parser = parser - self._parser._setevents(self._events, events) - - def __next__(self): - while True: - try: - item = self._events[self._index] - self._index += 1 - return item - except IndexError: - pass - if self._error: - e = self._error - self._error = None - raise e - if self._parser is None: - self.root = self._root - if self._close_file: - self._file.close() - raise StopIteration - # load event buffer - del self._events[:] - self._index = 0 - data = self._file.read(16384) - if data: - try: - self._parser.feed(data) - except SyntaxError as exc: - self._error = exc - else: - self._root = self._parser.close() - self._parser = None - - def __iter__(self): - return self # compatibility XMLTreeBuilder = XMLParser