# HG changeset patch # Parent 7b1da249ab6d2d7c84cac5ec787d2fa6ed58a1b2 diff -r 7b1da249ab6d Lib/test/test_xml_etree.py --- a/Lib/test/test_xml_etree.py Sun Aug 25 14:19:29 2013 +0200 +++ b/Lib/test/test_xml_etree.py Mon Aug 26 21:29:10 2013 +0200 @@ -950,24 +950,24 @@ self.assertEqual(serialized, expected) -class IncrementalParserTest(unittest.TestCase): +class ParserEventReadingTest(unittest.TestCase): def _feed(self, parser, data, chunk_size=None): if chunk_size is None: - parser.data_received(data) + parser.feed(data) else: for i in range(0, len(data), chunk_size): - parser.data_received(data[i:i+chunk_size]) + parser.feed(data[i:i+chunk_size]) def assert_event_tags(self, parser, expected): - events = parser.events() + events = parser.read_events() self.assertEqual([(action, elem.tag) for action, elem in events], expected) def test_simple_xml(self): for chunk_size in (None, 1, 5): with self.subTest(chunk_size=chunk_size): - parser = ET.IncrementalParser() + parser = ET.XMLParser(collect_events=('end',)) self.assert_event_tags(parser, []) self._feed(parser, "\n", chunk_size) self.assert_event_tags(parser, []) @@ -985,14 +985,12 @@ ]) self._feed(parser, "\n", chunk_size) self.assert_event_tags(parser, [('end', 'root')]) - # Receiving EOF sets the `root` attribute - self.assertIs(parser.root, None) - parser.eof_received() - self.assertEqual(parser.root.tag, 'root') + root = parser.close() + self.assertEqual(root.tag, 'root') def test_data_received_while_iterating(self): - parser = ET.IncrementalParser() - it = parser.events() + parser = ET.XMLParser(collect_events=('end',)) + it = parser.read_events() self._feed(parser, "\n text\n") action, elem = next(it) self.assertEqual((action, elem.tag), ('end', 'element')) @@ -1003,7 +1001,7 @@ next(it) def test_simple_xml_with_ns(self): - parser = ET.IncrementalParser() + parser = ET.XMLParser(collect_events=('end',)) self.assert_event_tags(parser, []) self._feed(parser, "\n") self.assert_event_tags(parser, []) @@ -1021,32 +1019,30 @@ ]) self._feed(parser, "\n") self.assert_event_tags(parser, [('end', '{namespace}root')]) - # Receiving EOF sets the `root` attribute - self.assertIs(parser.root, None) - parser.eof_received() - self.assertEqual(parser.root.tag, '{namespace}root') + root = parser.close() + self.assertEqual(root.tag, '{namespace}root') def test_ns_events(self): - parser = ET.IncrementalParser(events=('start-ns', 'end-ns')) + parser = ET.XMLParser(collect_events=('start-ns', 'end-ns')) self._feed(parser, "\n") self._feed(parser, "\n") self.assertEqual( - list(parser.events()), + list(parser.read_events()), [('start-ns', ('', 'namespace'))]) self._feed(parser, "text\n") self._feed(parser, "texttail\n") self._feed(parser, "\n") self._feed(parser, "\n") - self.assertEqual(list(parser.events()), [('end-ns', None)]) - parser.eof_received() + self.assertEqual(list(parser.read_events()), [('end-ns', None)]) + parser.close() def test_events(self): - parser = ET.IncrementalParser(events=()) + parser = ET.XMLParser(collect_events=()) self._feed(parser, "\n") self.assert_event_tags(parser, []) - parser = ET.IncrementalParser(events=('start', 'end')) + parser = ET.XMLParser(collect_events=('start', 'end')) self._feed(parser, "\n") self.assert_event_tags(parser, []) self._feed(parser, "\n") @@ -1064,12 +1060,11 @@ ('end', '{foo}element'), ]) self._feed(parser, "") - parser.eof_received() - self.assertIs(parser.root, None) + root = parser.close() self.assert_event_tags(parser, [('end', 'root')]) - self.assertEqual(parser.root.tag, 'root') - - parser = ET.IncrementalParser(events=('start',)) + self.assertEqual(root.tag, 'root') + + parser = ET.XMLParser(collect_events=('start',)) self._feed(parser, "\n") self.assert_event_tags(parser, []) self._feed(parser, "\n") @@ -1085,13 +1080,13 @@ ('start', '{foo}empty-element'), ]) self._feed(parser, "") - parser.eof_received() - self.assertEqual(parser.root.tag, 'root') + root = parser.close() + self.assertEqual(root.tag, 'root') def test_events_sequence(self): # Test that events can be some sequence that's not just a tuple or list eventset = {'end', 'start'} - parser = ET.IncrementalParser(events=eventset) + parser = ET.XMLParser(collect_events=eventset) self._feed(parser, "bar") self.assert_event_tags(parser, [('start', 'foo'), ('end', 'foo')]) @@ -1103,14 +1098,14 @@ def __next__(self): return next(self.events) - parser = ET.IncrementalParser(events=DummyIter()) + parser = ET.XMLParser(collect_events=DummyIter()) self._feed(parser, "bar") self.assert_event_tags(parser, [('start', 'foo'), ('end', 'foo')]) def test_unknown_event(self): with self.assertRaises(ValueError): - ET.IncrementalParser(events=('start', 'end', 'bogus')) + ET.XMLParser(collect_events=('start', 'end', 'bogus')) # @@ -2546,7 +2541,7 @@ ElementSlicingTest, BasicElementTest, ElementTreeTest, - IncrementalParserTest, + ParserEventReadingTest, IOTest, ParseErrorTest, XIncludeTest, diff -r 7b1da249ab6d Lib/xml/etree/ElementTree.py --- a/Lib/xml/etree/ElementTree.py Sun Aug 25 14:19:29 2013 +0200 +++ b/Lib/xml/etree/ElementTree.py Mon Aug 26 21:29:10 2013 +0200 @@ -1207,87 +1207,57 @@ if not hasattr(source, "read"): source = open(source, "rb") close_source = True - return _IterParseIterator(source, events, parser, close_source) - - -class IncrementalParser: - - def __init__(self, events=None, parser=None): - # _elementtree.c expects a list, not a deque - self._events_queue = [] - self._index = 0 - self.root = self._root = None - if not parser: - parser = XMLParser(target=TreeBuilder()) - self._parser = parser - # wire up the parser for event reporting - if events is None: - events = ("end",) - self._parser._setevents(self._events_queue, events) - - def data_received(self, data): - if self._parser is None: - raise ValueError("data_received() called after end of stream") - if data: - try: - self._parser.feed(data) - except SyntaxError as exc: - self._events_queue.append(exc) - - def eof_received(self): - self._root = self._parser.close() - self._parser = None - if self._index >= len(self._events_queue): - self.root = self._root - - def events(self): - events = self._events_queue - while True: - index = self._index - try: - event = events[self._index] - # Avoid retaining references to past events - events[self._index] = None - except IndexError: - break - index += 1 - # Compact the list in a O(1) amortized fashion - if index * 2 >= len(events): - events[:index] = [] - self._index = 0 - else: - self._index = index - if isinstance(event, Exception): - raise event - else: - yield event - if self._parser is None: - self.root = self._root + if events is None: + events = ("end",) + if parser is None: + parser = XMLParser(target=TreeBuilder(), collect_events=events) + else: + # essentially, we assume isinstance(parser, XMLParser) + # reconfigure the parser to collect the events we want + parser._setevents(parser._events_queue, events) + return _IterParseIterator(source, parser, close_source) class _IterParseIterator: - def __init__(self, source, events, parser, close_source=False): - self._parser = IncrementalParser(events, parser) + def __init__(self, source, parser, close_source=False): + self._error = None + # it's a very unfortunate design that we must set self.root at the + # end. it severly limits the way this can be implemented. + self.root = self._root = None + self._parser = parser self._file = source self._close_file = close_source - self.root = None def __next__(self): + if self._parser is None: + raise StopIteration + while 1: - for event in self._parser.events(): + # FIXME: this single-step for-loop has a huge overhead + for event in self._parser.read_events(): return event - if self._parser._parser is None: - self.root = self._parser.root - if self._close_file: - self._file.close() + + if self._error: + e = self._error + self._error = None + raise e + if self._file is None: + self._parser = None + self.root = self._root raise StopIteration # load event buffer data = self._file.read(16384) if data: - self._parser.data_received(data) + try: + self._parser.feed(data) + except SyntaxError as exc: + self._error = exc else: - self._parser.eof_received() + self._root = self._parser.close() + if self._close_file: + self._file.close() + self._file = None def __iter__(self): return self @@ -1438,7 +1408,7 @@ """ - def __init__(self, html=0, target=None, encoding=None): + def __init__(self, html=0, target=None, encoding=None, collect_events=None): try: from xml.parsers import expat except ImportError: @@ -1482,6 +1452,11 @@ parser.StartElementHandler = self._start_list except AttributeError: pass + # _elementtree.c expects a list, not a deque + self._events_queue = [] + self._events_index = 0 + if collect_events: + self._setevents(self._events_queue, collect_events) self._doctype = None self.entity = {} try: @@ -1651,6 +1626,38 @@ del self.parser, self._parser del self.target, self._target + def read_events(self): + # we avoid using yield here to make sure we free references early + return _EventsListIterator(self) + + +class _EventsListIterator: + # destructive event iterator -- for internal purposes only + def __init__(self, parser): + self._parser = parser + + def __iter__(self): + return self + + def __next__(self): + events = self._parser._events_queue + index = self._parser._events_index + try: + event = events[index] + # Avoid retaining references to past events + events[index] = None + except IndexError: + pass + else: + index += 1 + # Compact the list in a O(1) amortized fashion + if index * 2 >= len(events): + del events[:index] + index = 0 + self._parser._events_index = index + return event + raise StopIteration + # Import the C accelerators try: