Index: Doc/library/xml.dom.pulldom.rst =================================================================== --- Doc/library/xml.dom.pulldom.rst (revision 83065) +++ Doc/library/xml.dom.pulldom.rst (working copy) @@ -1,65 +1,127 @@ -:mod:`xml.dom.pulldom` --- Support for building partial DOM trees -================================================================= +:mod:`xml.dom.pulldom` --- Iterative XML parsing with DOM nodes +=============================================================== .. module:: xml.dom.pulldom - :synopsis: Support for building partial DOM trees from SAX events. + :synopsis: Iterative parsing of XML, with optional DOM subtree creation. .. moduleauthor:: Paul Prescod +.. sectionauthor:: Mark Smith -:mod:`xml.dom.pulldom` allows building only selected portions of a Document -Object Model representation of a document from SAX events. +:mod:`xml.dom.pulldom` allows iteration through the nodes of an XML stream, +with the ability to expand any node into a partial DOM tree. This can be useful +for memory-efficient parsing of large XML files, with a simpler interface than +writing a SAX ContentHandler. +Obtaining each of the nodes in an XML stream is straightforward:: -.. class:: PullDOM(documentFactory=None) + >>> from xml.dom import pulldom + >>> nodes = pulldom.parseString(""" + ... The Meaning of Life""") + >>> for evt, node in nodes: + ... if evt == pulldom.START_ELEMENT: + ... print("Start tag: {0}".format(node.tagName)) + ... elif evt == pulldom.END_ELEMENT: + ... print("End tag: {0}".format(node.tagName)) + ... elif evt == pulldom.CHARACTERS: + ... print("Text: {0}".format(node.data)) + Start tag: movie + Start tag: title + Text: The Meaning of Life + End tag: title + End tag: movie - :class:`xml.sax.handler.ContentHandler` implementation that ... +By default, nodes know nothing about their children or parent. To fill out the +children of a node, call :meth:`DOMEventStream.expandNode` immediately after +obtaining the node. This allows creation of a partial DOM tree: + >>> nodes = pulldom.parseString(""" + ... The Meaning of Life""") + >>> document = next(nodes)[1] # Document node + >>> print(next(nodes)[1].tagName) + movie + >>> node = next(nodes)[1] # title element + >>> print(len(node.childNodes)) + 0 + >>> nodes.expandNode(node) + >>> print(len(node.childNodes)) # Now childNodes has been populated + 1 + >>> # End movie - all tags under title, and have been consumed: + >>> print(next(nodes)[1].tagName) + movie + -.. class:: DOMEventStream(stream, parser, bufsize) +Basic Usage +----------- - ... +The following two functions provide convenient ways to create +:class:`DOMEventStream` instances. +.. function:: parse(stream_or_string, parser=None, bufsize=None) -.. class:: SAX2DOM(documentFactory=None) + The document, passed in as *stream_or_string*, can be a filename or a file + object. If provided, *parser* should be a SAX-compatible parser instance. + *bufsize* is the number of characters passed to the SAX parser each time it + needs to be fed; if not provided, a sensible default is used. - :class:`xml.sax.handler.ContentHandler` implementation that ... +.. function:: parseString(string, parser=None) + Parse an XML *string*. If provided, *parser* should be a SAX-compatible + parser instance. -.. function:: parse(stream_or_string, parser=None, bufsize=None) +Both functions return an instance of :class:`DOMEventStream` that can be used +for iterating through the nodes in the document. - ... +DOMEventStream Objects +---------------------- -.. function:: parseString(string, parser=None) +A DOMEventStream object is an iterator over the nodes of XML content. Each item +returned by the iterator is a 2-tuple containing an event-type and a DOM node +object representing the node. Create a DOMEventStream using :func:`parse` or +:func:`parseString`. - ... +The event-type is one of the following module attributes: +* START_DOCUMENT +* END_DOCUMENT +* START_ELEMENT +* END_ELEMENT +* CHARACTERS +* COMMENT +* PROCESSING_INSTRUCTION +* IGNORABLE_WHITESPACE -.. data:: default_bufsize +The DOM nodes obtained from the iterator know nothing about their parents, +children or siblings. Call :meth:`DOMEventStream.expandNode` to fill out a node +with its children. - Default value for the *bufsize* parameter to :func:`parse`. +.. method:: DOMEventStream.expandNode(node) - The value of this variable can be changed before calling :func:`parse` and - the new value will take effect. + Fill out the provided node with its children. This method must be called + immediately after obtaining a node from the DOMEventStream, otherwise + unexpected behavior will be observed. +.. method:: DOMEventStream.reset() -.. _domeventstream-objects: + Reset the state of the DOMEventStream. -DOMEventStream Objects ----------------------- +Other Classes +------------- -.. method:: DOMEventStream.getEvent() +.. class:: PullDOM - ... + PullDOM is used internally by DOMEventStream, and implements a + DOM ContextHandler with the purpose of converting SAX events into + DOM nodes. +.. class:: SAX2DOM -.. method:: DOMEventStream.expandNode(node) + SAX2DOM extends PullDOM and ensures that each node is added to its parent's + :attr:`childNodes` list as it is parsed. - ... +.. seealso:: -.. method:: DOMEventStream.reset() - - ... - + Module :mod:`xml.dom` -- contains the API for the DOM Nodes returned by + DOMEventStream Index: Lib/test/test_minidom.py =================================================================== --- Lib/test/test_minidom.py (revision 83128) +++ Lib/test/test_minidom.py (working copy) @@ -7,6 +7,7 @@ import xml.dom import xml.dom.minidom import xml.parsers.expat +from xml.dom import pulldom from xml.dom.minidom import parse, Node, Document, parseString from xml.dom.minidom import getDOMImplementation @@ -14,6 +15,13 @@ tstfile = findfile("test.xml", subdir="xmltestdata") +SMALL_SAMPLE = """ + + +Introduction to XSL +
+

A. Namespace

+""" # The tests of DocumentType importing use these helpers to construct # the documents to work with, since not all DOM builders actually @@ -1001,7 +1009,7 @@ doc.unlink() def testSAX2DOM(self): - from xml.dom import pulldom + sax2dom = pulldom.SAX2DOM() sax2dom.startDocument() @@ -1481,8 +1489,91 @@ doc.appendChild(doc.createComment("foo--bar")) self.assertRaises(ValueError, doc.toxml) +class PullDOMTestCase(unittest.TestCase): + def test_parse(self): + items = pulldom.parse(tstfile) + for item in items: + pass#print(item) + + def test_parse_string(self): + items = pulldom.parseString(SMALL_SAMPLE) + evt, node = next(items) + self.assertEqual(pulldom.START_DOCUMENT, evt) + evt, node = next(items) + self.assertEqual(pulldom.START_ELEMENT, evt) + self.assertEqual("html", node.tagName) + self.assertEqual(1, len(node.attributes)) + self.assertEqual(node.attributes.getNamedItem('xmlns:xdc').value, + "http://www.xml.com/books") + evt, node = next(items) + self.assertEqual(pulldom.CHARACTERS, evt) # Line break + evt, node = next(items) + # XXX - A comment should be reported here! + # self.assertEqual(pulldom.COMMENT, evt) + # Line break after swallowed comment: + self.assertEqual(pulldom.CHARACTERS, evt) + evt, node = next(items) + self.assertEqual("title", node.tagName) + title_node = node + evt, node = next(items) + self.assertEqual(pulldom.CHARACTERS, evt) + self.assertEqual("Introduction to XSL", node.data) + evt, node = next(items) + self.assertEqual(pulldom.END_ELEMENT, evt) + self.assertEqual("title", node.tagName) + self.assertTrue(title_node is node) + evt, node = next(items) + self.assertEqual(pulldom.CHARACTERS, evt) + evt, node = next(items) + self.assertEqual(pulldom.START_ELEMENT, evt) + self.assertEqual("hr", node.tagName) + evt, node = next(items) + self.assertEqual(pulldom.END_ELEMENT, evt) + self.assertEqual("hr", node.tagName) + evt, node = next(items) + self.assertEqual(pulldom.CHARACTERS, evt) + evt, node = next(items) + self.assertEqual(pulldom.START_ELEMENT, evt) + self.assertEqual("p", node.tagName) + evt, node = next(items) + self.assertEqual(pulldom.START_ELEMENT, evt) + self.assertEqual("xdc:author", node.tagName) + evt, node = next(items) + self.assertEqual(pulldom.CHARACTERS, evt) + evt, node = next(items) + self.assertEqual(pulldom.END_ELEMENT, evt) + self.assertEqual("xdc:author", node.tagName) + evt, node = next(items) + self.assertEqual(pulldom.END_ELEMENT, evt) + evt, node = next(items) + self.assertEqual(pulldom.CHARACTERS, evt) + evt, node = next(items) + self.assertEqual(pulldom.END_ELEMENT, evt) + # XXX No END_DOCUMENT item is ever obtained: + #evt, node = next(items) + #self.assertEqual(pulldom.END_DOCUMENT, evt) + + def test_expand_item(self): + items = pulldom.parseString(SMALL_SAMPLE) + # Loop through the nodes until we get to a 'title' start tag: + for evt, item in items: + if evt == pulldom.START_ELEMENT and item.tagName == "title": + items.expandNode(item) + break + else: + self.fail("No title tag detected in SMALL_SAMPLE!") + # Loop until we get to the next start-element: + for evt, node in items: + if evt == pulldom.START_ELEMENT: break + self.assertEqual("hr", node.tagName, + "expandItem did not leave DOMEventStream in the correct state.") + # Attempt to expand a standalone element: + items.expandNode(node) + self.assertEqual(next(items)[0], pulldom.CHARACTERS) + self.assertEqual(next(items)[1].tagName, "p") + def test_main(): - run_unittest(MinidomTest) + run_unittest(MinidomTest, PullDOMTestCase) if __name__ == "__main__": test_main() Index: Lib/xml/dom/pulldom.py =================================================================== --- Lib/xml/dom/pulldom.py (revision 83128) +++ Lib/xml/dom/pulldom.py (working copy) @@ -1,3 +1,18 @@ +""" Support for building partial DOM trees. + +For simple stream-based processing of an XML document, this module provides two +functions, :function:`parse` and :function:`parseString` which return a +DOMEventStream, which is an iterator over the nodes in the XMl document. + +Each item returned is a 2-tuple, containing the type of event (one of: +START_ELEMENT, END_ELEMENT, COMMENT, START_DOCUMENT, END_DOCUMENT, +PROCESSING_INSTRUCTION, IGNORABLE_WHITESPACE, CHARACTERS), and the event's +DOM node. By default, the DOM node has no references to any other nodes, and +thus attempting to access its parent or child nodes will return empty values. +Call :meth:`DOMEventStream.expandNodes` to expand a node to contain its +children from the document. +""" + import xml.sax import xml.sax.handler import types @@ -2,2 +17,4 @@ + +# Event type constants: START_ELEMENT = "START_ELEMENT" @@ -11,7 +28,13 @@ IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE" CHARACTERS = "CHARACTERS" + class PullDOM(xml.sax.ContentHandler): + """A ContentHandler that is used internally by :class:`DOMEventStream` to + provide an iterator over the nodes in an XML stream. This class would not + normally be used directly - its main responsibility is to produce DOM + nodes from SAX events. + """ _locator = None document = None @@ -32,14 +55,19 @@ self.pending_events = [] def pop(self): + """A fallback for versions of Python where the list type doesn't + support the pop method.""" result = self.elementStack[-1] del self.elementStack[-1] return result def setDocumentLocator(self, locator): + """Called by the parser to give the application a locator for + locating the origin of document events.""" self._locator = locator def startPrefixMapping(self, prefix, uri): + """Handle a SAX 'startPrefixMapping' event.""" if not hasattr(self, '_xmlns_attrs'): self._xmlns_attrs = [] self._xmlns_attrs.append((prefix or 'xmlns', uri)) @@ -47,9 +75,11 @@ self._current_context[uri] = prefix or None def endPrefixMapping(self, prefix): + """Handle a SAX 'endPrefixMapping' event.""" self._current_context = self._ns_contexts.pop() def startElementNS(self, name, tagName , attrs): + """Handle a SAX 'startElementNS' event.""" # Retrieve xml namespace declaration attributes. xmlns_uri = 'http://www.w3.org/2000/xmlns/' xmlns_attrs = getattr(self, '_xmlns_attrs', None) @@ -107,10 +137,12 @@ self.push(node) def endElementNS(self, name, tagName): + """Handle a SAX 'endElementNS' event.""" self.lastEvent[1] = [(END_ELEMENT, self.pop()), None] self.lastEvent = self.lastEvent[1] def startElement(self, name, attrs): + """Handle a SAX 'startElement' event.""" if self.document: node = self.document.createElement(name) else: @@ -126,10 +158,12 @@ self.push(node) def endElement(self, name): + """Handle a SAX 'endElement' event.""" self.lastEvent[1] = [(END_ELEMENT, self.pop()), None] self.lastEvent = self.lastEvent[1] def comment(self, s): + """Handle a SAX 'comment' event.""" if self.document: node = self.document.createComment(s) self.lastEvent[1] = [(COMMENT, node), None] @@ -139,6 +173,7 @@ self.pending_events.append(event) def processingInstruction(self, target, data): + """Handle a SAX 'processingInstruction' event.""" if self.document: node = self.document.createProcessingInstruction(target, data) self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None] @@ -148,22 +183,26 @@ self.pending_events.append(event) def ignorableWhitespace(self, chars): + """Handle a SAX 'ignorableWhitespace' event.""" node = self.document.createTextNode(chars) self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None] self.lastEvent = self.lastEvent[1] def characters(self, chars): + """Handle a SAX 'characters' event.""" node = self.document.createTextNode(chars) self.lastEvent[1] = [(CHARACTERS, node), None] self.lastEvent = self.lastEvent[1] def startDocument(self): + """Handle a SAX 'startDocument' event.""" if self.documentFactory is None: import xml.dom.minidom self.documentFactory = xml.dom.minidom.Document.implementation def buildDocument(self, uri, tagname): - # Can't do that in startDocument, since we need the tagname + """Used internally to create a minidom document. Can't do that in + startDocument, since we need the tagname""" # XXX: obtain DocumentType node = self.documentFactory.createDocument(uri, tagname, None) self.document = node @@ -187,13 +226,15 @@ return node.firstChild def endDocument(self): + """Handle a SAX endDocument event.""" self.lastEvent[1] = [(END_DOCUMENT, self.document), None] self.pop() def clear(self): - "clear(): Explicitly release parsing structures" + "Explicitly release parsing structures" self.document = None + class ErrorHandler: def warning(self, exception): print(exception) @@ -202,7 +243,18 @@ def fatalError(self, exception): raise exception + class DOMEventStream: + """A streaming iterator for the contents of an XML document. + + The parse and parseString functions are simplified ways of creating + a DOMEventStream, so this class would not normally be instantiated + directly. + + A DOMEventStream can only be iterated through once; an attempt to + loop through a stream a second time will provide no further events. + """ + def __init__(self, stream, parser, bufsize): self.stream = stream self.parser = parser @@ -212,18 +264,24 @@ self.reset() def reset(self): + """Clears the internal state of the parser, so that + """ self.pulldom = PullDOM() # This content handler relies on namespace support self.parser.setFeature(xml.sax.handler.feature_namespaces, 1) self.parser.setContentHandler(self.pulldom) def __getitem__(self, pos): + """Obtain the next event in the stream. The index provided + is ignored. + """ rc = self.getEvent() if rc: return rc raise IndexError def __next__(self): + """Obtain the next event from the stream.""" rc = self.getEvent() if rc: return rc @@ -233,6 +291,15 @@ return self def expandNode(self, node): + """When called immediately after acquiring *node* from the + stream, will fill out *node* with its children from the + stream. + + Notes: + * *node* must be an element, or an exception will be raised. + * *node* must be the most recent item obtained from the stream + or unexpected behavior will be observed. + """ event = self.getEvent() parents = [node] while event: @@ -248,8 +315,9 @@ event = self.getEvent() def getEvent(self): - # use IncrementalParser interface, so we get the desired - # pull effect + """Obtain the next event from the XML stream. An event is + a 2-tuple, consisting of the event type, and a DOM node. + """ if not self.pulldom.firstEvent[1]: self.pulldom.lastEvent = self.pulldom.firstEvent while not self.pulldom.firstEvent[1]: @@ -288,7 +356,10 @@ self.stream = None class SAX2DOM(PullDOM): - + """An extension of PullDOM that also ensures that each Node + correctly holds a reference to any child nodes that have + been emitted.""" + def startElementNS(self, name, tagName , attrs): PullDOM.startElementNS(self, name, tagName, attrs) curNode = self.elementStack[-1] @@ -322,7 +393,11 @@ default_bufsize = (2 ** 14) - 20 + def parse(stream_or_string, parser=None, bufsize=None): + """Create a stream-based iterator over the nodes contained in + *stream_or_string*, which should either be a file-like object, + or the path of an XML file to be parsed.""" if bufsize is None: bufsize = default_bufsize if isinstance(stream_or_string, str): @@ -333,7 +408,10 @@ parser = xml.sax.make_parser() return DOMEventStream(stream, parser, bufsize) + def parseString(string, parser=None): + """Create a stream-based iterator over the nodes contained in + *string*, which should be a string containing XML.""" try: from io import StringIO except ImportError: