diff -r cee04627bdd0 Lib/_markupbase.py --- a/Lib/_markupbase.py Wed Mar 13 11:09:08 2013 -0700 +++ b/Lib/_markupbase.py Wed Mar 13 20:23:29 2013 +0100 @@ -77,11 +77,11 @@ assert rawdata[i:j] == "": # the empty comment - return j + 1 + return j + 1, None if rawdata[j:j+1] in ("-", ""): # Start of comment followed by buffer boundary, # or just a buffer boundary. - return -1 + return -1, None # A simple, practical version could look like: ((name|stringlit) S*) + '>' n = len(rawdata) if rawdata[j:j+2] == '--': #comment @@ -96,27 +96,28 @@ else: #all other declaration elements decltype, j = self._scan_name(j, i) if j < 0: - return j + return j, None if decltype == "doctype": self._decl_otherchars = '' + token = None while j < n: c = rawdata[j] if c == ">": # end of declaration syntax data = rawdata[i+2:j] if decltype == "doctype": - self.handle_decl(data) + token = ('decl', data) else: # According to the HTML5 specs sections "8.2.4.44 Bogus # comment state" and "8.2.4.45 Markup declaration open # state", a comment token should be emitted. # Calling unknown_decl provides more flexibility though. - self.unknown_decl(data) - return j + 1 + token = ('unknown_decl', data) + return j + 1, token if c in "\"'": m = _declstringlit_match(rawdata, j) if not m: - return -1 # incomplete + return -1, None # incomplete j = m.end() elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": name, j = self._scan_name(j, i) @@ -138,8 +139,8 @@ self.error( "unexpected %r char in declaration" % rawdata[j]) if j < 0: - return j - return -1 # incomplete + return j, token + return -1, None # incomplete # Internal -- parse a marked section # Override this to handle MS-word extension syntax content @@ -148,7 +149,7 @@ assert rawdata[i:i+3] == ' ending match= _markedsectionclose.search(rawdata, i+3) @@ -158,11 +159,12 @@ else: self.error('unknown status keyword %r in marked section' % rawdata[i+3:j]) if not match: - return -1 + return -1, None + token = None if report: j = match.start(0) - self.unknown_decl(rawdata[i+3: j]) - return match.end(0) + token = ('unknown_decl', rawdata[i+3: j]) + return match.end(0), token # Internal -- parse comment, return length or -1 if not terminated def parse_comment(self, i, report=1): @@ -171,11 +173,12 @@ self.error('unexpected call to parse_comment()') match = _commentclose.search(rawdata, i+4) if not match: - return -1 + return -1, None + token = None if report: j = match.start(0) - self.handle_comment(rawdata[i+4: j]) - return match.end(0) + token = ('comment', rawdata[i+4: j]) + return match.end(0), token # Internal -- scan past the internal subset in a gtpos = rawdata.find('>', i+9) if gtpos == -1: - return -1 - self.handle_decl(rawdata[i+2:gtpos]) - return gtpos+1 + return -1, None + return gtpos+1, ('decl', rawdata[i+2:gtpos]) else: return self.parse_bogus_comment(i) @@ -299,10 +316,9 @@ 'parse_comment()') pos = rawdata.find('>', i+2) if pos == -1: - return -1 - if report: - self.handle_comment(rawdata[i+2:pos]) - return pos + 1 + return -1, None + token = None if not report else ('comment', rawdata[i+2:pos]) + return pos + 1, token # Internal -- parse processing instr, return end or -1 if not terminated def parse_pi(self, i): @@ -310,18 +326,17 @@ assert rawdata[i:i+2] == ' if not match: - return -1 + return -1, None j = match.start() - self.handle_pi(rawdata[i+2: j]) - j = match.end() - return j + k = match.end() + return k, ('pi', rawdata[i+2: j]) # Internal -- handle starttag, return end or -1 if not terminated def parse_starttag(self, i): self.__starttag_text = None endpos = self.check_for_whole_start_tag(i) if endpos < 0: - return endpos + return endpos, None rawdata = self.rawdata self.__starttag_text = rawdata[i:endpos] @@ -361,16 +376,15 @@ if self.strict: self.error("junk characters in start tag: %r" % (rawdata[k:endpos][:20],)) - self.handle_data(rawdata[i:endpos]) - return endpos + return endpos, ('data', rawdata[i:endpos]) if end.endswith('/>'): # XHTML-style empty tag: - self.handle_startendtag(tag, attrs) + token = ('startendtag', (tag, attrs)) else: - self.handle_starttag(tag, attrs) + token = ('starttag', (tag, attrs)) if tag in self.CDATA_CONTENT_ELEMENTS: self.set_cdata_mode(tag) - return endpos + return endpos, token # Internal -- check to see if we have a complete starttag; return end # or -1 if incomplete. @@ -422,13 +436,12 @@ assert rawdata[i:i+2] == " if not match: - return -1 + return -1, None gtpos = match.end() match = endtagfind.match(rawdata, i) # if not match: if self.cdata_elem is not None: - self.handle_data(rawdata[i:gtpos]) - return gtpos + return gtpos, ('data', rawdata[i:gtpos]) if self.strict: self.error("bad end tag: %r" % (rawdata[i:gtpos],)) # find the name: w3.org/TR/html5/tokenization.html#tag-name-state @@ -436,7 +449,7 @@ if not namematch: # w3.org/TR/html5/tokenization.html#end-tag-open-state if rawdata[i:i+3] == '': - return i+3 + return i+3, None else: return self.parse_bogus_comment(i) tagname = namematch.group().lower() @@ -445,18 +458,15 @@ # , but looking for > after tha name should cover # most of the cases and is much simpler gtpos = rawdata.find('>', namematch.end()) - self.handle_endtag(tagname) - return gtpos+1 + return gtpos+1, ('endtag', tagname) elem = match.group(1).lower() # script or style if self.cdata_elem is not None: if elem != self.cdata_elem: - self.handle_data(rawdata[i:gtpos]) - return gtpos + return gtpos, ('data', rawdata[i:gtpos]) - self.handle_endtag(elem.lower()) self.clear_cdata_mode() - return gtpos + return gtpos, ('endtag', elem.lower()) # Overridable -- finish processing of start+end tag: def handle_startendtag(self, tag, attrs):