diff -r cee04627bdd0 Lib/_markupbase.py
--- a/Lib/_markupbase.py Wed Mar 13 11:09:08 2013 -0700
+++ b/Lib/_markupbase.py Wed Mar 13 20:23:29 2013 +0100
@@ -77,11 +77,11 @@
assert rawdata[i:j] == "":
# the empty comment
- return j + 1
+ return j + 1, None
if rawdata[j:j+1] in ("-", ""):
# Start of comment followed by buffer boundary,
# or just a buffer boundary.
- return -1
+ return -1, None
# A simple, practical version could look like: ((name|stringlit) S*) + '>'
n = len(rawdata)
if rawdata[j:j+2] == '--': #comment
@@ -96,27 +96,28 @@
else: #all other declaration elements
decltype, j = self._scan_name(j, i)
if j < 0:
- return j
+ return j, None
if decltype == "doctype":
self._decl_otherchars = ''
+ token = None
while j < n:
c = rawdata[j]
if c == ">":
# end of declaration syntax
data = rawdata[i+2:j]
if decltype == "doctype":
- self.handle_decl(data)
+ token = ('decl', data)
else:
# According to the HTML5 specs sections "8.2.4.44 Bogus
# comment state" and "8.2.4.45 Markup declaration open
# state", a comment token should be emitted.
# Calling unknown_decl provides more flexibility though.
- self.unknown_decl(data)
- return j + 1
+ token = ('unknown_decl', data)
+ return j + 1, token
if c in "\"'":
m = _declstringlit_match(rawdata, j)
if not m:
- return -1 # incomplete
+ return -1, None # incomplete
j = m.end()
elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
name, j = self._scan_name(j, i)
@@ -138,8 +139,8 @@
self.error(
"unexpected %r char in declaration" % rawdata[j])
if j < 0:
- return j
- return -1 # incomplete
+ return j, token
+ return -1, None # incomplete
# Internal -- parse a marked section
# Override this to handle MS-word extension syntax content
@@ -148,7 +149,7 @@
assert rawdata[i:i+3] == ' ending
match= _markedsectionclose.search(rawdata, i+3)
@@ -158,11 +159,12 @@
else:
self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
if not match:
- return -1
+ return -1, None
+ token = None
if report:
j = match.start(0)
- self.unknown_decl(rawdata[i+3: j])
- return match.end(0)
+ token = ('unknown_decl', rawdata[i+3: j])
+ return match.end(0), token
# Internal -- parse comment, return length or -1 if not terminated
def parse_comment(self, i, report=1):
@@ -171,11 +173,12 @@
self.error('unexpected call to parse_comment()')
match = _commentclose.search(rawdata, i+4)
if not match:
- return -1
+ return -1, None
+ token = None
if report:
j = match.start(0)
- self.handle_comment(rawdata[i+4: j])
- return match.end(0)
+ token = ('comment', rawdata[i+4: j])
+ return match.end(0), token
# Internal -- scan past the internal subset in a
gtpos = rawdata.find('>', i+9)
if gtpos == -1:
- return -1
- self.handle_decl(rawdata[i+2:gtpos])
- return gtpos+1
+ return -1, None
+ return gtpos+1, ('decl', rawdata[i+2:gtpos])
else:
return self.parse_bogus_comment(i)
@@ -299,10 +316,9 @@
'parse_comment()')
pos = rawdata.find('>', i+2)
if pos == -1:
- return -1
- if report:
- self.handle_comment(rawdata[i+2:pos])
- return pos + 1
+ return -1, None
+ token = None if not report else ('comment', rawdata[i+2:pos])
+ return pos + 1, token
# Internal -- parse processing instr, return end or -1 if not terminated
def parse_pi(self, i):
@@ -310,18 +326,17 @@
assert rawdata[i:i+2] == '', 'unexpected call to parse_pi()'
match = piclose.search(rawdata, i+2) # >
if not match:
- return -1
+ return -1, None
j = match.start()
- self.handle_pi(rawdata[i+2: j])
- j = match.end()
- return j
+ k = match.end()
+ return k, ('pi', rawdata[i+2: j])
# Internal -- handle starttag, return end or -1 if not terminated
def parse_starttag(self, i):
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
- return endpos
+ return endpos, None
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]
@@ -361,16 +376,15 @@
if self.strict:
self.error("junk characters in start tag: %r"
% (rawdata[k:endpos][:20],))
- self.handle_data(rawdata[i:endpos])
- return endpos
+ return endpos, ('data', rawdata[i:endpos])
if end.endswith('/>'):
# XHTML-style empty tag:
- self.handle_startendtag(tag, attrs)
+ token = ('startendtag', (tag, attrs))
else:
- self.handle_starttag(tag, attrs)
+ token = ('starttag', (tag, attrs))
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
- return endpos
+ return endpos, token
# Internal -- check to see if we have a complete starttag; return end
# or -1 if incomplete.
@@ -422,13 +436,12 @@
assert rawdata[i:i+2] == "", "unexpected call to parse_endtag"
match = endendtag.search(rawdata, i+1) # >
if not match:
- return -1
+ return -1, None
gtpos = match.end()
match = endtagfind.match(rawdata, i) # + tag + >
if not match:
if self.cdata_elem is not None:
- self.handle_data(rawdata[i:gtpos])
- return gtpos
+ return gtpos, ('data', rawdata[i:gtpos])
if self.strict:
self.error("bad end tag: %r" % (rawdata[i:gtpos],))
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
@@ -436,7 +449,7 @@
if not namematch:
# w3.org/TR/html5/tokenization.html#end-tag-open-state
if rawdata[i:i+3] == '>':
- return i+3
+ return i+3, None
else:
return self.parse_bogus_comment(i)
tagname = namematch.group().lower()
@@ -445,18 +458,15 @@
# ">, but looking for > after tha name should cover
# most of the cases and is much simpler
gtpos = rawdata.find('>', namematch.end())
- self.handle_endtag(tagname)
- return gtpos+1
+ return gtpos+1, ('endtag', tagname)
elem = match.group(1).lower() # script or style
if self.cdata_elem is not None:
if elem != self.cdata_elem:
- self.handle_data(rawdata[i:gtpos])
- return gtpos
+ return gtpos, ('data', rawdata[i:gtpos])
- self.handle_endtag(elem.lower())
self.clear_cdata_mode()
- return gtpos
+ return gtpos, ('endtag', elem.lower())
# Overridable -- finish processing of start+end tag:
def handle_startendtag(self, tag, attrs):