diff -ur _orig\HTMLParser.py .\HTMLParser.py --- _orig\HTMLParser.py Tue May 27 12:41:14 2008 +++ .\HTMLParser.py Mon Aug 23 15:09:43 2010 @@ -30,7 +30,7 @@ locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s+ # whitespace before attribute name + (?:\s* # whitespace before attribute name (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name (?:\s*=\s* # value indicator (?:'[^']*' # LITA-enclosed value @@ -86,15 +86,26 @@ CDATA_CONTENT_ELEMENTS = ("script", "style") - - def __init__(self): - """Initialize and reset this instance.""" + warning_count = 0 + warning_file = None + + def __init__(self, tolerant=0, warning_file=None): + """Initialize and reset this instance. + + tolerant: 0=strict parsing, 9=fully tolerant parsing mode without + break by HTML format errors. + warning_file: output stream for warning messages about the HTML + format. By default (None) no such output is done. + """ + self.tolerant = tolerant + self.warning_file = warning_file self.reset() def reset(self): """Reset this instance. Loses all unprocessed data.""" self.rawdata = '' self.lasttag = '???' + self.warning_count = 0 self.interesting = interesting_normal markupbase.ParserBase.reset(self) @@ -114,6 +125,14 @@ def error(self, message): raise HTMLParseError(message, self.getpos()) + def warning(self, message, i=-1, k=-1): + self.warning_count += 1 + if self.warning_file: + line, offset = self.getpos() + self.warning_file.write( + message + " ('%s', line=%s, offset=%s)\n" % + (self.rawdata[i:k], line, offset)) + __starttag_text = None def get_starttag_text(self): @@ -161,8 +180,21 @@ break if k < 0: if end: - self.error("EOF in middle of construct") - break + if self.tolerant: + # best recover .. + k = rawdata.find('>', i + 1) + if k < 0: + k = rawdata.find('<', i + 1) + if k < 0: + k = i + 1 + else: + k += 1 + self.warning("EOF in middle of construct", i, k) + self.handle_data(rawdata[i:k]) + else: + self.error("EOF in middle of construct") + else: + break i = self.updatepos(i, k) elif startswith("&#", i): match = charref.match(rawdata, i) @@ -190,7 +222,13 @@ if match: # match.group() will contain at least 2 chars if end and match.group() == rawdata[i:]: - self.error("EOF in middle of entity or char ref") + if self.tolerant: + self.warning("EOF in middle of entity or char ref", i, k) + if k <= i: + k = n + i = self.updatepos(i, i + 1) + else: + self.error("EOF in middle of entity or char ref") # incomplete break elif (i + 1) < n: @@ -259,6 +297,10 @@ - self.__starttag_text.rfind("\n") else: offset = offset + len(self.__starttag_text) + if self.tolerant: + self.warning("junk characters in start tag", i, endpos) + self.handle_data(rawdata[i:endpos]) + return endpos self.error("junk characters in start tag: %r" % (rawdata[k:endpos][:20],)) if end.endswith('/>'): @@ -287,6 +329,12 @@ # buffer boundary return -1 # else bogus input + if self.tolerant: + self.warning("malformed empty start tag", i, j) + if j > i: + return j + else: + return i + 1 self.updatepos(i, j + 1) self.error("malformed empty start tag") if next == "": @@ -297,6 +345,12 @@ # end of input in or before attribute value, or we have the # '/' from a '/>' ending return -1 + if self.tolerant: + self.warning("malformed start tag", i, j) + if j > i: + return j + else: + return i + 1 self.updatepos(i, j) self.error("malformed start tag") raise AssertionError("we should not get here!") @@ -311,6 +365,15 @@ j = match.end() match = endtagfind.match(rawdata, i) # if not match: + k = rawdata.find('<', i + 1, j) + if k > i: + j = k + if self.tolerant: + if j <= i: + j = i + 1 + self.warning("bad end tag", i, j) + self.handle_data(rawdata[i:j]) + return j self.error("bad end tag: %r" % (rawdata[i:j],)) tag = match.group(1) self.handle_endtag(tag.lower()) @@ -355,6 +418,9 @@ pass def unknown_decl(self, data): + if self.tolerant: + self.warning("unknown declaration: %r" % (data,) ) + return self.error("unknown declaration: %r" % (data,)) # Internal -- helper to remove special character quoting