--- HTMLParser.py.orig 2005-05-05 11:46:18.000000000 +0200 +++ HTMLParser.py 2006-05-23 16:17:38.000000000 +0200 @@ -30,7 +30,7 @@ locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s+ # whitespace before attribute name + (?:\s* # whitespace before attribute name (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name (?:\s*=\s* # value indicator (?:'[^']*' # LITA-enclosed value @@ -86,9 +86,12 @@ CDATA_CONTENT_ELEMENTS = ("script", "style") - - def __init__(self): + tolerant=9 # tolerance level - 9 is fully tolerant, 0 is strict + warning_count=0 + + def __init__(self, tolerant=9): """Initialize and reset this instance.""" + self.tolerant=tolerant self.reset() def reset(self): @@ -111,9 +114,13 @@ """Handle any buffered data.""" self.goahead(1) + # XXX error() should allow to recover: def error(msg,i=-1,k=-1) => k/-1 def error(self, message): raise HTMLParseError(message, self.getpos()) + def warning(self, message, i=-1, k=-1): + self.warning_count += 1 + __starttag_text = None def get_starttag_text(self): @@ -161,8 +168,15 @@ break if k < 0: if end: - self.error("EOF in middle of construct") - break + if self.tolerant: + # recover and leave '>' in data to indicate HTML junk + k=rawdata.find('>', i + 1) + self.warning("EOF in middle of construct",i,k) + else: + k=self.error("EOF in middle of construct") + if k<=i: k=n + else: + break i = self.updatepos(i, k) elif startswith("&#", i): match = charref.match(rawdata, i) @@ -190,7 +204,13 @@ if match: # match.group() will contain at least 2 chars if end and match.group() == rawdata[i:]: - self.error("EOF in middle of entity or char ref") + if self.tolerant: + self.error("malformed entity or char ref at pos=%s" % i) + i = self.updatepos(i, i + 1) + else: + k = self.error("EOF in middle of entity or char ref") + if k<=i: k=n + i = self.updatepos(i, k) # incomplete break elif (i + 1) < n: @@ -259,6 +279,10 @@ - self.__starttag_text.rfind("\n") else: offset = offset + len(self.__starttag_text) + if self.tolerant: + self.warning("junk characters in start tag: %r" + % rawdata[k:endpos][:20], i, endpos) + return endpos self.error("junk characters in start tag: %r" % (rawdata[k:endpos][:20],)) if end.endswith('/>'): @@ -287,6 +311,9 @@ # buffer boundary return -1 # else bogus input + if self.tolerant: + self.warning("malformed empty start tag", i, j) + return -1 self.updatepos(i, j + 1) self.error("malformed empty start tag") if next == "": @@ -297,6 +324,9 @@ # end of input in or before attribute value, or we have the # '/' from a '/>' ending return -1 + if self.tolerant: + self.warning("malformed start tag", i, j) + return -1 self.updatepos(i, j) self.error("malformed start tag") raise AssertionError("we should not get here!") @@ -311,6 +341,9 @@ j = match.end() match = endtagfind.match(rawdata, i) # if not match: + if self.tolerant: + self.warning("bad end tag %r" % (rawdata[i:j],), i, j) + return -1 self.error("bad end tag: %r" % (rawdata[i:j],)) tag = match.group(1) self.handle_endtag(tag.lower()) @@ -355,6 +388,9 @@ pass def unknown_decl(self, data): + if self.tolerant: + self.warning("unknown declaration: %r" % (data,) ) + return self.error("unknown declaration: %r" % (data,)) # Internal -- helper to remove special character quoting