diff -ur _orig\HTMLParser.py .\HTMLParser.py
--- _orig\HTMLParser.py Tue May 27 12:41:14 2008
+++ .\HTMLParser.py Mon Aug 23 15:09:43 2010
@@ -30,7 +30,7 @@
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
- (?:\s+ # whitespace before attribute name
+ (?:\s* # whitespace before attribute name
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
(?:\s*=\s* # value indicator
(?:'[^']*' # LITA-enclosed value
@@ -86,15 +86,26 @@
CDATA_CONTENT_ELEMENTS = ("script", "style")
-
- def __init__(self):
- """Initialize and reset this instance."""
+ warning_count = 0
+ warning_file = None
+
+ def __init__(self, tolerant=0, warning_file=None):
+ """Initialize and reset this instance.
+
+ tolerant: 0=strict parsing, 9=fully tolerant parsing mode without
+ break by HTML format errors.
+ warning_file: output stream for warning messages about the HTML
+ format. By default (None) no such output is done.
+ """
+ self.tolerant = tolerant
+ self.warning_file = warning_file
self.reset()
def reset(self):
"""Reset this instance. Loses all unprocessed data."""
self.rawdata = ''
self.lasttag = '???'
+ self.warning_count = 0
self.interesting = interesting_normal
markupbase.ParserBase.reset(self)
@@ -114,6 +125,14 @@
def error(self, message):
raise HTMLParseError(message, self.getpos())
+ def warning(self, message, i=-1, k=-1):
+ self.warning_count += 1
+ if self.warning_file:
+ line, offset = self.getpos()
+ self.warning_file.write(
+ message + " ('%s', line=%s, offset=%s)\n" %
+ (self.rawdata[i:k], line, offset))
+
__starttag_text = None
def get_starttag_text(self):
@@ -161,8 +180,21 @@
break
if k < 0:
if end:
- self.error("EOF in middle of construct")
- break
+ if self.tolerant:
+ # best recover ..
+ k = rawdata.find('>', i + 1)
+ if k < 0:
+ k = rawdata.find('<', i + 1)
+ if k < 0:
+ k = i + 1
+ else:
+ k += 1
+ self.warning("EOF in middle of construct", i, k)
+ self.handle_data(rawdata[i:k])
+ else:
+ self.error("EOF in middle of construct")
+ else:
+ break
i = self.updatepos(i, k)
elif startswith("", i):
match = charref.match(rawdata, i)
@@ -190,7 +222,13 @@
if match:
# match.group() will contain at least 2 chars
if end and match.group() == rawdata[i:]:
- self.error("EOF in middle of entity or char ref")
+ if self.tolerant:
+ self.warning("EOF in middle of entity or char ref", i, k)
+ if k <= i:
+ k = n
+ i = self.updatepos(i, i + 1)
+ else:
+ self.error("EOF in middle of entity or char ref")
# incomplete
break
elif (i + 1) < n:
@@ -259,6 +297,10 @@
- self.__starttag_text.rfind("\n")
else:
offset = offset + len(self.__starttag_text)
+ if self.tolerant:
+ self.warning("junk characters in start tag", i, endpos)
+ self.handle_data(rawdata[i:endpos])
+ return endpos
self.error("junk characters in start tag: %r"
% (rawdata[k:endpos][:20],))
if end.endswith('/>'):
@@ -287,6 +329,12 @@
# buffer boundary
return -1
# else bogus input
+ if self.tolerant:
+ self.warning("malformed empty start tag", i, j)
+ if j > i:
+ return j
+ else:
+ return i + 1
self.updatepos(i, j + 1)
self.error("malformed empty start tag")
if next == "":
@@ -297,6 +345,12 @@
# end of input in or before attribute value, or we have the
# '/' from a '/>' ending
return -1
+ if self.tolerant:
+ self.warning("malformed start tag", i, j)
+ if j > i:
+ return j
+ else:
+ return i + 1
self.updatepos(i, j)
self.error("malformed start tag")
raise AssertionError("we should not get here!")
@@ -311,6 +365,15 @@
j = match.end()
match = endtagfind.match(rawdata, i) # + tag + >
if not match:
+ k = rawdata.find('<', i + 1, j)
+ if k > i:
+ j = k
+ if self.tolerant:
+ if j <= i:
+ j = i + 1
+ self.warning("bad end tag", i, j)
+ self.handle_data(rawdata[i:j])
+ return j
self.error("bad end tag: %r" % (rawdata[i:j],))
tag = match.group(1)
self.handle_endtag(tag.lower())
@@ -355,6 +418,9 @@
pass
def unknown_decl(self, data):
+ if self.tolerant:
+ self.warning("unknown declaration: %r" % (data,) )
+ return
self.error("unknown declaration: %r" % (data,))
# Internal -- helper to remove special character quoting