--- svn/HTMLParser.py 2010-09-30 18:49:20.000000000 +0200
+++ HTMLParser.py 2010-12-10 19:02:25.000000000 +0200
@@ -13,6 +13,14 @@
# Regular expressions used for parsing
+# See: http://www.w3.org/TR/html5/syntax.html#attributes-0
+## Attributes have a name and a value. Attribute names must consist of
+## one or more characters other than the space characters, U+0000 NULL,
+## U+0022 QUOTATION MARK ("), U+0027 APOSTROPHE ('), U+003E GREATER-THAN
+## SIGN (>), U+002F SOLIDUS (/), and U+003D EQUALS SIGN (=) characters,
+space_chars = " \t\n\f\r"
+non_attr_chars = space_chars + "\0" + '"' + "'" + ">/="
+
interesting_normal = re.compile('[&<]')
interesting_cdata = re.compile(r'<(/|\Z)')
incomplete = re.compile('&[a-zA-Z#]')
@@ -25,13 +33,13 @@
commentclose = re.compile(r'--\s*>')
tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
attrfind = re.compile(
- r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
+ r'\s*([^\s' '"' "'" r'>/=]+)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name
- (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
+ (?:[^%s]+ # attribute name
(?:\s*=\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|\"[^\"]*\" # LIT-enclosed value
@@ -41,10 +49,20 @@
)
)*
\s* # trailing whitespace
-""", re.VERBOSE)
+""" % non_attr_chars, re.VERBOSE)
endendtag = re.compile('>')
endtagfind = re.compile('\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
+# See:
+# http://www.w3.org/TR/html5/syntax.html#end-tags
+# http://www.w3.org/TR/html5/common-microsyntaxes.html#space-character
+## The space characters, for the purposes of this specification, are
+## U+0020 SPACE, U+0009 CHARACTER TABULATION (tab), U+000A LINE FEED
+## (LF), U+000C FORM FEED (FF), and U+000D CARRIAGE RETURN (CR).
+pat_endtag = "/[%s]*>" % space_chars
+has_endtag = re.compile(pat_endtag)
+start_endtag = re.compile("^" + pat_endtag)
+
class HTMLParseError(Exception):
"""Exception raised for all parse errors."""
@@ -97,6 +115,7 @@
self.lasttag = '???'
self.interesting = interesting_normal
markupbase.ParserBase.reset(self)
+ self.cdata_tags = []
def feed(self, data):
"""Feed data to the parser.
@@ -254,7 +273,8 @@
k = m.end()
end = rawdata[k:endpos].strip()
- if end not in (">", "/>"):
+ is_endtag = start_endtag.match(end)
+ if end != ">" and is_endtag is None:
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
lineno = lineno + self.__starttag_text.count("\n")
@@ -264,12 +284,13 @@
offset = offset + len(self.__starttag_text)
self.error("junk characters in start tag: %r"
% (rawdata[k:endpos][:20],))
- if end.endswith('/>'):
+ if is_endtag:
# XHTML-style empty tag:
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
+ self.cdata_tags.append(tag)
self.set_cdata_mode()
return endpos
@@ -284,8 +305,9 @@
if next == ">":
return j + 1
if next == "/":
- if rawdata.startswith("/>", j):
- return j + 2
+ mhet = has_endtag.match(rawdata, j)
+ if mhet and mhet.start() == j: # leading '^' ignores pos-shift
+ return mhet.end()
if rawdata.startswith("/", j):
# buffer boundary
return -1
@@ -316,8 +338,11 @@
if not match:
self.error("bad end tag: %r" % (rawdata[i:j],))
tag = match.group(1)
- self.handle_endtag(tag.lower())
- self.clear_cdata_mode()
+ tag = tag.lower()
+ self.handle_endtag(tag)
+ if len(self.cdata_tags) > 0 and self.cdata_tags[-1] == tag:
+ self.cdata_tags.pop()
+ self.clear_cdata_mode()
return j
# Overridable -- finish processing of start+end tag: