--- svn/HTMLParser.py 2010-09-30 18:49:20.000000000 +0200 +++ HTMLParser.py 2010-12-10 19:02:25.000000000 +0200 @@ -13,6 +13,14 @@ # Regular expressions used for parsing +# See: http://www.w3.org/TR/html5/syntax.html#attributes-0 +## Attributes have a name and a value. Attribute names must consist of +## one or more characters other than the space characters, U+0000 NULL, +## U+0022 QUOTATION MARK ("), U+0027 APOSTROPHE ('), U+003E GREATER-THAN +## SIGN (>), U+002F SOLIDUS (/), and U+003D EQUALS SIGN (=) characters, +space_chars = " \t\n\f\r" +non_attr_chars = space_chars + "\0" + '"' + "'" + ">/=" + interesting_normal = re.compile('[&<]') interesting_cdata = re.compile(r'<(/|\Z)') incomplete = re.compile('&[a-zA-Z#]') @@ -25,13 +33,13 @@ commentclose = re.compile(r'--\s*>') tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') attrfind = re.compile( - r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' + r'\s*([^\s' '"' "'" r'>/=]+)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?') locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:\s+ # whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name + (?:[^%s]+ # attribute name (?:\s*=\s* # value indicator (?:'[^']*' # LITA-enclosed value |\"[^\"]*\" # LIT-enclosed value @@ -41,10 +49,20 @@ ) )* \s* # trailing whitespace -""", re.VERBOSE) +""" % non_attr_chars, re.VERBOSE) endendtag = re.compile('>') endtagfind = re.compile('') +# See: +# http://www.w3.org/TR/html5/syntax.html#end-tags +# http://www.w3.org/TR/html5/common-microsyntaxes.html#space-character +## The space characters, for the purposes of this specification, are +## U+0020 SPACE, U+0009 CHARACTER TABULATION (tab), U+000A LINE FEED +## (LF), U+000C FORM FEED (FF), and U+000D CARRIAGE RETURN (CR). +pat_endtag = "/[%s]*>" % space_chars +has_endtag = re.compile(pat_endtag) +start_endtag = re.compile("^" + pat_endtag) + class HTMLParseError(Exception): """Exception raised for all parse errors.""" @@ -97,6 +115,7 @@ self.lasttag = '???' self.interesting = interesting_normal markupbase.ParserBase.reset(self) + self.cdata_tags = [] def feed(self, data): """Feed data to the parser. @@ -254,7 +273,8 @@ k = m.end() end = rawdata[k:endpos].strip() - if end not in (">", "/>"): + is_endtag = start_endtag.match(end) + if end != ">" and is_endtag is None: lineno, offset = self.getpos() if "\n" in self.__starttag_text: lineno = lineno + self.__starttag_text.count("\n") @@ -264,12 +284,13 @@ offset = offset + len(self.__starttag_text) self.error("junk characters in start tag: %r" % (rawdata[k:endpos][:20],)) - if end.endswith('/>'): + if is_endtag: # XHTML-style empty tag: self.handle_startendtag(tag, attrs) else: self.handle_starttag(tag, attrs) if tag in self.CDATA_CONTENT_ELEMENTS: + self.cdata_tags.append(tag) self.set_cdata_mode() return endpos @@ -284,8 +305,9 @@ if next == ">": return j + 1 if next == "/": - if rawdata.startswith("/>", j): - return j + 2 + mhet = has_endtag.match(rawdata, j) + if mhet and mhet.start() == j: # leading '^' ignores pos-shift + return mhet.end() if rawdata.startswith("/", j): # buffer boundary return -1 @@ -316,8 +338,11 @@ if not match: self.error("bad end tag: %r" % (rawdata[i:j],)) tag = match.group(1) - self.handle_endtag(tag.lower()) - self.clear_cdata_mode() + tag = tag.lower() + self.handle_endtag(tag) + if len(self.cdata_tags) > 0 and self.cdata_tags[-1] == tag: + self.cdata_tags.pop() + self.clear_cdata_mode() return j # Overridable -- finish processing of start+end tag: