--- svn/HTMLParser.py	2010-09-30 18:49:20.000000000 +0200
+++ HTMLParser.py	2010-12-10 19:02:25.000000000 +0200
@@ -13,6 +13,14 @@
 
 # Regular expressions used for parsing
 
+# See: http://www.w3.org/TR/html5/syntax.html#attributes-0
+## Attributes have a name and a value. Attribute names must consist of
+## one or more characters other than the space characters, U+0000 NULL,
+## U+0022 QUOTATION MARK ("), U+0027 APOSTROPHE ('), U+003E GREATER-THAN
+## SIGN (>), U+002F SOLIDUS (/), and U+003D EQUALS SIGN (=) characters,
+space_chars = " \t\n\f\r"
+non_attr_chars = space_chars + "\0" + '"' + "'" + ">/="
+
 interesting_normal = re.compile('[&<]')
 interesting_cdata = re.compile(r'<(/|\Z)')
 incomplete = re.compile('&[a-zA-Z#]')
@@ -25,13 +33,13 @@
 commentclose = re.compile(r'--\s*>')
 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
 attrfind = re.compile(
-    r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
+    r'\s*([^\s' '"' "'" r'>/=]+)(\s*=\s*'
     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]*))?')
 
 locatestarttagend = re.compile(r"""
   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
   (?:\s+                             # whitespace before attribute name
-    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
+    (?:[^%s]+                        # attribute name
       (?:\s*=\s*                     # value indicator
         (?:'[^']*'                   # LITA-enclosed value
           |\"[^\"]*\"                # LIT-enclosed value
@@ -41,10 +49,20 @@
      )
    )*
   \s*                                # trailing whitespace
-""", re.VERBOSE)
+""" % non_attr_chars, re.VERBOSE)
 endendtag = re.compile('>')
 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
 
+# See:
+# http://www.w3.org/TR/html5/syntax.html#end-tags
+# http://www.w3.org/TR/html5/common-microsyntaxes.html#space-character
+## The space characters, for the purposes of this specification, are
+## U+0020 SPACE, U+0009 CHARACTER TABULATION (tab), U+000A LINE FEED
+## (LF), U+000C FORM FEED (FF), and U+000D CARRIAGE RETURN (CR).
+pat_endtag = "/[%s]*>" % space_chars
+has_endtag = re.compile(pat_endtag)
+start_endtag = re.compile("^" + pat_endtag)
+
 
 class HTMLParseError(Exception):
     """Exception raised for all parse errors."""
@@ -97,6 +115,7 @@
         self.lasttag = '???'
         self.interesting = interesting_normal
         markupbase.ParserBase.reset(self)
+        self.cdata_tags = []
 
     def feed(self, data):
         """Feed data to the parser.
@@ -254,7 +273,8 @@
             k = m.end()
 
         end = rawdata[k:endpos].strip()
-        if end not in (">", "/>"):
+        is_endtag = start_endtag.match(end)
+        if end != ">" and is_endtag is None:
             lineno, offset = self.getpos()
             if "\n" in self.__starttag_text:
                 lineno = lineno + self.__starttag_text.count("\n")
@@ -264,12 +284,13 @@
                 offset = offset + len(self.__starttag_text)
             self.error("junk characters in start tag: %r"
                        % (rawdata[k:endpos][:20],))
-        if end.endswith('/>'):
+        if is_endtag:
             # XHTML-style empty tag: <span attr="value" />
             self.handle_startendtag(tag, attrs)
         else:
             self.handle_starttag(tag, attrs)
             if tag in self.CDATA_CONTENT_ELEMENTS:
+                self.cdata_tags.append(tag)
                 self.set_cdata_mode()
         return endpos
 
@@ -284,8 +305,9 @@
             if next == ">":
                 return j + 1
             if next == "/":
-                if rawdata.startswith("/>", j):
-                    return j + 2
+                mhet = has_endtag.match(rawdata, j)
+                if mhet and mhet.start() == j: # leading '^' ignores pos-shift
+                    return mhet.end()
                 if rawdata.startswith("/", j):
                     # buffer boundary
                     return -1
@@ -316,8 +338,11 @@
         if not match:
             self.error("bad end tag: %r" % (rawdata[i:j],))
         tag = match.group(1)
-        self.handle_endtag(tag.lower())
-        self.clear_cdata_mode()
+        tag = tag.lower()
+        self.handle_endtag(tag)
+        if len(self.cdata_tags) > 0 and self.cdata_tags[-1] == tag:
+            self.cdata_tags.pop()
+            self.clear_cdata_mode()
         return j
 
     # Overridable -- finish processing of start+end tag: <tag.../>