Index: Lib/sgmllib.py =================================================================== --- Lib/sgmllib.py (revision 46865) +++ Lib/sgmllib.py (working copy) @@ -29,7 +29,11 @@ shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/') shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/') piclose = re.compile('>') -endbracket = re.compile('[<>]') +endbracket = re.compile(r'/?[a-zA-Z][-_.:a-zA-Z0-9]*\s*(' + r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' + r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]' + r'[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?' + r')*\s*/?\s*(?=[<>])') tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') attrfind = re.compile( r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' @@ -245,11 +249,10 @@ self.finish_shorttag(tag, data) self.__starttag_text = rawdata[start_pos:match.end(1) + 1] return k - # XXX The following should skip matching quotes (' or ") - match = endbracket.search(rawdata, i+1) + match = endbracket.match(rawdata, i+1) if not match: return -1 - j = match.start(0) + j = match.end(0) # Now parse the data between i+1 and j into a tag and attrs attrs = [] if rawdata[i:i+2] == '<>': @@ -311,10 +314,10 @@ # Internal -- parse endtag def parse_endtag(self, i): rawdata = self.rawdata - match = endbracket.search(rawdata, i+1) + match = endbracket.match(rawdata, i+1) if not match: return -1 - j = match.start(0) + j = match.end(0) tag = rawdata[i+2:j].strip().lower() if rawdata[j] == '>': j = j+1 Index: Lib/test/test_sgmllib.py =================================================================== --- Lib/test/test_sgmllib.py (revision 46865) +++ Lib/test/test_sgmllib.py (working copy) @@ -228,6 +228,13 @@ ("h", "Ǵ"), ("i", "x?a=b&c=d;"), ])]) + def test_attr_values_quoted_markup(self): + """Multi-line and markup in attribute values""" + self.check_events("""text""", + [("starttag", "a", [("title", "foo\n
bar")]), + ("data", "text"), + ("endtag", "a")]) + def test_attr_funky_names(self): self.check_events("""""", [ ("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),