Index: Lib/sgmllib.py =================================================================== --- Lib/sgmllib.py (revision 61302) +++ Lib/sgmllib.py (working copy) @@ -31,9 +31,14 @@ piclose = re.compile('>') endbracket = re.compile('[<>]') tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*') -attrfind = re.compile( + +attrfind_quotestart = re.compile( + r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)\s*=\s*[\'"]') +attrfind_completedquote = re.compile( + r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*(\'[^\']*\'|"[^"]*"))') +attrfind_unquoted = re.compile( r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') + r'([][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') class SGMLParseError(RuntimeError): @@ -249,42 +254,72 @@ self.finish_shorttag(tag, data) self.__starttag_text = rawdata[start_pos:match.end(1) + 1] return k - # XXX The following should skip matching quotes (' or ") - # As a shortcut way to exit, this isn't so bad, but shouldn't - # be used to locate the actual end of the start tag since the - # < or > characters may be embedded in an attribute value. - match = endbracket.search(rawdata, i+1) - if not match: - return -1 - j = match.start(0) - # Now parse the data between i+1 and j into a tag and attrs - attrs = [] + + j = i + k = i+1 if rawdata[i:i+2] == '<>': # SGML shorthand: <> == - k = j + j = i+1 tag = self.lasttag else: + # Now parse the data after i into a tag and attrs match = tagfind.match(rawdata, i+1) if not match: self.error('unexpected call to parse_starttag') k = match.end(0) tag = rawdata[i+1:k].lower() - self.lasttag = tag - while k < j: - match = attrfind.match(rawdata, k) - if not match: break - attrname, rest, attrvalue = match.group(1, 2, 3) - if not rest: - attrvalue = attrname - else: - if (attrvalue[:1] == "'" == attrvalue[-1:] or - attrvalue[:1] == '"' == attrvalue[-1:]): - # strip quotes - attrvalue = attrvalue[1:-1] - attrvalue = self.entity_or_charref.sub( - self._convert_ref, attrvalue) - attrs.append((attrname.lower(), attrvalue)) - k = match.end(0) + + attrs = [] + while True: + # This is the loop for finding attributes... + + # First, we find a new endbracket location (j), + # if the old location is behind the point + # we've parsed up to (k) + if j < k: + match = endbracket.search(rawdata, k) + if not match: + return -1 + j = match.start(0) + + # To handle quoted strings, we first check if there is a + # completed quote + match = attrfind_completedquote.match(rawdata, k) + if not match: + # If not, we check if there was a quote started (but not + # finished, since we already checked for that) - if so, we + # have an incomplete expression, and return -1 + match = attrfind_quotestart.match(rawdata, k) + if match: + return -1 + + # Otherwise, we look for an unquoted (and possibly + # mal-formed) attribute + match = attrfind_unquoted.match(rawdata, k) + + if not match: + # If we can't find an attribute (and don't have open + # strings!), we've found all the attributes we can, so break + # out of the loop, and close the tag + break + + # Process the attribute we found... + attrname, rest, attrvalue = match.group(1, 2, 3) + if not rest: + attrvalue = attrname + else: + if (attrvalue[:1] == "'" == attrvalue[-1:] or + attrvalue[:1] == '"' == attrvalue[-1:]): + # strip quotes + attrvalue = attrvalue[1:-1] + attrvalue = self.entity_or_charref.sub( + self._convert_ref, attrvalue) + attrs.append((attrname.lower(), attrvalue)) + k = match.end(0) + + # Close up the tag, do housekeeping + self.lasttag = tag + if rawdata[j] == '>': j = j+1 self.__starttag_text = rawdata[start_pos:j] Index: Lib/test/test_sgmllib.py =================================================================== --- Lib/test/test_sgmllib.py (revision 61302) +++ Lib/test/test_sgmllib.py (working copy) @@ -284,6 +284,13 @@ ('charref', 'convert', '42'), ('codepoint', 'convert', 42), ]) + + def test_attr_values_quoted_markup(self): + """Multi-line and markup in attribute values""" + self.check_events("""text""", + [("starttag", "a", [("title", "foo\n
bar")]), + ("data", "text"), + ("endtag", "a")]) def test_attr_funky_names(self): self.check_events("""""", [ @@ -373,16 +380,11 @@ if len(data) != CHUNK: break - # XXX These tests have been disabled by prefixing their names with - # an underscore. The first two exercise outstanding bugs in the - # sgmllib module, and the third exhibits questionable behavior - # that needs to be carefully considered before changing it. - - def _test_starttag_end_boundary(self): + def test_starttag_end_boundary(self): self.check_events("", [("starttag", "a", [("b", "<")])]) self.check_events("", [("starttag", "a", [("b", ">")])]) - def _test_buffer_artefacts(self): + def test_buffer_artefacts(self): output = [("starttag", "a", [("b", "<")])] self.check_events([""], output) self.check_events([""], output) @@ -412,6 +414,10 @@ self.check_events(["", ""], output) + # XXX These tests have been disabled by prefixing their names with an + # underscore. The test exhibits questionable behavior that needs to be + # carefully considered before changing it. + def _test_starttag_junk_chars(self): self.check_parse_error("<") self.check_parse_error("<>")