Index: Lib/sgmllib.py
===================================================================
--- Lib/sgmllib.py (revision 46865)
+++ Lib/sgmllib.py (working copy)
@@ -29,7 +29,11 @@
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
piclose = re.compile('>')
-endbracket = re.compile('[<>]')
+endbracket = re.compile(r'/?[a-zA-Z][-_.:a-zA-Z0-9]*\s*('
+ r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
+ r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]'
+ r'[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?'
+ r')*\s*/?\s*(?=[<>])')
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
attrfind = re.compile(
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
@@ -245,11 +249,10 @@
self.finish_shorttag(tag, data)
self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
return k
- # XXX The following should skip matching quotes (' or ")
- match = endbracket.search(rawdata, i+1)
+ match = endbracket.match(rawdata, i+1)
if not match:
return -1
- j = match.start(0)
+ j = match.end(0)
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
if rawdata[i:i+2] == '<>':
@@ -311,10 +314,10 @@
# Internal -- parse endtag
def parse_endtag(self, i):
rawdata = self.rawdata
- match = endbracket.search(rawdata, i+1)
+ match = endbracket.match(rawdata, i+1)
if not match:
return -1
- j = match.start(0)
+ j = match.end(0)
tag = rawdata[i+2:j].strip().lower()
if rawdata[j] == '>':
j = j+1
Index: Lib/test/test_sgmllib.py
===================================================================
--- Lib/test/test_sgmllib.py (revision 46865)
+++ Lib/test/test_sgmllib.py (working copy)
@@ -228,6 +228,13 @@
("h", "Ǵ"),
("i", "x?a=b&c=d;"), ])])
+ def test_attr_values_quoted_markup(self):
+ """Multi-line and markup in attribute values"""
+ self.check_events("""text""",
+ [("starttag", "a", [("title", "foo\n
bar")]),
+ ("data", "text"),
+ ("endtag", "a")])
+
def test_attr_funky_names(self):
self.check_events("""""", [
("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")]),