Index: Lib/html/parser.py =================================================================== --- Lib/html/parser.py (revision 86342) +++ Lib/html/parser.py (working copy) @@ -362,6 +362,8 @@ # Internal -- helper to remove special character quoting entitydefs = None + # many web pages accidentally use MS code page 1252 characters instead of iso-8859-1 or unicode characters + cp1252_to_unicode = {92:2019, 145:8216, 146:8217, 147:8220, 148:8221, 149:8226, 150:8211, 151:8212, 152:732, 153:8482} def unescape(self, s): if '&' not in s: return s @@ -373,6 +375,8 @@ c = int(s[1:], 16) else: c = int(s) + if c in HTMLParser.cp1252_to_unicode: + c = HTMLParser.cp1252_to_unicode[c] return chr(c) else: # Cannot use name2codepoint directly, because HTMLParser Index: Lib/test/test_htmlparser.py =================================================================== --- Lib/test/test_htmlparser.py (revision 86342) +++ Lib/test/test_htmlparser.py (working copy) @@ -320,10 +320,17 @@ ("starttag", "html", [("foo", "\u20AC&aa&unsupported;")]) ]) + def test_unescaping(self): + parser = html.parser.HTMLParser() + escaped = "

There’s the Côte

" + correct_unescaped = "

There"+chr(8217) + "s the C" + chr(0x00f4) \ + + "te

" + actual_unescaped = parser.unescape(escaped) + if actual_unescaped != correct_unescaped: + self.fail ("failed to unescape properly") def test_main(): support.run_unittest(HTMLParserTestCase) - if __name__ == "__main__": test_main()