Index: Lib/html/parser.py =================================================================== --- Lib/html/parser.py (revision 63474) +++ Lib/html/parser.py (working copy) @@ -10,6 +10,7 @@ import markupbase import re +import html.entities # Regular expressions used for parsing @@ -45,7 +46,39 @@ endendtag = re.compile('>') endtagfind = re.compile('') +# remove HTML escape sequences +def unescape(s): + """convert &...; escape sequences into unicode characters""" + + # many web pages accidentally use MS code page 1252 characters instead of iso-8859-1 or unicode characters + cp1252_to_unicode = {92:2019, 145:8216, 146:8217, 147:8220, 148:8221, 149:8226, 150:8211, 151:8212, 152:732, 153:8482} + if '&' not in s: + return s + def replaceEntities(s): + s = s.groups()[0] + if s[0] == "#": + s = s[1:] + if s[0] in ['x','X']: + c = int(s[1:], 16) + else: + c = int(s) + if c in cp1252_to_unicode: + c = cp1252_to_unicode[c] + return unichr(c) + else: + try: + return unichr(html.entities.name2codepoint[s]) + except KeyError: + # HTMLParser also supports apos, which is not in HTML 4 + if s == 'apos': + return u"'" + else: + return '&'+s+';' + return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", + replaceEntities, s) + + class HTMLParseError(Exception): """Exception raised for all parse errors.""" @@ -246,7 +279,7 @@ elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] - attrvalue = self.unescape(attrvalue) + attrvalue = unescape(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() @@ -357,32 +390,3 @@ def unknown_decl(self, data): self.error("unknown declaration: %r" % (data,)) - # Internal -- helper to remove special character quoting - entitydefs = None - def unescape(self, s): - if '&' not in s: - return s - def replaceEntities(s): - s = s.groups()[0] - if s[0] == "#": - s = s[1:] - if s[0] in ['x','X']: - c = int(s[1:], 16) - else: - c = int(s) - return unichr(c) - else: - # Cannot use name2codepoint directly, because HTMLParser - # supports apos, which is not part of HTML 4 - import html.entities - if HTMLParser.entitydefs is None: - entitydefs = HTMLParser.entitydefs = {'apos':u"'"} - for k, v in html.entities.name2codepoint.iteritems(): - entitydefs[k] = unichr(v) - try: - return self.entitydefs[s] - except KeyError: - return '&'+s+';' - - return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", - replaceEntities, s) Index: Lib/test/test_htmlparser.py =================================================================== --- Lib/test/test_htmlparser.py (revision 63474) +++ Lib/test/test_htmlparser.py (working copy) @@ -314,8 +314,17 @@ ]) +class UnescapeTestCase(unittest.TestCase): + def test_unescaping(self): + escaped = u"

There’s the Côte

" + correct_unescaped = u"

There"+unichr(8217) + u"s the C" + unichr(0x00f4) + u"te

" + actual_unescaped = html.parser.unescape(escaped) + if actual_unescaped != correct_unescaped: + self.fail ("failed to unescape properly") + def test_main(): test_support.run_unittest(HTMLParserTestCase) + test_support.run_unittest(UnescapeTestCase) if __name__ == "__main__":