diff -r cf70f030a744 Lib/HTMLParser.py --- a/Lib/HTMLParser.py Wed Jun 18 23:07:46 2014 -0400 +++ b/Lib/HTMLParser.py Mon Jun 23 21:53:39 2014 +0300 @@ -11,6 +11,11 @@ import markupbase import re +try: + _unichr = unichr +except NameError: + _unichr = chr + # Regular expressions used for parsing interesting_normal = re.compile('[&<]') @@ -456,8 +461,8 @@ c = int(s[1:], 16) else: c = int(s) - return unichr(c) - except ValueError: + return _unichr(c) + except ValueError as e: return '&#'+s+';' else: # Cannot use name2codepoint directly, because HTMLParser supports apos, @@ -466,10 +471,13 @@ if HTMLParser.entitydefs is None: entitydefs = HTMLParser.entitydefs = {'apos':u"'"} for k, v in htmlentitydefs.name2codepoint.iteritems(): - entitydefs[k] = unichr(v) + try: + entitydefs[k] = _unichr(v) + except ValueError: + pass try: return self.entitydefs[s] - except KeyError: + except KeyError as e: return '&'+s+';' return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s) diff -r cf70f030a744 Lib/test/test_htmlparser.py --- a/Lib/test/test_htmlparser.py Wed Jun 18 23:07:46 2014 -0400 +++ b/Lib/test/test_htmlparser.py Mon Jun 23 21:53:39 2014 +0300 @@ -4,6 +4,7 @@ import pprint import unittest from test import test_support +from test.test_support import have_unicode, requires_unicode, u class EventCollector(HTMLParser.HTMLParser): @@ -351,12 +352,13 @@ 'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'), '\n\n', 'foo = "";', - u'', # these two should be invalid according to the HTML 5 spec, # section 8.1.2.2 #'foo = ', #'foo = ', ] + if have_unicode: + contents.append(u(r'')) elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style'] for content in contents: for element in elements: @@ -434,20 +436,21 @@ "", [("starttag", "a", [("href", "mailto:xyz@example.com")])]) + @requires_unicode def test_attr_nonascii(self): # see issue 7311 self._run_check( - u"\u4e2d\u6587", + u(r"\u4e2d\u6587"), [("starttag", "img", [("src", "/foo/bar.png"), - ("alt", u"\u4e2d\u6587")])]) + ("alt", u(r"\u4e2d\u6587"))])]) self._run_check( - u"", - [("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"), - ("href", u"\u30c6\u30b9\u30c8.html")])]) + u(r""), + [("starttag", "a", [("title", u(r"\u30c6\u30b9\u30c8")), + ("href", u(r"\u30c6\u30b9\u30c8.html"))])]) self._run_check( - u'', - [("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"), - ("href", u"\u30c6\u30b9\u30c8.html")])]) + u(r''), + [("starttag", "a", [("title", u(r"\u30c6\u30b9\u30c8")), + ("href", u(r"\u30c6\u30b9\u30c8.html"))])]) def test_attr_entity_replacement(self): self._run_check( @@ -465,9 +468,10 @@ ("starttag", "c", [("\\", "/")])]) def test_entityrefs_in_attributes(self): + euro = unichr(0x20AC) if have_unicode else '€' self._run_check( "", - [("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")])]) + [("starttag", "html", [("foo", u"%s&aa&unsupported;" % euro)])]) def test_entities_in_attribute_value(self): # see #1200313