diff -r cf70f030a744 Lib/HTMLParser.py
--- a/Lib/HTMLParser.py Wed Jun 18 23:07:46 2014 -0400
+++ b/Lib/HTMLParser.py Mon Jun 23 21:53:39 2014 +0300
@@ -11,6 +11,11 @@
import markupbase
import re
+try:
+ _unichr = unichr
+except NameError:
+ _unichr = chr
+
# Regular expressions used for parsing
interesting_normal = re.compile('[&<]')
@@ -456,8 +461,8 @@
c = int(s[1:], 16)
else:
c = int(s)
- return unichr(c)
- except ValueError:
+ return _unichr(c)
+ except ValueError as e:
return ''+s+';'
else:
# Cannot use name2codepoint directly, because HTMLParser supports apos,
@@ -466,10 +471,13 @@
if HTMLParser.entitydefs is None:
entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
for k, v in htmlentitydefs.name2codepoint.iteritems():
- entitydefs[k] = unichr(v)
+ try:
+ entitydefs[k] = _unichr(v)
+ except ValueError:
+ pass
try:
return self.entitydefs[s]
- except KeyError:
+ except KeyError as e:
return '&'+s+';'
return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
diff -r cf70f030a744 Lib/test/test_htmlparser.py
--- a/Lib/test/test_htmlparser.py Wed Jun 18 23:07:46 2014 -0400
+++ b/Lib/test/test_htmlparser.py Mon Jun 23 21:53:39 2014 +0300
@@ -4,6 +4,7 @@
import pprint
import unittest
from test import test_support
+from test.test_support import have_unicode, requires_unicode, u
class EventCollector(HTMLParser.HTMLParser):
@@ -351,12 +352,13 @@
'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'),
'\n\n',
'foo = "";',
- u'',
# these two should be invalid according to the HTML 5 spec,
# section 8.1.2.2
#'foo = \nscript>',
#'foo = script>',
]
+ if have_unicode:
+ contents.append(u(r''))
elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
for content in contents:
for element in elements:
@@ -434,20 +436,21 @@
"",
[("starttag", "a", [("href", "mailto:xyz@example.com")])])
+ @requires_unicode
def test_attr_nonascii(self):
# see issue 7311
self._run_check(
- u"",
+ u(r""),
[("starttag", "img", [("src", "/foo/bar.png"),
- ("alt", u"\u4e2d\u6587")])])
+ ("alt", u(r"\u4e2d\u6587"))])])
self._run_check(
- u"",
- [("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),
- ("href", u"\u30c6\u30b9\u30c8.html")])])
+ u(r""),
+ [("starttag", "a", [("title", u(r"\u30c6\u30b9\u30c8")),
+ ("href", u(r"\u30c6\u30b9\u30c8.html"))])])
self._run_check(
- u'',
- [("starttag", "a", [("title", u"\u30c6\u30b9\u30c8"),
- ("href", u"\u30c6\u30b9\u30c8.html")])])
+ u(r''),
+ [("starttag", "a", [("title", u(r"\u30c6\u30b9\u30c8")),
+ ("href", u(r"\u30c6\u30b9\u30c8.html"))])])
def test_attr_entity_replacement(self):
self._run_check(
@@ -465,9 +468,10 @@
("starttag", "c", [("\\", "/")])])
def test_entityrefs_in_attributes(self):
+ euro = unichr(0x20AC) if have_unicode else '€'
self._run_check(
"",
- [("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")])])
+ [("starttag", "html", [("foo", u"%s&aa&unsupported;" % euro)])])
def test_entities_in_attribute_value(self):
# see #1200313