diff --git a/Doc/library/html.entities.rst b/Doc/library/html.entities.rst --- a/Doc/library/html.entities.rst +++ b/Doc/library/html.entities.rst @@ -11,10 +11,6 @@ This module defines four dictionaries, :data:`html5`, :data:`name2codepoint`, :data:`codepoint2name`, and :data:`entitydefs`. -:data:`entitydefs` is used to provide the :attr:`entitydefs` -attribute of the :class:`html.parser.HTMLParser` class. The definition provided -here contains all the entities defined by XHTML 1.0 that can be handled using -simple textual substitution in the Latin-1 character set (ISO-8859-1). .. data:: html5 diff --git a/Lib/html/parser.py b/Lib/html/parser.py --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -500,7 +500,6 @@ self.error("unknown declaration: %r" % (data,)) # Internal -- helper to remove special character quoting - entitydefs = None def unescape(self, s): if '&' not in s: return s @@ -510,24 +509,21 @@ if s[0] == "#": s = s[1:] if s[0] in ['x','X']: - c = int(s[1:], 16) + c = int(s[1:].rstrip(';'), 16) else: - c = int(s) + c = int(s.rstrip(';')) return chr(c) except ValueError: - return '&#'+ s +';' + return '&#' + s else: - # Cannot use name2codepoint directly, because HTMLParser - # supports apos, which is not part of HTML 4 - import html.entities - if HTMLParser.entitydefs is None: - entitydefs = HTMLParser.entitydefs = {'apos':"'"} - for k, v in html.entities.name2codepoint.items(): - entitydefs[k] = chr(v) - try: - return self.entitydefs[s] - except KeyError: - return '&'+s+';' + from html.entities import html5 + if s in html5: + return html5[s] + for x in range(2, len(s)): + if s[:x] in html5: + return html5[s[:x]] + s[x:] + else: + return '&' + s - return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", + return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))", replaceEntities, s, flags=re.ASCII) diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -456,7 +456,7 @@ self._run_check('
', [ ('starttag', 'form', - [('action', '/xxx.php?a=1&b=2&'), + [('action', '/xxx.php?a=1&b=2&'), (',', None), ('method', 'post')])]) def test_weird_chars_in_unquoted_attribute_values(self): @@ -541,6 +541,10 @@ self.assertEqual(p.unescape('&'),'&') # see #12888 self.assertEqual(p.unescape('{ ' * 1050), '{ ' * 1050) + # see #15156 + self.assertEqual(p.unescape('ÉricÉric' + '&alphacentauriαcentauri'), + 'ÉricÉric&alphacentauriαcentauri') def test_broken_comments(self): html = (''