diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst --- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -16,14 +16,21 @@ This module defines a class :class:`HTMLParser` which serves as the basis for parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. -.. class:: HTMLParser(strict=False) +.. class:: HTMLParser(strict=False, *, convert_charrefs=False) - Create a parser instance. If *strict* is ``False`` (the default), the parser - will accept and parse invalid markup. If *strict* is ``True`` the parser - will raise an :exc:`~html.parser.HTMLParseError` exception instead [#]_ when - it's not able to parse the markup. - The use of ``strict=True`` is discouraged and the *strict* argument is - deprecated. + Create a parser instance. + + If *convert_charrefs* is ``True`` (default: ``False``), all character + references (except the ones in ``script``/``style`` elements) are + automatically converted to the corresponding Unicode characters. + The use of ``convert_charrefs=True`` is encouraged and might become + the default in the future. + + If *strict* is ``False`` (the default), the parser will accept and parse + invalid markup. If *strict* is ``True`` the parser will raise an + :exc:`~html.parser.HTMLParseError` exception instead [#]_ when it's not + able to parse the markup. The use of ``strict=True`` is discouraged and + the *strict* argument is deprecated. An :class:`.HTMLParser` instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are @@ -34,12 +41,15 @@ handler for elements which are closed implicitly by closing an outer element. .. versionchanged:: 3.2 - *strict* keyword added. + *strict* argument added. .. deprecated-removed:: 3.3 3.5 The *strict* argument and the strict mode have been deprecated. The parser is now able to accept and parse invalid markup too. + .. versionchanged:: 3.4 + *convert_charrefs* keyword argument added. + An exception is defined as well: @@ -181,7 +191,8 @@ This method is called to process a named character reference of the form ``&name;`` (e.g. ``>``), where *name* is a general entity reference - (e.g. ``'gt'``). + (e.g. ``'gt'``). This method is never called if *convert_charrefs* is + ``True``. .. method:: HTMLParser.handle_charref(name) @@ -189,7 +200,8 @@ This method is called to process decimal and hexadecimal numeric character references of the form ``&#NNN;`` and ``&#xNNN;``. For example, the decimal equivalent for ``>`` is ``>``, whereas the hexadecimal is ``>``; - in this case the method will receive ``'62'`` or ``'x3E'``. + in this case the method will receive ``'62'`` or ``'x3E'``. This method + is never called if *convert_charrefs* is ``True``. .. method:: HTMLParser.handle_comment(data) @@ -324,7 +336,8 @@ Num ent : > Feeding incomplete chunks to :meth:`~HTMLParser.feed` works, but -:meth:`~HTMLParser.handle_data` might be called more than once:: +:meth:`~HTMLParser.handle_data` might be called more than once +(unless *convert_charrefs* is set to ``True``):: >>> for chunk in ['buff', 'ered ', 'text']: ... parser.feed(chunk) diff --git a/Lib/html/parser.py b/Lib/html/parser.py --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -112,18 +112,22 @@ self.handle_startendtag(); end tags by self.handle_endtag(). The data between tags is passed from the parser to the derived class by calling self.handle_data() with the data as argument (the data - may be split up in arbitrary chunks). Entity references are - passed by calling self.handle_entityref() with the entity - reference as the argument. Numeric character references are - passed to self.handle_charref() with the string containing the - reference as the argument. + may be split up in arbitrary chunks). If convert_charrefs is + True the character references are converted automatically to the + corresponding Unicode character (and self.handle_data() is no + longer split in chunks), otherwise they are passed by calling + self.handle_entityref() or self.handle_charref() with the string + containing respectively the named or numeric reference as the + argument. """ CDATA_CONTENT_ELEMENTS = ("script", "style") - def __init__(self, strict=_strict_sentinel): + def __init__(self, strict=_strict_sentinel, *, convert_charrefs=False): """Initialize and reset this instance. + If convert_charrefs is True (default: False), all character references + are automatically converted to the corresponding Unicode characters. If strict is set to False (the default) the parser will parse invalid markup, otherwise it will raise an error. Note that the strict mode and argument are deprecated. @@ -134,6 +138,7 @@ else: strict = False # default self.strict = strict + self.convert_charrefs = convert_charrefs self.reset() def reset(self): @@ -184,14 +189,26 @@ i = 0 n = len(rawdata) while i < n: - match = self.interesting.search(rawdata, i) # < or & - if match: - j = match.start() + if self.convert_charrefs and not self.cdata_elem: + try: + j = rawdata.index('<', i) + except ValueError: + if not end: + break # wait till we get all the text + j = n else: - if self.cdata_elem: - break - j = n - if i < j: self.handle_data(rawdata[i:j]) + match = self.interesting.search(rawdata, i) # < or & + if match: + j = match.start() + else: + if self.cdata_elem: + break + j = n + if i < j: + if self.convert_charrefs and not self.cdata_elem: + self.handle_data(unescape(rawdata[i:j])) + else: + self.handle_data(rawdata[i:j]) i = self.updatepos(i, j) if i == n: break startswith = rawdata.startswith @@ -226,9 +243,14 @@ k = i + 1 else: k += 1 - self.handle_data(rawdata[i:k]) + if self.convert_charrefs and not self.cdata_elem: + self.handle_data(unescape(rawdata[i:k])) + else: + self.handle_data(rawdata[i:k]) i = self.updatepos(i, k) elif startswith("&#", i): + assert self.convert_charrefs == False, ( + "we shouldn't get here with convert_charrefs == True") match = charref.match(rawdata, i) if match: name = match.group()[2:-1] @@ -244,6 +266,8 @@ i = self.updatepos(i, 2) break elif startswith('&', i): + assert self.convert_charrefs == False, ( + "we shouldn't get here with convert_charrefs == True") match = entityref.match(rawdata, i) if match: name = match.group(1) @@ -277,7 +301,10 @@ assert 0, "interesting.search() lied" # end while if end and i < n and not self.cdata_elem: - self.handle_data(rawdata[i:n]) + if self.convert_charrefs and not self.cdata_elem: + self.handle_data(unescape(rawdata[i:n])) + else: + self.handle_data(rawdata[i:n]) i = self.updatepos(i, n) self.rawdata = rawdata[i:] diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -70,6 +70,18 @@ self.append(("starttag_text", self.get_starttag_text())) +class EventCollectorCharrefs(EventCollector): + + def get_events(self): + return self.events + + def handle_charref(self, data): + self.fail('This should never be called with convert_charrefs=True') + + def handle_entityref(self, data): + self.fail('This should never be called with convert_charrefs=True') + + class TestCaseBase(unittest.TestCase): def get_collector(self): @@ -84,8 +96,9 @@ parser.close() events = parser.get_events() if events != expected_events: - self.fail("received events did not match expected events\n" - "Expected:\n" + pprint.pformat(expected_events) + + self.fail("received events did not match expected events" + + "\nSource:\n" + repr(source) + + "\nExpected:\n" + pprint.pformat(expected_events) + "\nReceived:\n" + pprint.pformat(events)) def _run_check_extra(self, source, events): @@ -363,6 +376,44 @@ ('comment', '[if lte IE 7]>pretty?a{0}z'.format(charref), + expected, collector=collector()) + # check charrefs at the beginning/end of the text/attributes + expected = [('data', '"'), + ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]), + ('data', '"'), ('endtag', 'a'), ('data', '"')] + for charref in charrefs: + self._run_check('{0}' + '{0}{0}'.format(charref), + expected, collector=collector()) + # check charrefs in {1}' + '{1}'.format(text, charref), + expected, collector=collector()) + # check truncated charrefs at the end of the file + html = '&quo &# &#x' + for x in range(1, len(html)): + self._run_check(html[:x], [('data', html[:x])], + collector=collector()) + # check a string with no charrefs + self._run_check('no charrefs here', [('data', 'no charrefs here')], + collector=collector()) + class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):