diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst --- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -11,25 +11,23 @@ **Source code:** :source:`Lib/html/parser.py` -------------- This module defines a class :class:`HTMLParser` which serves as the basis for parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. -.. class:: HTMLParser(strict=False, *, convert_charrefs=False) +.. class:: HTMLParser(strict=False, *, convert_charrefs=True) Create a parser instance. - If *convert_charrefs* is ``True`` (default: ``False``), all character - references (except the ones in ``script``/``style`` elements) are - automatically converted to the corresponding Unicode characters. - The use of ``convert_charrefs=True`` is encouraged and will become - the default in Python 3.5. + If *convert_charrefs* is ``True`` (the default), all character references + (except the ones in ``script``/``style`` elements) are automatically + converted to the corresponding Unicode characters. If *strict* is ``False`` (the default), the parser will accept and parse invalid markup. If *strict* is ``True`` the parser will raise an :exc:`~html.parser.HTMLParseError` exception instead [#]_ when it's not able to parse the markup. The use of ``strict=True`` is discouraged and the *strict* argument is deprecated. An :class:`.HTMLParser` instance is fed HTML data and calls handler methods @@ -45,16 +43,19 @@ parsing text files formatted in HTML (Hy .. deprecated-removed:: 3.3 3.5 The *strict* argument and the strict mode have been deprecated. The parser is now able to accept and parse invalid markup too. .. versionchanged:: 3.4 *convert_charrefs* keyword argument added. + .. versionchanged:: 3.5 + The default value for argument *convert_charrefs* is now ``True``. + An exception is defined as well: .. exception:: HTMLParseError Exception raised by the :class:`HTMLParser` class when it encounters an error while parsing and *strict* is ``True``. This exception provides three attributes: :attr:`msg` is a brief message explaining the error, diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst --- a/Doc/whatsnew/3.5.rst +++ b/Doc/whatsnew/3.5.rst @@ -259,12 +259,15 @@ Changes in the Python API if it represented midnight in UTC. This behavior was considered obscure and error-prone and has been removed in Python 3.5. See :issue:`13936` for full details. * :meth:`ssl.SSLSocket.send()` now raises either :exc:`ssl.SSLWantReadError` or :exc:`ssl.SSLWantWriteError` on a non-blocking socket if the operation would block. Previously, it would return 0. See :issue:`20951`. +* The default value for argument *convert_charrefs* in class + :class:`html.parser.HTMLParser` is now ``True``. See :issue:`21047`. + Changes in the C API -------------------- * The :c:type:`PyMemAllocator` structure has a new ``calloc`` field. diff --git a/Lib/html/parser.py b/Lib/html/parser.py --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -118,37 +118,31 @@ class HTMLParser(_markupbase.ParserBase) longer split in chunks), otherwise they are passed by calling self.handle_entityref() or self.handle_charref() with the string containing respectively the named or numeric reference as the argument. """ CDATA_CONTENT_ELEMENTS = ("script", "style") - def __init__(self, strict=_default_sentinel, *, - convert_charrefs=_default_sentinel): + def __init__(self, strict=_default_sentinel, *, convert_charrefs=True): """Initialize and reset this instance. - If convert_charrefs is True (default: False), all character references + If convert_charrefs is True (the default), all character references are automatically converted to the corresponding Unicode characters. If strict is set to False (the default) the parser will parse invalid markup, otherwise it will raise an error. Note that the strict mode and argument are deprecated. """ if strict is not _default_sentinel: warnings.warn("The strict argument and mode are deprecated.", DeprecationWarning, stacklevel=2) else: strict = False # default self.strict = strict - if convert_charrefs is _default_sentinel: - convert_charrefs = False # default - warnings.warn("The value of convert_charrefs will become True in " - "3.5. You are encouraged to set the value explicitly.", - DeprecationWarning, stacklevel=2) self.convert_charrefs = convert_charrefs self.reset() def reset(self): """Reset this instance. Loses all unprocessed data.""" self.rawdata = '' self.lasttag = '???' self.interesting = interesting_normal diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -379,17 +379,18 @@ text '' '') expected = [('comment', "[if IE & !(lte IE 8)]>aren'tcondcomspretty?a{0}z'.format(charref), expected, collector=collector()) @@ -424,18 +425,16 @@ text class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): def get_collector(self): return EventCollector(convert_charrefs=False) def test_deprecation_warnings(self): with self.assertWarns(DeprecationWarning): - EventCollector() # convert_charrefs not passed explicitly - with self.assertWarns(DeprecationWarning): EventCollector(strict=True) with self.assertWarns(DeprecationWarning): EventCollector(strict=False) with self.assertRaises(html.parser.HTMLParseError): with self.assertWarns(DeprecationWarning): EventCollector().error('test') def test_tolerant_parsing(self): @@ -742,17 +741,16 @@ class AttributesStrictTestCase(TestCaseB [("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])]) def test_entityrefs_in_attributes(self): self._run_check( "", [("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])]) - class AttributesTolerantTestCase(AttributesStrictTestCase): def get_collector(self): return EventCollector(convert_charrefs=False) def test_attr_funky_names2(self): self._run_check( "",