diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst
--- a/Doc/library/html.parser.rst
+++ b/Doc/library/html.parser.rst
@@ -11,25 +11,23 @@
**Source code:** :source:`Lib/html/parser.py`
--------------
This module defines a class :class:`HTMLParser` which serves as the basis for
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
-.. class:: HTMLParser(strict=False, *, convert_charrefs=False)
+.. class:: HTMLParser(strict=False, *, convert_charrefs=True)
Create a parser instance.
- If *convert_charrefs* is ``True`` (default: ``False``), all character
- references (except the ones in ``script``/``style`` elements) are
- automatically converted to the corresponding Unicode characters.
- The use of ``convert_charrefs=True`` is encouraged and will become
- the default in Python 3.5.
+ If *convert_charrefs* is ``True`` (the default), all character references
+ (except the ones in ``script``/``style`` elements) are automatically
+ converted to the corresponding Unicode characters.
If *strict* is ``False`` (the default), the parser will accept and parse
invalid markup. If *strict* is ``True`` the parser will raise an
:exc:`~html.parser.HTMLParseError` exception instead [#]_ when it's not
able to parse the markup. The use of ``strict=True`` is discouraged and
the *strict* argument is deprecated.
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
@@ -45,16 +43,19 @@ parsing text files formatted in HTML (Hy
.. deprecated-removed:: 3.3 3.5
The *strict* argument and the strict mode have been deprecated.
The parser is now able to accept and parse invalid markup too.
.. versionchanged:: 3.4
*convert_charrefs* keyword argument added.
+ .. versionchanged:: 3.5
+ The default value for argument *convert_charrefs* is now ``True``.
+
An exception is defined as well:
.. exception:: HTMLParseError
Exception raised by the :class:`HTMLParser` class when it encounters an error
while parsing and *strict* is ``True``. This exception provides three
attributes: :attr:`msg` is a brief message explaining the error,
diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst
--- a/Doc/whatsnew/3.5.rst
+++ b/Doc/whatsnew/3.5.rst
@@ -259,12 +259,15 @@ Changes in the Python API
if it represented midnight in UTC. This behavior was considered obscure and
error-prone and has been removed in Python 3.5. See :issue:`13936` for full
details.
* :meth:`ssl.SSLSocket.send()` now raises either :exc:`ssl.SSLWantReadError`
or :exc:`ssl.SSLWantWriteError` on a non-blocking socket if the operation
would block. Previously, it would return 0. See :issue:`20951`.
+* The default value for argument *convert_charrefs* in class
+ :class:`html.parser.HTMLParser` is now ``True``. See :issue:`21047`.
+
Changes in the C API
--------------------
* The :c:type:`PyMemAllocator` structure has a new ``calloc`` field.
diff --git a/Lib/html/parser.py b/Lib/html/parser.py
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -118,37 +118,31 @@ class HTMLParser(_markupbase.ParserBase)
longer split in chunks), otherwise they are passed by calling
self.handle_entityref() or self.handle_charref() with the string
containing respectively the named or numeric reference as the
argument.
"""
CDATA_CONTENT_ELEMENTS = ("script", "style")
- def __init__(self, strict=_default_sentinel, *,
- convert_charrefs=_default_sentinel):
+ def __init__(self, strict=_default_sentinel, *, convert_charrefs=True):
"""Initialize and reset this instance.
- If convert_charrefs is True (default: False), all character references
+ If convert_charrefs is True (the default), all character references
are automatically converted to the corresponding Unicode characters.
If strict is set to False (the default) the parser will parse invalid
markup, otherwise it will raise an error. Note that the strict mode
and argument are deprecated.
"""
if strict is not _default_sentinel:
warnings.warn("The strict argument and mode are deprecated.",
DeprecationWarning, stacklevel=2)
else:
strict = False # default
self.strict = strict
- if convert_charrefs is _default_sentinel:
- convert_charrefs = False # default
- warnings.warn("The value of convert_charrefs will become True in "
- "3.5. You are encouraged to set the value explicitly.",
- DeprecationWarning, stacklevel=2)
self.convert_charrefs = convert_charrefs
self.reset()
def reset(self):
"""Reset this instance. Loses all unprocessed data."""
self.rawdata = ''
self.lasttag = '???'
self.interesting = interesting_normal
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -379,17 +379,18 @@ text
''
'')
expected = [('comment', "[if IE & !(lte IE 8)]>aren'tcondcomspretty?a{0}z'.format(charref),
expected, collector=collector())
@@ -424,18 +425,16 @@ text
class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
def get_collector(self):
return EventCollector(convert_charrefs=False)
def test_deprecation_warnings(self):
with self.assertWarns(DeprecationWarning):
- EventCollector() # convert_charrefs not passed explicitly
- with self.assertWarns(DeprecationWarning):
EventCollector(strict=True)
with self.assertWarns(DeprecationWarning):
EventCollector(strict=False)
with self.assertRaises(html.parser.HTMLParseError):
with self.assertWarns(DeprecationWarning):
EventCollector().error('test')
def test_tolerant_parsing(self):
@@ -742,17 +741,16 @@ class AttributesStrictTestCase(TestCaseB
[("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])])
def test_entityrefs_in_attributes(self):
self._run_check(
"",
[("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])])
-
class AttributesTolerantTestCase(AttributesStrictTestCase):
def get_collector(self):
return EventCollector(convert_charrefs=False)
def test_attr_funky_names2(self):
self._run_check(
"",