diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst --- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -16,14 +16,9 @@ This module defines a class :class:`HTMLParser` which serves as the basis for parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. -.. class:: HTMLParser(strict=False) +.. class:: HTMLParser() - Create a parser instance. If *strict* is ``False`` (the default), the parser - will accept and parse invalid markup. If *strict* is ``True`` the parser - will raise an :exc:`~html.parser.HTMLParseError` exception instead [#]_ when - it's not able to parse the markup. - The use of ``strict=True`` is discouraged and the *strict* argument is - deprecated. + Create a parser instance able to parse invalid markup. An :class:`.HTMLParser` instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are @@ -33,30 +28,6 @@ This parser does not check that end tags match start tags or call the end-tag handler for elements which are closed implicitly by closing an outer element. - .. versionchanged:: 3.2 - *strict* keyword added. - - .. deprecated-removed:: 3.3 3.5 - The *strict* argument and the strict mode have been deprecated. - The parser is now able to accept and parse invalid markup too. - -An exception is defined as well: - - -.. exception:: HTMLParseError - - Exception raised by the :class:`HTMLParser` class when it encounters an error - while parsing and *strict* is ``True``. This exception provides three - attributes: :attr:`msg` is a brief message explaining the error, - :attr:`lineno` is the number of the line on which the broken construct was - detected, and :attr:`offset` is the number of characters into the line at - which the construct starts. - - .. deprecated-removed:: 3.3 3.5 - This exception has been deprecated because it's never raised by the parser - (when the default non-strict mode is used). - - Example HTML Parser Application ------------------------------- @@ -234,8 +205,7 @@ The *data* parameter will be the entire contents of the declaration inside the ```` markup. It is sometimes useful to be overridden by a - derived class. The base class implementation raises an :exc:`HTMLParseError` - when *strict* is ``True``. + derived class. The base class implementation does nothing. .. _htmlparser-examples: @@ -345,9 +315,3 @@ Data : tag soup End tag : p End tag : a - -.. rubric:: Footnotes - -.. [#] For backward compatibility reasons *strict* mode does not raise - exceptions for all non-compliant HTML. That is, some invalid HTML - is tolerated even in *strict* mode. diff --git a/Lib/html/parser.py b/Lib/html/parser.py --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -25,36 +25,12 @@ starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') commentclose = re.compile(r'--\s*>') -# Note: -# 1) the strict attrfind isn't really strict, but we can't make it -# correctly strict without breaking backward compatibility; -# 2) if you change tagfind/attrfind remember to update locatestarttagend too; -# 3) if you change tagfind/attrfind and/or locatestarttagend the parser will -# explode, so don't do it. -tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*') # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') -attrfind = re.compile( - r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') attrfind_tolerant = re.compile( r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') -locatestarttagend = re.compile(r""" - <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s+ # whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator - (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value - ) - )? - ) - )* - \s* # trailing whitespace -""", re.VERBOSE) locatestarttagend_tolerant = re.compile(r""" <[a-zA-Z][^\t\n\r\f />\x00]* # tag name (?:[\s/]* # optional whitespace before attribute name @@ -76,25 +52,6 @@ endtagfind = re.compile('') -class HTMLParseError(Exception): - """Exception raised for all parse errors.""" - - def __init__(self, msg, position=(None, None)): - assert msg - self.msg = msg - self.lineno = position[0] - self.offset = position[1] - - def __str__(self): - result = self.msg - if self.lineno is not None: - result = result + ", at line %d" % self.lineno - if self.offset is not None: - result = result + ", column %d" % (self.offset + 1) - return result - - -_strict_sentinel = object() class HTMLParser(_markupbase.ParserBase): """Find tags and other markup and call handler functions. @@ -118,19 +75,8 @@ CDATA_CONTENT_ELEMENTS = ("script", "style") - def __init__(self, strict=_strict_sentinel): - """Initialize and reset this instance. - - If strict is set to False (the default) the parser will parse invalid - markup, otherwise it will raise an error. Note that the strict mode - and argument are deprecated. - """ - if strict is not _strict_sentinel: - warnings.warn("The strict argument and mode are deprecated.", - DeprecationWarning, stacklevel=2) - else: - strict = False # default - self.strict = strict + def __init__(self): + """Initialize and reset this instance.""" self.reset() def reset(self): @@ -154,11 +100,6 @@ """Handle any buffered data.""" self.goahead(1) - def error(self, message): - warnings.warn("The 'error' method is deprecated.", - DeprecationWarning, stacklevel=2) - raise HTMLParseError(message, self.getpos()) - __starttag_text = None def get_starttag_text(self): @@ -202,10 +143,7 @@ elif startswith("', i + 1) if k < 0: k = rawdata.find('<', i + 1) @@ -254,13 +190,10 @@ if match: # match.group() will contain at least 2 chars if end and match.group() == rawdata[i:]: - if self.strict: - self.error("EOF in middle of entity or char ref") - else: - k = match.end() - if k <= i: - k = n - i = self.updatepos(i, i + 1) + k = match.end() + if k <= i: + k = n + i = self.updatepos(i, i + 1) # incomplete break elif (i + 1) < n: @@ -336,18 +269,12 @@ # Now parse the data between i+1 and j into a tag and attrs attrs = [] - if self.strict: - match = tagfind.match(rawdata, i+1) - else: - match = tagfind_tolerant.match(rawdata, i+1) + match = tagfind_tolerant.match(rawdata, i+1) assert match, 'unexpected call to parse_starttag()' k = match.end() self.lasttag = tag = match.group(1).lower() while k < endpos: - if self.strict: - m = attrfind.match(rawdata, k) - else: - m = attrfind_tolerant.match(rawdata, k) + m = attrfind_tolerant.match(rawdata, k) if not m: break attrname, rest, attrvalue = m.group(1, 2, 3) @@ -370,9 +297,6 @@ - self.__starttag_text.rfind("\n") else: offset = offset + len(self.__starttag_text) - if self.strict: - self.error("junk characters in start tag: %r" - % (rawdata[k:endpos][:20],)) self.handle_data(rawdata[i:endpos]) return endpos if end.endswith('/>'): @@ -388,10 +312,7 @@ # or -1 if incomplete. def check_for_whole_start_tag(self, i): rawdata = self.rawdata - if self.strict: - m = locatestarttagend.match(rawdata, i) - else: - m = locatestarttagend_tolerant.match(rawdata, i) + m = locatestarttagend_tolerant.match(rawdata, i) if m: j = m.end() next = rawdata[j:j+1] @@ -404,9 +325,6 @@ # buffer boundary return -1 # else bogus input - if self.strict: - self.updatepos(i, j + 1) - self.error("malformed empty start tag") if j > i: return j else: @@ -419,9 +337,6 @@ # end of input in or before attribute value, or we have the # '/' from a '/>' ending return -1 - if self.strict: - self.updatepos(i, j) - self.error("malformed start tag") if j > i: return j else: @@ -441,8 +356,6 @@ if self.cdata_elem is not None: self.handle_data(rawdata[i:gtpos]) return gtpos - if self.strict: - self.error("bad end tag: %r" % (rawdata[i:gtpos],)) # find the name: w3.org/TR/html5/tokenization.html#tag-name-state namematch = tagfind_tolerant.match(rawdata, i+2) if not namematch: @@ -508,8 +421,7 @@ pass def unknown_decl(self, data): - if self.strict: - self.error("unknown declaration: %r" % (data,)) + pass # Internal -- helper to remove special character quoting def unescape(self, s): diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -91,21 +91,11 @@ def _run_check_extra(self, source, events): self._run_check(source, events, EventCollectorExtra()) - def _parse_error(self, source): - def parse(source=source): - parser = self.get_collector() - parser.feed(source) - parser.close() - with self.assertRaises(html.parser.HTMLParseError): - with self.assertWarns(DeprecationWarning): - parse() - -class HTMLParserStrictTestCase(TestCaseBase): +class HTMLParserTestCase(TestCaseBase): def get_collector(self): - with support.check_warnings(("", DeprecationWarning), quite=False): - return EventCollector(strict=True) + return EventCollector() def test_processing_instruction_only(self): self._run_check("", [ @@ -181,9 +171,6 @@ ("data", "this < text > contains < bare>pointy< brackets"), ]) - def test_illegal_declarations(self): - self._parse_error('') - def test_starttag_end_boundary(self): self._run_check("""""", [("starttag", "a", [("b", "<")])]) self._run_check("""""", [("starttag", "a", [("b", ">")])]) @@ -218,25 +205,6 @@ self._run_check(["", ""], output) - def test_starttag_junk_chars(self): - self._parse_error("") - self._parse_error("") - self._parse_error("") - self._parse_error("") - self._parse_error("'") - self._parse_error("" % dtd, [('decl', 'DOCTYPE ' + dtd)]) - def test_declaration_junk_chars(self): - self._parse_error("") - def test_startendtag(self): self._run_check("

", [ ("startendtag", "p", []), @@ -363,20 +328,8 @@ ('comment', '[if lte IE 7]>pretty?te>>xt&a<\n' @@ -631,11 +584,10 @@ self._run_check(html, expected) -class AttributesStrictTestCase(TestCaseBase): +class AttributesTestCase(TestCaseBase): def get_collector(self): - with support.check_warnings(("", DeprecationWarning), quite=False): - return EventCollector(strict=True) + return EventCollector() def test_attr_syntax(self): output = [ @@ -691,13 +643,6 @@ "", [("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])]) - - -class AttributesTolerantTestCase(AttributesStrictTestCase): - - def get_collector(self): - return EventCollector() - def test_attr_funky_names2(self): self._run_check( "",