#15114: the strict mode of HTMLParser and the HTMLParseError exception are deprecated now that the parser is able to parse invalid markup.
This commit is contained in:
parent
a4db02c7a3
commit
3861d8b271
|
@ -16,13 +16,14 @@
|
|||
This module defines a class :class:`HTMLParser` which serves as the basis for
|
||||
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
|
||||
|
||||
.. class:: HTMLParser(strict=True)
|
||||
.. class:: HTMLParser(strict=False)
|
||||
|
||||
Create a parser instance. If *strict* is ``True`` (the default), invalid
|
||||
HTML results in :exc:`~html.parser.HTMLParseError` exceptions [#]_. If
|
||||
*strict* is ``False``, the parser uses heuristics to make a best guess at
|
||||
the intention of any invalid HTML it encounters, similar to the way most
|
||||
browsers do. Using ``strict=False`` is advised.
|
||||
Create a parser instance. If *strict* is ``False`` (the default), the parser
|
||||
will accept and parse invalid markup. If *strict* is ``True`` the parser
|
||||
will raise an :exc:`~html.parser.HTMLParseError` exception instead [#]_ when
|
||||
it's not able to parse the markup.
|
||||
The use of ``strict=True`` is discouraged and the *strict* argument is
|
||||
deprecated.
|
||||
|
||||
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
|
||||
when start tags, end tags, text, comments, and other markup elements are
|
||||
|
@ -34,6 +35,10 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
|
|||
|
||||
.. versionchanged:: 3.2 *strict* keyword added
|
||||
|
||||
.. deprecated-removed:: 3.3 3.5
|
||||
The *strict* argument and the strict mode have been deprecated.
|
||||
The parser is now able to accept and parse invalid markup too.
|
||||
|
||||
An exception is defined as well:
|
||||
|
||||
|
||||
|
@ -46,6 +51,10 @@ An exception is defined as well:
|
|||
detected, and :attr:`offset` is the number of characters into the line at
|
||||
which the construct starts.
|
||||
|
||||
.. deprecated-removed:: 3.3 3.5
|
||||
This exception has been deprecated because it's never raised by the parser
|
||||
(when the default non-strict mode is used).
|
||||
|
||||
|
||||
Example HTML Parser Application
|
||||
-------------------------------
|
||||
|
|
|
@ -10,6 +10,7 @@
|
|||
|
||||
import _markupbase
|
||||
import re
|
||||
import warnings
|
||||
|
||||
# Regular expressions used for parsing
|
||||
|
||||
|
@ -113,14 +114,16 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
|
||||
CDATA_CONTENT_ELEMENTS = ("script", "style")
|
||||
|
||||
def __init__(self, strict=True):
|
||||
def __init__(self, strict=False):
|
||||
"""Initialize and reset this instance.
|
||||
|
||||
If strict is set to True (the default), errors are raised when invalid
|
||||
HTML is encountered. If set to False, an attempt is instead made to
|
||||
continue parsing, making "best guesses" about the intended meaning, in
|
||||
a fashion similar to what browsers typically do.
|
||||
If strict is set to False (the default) the parser will parse invalid
|
||||
markup, otherwise it will raise an error. Note that the strict mode
|
||||
is deprecated.
|
||||
"""
|
||||
if strict:
|
||||
warnings.warn("The strict mode is deprecated.",
|
||||
DeprecationWarning, stacklevel=2)
|
||||
self.strict = strict
|
||||
self.reset()
|
||||
|
||||
|
@ -271,8 +274,8 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
# See also parse_declaration in _markupbase
|
||||
def parse_html_declaration(self, i):
|
||||
rawdata = self.rawdata
|
||||
if rawdata[i:i+2] != '<!':
|
||||
self.error('unexpected call to parse_html_declaration()')
|
||||
assert rawdata[i:i+2] == '<!', ('unexpected call to '
|
||||
'parse_html_declaration()')
|
||||
if rawdata[i:i+4] == '<!--':
|
||||
# this case is actually already handled in goahead()
|
||||
return self.parse_comment(i)
|
||||
|
@ -292,8 +295,8 @@ class HTMLParser(_markupbase.ParserBase):
|
|||
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
|
||||
def parse_bogus_comment(self, i, report=1):
|
||||
rawdata = self.rawdata
|
||||
if rawdata[i:i+2] not in ('<!', '</'):
|
||||
self.error('unexpected call to parse_comment()')
|
||||
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
|
||||
'parse_comment()')
|
||||
pos = rawdata.find('>', i+2)
|
||||
if pos == -1:
|
||||
return -1
|
||||
|
|
|
@ -102,7 +102,8 @@ class TestCaseBase(unittest.TestCase):
|
|||
class HTMLParserStrictTestCase(TestCaseBase):
|
||||
|
||||
def get_collector(self):
|
||||
return EventCollector(strict=True)
|
||||
with support.check_warnings(("", DeprecationWarning), quite=False):
|
||||
return EventCollector(strict=True)
|
||||
|
||||
def test_processing_instruction_only(self):
|
||||
self._run_check("<?processing instruction>", [
|
||||
|
@ -594,7 +595,8 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
|
|||
class AttributesStrictTestCase(TestCaseBase):
|
||||
|
||||
def get_collector(self):
|
||||
return EventCollector(strict=True)
|
||||
with support.check_warnings(("", DeprecationWarning), quite=False):
|
||||
return EventCollector(strict=True)
|
||||
|
||||
def test_attr_syntax(self):
|
||||
output = [
|
||||
|
|
|
@ -43,6 +43,9 @@ Core and Builtins
|
|||
Library
|
||||
-------
|
||||
|
||||
- Issue #15114: the strict mode of HTMLParser and the HTMLParseError exception
|
||||
are deprecated now that the parser is able to parse invalid markup.
|
||||
|
||||
- Issue #3665: \u and \U escapes are now supported in unicode regular
|
||||
expressions. Patch by Serhiy Storchaka.
|
||||
|
||||
|
@ -78,7 +81,7 @@ Library
|
|||
- Issue #9527: datetime.astimezone() method will now supply a class
|
||||
timezone instance corresponding to the system local timezone when
|
||||
called with no arguments.
|
||||
|
||||
|
||||
- Issue #14653: email.utils.mktime_tz() no longer relies on system
|
||||
mktime() when timezone offest is supplied.
|
||||
|
||||
|
|
Loading…
Reference in New Issue