#15114: the strict mode of HTMLParser and the HTMLParseError exception are deprecated now that the parser is able to parse invalid markup.

This commit is contained in:
Ezio Melotti 2012-06-23 15:27:51 +02:00
parent a4db02c7a3
commit 3861d8b271
4 changed files with 35 additions and 18 deletions

View File

@ -16,13 +16,14 @@
This module defines a class :class:`HTMLParser` which serves as the basis for
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
.. class:: HTMLParser(strict=True)
.. class:: HTMLParser(strict=False)
Create a parser instance. If *strict* is ``True`` (the default), invalid
HTML results in :exc:`~html.parser.HTMLParseError` exceptions [#]_. If
*strict* is ``False``, the parser uses heuristics to make a best guess at
the intention of any invalid HTML it encounters, similar to the way most
browsers do. Using ``strict=False`` is advised.
Create a parser instance. If *strict* is ``False`` (the default), the parser
will accept and parse invalid markup. If *strict* is ``True`` the parser
will raise an :exc:`~html.parser.HTMLParseError` exception instead [#]_ when
it's not able to parse the markup.
The use of ``strict=True`` is discouraged and the *strict* argument is
deprecated.
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
when start tags, end tags, text, comments, and other markup elements are
@ -34,6 +35,10 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
.. versionchanged:: 3.2 *strict* keyword added
.. deprecated-removed:: 3.3 3.5
The *strict* argument and the strict mode have been deprecated.
The parser is now able to accept and parse invalid markup too.
An exception is defined as well:
@ -46,6 +51,10 @@ An exception is defined as well:
detected, and :attr:`offset` is the number of characters into the line at
which the construct starts.
.. deprecated-removed:: 3.3 3.5
This exception has been deprecated because it's never raised by the parser
(when the default non-strict mode is used).
Example HTML Parser Application
-------------------------------

View File

@ -10,6 +10,7 @@
import _markupbase
import re
import warnings
# Regular expressions used for parsing
@ -113,14 +114,16 @@ class HTMLParser(_markupbase.ParserBase):
CDATA_CONTENT_ELEMENTS = ("script", "style")
def __init__(self, strict=True):
def __init__(self, strict=False):
"""Initialize and reset this instance.
If strict is set to True (the default), errors are raised when invalid
HTML is encountered. If set to False, an attempt is instead made to
continue parsing, making "best guesses" about the intended meaning, in
a fashion similar to what browsers typically do.
If strict is set to False (the default) the parser will parse invalid
markup, otherwise it will raise an error. Note that the strict mode
is deprecated.
"""
if strict:
warnings.warn("The strict mode is deprecated.",
DeprecationWarning, stacklevel=2)
self.strict = strict
self.reset()
@ -271,8 +274,8 @@ class HTMLParser(_markupbase.ParserBase):
# See also parse_declaration in _markupbase
def parse_html_declaration(self, i):
rawdata = self.rawdata
if rawdata[i:i+2] != '<!':
self.error('unexpected call to parse_html_declaration()')
assert rawdata[i:i+2] == '<!', ('unexpected call to '
'parse_html_declaration()')
if rawdata[i:i+4] == '<!--':
# this case is actually already handled in goahead()
return self.parse_comment(i)
@ -292,8 +295,8 @@ class HTMLParser(_markupbase.ParserBase):
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
def parse_bogus_comment(self, i, report=1):
rawdata = self.rawdata
if rawdata[i:i+2] not in ('<!', '</'):
self.error('unexpected call to parse_comment()')
assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
'parse_comment()')
pos = rawdata.find('>', i+2)
if pos == -1:
return -1

View File

@ -102,7 +102,8 @@ class TestCaseBase(unittest.TestCase):
class HTMLParserStrictTestCase(TestCaseBase):
def get_collector(self):
return EventCollector(strict=True)
with support.check_warnings(("", DeprecationWarning), quite=False):
return EventCollector(strict=True)
def test_processing_instruction_only(self):
self._run_check("<?processing instruction>", [
@ -594,7 +595,8 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
class AttributesStrictTestCase(TestCaseBase):
def get_collector(self):
return EventCollector(strict=True)
with support.check_warnings(("", DeprecationWarning), quite=False):
return EventCollector(strict=True)
def test_attr_syntax(self):
output = [

View File

@ -43,6 +43,9 @@ Core and Builtins
Library
-------
- Issue #15114: the strict mode of HTMLParser and the HTMLParseError exception
are deprecated now that the parser is able to parse invalid markup.
- Issue #3665: \u and \U escapes are now supported in unicode regular
expressions. Patch by Serhiy Storchaka.
@ -78,7 +81,7 @@ Library
- Issue #9527: datetime.astimezone() method will now supply a class
timezone instance corresponding to the system local timezone when
called with no arguments.
- Issue #14653: email.utils.mktime_tz() no longer relies on system
mktime() when timezone offest is supplied.