#15114: the strict mode and argument of HTMLParser, HTMLParser.error, and the HTMLParserError exception have been removed.

This commit is contained in:
Ezio Melotti 2014-08-02 14:10:30 +03:00
parent ffff1440d1
commit 73a4359eb0
4 changed files with 23 additions and 197 deletions

View File

@ -16,9 +16,9 @@
This module defines a class :class:`HTMLParser` which serves as the basis for
parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
.. class:: HTMLParser(strict=False, *, convert_charrefs=False)
.. class:: HTMLParser(*, convert_charrefs=False)
Create a parser instance.
Create a parser instance able to parse invalid markup.
If *convert_charrefs* is ``True`` (default: ``False``), all character
references (except the ones in ``script``/``style`` elements) are
@ -26,12 +26,6 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
The use of ``convert_charrefs=True`` is encouraged and will become
the default in Python 3.5.
If *strict* is ``False`` (the default), the parser will accept and parse
invalid markup. If *strict* is ``True`` the parser will raise an
:exc:`~html.parser.HTMLParseError` exception instead [#]_ when it's not
able to parse the markup. The use of ``strict=True`` is discouraged and
the *strict* argument is deprecated.
An :class:`.HTMLParser` instance is fed HTML data and calls handler methods
when start tags, end tags, text, comments, and other markup elements are
encountered. The user should subclass :class:`.HTMLParser` and override its
@ -40,32 +34,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML.
This parser does not check that end tags match start tags or call the end-tag
handler for elements which are closed implicitly by closing an outer element.
.. versionchanged:: 3.2
*strict* argument added.
.. deprecated-removed:: 3.3 3.5
The *strict* argument and the strict mode have been deprecated.
The parser is now able to accept and parse invalid markup too.
.. versionchanged:: 3.4
*convert_charrefs* keyword argument added.
An exception is defined as well:
.. exception:: HTMLParseError
Exception raised by the :class:`HTMLParser` class when it encounters an error
while parsing and *strict* is ``True``. This exception provides three
attributes: :attr:`msg` is a brief message explaining the error,
:attr:`lineno` is the number of the line on which the broken construct was
detected, and :attr:`offset` is the number of characters into the line at
which the construct starts.
.. deprecated-removed:: 3.3 3.5
This exception has been deprecated because it's never raised by the parser
(when the default non-strict mode is used).
Example HTML Parser Application
-------------------------------
@ -246,8 +217,7 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`):
The *data* parameter will be the entire contents of the declaration inside
the ``<![...]>`` markup. It is sometimes useful to be overridden by a
derived class. The base class implementation raises an :exc:`HTMLParseError`
when *strict* is ``True``.
derived class. The base class implementation does nothing.
.. _htmlparser-examples:
@ -358,9 +328,3 @@ Parsing invalid HTML (e.g. unquoted attributes) also works::
Data : tag soup
End tag : p
End tag : a
.. rubric:: Footnotes
.. [#] For backward compatibility reasons *strict* mode does not raise
exceptions for all non-compliant HTML. That is, some invalid HTML
is tolerated even in *strict* mode.

View File

@ -29,35 +29,15 @@ starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>')
commentclose = re.compile(r'--\s*>')
# Note:
# 1) the strict attrfind isn't really strict, but we can't make it
# correctly strict without breaking backward compatibility;
# 2) if you change tagfind/attrfind remember to update locatestarttagend too;
# 3) if you change tagfind/attrfind and/or locatestarttagend the parser will
# 1) if you change tagfind/attrfind remember to update locatestarttagend too;
# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
# explode, so don't do it.
tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
attrfind_tolerant = re.compile(
r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
(?:\s*=\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|\"[^\"]*\" # LIT-enclosed value
|[^'\">\s]+ # bare value
)
)?
)
)*
\s* # trailing whitespace
""", re.VERBOSE)
locatestarttagend_tolerant = re.compile(r"""
<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
(?:[\s/]* # optional whitespace before attribute name
@ -79,24 +59,6 @@ endendtag = re.compile('>')
endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
class HTMLParseError(Exception):
"""Exception raised for all parse errors."""
def __init__(self, msg, position=(None, None)):
assert msg
self.msg = msg
self.lineno = position[0]
self.offset = position[1]
def __str__(self):
result = self.msg
if self.lineno is not None:
result = result + ", at line %d" % self.lineno
if self.offset is not None:
result = result + ", column %d" % (self.offset + 1)
return result
_default_sentinel = object()
class HTMLParser(_markupbase.ParserBase):
@ -123,22 +85,12 @@ class HTMLParser(_markupbase.ParserBase):
CDATA_CONTENT_ELEMENTS = ("script", "style")
def __init__(self, strict=_default_sentinel, *,
convert_charrefs=_default_sentinel):
def __init__(self, *, convert_charrefs=_default_sentinel):
"""Initialize and reset this instance.
If convert_charrefs is True (default: False), all character references
are automatically converted to the corresponding Unicode characters.
If strict is set to False (the default) the parser will parse invalid
markup, otherwise it will raise an error. Note that the strict mode
and argument are deprecated.
"""
if strict is not _default_sentinel:
warnings.warn("The strict argument and mode are deprecated.",
DeprecationWarning, stacklevel=2)
else:
strict = False # default
self.strict = strict
if convert_charrefs is _default_sentinel:
convert_charrefs = False # default
warnings.warn("The value of convert_charrefs will become True in "
@ -168,11 +120,6 @@ class HTMLParser(_markupbase.ParserBase):
"""Handle any buffered data."""
self.goahead(1)
def error(self, message):
warnings.warn("The 'error' method is deprecated.",
DeprecationWarning, stacklevel=2)
raise HTMLParseError(message, self.getpos())
__starttag_text = None
def get_starttag_text(self):
@ -227,10 +174,7 @@ class HTMLParser(_markupbase.ParserBase):
elif startswith("<?", i):
k = self.parse_pi(i)
elif startswith("<!", i):
if self.strict:
k = self.parse_declaration(i)
else:
k = self.parse_html_declaration(i)
k = self.parse_html_declaration(i)
elif (i + 1) < n:
self.handle_data("<")
k = i + 1
@ -239,8 +183,6 @@ class HTMLParser(_markupbase.ParserBase):
if k < 0:
if not end:
break
if self.strict:
self.error("EOF in middle of construct")
k = rawdata.find('>', i + 1)
if k < 0:
k = rawdata.find('<', i + 1)
@ -282,13 +224,10 @@ class HTMLParser(_markupbase.ParserBase):
if match:
# match.group() will contain at least 2 chars
if end and match.group() == rawdata[i:]:
if self.strict:
self.error("EOF in middle of entity or char ref")
else:
k = match.end()
if k <= i:
k = n
i = self.updatepos(i, i + 1)
k = match.end()
if k <= i:
k = n
i = self.updatepos(i, i + 1)
# incomplete
break
elif (i + 1) < n:
@ -367,18 +306,12 @@ class HTMLParser(_markupbase.ParserBase):
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
if self.strict:
match = tagfind.match(rawdata, i+1)
else:
match = tagfind_tolerant.match(rawdata, i+1)
match = tagfind_tolerant.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = match.group(1).lower()
while k < endpos:
if self.strict:
m = attrfind.match(rawdata, k)
else:
m = attrfind_tolerant.match(rawdata, k)
m = attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
@ -401,9 +334,6 @@ class HTMLParser(_markupbase.ParserBase):
- self.__starttag_text.rfind("\n")
else:
offset = offset + len(self.__starttag_text)
if self.strict:
self.error("junk characters in start tag: %r"
% (rawdata[k:endpos][:20],))
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'):
@ -419,10 +349,7 @@ class HTMLParser(_markupbase.ParserBase):
# or -1 if incomplete.
def check_for_whole_start_tag(self, i):
rawdata = self.rawdata
if self.strict:
m = locatestarttagend.match(rawdata, i)
else:
m = locatestarttagend_tolerant.match(rawdata, i)
m = locatestarttagend_tolerant.match(rawdata, i)
if m:
j = m.end()
next = rawdata[j:j+1]
@ -435,9 +362,6 @@ class HTMLParser(_markupbase.ParserBase):
# buffer boundary
return -1
# else bogus input
if self.strict:
self.updatepos(i, j + 1)
self.error("malformed empty start tag")
if j > i:
return j
else:
@ -450,9 +374,6 @@ class HTMLParser(_markupbase.ParserBase):
# end of input in or before attribute value, or we have the
# '/' from a '/>' ending
return -1
if self.strict:
self.updatepos(i, j)
self.error("malformed start tag")
if j > i:
return j
else:
@ -472,8 +393,6 @@ class HTMLParser(_markupbase.ParserBase):
if self.cdata_elem is not None:
self.handle_data(rawdata[i:gtpos])
return gtpos
if self.strict:
self.error("bad end tag: %r" % (rawdata[i:gtpos],))
# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
namematch = tagfind_tolerant.match(rawdata, i+2)
if not namematch:
@ -539,8 +458,7 @@ class HTMLParser(_markupbase.ParserBase):
pass
def unknown_decl(self, data):
if self.strict:
self.error("unknown declaration: %r" % (data,))
pass
# Internal -- helper to remove special character quoting
def unescape(self, s):

View File

@ -85,7 +85,7 @@ class EventCollectorCharrefs(EventCollector):
class TestCaseBase(unittest.TestCase):
def get_collector(self):
raise NotImplementedError
return EventCollector(convert_charrefs=False)
def _run_check(self, source, expected_events, collector=None):
if collector is None:
@ -105,21 +105,8 @@ class TestCaseBase(unittest.TestCase):
self._run_check(source, events,
EventCollectorExtra(convert_charrefs=False))
def _parse_error(self, source):
def parse(source=source):
parser = self.get_collector()
parser.feed(source)
parser.close()
with self.assertRaises(html.parser.HTMLParseError):
with self.assertWarns(DeprecationWarning):
parse()
class HTMLParserStrictTestCase(TestCaseBase):
def get_collector(self):
with support.check_warnings(("", DeprecationWarning), quite=False):
return EventCollector(strict=True, convert_charrefs=False)
class HTMLParserTestCase(TestCaseBase):
def test_processing_instruction_only(self):
self._run_check("<?processing instruction>", [
@ -201,9 +188,6 @@ text
("data", "this < text > contains < bare>pointy< brackets"),
])
def test_illegal_declarations(self):
self._parse_error('<!spacer type="block" height="25">')
def test_starttag_end_boundary(self):
self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])])
@ -238,25 +222,6 @@ text
self._run_check(["<!--abc--", ">"], output)
self._run_check(["<!--abc-->", ""], output)
def test_starttag_junk_chars(self):
self._parse_error("</>")
self._parse_error("</$>")
self._parse_error("</")
self._parse_error("</a")
self._parse_error("<a<a>")
self._parse_error("</a<a>")
self._parse_error("<!")
self._parse_error("<a")
self._parse_error("<a foo='bar'")
self._parse_error("<a foo='bar")
self._parse_error("<a foo='>'")
self._parse_error("<a foo='>")
self._parse_error("<a$>")
self._parse_error("<a$b>")
self._parse_error("<a$b/>")
self._parse_error("<a$b >")
self._parse_error("<a$b />")
def test_valid_doctypes(self):
# from http://www.w3.org/QA/2002/04/valid-dtd-list.html
dtds = ['HTML', # HTML5 doctype
@ -281,9 +246,6 @@ text
self._run_check("<!DOCTYPE %s>" % dtd,
[('decl', 'DOCTYPE ' + dtd)])
def test_declaration_junk_chars(self):
self._parse_error("<!DOCTYPE foo $ >")
def test_startendtag(self):
self._run_check("<p/>", [
("startendtag", "p", []),
@ -421,23 +383,12 @@ text
self._run_check('no charrefs here', [('data', 'no charrefs here')],
collector=collector())
class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
def get_collector(self):
return EventCollector(convert_charrefs=False)
def test_deprecation_warnings(self):
with self.assertWarns(DeprecationWarning):
EventCollector() # convert_charrefs not passed explicitly
with self.assertWarns(DeprecationWarning):
EventCollector(strict=True)
with self.assertWarns(DeprecationWarning):
EventCollector(strict=False)
with self.assertRaises(html.parser.HTMLParseError):
with self.assertWarns(DeprecationWarning):
EventCollector().error('test')
# the remaining tests were for the "tolerant" parser (which is now
# the default), and check various kind of broken markup
def test_tolerant_parsing(self):
self._run_check('<html <html>te>>xt&a<<bc</a></html>\n'
'<img src="URL><//img></html</html>', [
@ -686,11 +637,7 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
self._run_check(html, expected)
class AttributesStrictTestCase(TestCaseBase):
def get_collector(self):
with support.check_warnings(("", DeprecationWarning), quite=False):
return EventCollector(strict=True, convert_charrefs=False)
class AttributesTestCase(TestCaseBase):
def test_attr_syntax(self):
output = [
@ -747,12 +694,6 @@ class AttributesStrictTestCase(TestCaseBase):
[("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])])
class AttributesTolerantTestCase(AttributesStrictTestCase):
def get_collector(self):
return EventCollector(convert_charrefs=False)
def test_attr_funky_names2(self):
self._run_check(
"<a $><b $=%><c \=/>",

View File

@ -121,6 +121,9 @@ Core and Builtins
Library
-------
- Issue #15114: the strict mode and argument of HTMLParser, HTMLParser.error,
and the HTMLParserError exception have been removed.
- Issue #22085: Dropped support of Tk 8.3 in Tkinter.
- Issue #21580: Now Tkinter correctly handles bytes arguments passed to Tk.