From 73a4359eb0eb624c588c5d52083ea4944f9787ea Mon Sep 17 00:00:00 2001 From: Ezio Melotti Date: Sat, 2 Aug 2014 14:10:30 +0300 Subject: [PATCH] #15114: the strict mode and argument of HTMLParser, HTMLParser.error, and the HTMLParserError exception have been removed. --- Doc/library/html.parser.rst | 42 +------------- Lib/html/parser.py | 106 ++++-------------------------------- Lib/test/test_htmlparser.py | 69 ++--------------------- Misc/NEWS | 3 + 4 files changed, 23 insertions(+), 197 deletions(-) diff --git a/Doc/library/html.parser.rst b/Doc/library/html.parser.rst index 44b7d6ea6d2..67ae139eb06 100644 --- a/Doc/library/html.parser.rst +++ b/Doc/library/html.parser.rst @@ -16,9 +16,9 @@ This module defines a class :class:`HTMLParser` which serves as the basis for parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. -.. class:: HTMLParser(strict=False, *, convert_charrefs=False) +.. class:: HTMLParser(*, convert_charrefs=False) - Create a parser instance. + Create a parser instance able to parse invalid markup. If *convert_charrefs* is ``True`` (default: ``False``), all character references (except the ones in ``script``/``style`` elements) are @@ -26,12 +26,6 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. The use of ``convert_charrefs=True`` is encouraged and will become the default in Python 3.5. - If *strict* is ``False`` (the default), the parser will accept and parse - invalid markup. If *strict* is ``True`` the parser will raise an - :exc:`~html.parser.HTMLParseError` exception instead [#]_ when it's not - able to parse the markup. The use of ``strict=True`` is discouraged and - the *strict* argument is deprecated. - An :class:`.HTMLParser` instance is fed HTML data and calls handler methods when start tags, end tags, text, comments, and other markup elements are encountered. The user should subclass :class:`.HTMLParser` and override its @@ -40,32 +34,9 @@ parsing text files formatted in HTML (HyperText Mark-up Language) and XHTML. This parser does not check that end tags match start tags or call the end-tag handler for elements which are closed implicitly by closing an outer element. - .. versionchanged:: 3.2 - *strict* argument added. - - .. deprecated-removed:: 3.3 3.5 - The *strict* argument and the strict mode have been deprecated. - The parser is now able to accept and parse invalid markup too. - .. versionchanged:: 3.4 *convert_charrefs* keyword argument added. -An exception is defined as well: - - -.. exception:: HTMLParseError - - Exception raised by the :class:`HTMLParser` class when it encounters an error - while parsing and *strict* is ``True``. This exception provides three - attributes: :attr:`msg` is a brief message explaining the error, - :attr:`lineno` is the number of the line on which the broken construct was - detected, and :attr:`offset` is the number of characters into the line at - which the construct starts. - - .. deprecated-removed:: 3.3 3.5 - This exception has been deprecated because it's never raised by the parser - (when the default non-strict mode is used). - Example HTML Parser Application ------------------------------- @@ -246,8 +217,7 @@ implementations do nothing (except for :meth:`~HTMLParser.handle_startendtag`): The *data* parameter will be the entire contents of the declaration inside the ```` markup. It is sometimes useful to be overridden by a - derived class. The base class implementation raises an :exc:`HTMLParseError` - when *strict* is ``True``. + derived class. The base class implementation does nothing. .. _htmlparser-examples: @@ -358,9 +328,3 @@ Parsing invalid HTML (e.g. unquoted attributes) also works:: Data : tag soup End tag : p End tag : a - -.. rubric:: Footnotes - -.. [#] For backward compatibility reasons *strict* mode does not raise - exceptions for all non-compliant HTML. That is, some invalid HTML - is tolerated even in *strict* mode. diff --git a/Lib/html/parser.py b/Lib/html/parser.py index a650d5eeded..5a4f9e17551 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -29,35 +29,15 @@ starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') commentclose = re.compile(r'--\s*>') # Note: -# 1) the strict attrfind isn't really strict, but we can't make it -# correctly strict without breaking backward compatibility; -# 2) if you change tagfind/attrfind remember to update locatestarttagend too; -# 3) if you change tagfind/attrfind and/or locatestarttagend the parser will +# 1) if you change tagfind/attrfind remember to update locatestarttagend too; +# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will # explode, so don't do it. -tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*') # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') -attrfind = re.compile( - r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') attrfind_tolerant = re.compile( r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') -locatestarttagend = re.compile(r""" - <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s+ # whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator - (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value - ) - )? - ) - )* - \s* # trailing whitespace -""", re.VERBOSE) locatestarttagend_tolerant = re.compile(r""" <[a-zA-Z][^\t\n\r\f />\x00]* # tag name (?:[\s/]* # optional whitespace before attribute name @@ -79,24 +59,6 @@ endendtag = re.compile('>') endtagfind = re.compile('') -class HTMLParseError(Exception): - """Exception raised for all parse errors.""" - - def __init__(self, msg, position=(None, None)): - assert msg - self.msg = msg - self.lineno = position[0] - self.offset = position[1] - - def __str__(self): - result = self.msg - if self.lineno is not None: - result = result + ", at line %d" % self.lineno - if self.offset is not None: - result = result + ", column %d" % (self.offset + 1) - return result - - _default_sentinel = object() class HTMLParser(_markupbase.ParserBase): @@ -123,22 +85,12 @@ class HTMLParser(_markupbase.ParserBase): CDATA_CONTENT_ELEMENTS = ("script", "style") - def __init__(self, strict=_default_sentinel, *, - convert_charrefs=_default_sentinel): + def __init__(self, *, convert_charrefs=_default_sentinel): """Initialize and reset this instance. If convert_charrefs is True (default: False), all character references are automatically converted to the corresponding Unicode characters. - If strict is set to False (the default) the parser will parse invalid - markup, otherwise it will raise an error. Note that the strict mode - and argument are deprecated. """ - if strict is not _default_sentinel: - warnings.warn("The strict argument and mode are deprecated.", - DeprecationWarning, stacklevel=2) - else: - strict = False # default - self.strict = strict if convert_charrefs is _default_sentinel: convert_charrefs = False # default warnings.warn("The value of convert_charrefs will become True in " @@ -168,11 +120,6 @@ class HTMLParser(_markupbase.ParserBase): """Handle any buffered data.""" self.goahead(1) - def error(self, message): - warnings.warn("The 'error' method is deprecated.", - DeprecationWarning, stacklevel=2) - raise HTMLParseError(message, self.getpos()) - __starttag_text = None def get_starttag_text(self): @@ -227,10 +174,7 @@ class HTMLParser(_markupbase.ParserBase): elif startswith("', i + 1) if k < 0: k = rawdata.find('<', i + 1) @@ -282,13 +224,10 @@ class HTMLParser(_markupbase.ParserBase): if match: # match.group() will contain at least 2 chars if end and match.group() == rawdata[i:]: - if self.strict: - self.error("EOF in middle of entity or char ref") - else: - k = match.end() - if k <= i: - k = n - i = self.updatepos(i, i + 1) + k = match.end() + if k <= i: + k = n + i = self.updatepos(i, i + 1) # incomplete break elif (i + 1) < n: @@ -367,18 +306,12 @@ class HTMLParser(_markupbase.ParserBase): # Now parse the data between i+1 and j into a tag and attrs attrs = [] - if self.strict: - match = tagfind.match(rawdata, i+1) - else: - match = tagfind_tolerant.match(rawdata, i+1) + match = tagfind_tolerant.match(rawdata, i+1) assert match, 'unexpected call to parse_starttag()' k = match.end() self.lasttag = tag = match.group(1).lower() while k < endpos: - if self.strict: - m = attrfind.match(rawdata, k) - else: - m = attrfind_tolerant.match(rawdata, k) + m = attrfind_tolerant.match(rawdata, k) if not m: break attrname, rest, attrvalue = m.group(1, 2, 3) @@ -401,9 +334,6 @@ class HTMLParser(_markupbase.ParserBase): - self.__starttag_text.rfind("\n") else: offset = offset + len(self.__starttag_text) - if self.strict: - self.error("junk characters in start tag: %r" - % (rawdata[k:endpos][:20],)) self.handle_data(rawdata[i:endpos]) return endpos if end.endswith('/>'): @@ -419,10 +349,7 @@ class HTMLParser(_markupbase.ParserBase): # or -1 if incomplete. def check_for_whole_start_tag(self, i): rawdata = self.rawdata - if self.strict: - m = locatestarttagend.match(rawdata, i) - else: - m = locatestarttagend_tolerant.match(rawdata, i) + m = locatestarttagend_tolerant.match(rawdata, i) if m: j = m.end() next = rawdata[j:j+1] @@ -435,9 +362,6 @@ class HTMLParser(_markupbase.ParserBase): # buffer boundary return -1 # else bogus input - if self.strict: - self.updatepos(i, j + 1) - self.error("malformed empty start tag") if j > i: return j else: @@ -450,9 +374,6 @@ class HTMLParser(_markupbase.ParserBase): # end of input in or before attribute value, or we have the # '/' from a '/>' ending return -1 - if self.strict: - self.updatepos(i, j) - self.error("malformed start tag") if j > i: return j else: @@ -472,8 +393,6 @@ class HTMLParser(_markupbase.ParserBase): if self.cdata_elem is not None: self.handle_data(rawdata[i:gtpos]) return gtpos - if self.strict: - self.error("bad end tag: %r" % (rawdata[i:gtpos],)) # find the name: w3.org/TR/html5/tokenization.html#tag-name-state namematch = tagfind_tolerant.match(rawdata, i+2) if not namematch: @@ -539,8 +458,7 @@ class HTMLParser(_markupbase.ParserBase): pass def unknown_decl(self, data): - if self.strict: - self.error("unknown declaration: %r" % (data,)) + pass # Internal -- helper to remove special character quoting def unescape(self, s): diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 2d771a2a974..1aa1508039e 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -85,7 +85,7 @@ class EventCollectorCharrefs(EventCollector): class TestCaseBase(unittest.TestCase): def get_collector(self): - raise NotImplementedError + return EventCollector(convert_charrefs=False) def _run_check(self, source, expected_events, collector=None): if collector is None: @@ -105,21 +105,8 @@ class TestCaseBase(unittest.TestCase): self._run_check(source, events, EventCollectorExtra(convert_charrefs=False)) - def _parse_error(self, source): - def parse(source=source): - parser = self.get_collector() - parser.feed(source) - parser.close() - with self.assertRaises(html.parser.HTMLParseError): - with self.assertWarns(DeprecationWarning): - parse() - -class HTMLParserStrictTestCase(TestCaseBase): - - def get_collector(self): - with support.check_warnings(("", DeprecationWarning), quite=False): - return EventCollector(strict=True, convert_charrefs=False) +class HTMLParserTestCase(TestCaseBase): def test_processing_instruction_only(self): self._run_check("", [ @@ -201,9 +188,6 @@ text ("data", "this < text > contains < bare>pointy< brackets"), ]) - def test_illegal_declarations(self): - self._parse_error('') - def test_starttag_end_boundary(self): self._run_check("""""", [("starttag", "a", [("b", "<")])]) self._run_check("""""", [("starttag", "a", [("b", ">")])]) @@ -238,25 +222,6 @@ text self._run_check(["", ""], output) - def test_starttag_junk_chars(self): - self._parse_error("") - self._parse_error("") - self._parse_error("") - self._parse_error("") - self._parse_error("'") - self._parse_error("" % dtd, [('decl', 'DOCTYPE ' + dtd)]) - def test_declaration_junk_chars(self): - self._parse_error("") - def test_startendtag(self): self._run_check("

", [ ("startendtag", "p", []), @@ -421,23 +383,12 @@ text self._run_check('no charrefs here', [('data', 'no charrefs here')], collector=collector()) - -class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): - - def get_collector(self): - return EventCollector(convert_charrefs=False) - def test_deprecation_warnings(self): with self.assertWarns(DeprecationWarning): EventCollector() # convert_charrefs not passed explicitly - with self.assertWarns(DeprecationWarning): - EventCollector(strict=True) - with self.assertWarns(DeprecationWarning): - EventCollector(strict=False) - with self.assertRaises(html.parser.HTMLParseError): - with self.assertWarns(DeprecationWarning): - EventCollector().error('test') + # the remaining tests were for the "tolerant" parser (which is now + # the default), and check various kind of broken markup def test_tolerant_parsing(self): self._run_check('te>>xt&a<\n' '", diff --git a/Misc/NEWS b/Misc/NEWS index b7d3fcdb57d..5174f0552a3 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -121,6 +121,9 @@ Core and Builtins Library ------- +- Issue #15114: the strict mode and argument of HTMLParser, HTMLParser.error, + and the HTMLParserError exception have been removed. + - Issue #22085: Dropped support of Tk 8.3 in Tkinter. - Issue #21580: Now Tkinter correctly handles bytes arguments passed to Tk.