From 0f1571ce7fb7da0e2ad75f941b29f2d19717e012 Mon Sep 17 00:00:00 2001 From: Ezio Melotti Date: Mon, 14 Nov 2011 18:04:05 +0200 Subject: [PATCH] #1745761, #755670, #13357, #12629, #1200313: improve attribute handling in HTMLParser. --- Lib/HTMLParser.py | 20 +++++----- Lib/test/test_htmlparser.py | 76 +++++++++++++++++++++++++++++++++++-- Misc/NEWS | 3 ++ 3 files changed, 87 insertions(+), 12 deletions(-) diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 94ebc7f8dc4..cd353f8ca03 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -24,22 +24,23 @@ starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') commentclose = re.compile(r'--\s*>') tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') + attrfind = re.compile( - r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' - r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') + r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name (?:\s+ # whitespace before attribute name - (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name - (?:\s*=\s* # value indicator + (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name + (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value - |\"[^\"]*\" # LIT-enclosed value - |[^'\">\s]+ # bare value + |"[^"]*" # LIT-enclosed value + |(?!['"])[^>\s]* # bare value ) - )? - ) - )* + )?\s* + )* + )? \s* # trailing whitespace """, re.VERBOSE) endendtag = re.compile('>') @@ -254,6 +255,7 @@ class HTMLParser(markupbase.ParserBase): elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] + if attrvalue: attrvalue = self.unescape(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 192a3419354..b84e7dc935f 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -226,13 +226,11 @@ DOCTYPE html [ self._parse_error("") self._parse_error("") self._parse_error("") self._parse_error("'") self._parse_error("", [("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])]) - + self._run_check( + "", + [("starttag", "a", [("$", None)]), + ("starttag", "b", [("$", "%")]), + ("starttag", "c", [("\\", "/")])]) def test_entityrefs_in_attributes(self): self._run_check( "", [("starttag", "html", [("foo", u"\u20AC&aa&unsupported;")])]) + def test_entities_in_attribute_value(self): + # see #1200313 + for entity in ['&', '&', '&', '&']: + self._run_check('' % entity, + [("starttag", "a", [("href", "&")])]) + self._run_check("" % entity, + [("starttag", "a", [("href", "&")])]) + self._run_check("" % entity, + [("starttag", "a", [("href", "&")])]) + + def test_malformed_attributes(self): + # see #13357 + html = ( + "test - bad1" + "test - bad2" + "test - bad3" + "test - bad4" + ) + expected = [ + ('starttag', 'a', [('href', "test'style='color:red;bad1'")]), + ('data', 'test - bad1'), ('endtag', 'a'), + ('starttag', 'a', [('href', "test'+style='color:red;ba2'")]), + ('data', 'test - bad2'), ('endtag', 'a'), + ('starttag', 'a', [('href', u"test'\xa0style='color:red;bad3'")]), + ('data', 'test - bad3'), ('endtag', 'a'), + ('starttag', 'a', [('href', u"test'\xa0style='color:red;bad4'")]), + ('data', 'test - bad4'), ('endtag', 'a') + ] + self._run_check(html, expected) + + def test_malformed_adjacent_attributes(self): + # see #12629 + self._run_check('', + [('starttag', 'x', []), + ('startendtag', 'y', [('z', ''), ('o""', None)]), + ('endtag', 'x')]) + self._run_check('', + [('starttag', 'x', []), + ('startendtag', 'y', [('z', ''), ('""', None)]), + ('endtag', 'x')]) + + # see #755670 for the following 3 tests + def test_adjacent_attributes(self): + self._run_check('', + [("starttag", "a", + [("width", "100%"), ("cellspacing","0")])]) + + self._run_check('', + [("starttag", "a", + [("id", "foo"), ("class","bar")])]) + + def test_missing_attribute_value(self): + self._run_check('', + [("starttag", "a", [("v", "")])]) + + def test_javascript_attribute_value(self): + self._run_check("", + [("starttag", "a", + [("href", "javascript:popup('/popup/help.html')")])]) + + def test_end_tag_in_attribute_value(self): + # see #1745761 + self._run_check("spam", + [("starttag", "a", + [("href", "http://www.example.org/\">;")]), + ("data", "spam"), ("endtag", "a")]) + + def test_main(): test_support.run_unittest(HTMLParserTestCase, AttributesTestCase) diff --git a/Misc/NEWS b/Misc/NEWS index 259d08a5f55..c12d53c604e 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -76,6 +76,9 @@ Core and Builtins Library ------- +- Issues #1745761, #755670, #13357, #12629, #1200313: HTMLParser now correctly + handles non-valid attributes, including adjacent and unquoted attributes. + - Issue #13193: Fix distutils.filelist.FileList under Windows. The "recursive-include" directive now recognizes both legal path separators.