diff --git a/Lib/html/parser.py b/Lib/html/parser.py index 60830779816..9e49effca1f 100644 --- a/Lib/html/parser.py +++ b/Lib/html/parser.py @@ -46,7 +46,7 @@ locatestarttagend_tolerant = re.compile(r""" |"[^"]*" # LIT-enclosed value |(?!['"])[^>\s]* # bare value ) - (?:\s*,)* # possibly followed by a comma + \s* # possibly followed by a space )?(?:\s|/(?!>))* )* )? diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index a2bfb39d16a..12917755a56 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -452,42 +452,6 @@ text self._run_check('', [('comment', 'spacer type="block" height="25"')]) - def test_with_unquoted_attributes(self): - # see #12008 - html = ("" - "" - "
" - "- software-and-i" - "- library
") - expected = [ - ('starttag', 'html', []), - ('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]), - ('starttag', 'table', - [('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]), - ('starttag', 'tr', []), - ('starttag', 'td', [('align', 'left')]), - ('starttag', 'font', [('size', '-1')]), - ('data', '- '), ('starttag', 'a', [('href', '/rabota/')]), - ('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'), - ('endtag', 'span'), ('endtag', 'a'), - ('data', '- '), ('starttag', 'a', [('href', '/1/')]), - ('starttag', 'span', [('class', 'en')]), ('data', ' library'), - ('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table') - ] - self._run_check(html, expected) - - def test_comma_between_attributes(self): - self._run_check('
', [ - ('starttag', 'form', - [('action', '/xxx.php?a=1&b=2&'), - (',', None), ('method', 'post')])]) - - def test_weird_chars_in_unquoted_attribute_values(self): - self._run_check('', [ - ('starttag', 'form', - [('action', 'bogus|&#()value')])]) - def test_invalid_end_tags(self): # A collection of broken end tags.
is used as separator. # see http://www.w3.org/TR/html5/tokenization.html#end-tag-open-state @@ -766,6 +730,62 @@ class AttributesTestCase(TestCaseBase): [("href", "http://www.example.org/\">;")]), ("data", "spam"), ("endtag", "a")]) + def test_with_unquoted_attributes(self): + # see #12008 + html = ("" + "" + "
" + "- software-and-i" + "- library
") + expected = [ + ('starttag', 'html', []), + ('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]), + ('starttag', 'table', + [('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]), + ('starttag', 'tr', []), + ('starttag', 'td', [('align', 'left')]), + ('starttag', 'font', [('size', '-1')]), + ('data', '- '), ('starttag', 'a', [('href', '/rabota/')]), + ('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'), + ('endtag', 'span'), ('endtag', 'a'), + ('data', '- '), ('starttag', 'a', [('href', '/1/')]), + ('starttag', 'span', [('class', 'en')]), ('data', ' library'), + ('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table') + ] + self._run_check(html, expected) + + def test_comma_between_attributes(self): + # see bpo 41478 + # HTMLParser preserves duplicate attributes, leaving the task of + # removing duplicate attributes to a conformant html tree builder + html = ('
' # between attrs (unquoted) + '
' # between attrs (quoted) + '
' # after values (unquoted) + '
' # after values (quoted) + '
' # one comma values (quoted) + '
' # before values (unquoted) + '
' # before values (quoted) + '
' # before names + '
' # after names + ) + expected = [ + ('starttag', 'div', [('class', 'bar,baz=asd'),]), + ('starttag', 'div', [('class', 'bar'), (',baz', 'asd')]), + ('starttag', 'div', [('class', 'bar,'), ('baz', 'asd,')]), + ('starttag', 'div', [('class', 'bar'), (',', None), + ('baz', 'asd'), (',', None)]), + ('starttag', 'div', [('class', 'bar'), (',', None)]), + ('starttag', 'div', [('class', ',bar'), ('baz', ',asd')]), + ('starttag', 'div', [('class', ',"bar"'), ('baz', ',"asd"')]), + ('starttag', 'div', [(',class', 'bar'), (',baz', 'asd')]), + ('starttag', 'div', [('class,', 'bar'), ('baz,', 'asd')]), + ] + self._run_check(html, expected) + + def test_weird_chars_in_unquoted_attribute_values(self): + self._run_check('', [ + ('starttag', 'form', + [('action', 'bogus|&#()value')])]) if __name__ == "__main__": unittest.main() diff --git a/Misc/NEWS.d/next/Library/2021-01-05-21-26-29.bpo-41748.KdC0w3.rst b/Misc/NEWS.d/next/Library/2021-01-05-21-26-29.bpo-41748.KdC0w3.rst new file mode 100644 index 00000000000..52efa3ac3d4 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2021-01-05-21-26-29.bpo-41748.KdC0w3.rst @@ -0,0 +1,2 @@ +Fix HTMLParser parsing rules for element attributes containing +commas with spaces. Patch by Karl Dubost. \ No newline at end of file