From 36b7361fe76733b3a4944ef92b49bcea4584b740 Mon Sep 17 00:00:00 2001 From: Ezio Melotti Date: Tue, 21 Feb 2012 09:22:16 +0200 Subject: [PATCH] HTMLParser is now able to handle slashes in the start tag. --- Lib/HTMLParser.py | 10 +++++----- Lib/test/test_htmlparser.py | 21 +++++++++++++++++++++ Misc/NEWS | 2 ++ 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/Lib/HTMLParser.py b/Lib/HTMLParser.py index 5081a62562c..d4e14d43876 100644 --- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -28,19 +28,19 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*') attrfind = re.compile( - r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' - r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') + r'[\s/]*((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' + r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') locatestarttagend = re.compile(r""" <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name - (?:\s+ # whitespace before attribute name - (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name + (?:[\s/]* # optional whitespace before attribute name + (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name (?:\s*=+\s* # value indicator (?:'[^']*' # LITA-enclosed value |"[^"]*" # LIT-enclosed value |(?!['"])[^>\s]* # bare value ) - )?\s* + )?(?:\s|/(?!>))* )* )? \s* # trailing whitespace diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py index 8136bca3e28..41f43408d83 100644 --- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -240,6 +240,27 @@ text self._run_check("" % dtd, [('decl', 'DOCTYPE ' + dtd)]) + def test_slashes_in_starttag(self): + self._run_check('', [('startendtag', 'a', [('foo', 'var')])]) + html = ('') + expected = [( + 'startendtag', 'img', + [('width', '902'), ('height', '250px'), + ('src', '/sites/default/files/images/homepage/foo.jpg'), + ('*what', None), ('am', None), ('i', None), + ('doing', None), ('here*', None)] + )] + self._run_check(html, expected) + html = ('' + '') + expected = [ + ('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]), + ('starttag', 'a', [('foo', None), ('=', None), ('bar', None)]) + ] + self._run_check(html, expected) + def test_declaration_junk_chars(self): self._run_check("", [('decl', 'DOCTYPE foo $ ')]) diff --git a/Misc/NEWS b/Misc/NEWS index 1c049876abb..45f22b1447d 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -98,6 +98,8 @@ Core and Builtins Library ------- +- HTMLParser is now able to handle slashes in the start tag. + - Issue #14001: CVE-2012-0845: xmlrpc: Fix an endless loop in SimpleXMLRPCServer upon malformed POST request.