diff --git a/Lib/html/parser.py b/Lib/html/parser.py
index aa31fbc5b1b..2bfd187a892 100644
--- a/Lib/html/parser.py
+++ b/Lib/html/parser.py
@@ -26,14 +26,18 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
-# Note, the strict one of this pair isn't really strict, but we can't
-# make it correctly strict without breaking backward compatibility.
+# Note:
+# 1) the strict attrfind isn't really strict, but we can't make it
+# correctly strict without breaking backward compatibility;
+# 2) if you change attrfind remember to update locatestarttagend too;
+# 3) if you change attrfind and/or locatestarttagend the parser will
+# explode, so don't do it.
attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
attrfind_tolerant = re.compile(
- r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
- r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
+ r'[\s/]*((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
+ r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name
@@ -50,15 +54,15 @@ locatestarttagend = re.compile(r"""
""", re.VERBOSE)
locatestarttagend_tolerant = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
- (?:\s* # optional whitespace before attribute name
- (?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name
+ (?:[\s/]* # optional whitespace before attribute name
+ (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
(?:\s*=+\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|"[^"]*" # LIT-enclosed value
|(?!['"])[^>\s]* # bare value
)
(?:\s*,)* # possibly followed by a comma
- )?\s*
+ )?(?:\s|/(?!>))*
)*
)?
\s* # trailing whitespace
diff --git a/Lib/test/test_htmlparser.py b/Lib/test/test_htmlparser.py
index e2b09a96c03..3e2a59064e5 100644
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@@ -389,6 +389,27 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
self._run_check("', [('startendtag', 'a', [('foo', 'var')])])
+ html = ('')
+ expected = [(
+ 'startendtag', 'img',
+ [('width', '902'), ('height', '250px'),
+ ('src', '/sites/default/files/images/homepage/foo.jpg'),
+ ('*what', None), ('am', None), ('i', None),
+ ('doing', None), ('here*', None)]
+ )]
+ self._run_check(html, expected)
+ html = (''
+ '')
+ expected = [
+ ('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]),
+ ('starttag', 'a', [('foo', None), ('=', None), ('bar', None)])
+ ]
+ self._run_check(html, expected)
+
def test_declaration_junk_chars(self):
self._run_check("", [('decl', 'DOCTYPE foo $ ')])
diff --git a/Misc/NEWS b/Misc/NEWS
index 0e3595a2212..45744555cec 100644
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -121,6 +121,8 @@ Core and Builtins
Library
-------
+- HTMLParser is now able to handle slashes in the start tag.
+
- Issue #14001: CVE-2012-0845: xmlrpc: Fix an endless loop in
SimpleXMLRPCServer upon malformed POST request.