HTMLParser is now able to handle slashes in the start tag.

This commit is contained in:
Ezio Melotti 2012-02-21 09:25:00 +02:00
parent 178e5ea305
commit 29877e8e04
3 changed files with 34 additions and 7 deletions

View File

@ -26,14 +26,18 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*') tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
# Note, the strict one of this pair isn't really strict, but we can't # Note:
# make it correctly strict without breaking backward compatibility. # 1) the strict attrfind isn't really strict, but we can't make it
# correctly strict without breaking backward compatibility;
# 2) if you change attrfind remember to update locatestarttagend too;
# 3) if you change attrfind and/or locatestarttagend the parser will
# explode, so don't do it.
attrfind = re.compile( attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
attrfind_tolerant = re.compile( attrfind_tolerant = re.compile(
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' r'[\s/]*((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
locatestarttagend = re.compile(r""" locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name (?:\s+ # whitespace before attribute name
@ -50,15 +54,15 @@ locatestarttagend = re.compile(r"""
""", re.VERBOSE) """, re.VERBOSE)
locatestarttagend_tolerant = re.compile(r""" locatestarttagend_tolerant = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s* # optional whitespace before attribute name (?:[\s/]* # optional whitespace before attribute name
(?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
(?:\s*=+\s* # value indicator (?:\s*=+\s* # value indicator
(?:'[^']*' # LITA-enclosed value (?:'[^']*' # LITA-enclosed value
|"[^"]*" # LIT-enclosed value |"[^"]*" # LIT-enclosed value
|(?!['"])[^>\s]* # bare value |(?!['"])[^>\s]* # bare value
) )
(?:\s*,)* # possibly followed by a comma (?:\s*,)* # possibly followed by a comma
)?\s* )?(?:\s|/(?!>))*
)* )*
)? )?
\s* # trailing whitespace \s* # trailing whitespace

View File

@ -389,6 +389,27 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
self._run_check("<a foo='>'", [('data', "<a foo='>'")]) self._run_check("<a foo='>'", [('data', "<a foo='>'")])
self._run_check("<a foo='>", [('data', "<a foo='>")]) self._run_check("<a foo='>", [('data', "<a foo='>")])
def test_slashes_in_starttag(self):
self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
html = ('<img width=902 height=250px '
'src="/sites/default/files/images/homepage/foo.jpg" '
'/*what am I doing here*/ />')
expected = [(
'startendtag', 'img',
[('width', '902'), ('height', '250px'),
('src', '/sites/default/files/images/homepage/foo.jpg'),
('*what', None), ('am', None), ('i', None),
('doing', None), ('here*', None)]
)]
self._run_check(html, expected)
html = ('<a / /foo/ / /=/ / /bar/ / />'
'<a / /foo/ / /=/ / /bar/ / >')
expected = [
('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]),
('starttag', 'a', [('foo', None), ('=', None), ('bar', None)])
]
self._run_check(html, expected)
def test_declaration_junk_chars(self): def test_declaration_junk_chars(self):
self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')]) self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')])

View File

@ -121,6 +121,8 @@ Core and Builtins
Library Library
------- -------
- HTMLParser is now able to handle slashes in the start tag.
- Issue #14001: CVE-2012-0845: xmlrpc: Fix an endless loop in - Issue #14001: CVE-2012-0845: xmlrpc: Fix an endless loop in
SimpleXMLRPCServer upon malformed POST request. SimpleXMLRPCServer upon malformed POST request.