HTMLParser is now able to handle slashes in the start tag.
This commit is contained in:
parent
178e5ea305
commit
29877e8e04
|
@ -26,14 +26,18 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
|
||||||
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
|
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
|
||||||
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
|
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
|
||||||
tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
|
tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
|
||||||
# Note, the strict one of this pair isn't really strict, but we can't
|
# Note:
|
||||||
# make it correctly strict without breaking backward compatibility.
|
# 1) the strict attrfind isn't really strict, but we can't make it
|
||||||
|
# correctly strict without breaking backward compatibility;
|
||||||
|
# 2) if you change attrfind remember to update locatestarttagend too;
|
||||||
|
# 3) if you change attrfind and/or locatestarttagend the parser will
|
||||||
|
# explode, so don't do it.
|
||||||
attrfind = re.compile(
|
attrfind = re.compile(
|
||||||
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
|
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
|
||||||
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
|
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
|
||||||
attrfind_tolerant = re.compile(
|
attrfind_tolerant = re.compile(
|
||||||
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
|
r'[\s/]*((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
|
||||||
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
|
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
|
||||||
locatestarttagend = re.compile(r"""
|
locatestarttagend = re.compile(r"""
|
||||||
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
||||||
(?:\s+ # whitespace before attribute name
|
(?:\s+ # whitespace before attribute name
|
||||||
|
@ -50,15 +54,15 @@ locatestarttagend = re.compile(r"""
|
||||||
""", re.VERBOSE)
|
""", re.VERBOSE)
|
||||||
locatestarttagend_tolerant = re.compile(r"""
|
locatestarttagend_tolerant = re.compile(r"""
|
||||||
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
||||||
(?:\s* # optional whitespace before attribute name
|
(?:[\s/]* # optional whitespace before attribute name
|
||||||
(?:(?<=['"\s])[^\s/>][^\s/=>]* # attribute name
|
(?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
|
||||||
(?:\s*=+\s* # value indicator
|
(?:\s*=+\s* # value indicator
|
||||||
(?:'[^']*' # LITA-enclosed value
|
(?:'[^']*' # LITA-enclosed value
|
||||||
|"[^"]*" # LIT-enclosed value
|
|"[^"]*" # LIT-enclosed value
|
||||||
|(?!['"])[^>\s]* # bare value
|
|(?!['"])[^>\s]* # bare value
|
||||||
)
|
)
|
||||||
(?:\s*,)* # possibly followed by a comma
|
(?:\s*,)* # possibly followed by a comma
|
||||||
)?\s*
|
)?(?:\s|/(?!>))*
|
||||||
)*
|
)*
|
||||||
)?
|
)?
|
||||||
\s* # trailing whitespace
|
\s* # trailing whitespace
|
||||||
|
|
|
@ -389,6 +389,27 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
|
||||||
self._run_check("<a foo='>'", [('data', "<a foo='>'")])
|
self._run_check("<a foo='>'", [('data', "<a foo='>'")])
|
||||||
self._run_check("<a foo='>", [('data', "<a foo='>")])
|
self._run_check("<a foo='>", [('data', "<a foo='>")])
|
||||||
|
|
||||||
|
def test_slashes_in_starttag(self):
|
||||||
|
self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
|
||||||
|
html = ('<img width=902 height=250px '
|
||||||
|
'src="/sites/default/files/images/homepage/foo.jpg" '
|
||||||
|
'/*what am I doing here*/ />')
|
||||||
|
expected = [(
|
||||||
|
'startendtag', 'img',
|
||||||
|
[('width', '902'), ('height', '250px'),
|
||||||
|
('src', '/sites/default/files/images/homepage/foo.jpg'),
|
||||||
|
('*what', None), ('am', None), ('i', None),
|
||||||
|
('doing', None), ('here*', None)]
|
||||||
|
)]
|
||||||
|
self._run_check(html, expected)
|
||||||
|
html = ('<a / /foo/ / /=/ / /bar/ / />'
|
||||||
|
'<a / /foo/ / /=/ / /bar/ / >')
|
||||||
|
expected = [
|
||||||
|
('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]),
|
||||||
|
('starttag', 'a', [('foo', None), ('=', None), ('bar', None)])
|
||||||
|
]
|
||||||
|
self._run_check(html, expected)
|
||||||
|
|
||||||
def test_declaration_junk_chars(self):
|
def test_declaration_junk_chars(self):
|
||||||
self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')])
|
self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')])
|
||||||
|
|
||||||
|
|
|
@ -121,6 +121,8 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- HTMLParser is now able to handle slashes in the start tag.
|
||||||
|
|
||||||
- Issue #14001: CVE-2012-0845: xmlrpc: Fix an endless loop in
|
- Issue #14001: CVE-2012-0845: xmlrpc: Fix an endless loop in
|
||||||
SimpleXMLRPCServer upon malformed POST request.
|
SimpleXMLRPCServer upon malformed POST request.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue