HTMLParser is now able to handle slashes in the start tag.

2012-02-21 09:22:16 +02:00 · 2012-02-21 09:22:16 +02:00 · 36b7361fe7
parent 9be6c3ddf0
commit 36b7361fe7
3 changed files with 28 additions and 5 deletions
--- a/Lib/HTMLParser.py
+++ b/Lib/HTMLParser.py
@ -28,19 +28,19 @@ tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
 tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
 attrfind = re.compile(
-    r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
+    r'[\s/]*((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
-    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
+    r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
 locatestarttagend = re.compile(r"""
  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
-  (?:\s+                             # whitespace before attribute name
+  (?:[\s/]*                          # optional whitespace before attribute name
-    (?:(?<=['"\s])[^\s/>][^\s/=>]*   # attribute name
+    (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
      (?:\s*=+\s*                    # value indicator
        (?:'[^']*'                   # LITA-enclosed value
          |"[^"]*"                   # LIT-enclosed value
          |(?!['"])[^>\s]*           # bare value
         )
-       )?\s*
+       )?(?:\s|/(?!>))*
     )*
   )?
  \s*                                # trailing whitespace
--- a/Lib/test/test_htmlparser.py
+++ b/Lib/test/test_htmlparser.py
@ -240,6 +240,27 @@ text
            self._run_check("<!DOCTYPE %s>" % dtd,
                            [('decl', 'DOCTYPE ' + dtd)])
    def test_slashes_in_starttag(self):
        self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
        html = ('<img width=902 height=250px '
                'src="/sites/default/files/images/homepage/foo.jpg" '
                '/*what am I doing here*/ />')
        expected = [(
            'startendtag', 'img',
            [('width', '902'), ('height', '250px'),
             ('src', '/sites/default/files/images/homepage/foo.jpg'),
             ('*what', None), ('am', None), ('i', None),
             ('doing', None), ('here*', None)]
        )]
        self._run_check(html, expected)
        html = ('<a / /foo/ / /=/ / /bar/ / />'
                '<a / /foo/ / /=/ / /bar/ / >')
        expected = [
            ('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]),
            ('starttag', 'a', [('foo', None), ('=', None), ('bar', None)])
        ]
        self._run_check(html, expected)
    def test_declaration_junk_chars(self):
        self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')])
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -98,6 +98,8 @@ Core and Builtins
 Library
 -------
 - HTMLParser is now able to handle slashes in the start tag.
 - Issue #14001: CVE-2012-0845: xmlrpc: Fix an endless loop in
  SimpleXMLRPCServer upon malformed POST request.