#13273: fix a bug that prevented HTMLParser to properly detect some tags when strict=False.

This commit is contained in:
Ezio Melotti 2011-10-28 13:21:09 +03:00
parent 0b85cd0680
commit f50ffa94ab
3 changed files with 38 additions and 3 deletions

View File

@ -30,7 +30,7 @@ attrfind = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
attrfind_tolerant = re.compile( attrfind_tolerant = re.compile(
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' r',?\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?') r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
locatestarttagend = re.compile(r""" locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
@ -277,12 +277,11 @@ class HTMLParser(_markupbase.ParserBase):
assert match, 'unexpected call to parse_starttag()' assert match, 'unexpected call to parse_starttag()'
k = match.end() k = match.end()
self.lasttag = tag = rawdata[i+1:k].lower() self.lasttag = tag = rawdata[i+1:k].lower()
while k < endpos: while k < endpos:
if self.strict: if self.strict:
m = attrfind.match(rawdata, k) m = attrfind.match(rawdata, k)
else: else:
m = attrfind_tolerant.search(rawdata, k) m = attrfind_tolerant.match(rawdata, k)
if not m: if not m:
break break
attrname, rest, attrvalue = m.group(1, 2, 3) attrname, rest, attrvalue = m.group(1, 2, 3)

View File

@ -373,6 +373,39 @@ class HTMLParserTolerantTestCase(TestCaseBase):
[('action', 'bogus|&#()value')])], [('action', 'bogus|&#()value')])],
collector = self.collector) collector = self.collector)
def test_issue13273(self):
html = ('<div style="" ><b>The <a href="some_url">rain</a> '
'<br /> in <span>Spain</span></b></div>')
expected = [
('starttag', 'div', [('style', '')]),
('starttag', 'b', []),
('data', 'The '),
('starttag', 'a', [('href', 'some_url')]),
('data', 'rain'),
('endtag', 'a'),
('data', ' '),
('startendtag', 'br', []),
('data', ' in '),
('starttag', 'span', []),
('data', 'Spain'),
('endtag', 'span'),
('endtag', 'b'),
('endtag', 'div')
]
self._run_check(html, expected, collector=self.collector)
def test_issue13273_2(self):
html = '<div style="", foo = "bar" ><b>The <a href="some_url">rain</a>'
expected = [
('starttag', 'div', [('style', ''), ('foo', 'bar')]),
('starttag', 'b', []),
('data', 'The '),
('starttag', 'a', [('href', 'some_url')]),
('data', 'rain'),
('endtag', 'a'),
]
self._run_check(html, expected, collector=self.collector)
def test_unescape_function(self): def test_unescape_function(self):
p = html.parser.HTMLParser() p = html.parser.HTMLParser()
self.assertEqual(p.unescape('&#bad;'),'&#bad;') self.assertEqual(p.unescape('&#bad;'),'&#bad;')

View File

@ -61,6 +61,9 @@ Core and Builtins
Library Library
------- -------
- Issue #13273: fix a bug that prevented HTMLParser to properly detect some
tags when strict=False.
- Issue #10332: multiprocessing: fix a race condition when a Pool is closed - Issue #10332: multiprocessing: fix a race condition when a Pool is closed
before all tasks have completed. before all tasks have completed.