#13273: fix a bug that prevented HTMLParser to properly detect some tags when strict=False.
This commit is contained in:
parent
0b85cd0680
commit
f50ffa94ab
|
@ -30,7 +30,7 @@ attrfind = re.compile(
|
||||||
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
|
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
|
||||||
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
|
r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
|
||||||
attrfind_tolerant = re.compile(
|
attrfind_tolerant = re.compile(
|
||||||
r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
|
r',?\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
|
||||||
r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
|
r'(\'[^\']*\'|"[^"]*"|[^>\s]*))?')
|
||||||
locatestarttagend = re.compile(r"""
|
locatestarttagend = re.compile(r"""
|
||||||
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
||||||
|
@ -277,12 +277,11 @@ class HTMLParser(_markupbase.ParserBase):
|
||||||
assert match, 'unexpected call to parse_starttag()'
|
assert match, 'unexpected call to parse_starttag()'
|
||||||
k = match.end()
|
k = match.end()
|
||||||
self.lasttag = tag = rawdata[i+1:k].lower()
|
self.lasttag = tag = rawdata[i+1:k].lower()
|
||||||
|
|
||||||
while k < endpos:
|
while k < endpos:
|
||||||
if self.strict:
|
if self.strict:
|
||||||
m = attrfind.match(rawdata, k)
|
m = attrfind.match(rawdata, k)
|
||||||
else:
|
else:
|
||||||
m = attrfind_tolerant.search(rawdata, k)
|
m = attrfind_tolerant.match(rawdata, k)
|
||||||
if not m:
|
if not m:
|
||||||
break
|
break
|
||||||
attrname, rest, attrvalue = m.group(1, 2, 3)
|
attrname, rest, attrvalue = m.group(1, 2, 3)
|
||||||
|
|
|
@ -373,6 +373,39 @@ class HTMLParserTolerantTestCase(TestCaseBase):
|
||||||
[('action', 'bogus|&#()value')])],
|
[('action', 'bogus|&#()value')])],
|
||||||
collector = self.collector)
|
collector = self.collector)
|
||||||
|
|
||||||
|
def test_issue13273(self):
|
||||||
|
html = ('<div style="" ><b>The <a href="some_url">rain</a> '
|
||||||
|
'<br /> in <span>Spain</span></b></div>')
|
||||||
|
expected = [
|
||||||
|
('starttag', 'div', [('style', '')]),
|
||||||
|
('starttag', 'b', []),
|
||||||
|
('data', 'The '),
|
||||||
|
('starttag', 'a', [('href', 'some_url')]),
|
||||||
|
('data', 'rain'),
|
||||||
|
('endtag', 'a'),
|
||||||
|
('data', ' '),
|
||||||
|
('startendtag', 'br', []),
|
||||||
|
('data', ' in '),
|
||||||
|
('starttag', 'span', []),
|
||||||
|
('data', 'Spain'),
|
||||||
|
('endtag', 'span'),
|
||||||
|
('endtag', 'b'),
|
||||||
|
('endtag', 'div')
|
||||||
|
]
|
||||||
|
self._run_check(html, expected, collector=self.collector)
|
||||||
|
|
||||||
|
def test_issue13273_2(self):
|
||||||
|
html = '<div style="", foo = "bar" ><b>The <a href="some_url">rain</a>'
|
||||||
|
expected = [
|
||||||
|
('starttag', 'div', [('style', ''), ('foo', 'bar')]),
|
||||||
|
('starttag', 'b', []),
|
||||||
|
('data', 'The '),
|
||||||
|
('starttag', 'a', [('href', 'some_url')]),
|
||||||
|
('data', 'rain'),
|
||||||
|
('endtag', 'a'),
|
||||||
|
]
|
||||||
|
self._run_check(html, expected, collector=self.collector)
|
||||||
|
|
||||||
def test_unescape_function(self):
|
def test_unescape_function(self):
|
||||||
p = html.parser.HTMLParser()
|
p = html.parser.HTMLParser()
|
||||||
self.assertEqual(p.unescape('&#bad;'),'&#bad;')
|
self.assertEqual(p.unescape('&#bad;'),'&#bad;')
|
||||||
|
|
|
@ -61,6 +61,9 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #13273: fix a bug that prevented HTMLParser to properly detect some
|
||||||
|
tags when strict=False.
|
||||||
|
|
||||||
- Issue #10332: multiprocessing: fix a race condition when a Pool is closed
|
- Issue #10332: multiprocessing: fix a race condition when a Pool is closed
|
||||||
before all tasks have completed.
|
before all tasks have completed.
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue