#13960: merge with 3.2.

This commit is contained in:
Ezio Melotti 2012-02-10 10:50:49 +02:00
commit 176630ec19
3 changed files with 58 additions and 2 deletions

View File

@ -184,7 +184,17 @@ class HTMLParser(_markupbase.ParserBase):
elif startswith("<?", i): elif startswith("<?", i):
k = self.parse_pi(i) k = self.parse_pi(i)
elif startswith("<!", i): elif startswith("<!", i):
k = self.parse_declaration(i) # this might fail with things like <! not a comment > or
# <! -- space before '--' -->. When strict is True an
# error is raised, when it's False they will be considered
# as bogus comments and parsed (see parse_bogus_comment).
if self.strict:
k = self.parse_declaration(i)
else:
try:
k = self.parse_declaration(i)
except HTMLParseError:
k = self.parse_bogus_comment(i)
elif (i + 1) < n: elif (i + 1) < n:
self.handle_data("<") self.handle_data("<")
k = i + 1 k = i + 1
@ -256,6 +266,19 @@ class HTMLParser(_markupbase.ParserBase):
i = self.updatepos(i, n) i = self.updatepos(i, n)
self.rawdata = rawdata[i:] self.rawdata = rawdata[i:]
# Internal -- parse bogus comment, return length or -1 if not terminated
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
def parse_bogus_comment(self, i, report=1):
rawdata = self.rawdata
if rawdata[i:i+2] != '<!':
self.error('unexpected call to parse_comment()')
pos = rawdata.find('>', i+2)
if pos == -1:
return -1
if report:
self.handle_comment(rawdata[i+2:pos])
return pos + 1
# Internal -- parse processing instr, return end or -1 if not terminated # Internal -- parse processing instr, return end or -1 if not terminated
def parse_pi(self, i): def parse_pi(self, i):
rawdata = self.rawdata rawdata = self.rawdata

View File

@ -323,6 +323,23 @@ DOCTYPE html [
("endtag", element_lower)], ("endtag", element_lower)],
collector=Collector()) collector=Collector())
def test_comments(self):
html = ("<!-- I'm a valid comment -->"
'<!--me too!-->'
'<!------>'
'<!---->'
'<!----I have many hyphens---->'
'<!-- I have a > in the middle -->'
'<!-- and I have -- in the middle! -->')
expected = [('comment', " I'm a valid comment "),
('comment', 'me too!'),
('comment', '--'),
('comment', ''),
('comment', '--I have many hyphens--'),
('comment', ' I have a > in the middle '),
('comment', ' and I have -- in the middle! ')]
self._run_check(html, expected)
def test_condcoms(self): def test_condcoms(self):
html = ('<!--[if IE & !(lte IE 8)]>aren\'t<![endif]-->' html = ('<!--[if IE & !(lte IE 8)]>aren\'t<![endif]-->'
'<!--[if IE 8]>condcoms<![endif]-->' '<!--[if IE 8]>condcoms<![endif]-->'
@ -426,6 +443,19 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
# see #12888 # see #12888
self.assertEqual(p.unescape('&#123; ' * 1050), '{ ' * 1050) self.assertEqual(p.unescape('&#123; ' * 1050), '{ ' * 1050)
def test_broken_comments(self):
html = ('<! not really a comment >'
'<! not a comment either -->'
'<! -- close enough -->'
'<!!! another bogus comment !!!>')
expected = [
('comment', ' not really a comment '),
('comment', ' not a comment either --'),
('comment', ' -- close enough --'),
('comment', '!! another bogus comment !!!'),
]
self._run_check(html, expected)
def test_broken_condcoms(self): def test_broken_condcoms(self):
# these condcoms are missing the '--' after '<!' and before the '>' # these condcoms are missing the '--' after '<!' and before the '>'
html = ('<![if !(IE)]>broken condcom<![endif]>' html = ('<![if !(IE)]>broken condcom<![endif]>'

View File

@ -466,6 +466,9 @@ Core and Builtins
Library Library
------- -------
- Issue #13960: HTMLParser is now able to handle broken comments when
strict=False.
- Issue #13921: Undocument and clean up sqlite3.OptimizedUnicode, - Issue #13921: Undocument and clean up sqlite3.OptimizedUnicode,
which is obsolete in Python 3.x. It's now aliased to str for which is obsolete in Python 3.x. It's now aliased to str for
backwards compatibility. backwards compatibility.
@ -498,7 +501,7 @@ Library
- Issue #10881: Fix test_site failure with OS X framework builds. - Issue #10881: Fix test_site failure with OS X framework builds.
- Issue #964437 Make IDLE help window non-modal. - Issue #964437: Make IDLE help window non-modal.
Patch by Guilherme Polo and Roger Serwy. Patch by Guilherme Polo and Roger Serwy.
- Issue #13734: Add os.fwalk(), a directory walking function yielding file - Issue #13734: Add os.fwalk(), a directory walking function yielding file