#13993: merge with 3.2.

This commit is contained in:
Ezio Melotti 2012-02-13 11:42:29 +02:00
commit d1c7b1afe8
3 changed files with 71 additions and 17 deletions

View File

@ -23,6 +23,9 @@ starttagopen = re.compile('<[a-zA-Z]')
piclose = re.compile('>') piclose = re.compile('>')
commentclose = re.compile(r'--\s*>') commentclose = re.compile(r'--\s*>')
tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
# Note, the strict one of this pair isn't really strict, but we can't # Note, the strict one of this pair isn't really strict, but we can't
# make it correctly strict without breaking backward compatibility. # make it correctly strict without breaking backward compatibility.
attrfind = re.compile( attrfind = re.compile(
@ -270,7 +273,7 @@ class HTMLParser(_markupbase.ParserBase):
# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
def parse_bogus_comment(self, i, report=1): def parse_bogus_comment(self, i, report=1):
rawdata = self.rawdata rawdata = self.rawdata
if rawdata[i:i+2] != '<!': if rawdata[i:i+2] not in ('<!', '</'):
self.error('unexpected call to parse_comment()') self.error('unexpected call to parse_comment()')
pos = rawdata.find('>', i+2) pos = rawdata.find('>', i+2)
if pos == -1: if pos == -1:
@ -398,31 +401,40 @@ class HTMLParser(_markupbase.ParserBase):
match = endendtag.search(rawdata, i+1) # > match = endendtag.search(rawdata, i+1) # >
if not match: if not match:
return -1 return -1
j = match.end() gtpos = match.end()
match = endtagfind.match(rawdata, i) # </ + tag + > match = endtagfind.match(rawdata, i) # </ + tag + >
if not match: if not match:
if self.cdata_elem is not None: if self.cdata_elem is not None:
self.handle_data(rawdata[i:j]) self.handle_data(rawdata[i:gtpos])
return j return gtpos
if self.strict: if self.strict:
self.error("bad end tag: %r" % (rawdata[i:j],)) self.error("bad end tag: %r" % (rawdata[i:gtpos],))
k = rawdata.find('<', i + 1, j) # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
if k > i: namematch = tagfind_tolerant.match(rawdata, i+2)
j = k if not namematch:
if j <= i: # w3.org/TR/html5/tokenization.html#end-tag-open-state
j = i + 1 if rawdata[i:i+3] == '</>':
self.handle_data(rawdata[i:j]) return i+3
return j else:
return self.parse_bogus_comment(i)
tagname = namematch.group().lower()
# consume and ignore other stuff between the name and the >
# Note: this is not 100% correct, since we might have things like
# </tag attr=">">, but looking for > after tha name should cover
# most of the cases and is much simpler
gtpos = rawdata.find('>', namematch.end())
self.handle_endtag(tagname)
return gtpos+1
elem = match.group(1).lower() # script or style elem = match.group(1).lower() # script or style
if self.cdata_elem is not None: if self.cdata_elem is not None:
if elem != self.cdata_elem: if elem != self.cdata_elem:
self.handle_data(rawdata[i:j]) self.handle_data(rawdata[i:gtpos])
return j return gtpos
self.handle_endtag(elem.lower()) self.handle_endtag(elem.lower())
self.clear_cdata_mode() self.clear_cdata_mode()
return j return gtpos
# Overridable -- finish processing of start+end tag: <tag.../> # Overridable -- finish processing of start+end tag: <tag.../>
def handle_startendtag(self, tag, attrs): def handle_startendtag(self, tag, attrs):

View File

@ -364,8 +364,9 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
('data', '<<bc'), ('data', '<<bc'),
('endtag', 'a'), ('endtag', 'a'),
('endtag', 'html'), ('endtag', 'html'),
('data', '\n<img src="URL><//img></html'), ('data', '\n<img src="URL>'),
('endtag', 'html')]) ('comment', '/img'),
('endtag', 'html<')])
def test_with_unquoted_attributes(self): def test_with_unquoted_attributes(self):
# see #12008 # see #12008
@ -403,6 +404,44 @@ class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):
('starttag', 'form', ('starttag', 'form',
[('action', 'bogus|&#()value')])]) [('action', 'bogus|&#()value')])])
def test_invalid_end_tags(self):
# A collection of broken end tags. <br> is used as separator.
# see http://www.w3.org/TR/html5/tokenization.html#end-tag-open-state
# and #13993
html = ('<br></label</p><br></div end tmAd-leaderBoard><br></<h4><br>'
'</li class="unit"><br></li\r\n\t\t\t\t\t\t</ul><br></><br>')
expected = [('starttag', 'br', []),
# < is part of the name, / is discarded, p is an attribute
('endtag', 'label<'),
('starttag', 'br', []),
# text and attributes are discarded
('endtag', 'div'),
('starttag', 'br', []),
# comment because the first char after </ is not a-zA-Z
('comment', '<h4'),
('starttag', 'br', []),
# attributes are discarded
('endtag', 'li'),
('starttag', 'br', []),
# everything till ul (included) is discarded
('endtag', 'li'),
('starttag', 'br', []),
# </> is ignored
('starttag', 'br', [])]
self._run_check(html, expected)
def test_broken_invalid_end_tag(self):
# This is technically wrong (the "> shouldn't be included in the 'data')
# but is probably not worth fixing it (in addition to all the cases of
# the previous test, it would require a full attribute parsing).
# see #13993
html = '<b>This</b attr=">"> confuses the parser'
expected = [('starttag', 'b', []),
('data', 'This'),
('endtag', 'b'),
('data', '"> confuses the parser')]
self._run_check(html, expected)
def test_correct_detection_of_start_tags(self): def test_correct_detection_of_start_tags(self):
# see #13273 # see #13273
html = ('<div style="" ><b>The <a href="some_url">rain</a> ' html = ('<div style="" ><b>The <a href="some_url">rain</a> '

View File

@ -466,6 +466,9 @@ Core and Builtins
Library Library
------- -------
- Issue #13993: HTMLParser is now able to handle broken end tags when
strict=False.
- Issue #13930: lib2to3 now supports writing converted output files to another - Issue #13930: lib2to3 now supports writing converted output files to another
directory tree as well as copying unchanged files and altering the file directory tree as well as copying unchanged files and altering the file
suffix. suffix.