mirror of https://github.com/python/cpython
#13358: HTMLParser now calls handle_data only once for each CDATA.
This commit is contained in:
parent
93bbb6a9a6
commit
00dc60beee
|
@ -14,7 +14,6 @@ import re
|
|||
# Regular expressions used for parsing
|
||||
|
||||
interesting_normal = re.compile('[&<]')
|
||||
interesting_cdata = re.compile(r'<(/|\Z)')
|
||||
incomplete = re.compile('&[a-zA-Z#]')
|
||||
|
||||
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
|
||||
|
@ -125,8 +124,8 @@ class HTMLParser(markupbase.ParserBase):
|
|||
return self.__starttag_text
|
||||
|
||||
def set_cdata_mode(self, elem):
|
||||
self.interesting = interesting_cdata
|
||||
self.cdata_elem = elem.lower()
|
||||
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
|
||||
|
||||
def clear_cdata_mode(self):
|
||||
self.interesting = interesting_normal
|
||||
|
@ -144,6 +143,8 @@ class HTMLParser(markupbase.ParserBase):
|
|||
if match:
|
||||
j = match.start()
|
||||
else:
|
||||
if self.cdata_elem:
|
||||
break
|
||||
j = n
|
||||
if i < j: self.handle_data(rawdata[i:j])
|
||||
i = self.updatepos(i, j)
|
||||
|
@ -212,7 +213,7 @@ class HTMLParser(markupbase.ParserBase):
|
|||
else:
|
||||
assert 0, "interesting.search() lied"
|
||||
# end while
|
||||
if end and i < n:
|
||||
if end and i < n and not self.cdata_elem:
|
||||
self.handle_data(rawdata[i:n])
|
||||
i = self.updatepos(i, n)
|
||||
self.rawdata = rawdata[i:]
|
||||
|
|
|
@ -286,6 +286,27 @@ DOCTYPE html [
|
|||
("data", content),
|
||||
("endtag", element_lower)])
|
||||
|
||||
def test_cdata_with_closing_tags(self):
|
||||
# see issue #13358
|
||||
# make sure that HTMLParser calls handle_data only once for each CDATA.
|
||||
# The normal event collector normalizes the events in get_events,
|
||||
# so we override it to return the original list of events.
|
||||
class Collector(EventCollector):
|
||||
def get_events(self):
|
||||
return self.events
|
||||
|
||||
content = """<!-- not a comment --> ¬-an-entity-ref;
|
||||
<a href="" /> </p><p> & <span></span></style>
|
||||
'</script' + '>' </html> </head> </scripter>!"""
|
||||
for element in [' script', 'script ', ' script ',
|
||||
'\nscript', 'script\n', '\nscript\n']:
|
||||
s = u'<script>{content}</{element}>'.format(element=element,
|
||||
content=content)
|
||||
self._run_check(s, [("starttag", "script", []),
|
||||
("data", content),
|
||||
("endtag", "script")],
|
||||
collector=Collector)
|
||||
|
||||
def test_malformatted_charref(self):
|
||||
self._run_check("<p>&#bad;</p>", [
|
||||
("starttag", "p", []),
|
||||
|
|
|
@ -79,6 +79,8 @@ Core and Builtins
|
|||
Library
|
||||
-------
|
||||
|
||||
- Issue #13358: HTMLParser now calls handle_data only once for each CDATA.
|
||||
|
||||
- Issue #4147: minidom's toprettyxml no longer adds whitespace around a text
|
||||
node when it is the only child of an element. Initial patch by Dan
|
||||
Kenigsberg.
|
||||
|
|
Loading…
Reference in New Issue