#13358: merge with 3.2.

This commit is contained in:
Ezio Melotti 2011-11-18 18:02:59 +02:00
commit 304261e85d
3 changed files with 26 additions and 3 deletions

View File

@ -14,7 +14,6 @@ import re
# Regular expressions used for parsing # Regular expressions used for parsing
interesting_normal = re.compile('[&<]') interesting_normal = re.compile('[&<]')
interesting_cdata = re.compile(r'<(/|\Z)')
incomplete = re.compile('&[a-zA-Z#]') incomplete = re.compile('&[a-zA-Z#]')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
@ -149,8 +148,8 @@ class HTMLParser(_markupbase.ParserBase):
return self.__starttag_text return self.__starttag_text
def set_cdata_mode(self, elem): def set_cdata_mode(self, elem):
self.interesting = interesting_cdata
self.cdata_elem = elem.lower() self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
def clear_cdata_mode(self): def clear_cdata_mode(self):
self.interesting = interesting_normal self.interesting = interesting_normal
@ -168,6 +167,8 @@ class HTMLParser(_markupbase.ParserBase):
if match: if match:
j = match.start() j = match.start()
else: else:
if self.cdata_elem:
break
j = n j = n
if i < j: self.handle_data(rawdata[i:j]) if i < j: self.handle_data(rawdata[i:j])
i = self.updatepos(i, j) i = self.updatepos(i, j)
@ -250,7 +251,7 @@ class HTMLParser(_markupbase.ParserBase):
else: else:
assert 0, "interesting.search() lied" assert 0, "interesting.search() lied"
# end while # end while
if end and i < n: if end and i < n and not self.cdata_elem:
self.handle_data(rawdata[i:n]) self.handle_data(rawdata[i:n])
i = self.updatepos(i, n) i = self.updatepos(i, n)
self.rawdata = rawdata[i:] self.rawdata = rawdata[i:]

View File

@ -301,7 +301,27 @@ DOCTYPE html [
("data", content), ("data", content),
("endtag", element_lower)]) ("endtag", element_lower)])
def test_cdata_with_closing_tags(self):
# see issue #13358
# make sure that HTMLParser calls handle_data only once for each CDATA.
# The normal event collector normalizes the events in get_events,
# so we override it to return the original list of events.
class Collector(EventCollector):
def get_events(self):
return self.events
content = """<!-- not a comment --> &not-an-entity-ref;
<a href="" /> </p><p> <span></span></style>
'</script' + '>'"""
for element in [' script', 'script ', ' script ',
'\nscript', 'script\n', '\nscript\n']:
element_lower = element.lower().strip()
s = '<script>{content}</{element}>'.format(element=element,
content=content)
self._run_check(s, [("starttag", element_lower, []),
("data", content),
("endtag", element_lower)],
collector=Collector())
class HTMLParserTolerantTestCase(HTMLParserStrictTestCase): class HTMLParserTolerantTestCase(HTMLParserStrictTestCase):

View File

@ -377,6 +377,8 @@ Core and Builtins
Library Library
------- -------
- Issue #13358: HTMLParser now calls handle_data only once for each CDATA.
- Issue #4147: minidom's toprettyxml no longer adds whitespace around a text - Issue #4147: minidom's toprettyxml no longer adds whitespace around a text
node when it is the only child of an element. Initial patch by Dan node when it is the only child of an element. Initial patch by Dan
Kenigsberg. Kenigsberg.