mirror of https://github.com/python/cpython
Deal more appropriately with bare ampersands and pointy brackets; this
module has to deal with "class" HTML-as-deployed as well as XHTML, so we cannot be as strict as XHTML allows. This closes SF bug #453059, but uses a different fix than suggested in the bug comments.
This commit is contained in:
parent
18da1e1e7f
commit
029acfb922
|
@ -15,7 +15,8 @@ import string
|
|||
|
||||
interesting_normal = re.compile('[&<]')
|
||||
interesting_cdata = re.compile(r'<(/|\Z)')
|
||||
incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*|#[0-9]*)?')
|
||||
incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*'
|
||||
'|#([0-9]*|[xX][0-9a-fA-F]*))?')
|
||||
|
||||
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
|
||||
charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
|
||||
|
@ -185,11 +186,8 @@ class HTMLParser:
|
|||
elif declopen.match(rawdata, i): # <!
|
||||
k = self.parse_declaration(i)
|
||||
else:
|
||||
if i < n-1:
|
||||
raise HTMLParseError(
|
||||
"invalid '<' construct: %s" % `rawdata[i:i+2]`,
|
||||
self.getpos())
|
||||
k = -1
|
||||
self.handle_data("<")
|
||||
k = i + 1
|
||||
if k < 0:
|
||||
if end:
|
||||
raise HTMLParseError("EOF in middle of construct",
|
||||
|
@ -203,7 +201,7 @@ class HTMLParser:
|
|||
self.handle_charref(name)
|
||||
k = match.end()
|
||||
if rawdata[k-1] != ';':
|
||||
k = k-1
|
||||
k = k - 1
|
||||
i = self.updatepos(i, k)
|
||||
continue
|
||||
match = entityref.match(rawdata, i)
|
||||
|
@ -212,17 +210,19 @@ class HTMLParser:
|
|||
self.handle_entityref(name)
|
||||
k = match.end()
|
||||
if rawdata[k-1] != ';':
|
||||
k = k-1
|
||||
k = k - 1
|
||||
i = self.updatepos(i, k)
|
||||
continue
|
||||
if incomplete.match(rawdata, i):
|
||||
if end:
|
||||
match = incomplete.match(rawdata, i)
|
||||
if match:
|
||||
rest = rawdata[i:]
|
||||
if end and rest != "&" and match.group() == rest:
|
||||
raise HTMLParseError(
|
||||
"EOF in middle of entity or char ref",
|
||||
self.getpos())
|
||||
return -1 # incomplete
|
||||
raise HTMLParseError("'&' not part of entity or char ref",
|
||||
self.getpos())
|
||||
self.handle_data("&")
|
||||
i = self.updatepos(i, i + 1)
|
||||
else:
|
||||
assert 0, "interesting.search() lied"
|
||||
# end while
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
"""Tests for HTMLParser.py."""
|
||||
|
||||
import HTMLParser
|
||||
import pprint
|
||||
import sys
|
||||
import test_support
|
||||
import unittest
|
||||
|
@ -83,9 +84,10 @@ class TestCaseBase(unittest.TestCase):
|
|||
for c in self.epilogue:
|
||||
parser.feed(c)
|
||||
parser.close()
|
||||
self.assert_(parser.get_events() ==
|
||||
self.initial_events + events + self.final_events,
|
||||
parser.get_events())
|
||||
events = parser.get_events()
|
||||
self.assertEqual(events,
|
||||
self.initial_events + events + self.final_events,
|
||||
"got events:\n" + pprint.pformat(events))
|
||||
|
||||
def _run_check_extra(self, source, events):
|
||||
self._run_check(source, events, EventCollectorExtra)
|
||||
|
@ -137,6 +139,18 @@ text
|
|||
("data", "\n"),
|
||||
])
|
||||
|
||||
def test_doctype_decl(self):
|
||||
inside = """\
|
||||
DOCTYPE html [
|
||||
<!ELEMENT html - O EMPTY>
|
||||
<!ATTLIST html
|
||||
version CDATA #IMPLIED '4.0'>
|
||||
<!-- comment -->
|
||||
]"""
|
||||
self._run_check("<!%s>" % inside, [
|
||||
("decl", inside),
|
||||
])
|
||||
|
||||
def test_bad_nesting(self):
|
||||
# Strangely, this *is* supposed to test that overlapping
|
||||
# elements are allowed. HTMLParser is more geared toward
|
||||
|
@ -148,6 +162,16 @@ text
|
|||
("endtag", "b"),
|
||||
])
|
||||
|
||||
def test_bare_ampersands(self):
|
||||
self._run_check("this text & contains & ampersands &", [
|
||||
("data", "this text & contains & ampersands &"),
|
||||
])
|
||||
|
||||
def test_bare_pointy_brackets(self):
|
||||
self._run_check("this < text > contains < bare>pointy< brackets", [
|
||||
("data", "this < text > contains < bare>pointy< brackets"),
|
||||
])
|
||||
|
||||
def test_attr_syntax(self):
|
||||
output = [
|
||||
("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
|
||||
|
@ -199,16 +223,12 @@ text
|
|||
self._run_check(["<a b='>'", ">"], output)
|
||||
|
||||
def test_starttag_junk_chars(self):
|
||||
self._parse_error("<")
|
||||
self._parse_error("<>")
|
||||
self._parse_error("</>")
|
||||
self._parse_error("</$>")
|
||||
self._parse_error("</")
|
||||
self._parse_error("</a")
|
||||
self._parse_error("<a<a>")
|
||||
self._parse_error("</a<a>")
|
||||
self._parse_error("<$")
|
||||
self._parse_error("<$>")
|
||||
self._parse_error("<!")
|
||||
self._parse_error("<a $>")
|
||||
self._parse_error("<a")
|
||||
|
|
Loading…
Reference in New Issue