In CDATA mode, make sure entity-reference syntax is not interpreted;
entity references are not allowed in that mode. Do a better job of scanning <!DOCTYPE ...> declarations; based on the code in HTMLParser.py.
This commit is contained in:
parent
e16c7aee4b
commit
fb38c76e0f
|
@ -5,7 +5,8 @@
|
|||
# XXX There should be a way to distinguish between PCDATA (parsed
|
||||
# character data -- the normal case), RCDATA (replaceable character
|
||||
# data -- only char and entity references and end tags are special)
|
||||
# and CDATA (character data -- only end tags are special).
|
||||
# and CDATA (character data -- only end tags are special). RCDATA is
|
||||
# not supported at all.
|
||||
|
||||
|
||||
import re
|
||||
|
@ -34,6 +35,9 @@ endbracket = re.compile('[<>]')
|
|||
special = re.compile('<![^<>]*>')
|
||||
commentopen = re.compile('<!--')
|
||||
commentclose = re.compile(r'--\s*>')
|
||||
declopen = re.compile('<!')
|
||||
declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
|
||||
declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
|
||||
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
|
||||
attrfind = re.compile(
|
||||
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
|
||||
|
@ -160,6 +164,10 @@ class SGMLParser:
|
|||
i = k
|
||||
continue
|
||||
elif rawdata[i] == '&':
|
||||
if self.literal:
|
||||
self.handle_data(rawdata[i])
|
||||
i = i+1
|
||||
continue
|
||||
match = charref.match(rawdata, i)
|
||||
if match:
|
||||
name = match.group(1)
|
||||
|
@ -210,11 +218,20 @@ class SGMLParser:
|
|||
|
||||
# Internal -- parse declaration.
|
||||
def parse_declaration(self, i):
|
||||
# This is some sort of declaration; in "HTML as
|
||||
# deployed," this should only be the document type
|
||||
# declaration ("<!DOCTYPE html...>").
|
||||
rawdata = self.rawdata
|
||||
j = i + 2
|
||||
assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
|
||||
if rawdata[j:j+1] in ("-", ""):
|
||||
# Start of comment followed by buffer boundary,
|
||||
# or just a buffer boundary.
|
||||
return -1
|
||||
# in practice, this should look like: ((name|stringlit) S*)+ '>'
|
||||
n = len(rawdata)
|
||||
while j < n:
|
||||
c = rawdata[j:j+1]
|
||||
c = rawdata[j]
|
||||
if c == ">":
|
||||
# end of declaration syntax
|
||||
self.handle_decl(rawdata[i+2:j])
|
||||
|
@ -222,15 +239,16 @@ class SGMLParser:
|
|||
if c in "\"'":
|
||||
m = declstringlit.match(rawdata, j)
|
||||
if not m:
|
||||
# incomplete or an error?
|
||||
return -1
|
||||
return -1 # incomplete
|
||||
j = m.end()
|
||||
elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
||||
m = declname.match(rawdata, j)
|
||||
if not m:
|
||||
return -1 # incomplete
|
||||
j = m.end()
|
||||
else:
|
||||
m = decldata.match(rawdata, j)
|
||||
if not m:
|
||||
# incomplete or an error?
|
||||
return -1
|
||||
j = m.end()
|
||||
raise SGMLParseError(
|
||||
"unexpected char in declaration: %s" % `rawdata[j]`)
|
||||
# end of buffer between tokens
|
||||
return -1
|
||||
|
||||
|
|
Loading…
Reference in New Issue