In CDATA mode, make sure entity-reference syntax is not interpreted;

entity references are not allowed in that mode.

Do a better job of scanning <!DOCTYPE ...> declarations; based on the
code in HTMLParser.py.
This commit is contained in:
Fred Drake 2001-07-16 18:30:35 +00:00
parent e16c7aee4b
commit fb38c76e0f
1 changed files with 27 additions and 9 deletions

View File

@ -5,7 +5,8 @@
# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
# and CDATA (character data -- only end tags are special).
# and CDATA (character data -- only end tags are special). RCDATA is
# not supported at all.
import re
@ -34,6 +35,9 @@ endbracket = re.compile('[<>]')
special = re.compile('<![^<>]*>')
commentopen = re.compile('<!--')
commentclose = re.compile(r'--\s*>')
declopen = re.compile('<!')
declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
attrfind = re.compile(
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
@ -160,6 +164,10 @@ class SGMLParser:
i = k
continue
elif rawdata[i] == '&':
if self.literal:
self.handle_data(rawdata[i])
i = i+1
continue
match = charref.match(rawdata, i)
if match:
name = match.group(1)
@ -210,11 +218,20 @@ class SGMLParser:
# Internal -- parse declaration.
def parse_declaration(self, i):
# This is some sort of declaration; in "HTML as
# deployed," this should only be the document type
# declaration ("<!DOCTYPE html...>").
rawdata = self.rawdata
j = i + 2
assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
if rawdata[j:j+1] in ("-", ""):
# Start of comment followed by buffer boundary,
# or just a buffer boundary.
return -1
# in practice, this should look like: ((name|stringlit) S*)+ '>'
n = len(rawdata)
while j < n:
c = rawdata[j:j+1]
c = rawdata[j]
if c == ">":
# end of declaration syntax
self.handle_decl(rawdata[i+2:j])
@ -222,15 +239,16 @@ class SGMLParser:
if c in "\"'":
m = declstringlit.match(rawdata, j)
if not m:
# incomplete or an error?
return -1
return -1 # incomplete
j = m.end()
elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
m = declname.match(rawdata, j)
if not m:
return -1 # incomplete
j = m.end()
else:
m = decldata.match(rawdata, j)
if not m:
# incomplete or an error?
return -1
j = m.end()
raise SGMLParseError(
"unexpected char in declaration: %s" % `rawdata[j]`)
# end of buffer between tokens
return -1