mirror of https://github.com/python/cpython
Change RuntimeError to SGMLParseError, which subclasses RuntimeError
for backward compatibility. Add support for SGML declaration syntax (<!....>) to some reasonable degree. This does not support everything allowed in SGML, but should work with "real" HTML (internal subset in a DOCTYPE is not handled). The content of the declaration is passed to the .handle_decl() method, which can be overridden by subclasses.
This commit is contained in:
parent
83e01bf6c8
commit
669573726b
|
@ -39,6 +39,14 @@ attrfind = re.compile(
|
||||||
r'\s*([a-zA-Z_][-.a-zA-Z_0-9]*)(\s*=\s*'
|
r'\s*([a-zA-Z_][-.a-zA-Z_0-9]*)(\s*=\s*'
|
||||||
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
|
r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
|
||||||
|
|
||||||
|
declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
|
||||||
|
declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
|
||||||
|
|
||||||
|
|
||||||
|
class SGMLParseError(RuntimeError):
|
||||||
|
"""Exception raised for all parse errors."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
# SGML parser base class -- find tags and call handler functions.
|
# SGML parser base class -- find tags and call handler functions.
|
||||||
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
|
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
|
||||||
|
@ -144,7 +152,12 @@ class SGMLParser:
|
||||||
self.handle_data(rawdata[i])
|
self.handle_data(rawdata[i])
|
||||||
i = i+1
|
i = i+1
|
||||||
continue
|
continue
|
||||||
i = match.end(0)
|
# This is some sort of declaration; in "HTML as
|
||||||
|
# deployed," this should only be the document type
|
||||||
|
# declaration ("<!DOCTYPE html...>").
|
||||||
|
k = self.parse_declaration(i)
|
||||||
|
if k < 0: break
|
||||||
|
i = k
|
||||||
continue
|
continue
|
||||||
elif rawdata[i] == '&':
|
elif rawdata[i] == '&':
|
||||||
match = charref.match(rawdata, i)
|
match = charref.match(rawdata, i)
|
||||||
|
@ -162,7 +175,7 @@ class SGMLParser:
|
||||||
if rawdata[i-1] != ';': i = i-1
|
if rawdata[i-1] != ';': i = i-1
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
raise RuntimeError, 'neither < nor & ??'
|
raise SGMLParserError('neither < nor & ??')
|
||||||
# We get here only if incomplete matches but
|
# We get here only if incomplete matches but
|
||||||
# nothing else
|
# nothing else
|
||||||
match = incomplete.match(rawdata, i)
|
match = incomplete.match(rawdata, i)
|
||||||
|
@ -186,7 +199,7 @@ class SGMLParser:
|
||||||
def parse_comment(self, i):
|
def parse_comment(self, i):
|
||||||
rawdata = self.rawdata
|
rawdata = self.rawdata
|
||||||
if rawdata[i:i+4] != '<!--':
|
if rawdata[i:i+4] != '<!--':
|
||||||
raise RuntimeError, 'unexpected call to handle_comment'
|
raise SGMLParseError('unexpected call to parse_comment()')
|
||||||
match = commentclose.search(rawdata, i+4)
|
match = commentclose.search(rawdata, i+4)
|
||||||
if not match:
|
if not match:
|
||||||
return -1
|
return -1
|
||||||
|
@ -195,11 +208,42 @@ class SGMLParser:
|
||||||
j = match.end(0)
|
j = match.end(0)
|
||||||
return j-i
|
return j-i
|
||||||
|
|
||||||
|
# Internal -- parse declaration.
|
||||||
|
def parse_declaration(self, i):
|
||||||
|
rawdata = self.rawdata
|
||||||
|
j = i + 2
|
||||||
|
# in practice, this should look like: ((name|stringlit) S*)+ '>'
|
||||||
|
while 1:
|
||||||
|
c = rawdata[j:j+1]
|
||||||
|
if c == ">":
|
||||||
|
# end of declaration syntax
|
||||||
|
self.handle_decl(rawdata[i+2:j])
|
||||||
|
return j + 1
|
||||||
|
if c in "\"'":
|
||||||
|
m = declstringlit.match(rawdata, j)
|
||||||
|
if not m:
|
||||||
|
# incomplete or an error?
|
||||||
|
return -1
|
||||||
|
j = m.end()
|
||||||
|
elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
||||||
|
m = declname.match(rawdata, j)
|
||||||
|
if not m:
|
||||||
|
# incomplete or an error?
|
||||||
|
return -1
|
||||||
|
j = m.end()
|
||||||
|
elif i == len(rawdata):
|
||||||
|
# end of buffer between tokens
|
||||||
|
return -1
|
||||||
|
else:
|
||||||
|
raise SGMLParseError(
|
||||||
|
"unexpected char in declaration: %s" % `rawdata[i]`)
|
||||||
|
assert 0, "can't get here!"
|
||||||
|
|
||||||
# Internal -- parse processing instr, return length or -1 if not terminated
|
# Internal -- parse processing instr, return length or -1 if not terminated
|
||||||
def parse_pi(self, i):
|
def parse_pi(self, i):
|
||||||
rawdata = self.rawdata
|
rawdata = self.rawdata
|
||||||
if rawdata[i:i+2] != '<?':
|
if rawdata[i:i+2] != '<?':
|
||||||
raise RuntimeError, 'unexpected call to handle_pi'
|
raise SGMLParseError('unexpected call to parse_pi()')
|
||||||
match = piclose.search(rawdata, i+2)
|
match = piclose.search(rawdata, i+2)
|
||||||
if not match:
|
if not match:
|
||||||
return -1
|
return -1
|
||||||
|
@ -246,7 +290,7 @@ class SGMLParser:
|
||||||
else:
|
else:
|
||||||
match = tagfind.match(rawdata, i+1)
|
match = tagfind.match(rawdata, i+1)
|
||||||
if not match:
|
if not match:
|
||||||
raise RuntimeError, 'unexpected call to parse_starttag'
|
raise SGMLParseError('unexpected call to parse_starttag')
|
||||||
k = match.end(0)
|
k = match.end(0)
|
||||||
tag = rawdata[i+1:k].lower()
|
tag = rawdata[i+1:k].lower()
|
||||||
self.lasttag = tag
|
self.lasttag = tag
|
||||||
|
@ -383,6 +427,10 @@ class SGMLParser:
|
||||||
def handle_comment(self, data):
|
def handle_comment(self, data):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Example -- handle declaration, could be overridden
|
||||||
|
def handle_decl(self, decl):
|
||||||
|
pass
|
||||||
|
|
||||||
# Example -- handle processing instruction, could be overridden
|
# Example -- handle processing instruction, could be overridden
|
||||||
def handle_pi(self, data):
|
def handle_pi(self, data):
|
||||||
pass
|
pass
|
||||||
|
|
Loading…
Reference in New Issue