Patch #545300: Support marked sections.
This commit is contained in:
parent
a965649386
commit
3163a3b4b2
|
@ -4,6 +4,13 @@ import re
|
|||
|
||||
_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
|
||||
_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
|
||||
_commentclose = re.compile(r'--\s*>')
|
||||
_markedsectionclose = re.compile(r']\s*]\s*>')
|
||||
|
||||
# An analysis of the MS-Word extensions is available at
|
||||
# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
|
||||
|
||||
_msmarkedsectionclose = re.compile(r']\s*>')
|
||||
|
||||
del re
|
||||
|
||||
|
@ -53,6 +60,13 @@ class ParserBase:
|
|||
# This is some sort of declaration; in "HTML as
|
||||
# deployed," this should only be the document type
|
||||
# declaration ("<!DOCTYPE html...>").
|
||||
# ISO 8879:1986, however, has more complex
|
||||
# declaration syntax for elements in <!...>, including:
|
||||
# --comment--
|
||||
# [marked section]
|
||||
# name in the following list: ENTITY, DOCTYPE, ELEMENT,
|
||||
# ATTLIST, NOTATION, SHORTREF, USEMAP,
|
||||
# LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
|
||||
rawdata = self.rawdata
|
||||
j = i + 2
|
||||
assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
|
||||
|
@ -60,9 +74,19 @@ class ParserBase:
|
|||
# Start of comment followed by buffer boundary,
|
||||
# or just a buffer boundary.
|
||||
return -1
|
||||
# in practice, this should look like: ((name|stringlit) S*)+ '>'
|
||||
# A simple, practical version could look like: ((name|stringlit) S*) + '>'
|
||||
n = len(rawdata)
|
||||
decltype, j = self._scan_name(j, i)
|
||||
if rawdata[j:j+1] == '--': #comment
|
||||
# Locate --.*-- as the body of the comment
|
||||
return self.parse_comment(i)
|
||||
elif rawdata[j] == '[': #marked section
|
||||
# Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
|
||||
# Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
|
||||
# Note that this is extended by Microsoft Office "Save as Web" function
|
||||
# to include [if...] and [endif].
|
||||
return self.parse_marked_section(i)
|
||||
else: #all other declaration elements
|
||||
decltype, j = self._scan_name(j, i)
|
||||
if j < 0:
|
||||
return j
|
||||
if decltype == "doctype":
|
||||
|
@ -87,8 +111,15 @@ class ParserBase:
|
|||
elif c in self._decl_otherchars:
|
||||
j = j + 1
|
||||
elif c == "[":
|
||||
# this could be handled in a separate doctype parser
|
||||
if decltype == "doctype":
|
||||
j = self._parse_doctype_subset(j + 1, i)
|
||||
elif decltype in ("attlist", "linktype", "link", "element"):
|
||||
# must tolerate []'d groups in a content model in an element declaration
|
||||
# also in data attribute specifications of attlist declaration
|
||||
# also link type declaration subsets in linktype declarations
|
||||
# also link attribute specification lists in link declarations
|
||||
self.error("unsupported '[' char in %s declaration" % decltype)
|
||||
else:
|
||||
self.error("unexpected '[' char in declaration")
|
||||
else:
|
||||
|
@ -98,6 +129,42 @@ class ParserBase:
|
|||
return j
|
||||
return -1 # incomplete
|
||||
|
||||
# Internal -- parse a marked section
|
||||
# Override this to handle MS-word extension syntax <![if word]>content<![endif]>
|
||||
def parse_marked_section( self, i, report=1 ):
|
||||
rawdata= self.rawdata
|
||||
assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
|
||||
sectName, j = self._scan_name( i+3, i )
|
||||
if j < 0:
|
||||
return j
|
||||
if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
|
||||
# look for standard ]]> ending
|
||||
match= _markedsectionclose.search(rawdata, i+3)
|
||||
elif sectName in ("if", "else", "endif"):
|
||||
# look for MS Office ]> ending
|
||||
match= _msmarkedsectionclose.search(rawdata, i+3)
|
||||
else:
|
||||
self.error('unknown status keyword %s in marked section' % `rawdata[i+3:j]`)
|
||||
if not match:
|
||||
return -1
|
||||
if report:
|
||||
j = match.start(0)
|
||||
self.unknown_decl(rawdata[i+3: j])
|
||||
return match.end(0)
|
||||
|
||||
# Internal -- parse comment, return length or -1 if not terminated
|
||||
def parse_comment(self, i, report=1):
|
||||
rawdata = self.rawdata
|
||||
if rawdata[i:i+4] != '<!--':
|
||||
self.error('unexpected call to parse_comment()')
|
||||
match = _commentclose.search(rawdata, i+4)
|
||||
if not match:
|
||||
return -1
|
||||
if report:
|
||||
j = match.start(0)
|
||||
self.handle_comment(rawdata[i+4: j])
|
||||
return match.end(0)
|
||||
|
||||
# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
|
||||
# returning the index just past any whitespace following the trailing ']'.
|
||||
def _parse_doctype_subset(self, i, declstartpos):
|
||||
|
|
|
@ -30,7 +30,6 @@ shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
|
|||
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
|
||||
piclose = re.compile('>')
|
||||
endbracket = re.compile('[<>]')
|
||||
commentclose = re.compile(r'--\s*>')
|
||||
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
|
||||
attrfind = re.compile(
|
||||
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
|
||||
|
@ -145,6 +144,10 @@ class SGMLParser(markupbase.ParserBase):
|
|||
break
|
||||
continue
|
||||
if rawdata.startswith("<!--", i):
|
||||
# Strictly speaking, a comment is --.*--
|
||||
# within a declaration tag <!...>.
|
||||
# This should be removed,
|
||||
# and comments handled only in parse_declaration.
|
||||
k = self.parse_comment(i)
|
||||
if k < 0: break
|
||||
i = k
|
||||
|
@ -202,19 +205,6 @@ class SGMLParser(markupbase.ParserBase):
|
|||
self.rawdata = rawdata[i:]
|
||||
# XXX if end: check for empty stack
|
||||
|
||||
# Internal -- parse comment, return length or -1 if not terminated
|
||||
def parse_comment(self, i, report=1):
|
||||
rawdata = self.rawdata
|
||||
if rawdata[i:i+4] != '<!--':
|
||||
self.error('unexpected call to parse_comment()')
|
||||
match = commentclose.search(rawdata, i+4)
|
||||
if not match:
|
||||
return -1
|
||||
if report:
|
||||
j = match.start(0)
|
||||
self.handle_comment(rawdata[i+4: j])
|
||||
return match.end(0)
|
||||
|
||||
# Extensions for the DOCTYPE scanner:
|
||||
_decl_otherchars = '='
|
||||
|
||||
|
@ -471,6 +461,10 @@ class TestSGMLParser(SGMLParser):
|
|||
self.flush()
|
||||
print '*** unknown char ref: &#' + ref + ';'
|
||||
|
||||
def unknown_decl(self, data):
|
||||
self.flush()
|
||||
print '*** unknown decl: [' + data + ']'
|
||||
|
||||
def close(self):
|
||||
SGMLParser.close(self)
|
||||
self.flush()
|
||||
|
|
|
@ -16,6 +16,17 @@ class AnchorCollector(htmllib.HTMLParser):
|
|||
def anchor_bgn(self, *args):
|
||||
self.__anchors.append(args)
|
||||
|
||||
class DeclCollector(htmllib.HTMLParser):
|
||||
def __init__(self, *args, **kw):
|
||||
self.__decls = []
|
||||
htmllib.HTMLParser.__init__(self, *args, **kw)
|
||||
|
||||
def get_decl_info(self):
|
||||
return self.__decls
|
||||
|
||||
def unknown_decl(self, data):
|
||||
self.__decls.append(data)
|
||||
|
||||
|
||||
class HTMLParserTestCase(unittest.TestCase):
|
||||
def test_anchor_collection(self):
|
||||
|
@ -33,6 +44,22 @@ class HTMLParserTestCase(unittest.TestCase):
|
|||
('', 'frob', ''),
|
||||
])
|
||||
|
||||
def test_decl_collection(self):
|
||||
# See SF patch #545300
|
||||
parser = DeclCollector(formatter.NullFormatter(), verbose=1)
|
||||
parser.feed(
|
||||
"""<html>
|
||||
<body>
|
||||
hallo
|
||||
<![if !supportEmptyParas]> <![endif]>
|
||||
</body>
|
||||
</html>
|
||||
""")
|
||||
parser.close()
|
||||
self.assertEquals(parser.get_decl_info(),
|
||||
["if !supportEmptyParas",
|
||||
"endif"
|
||||
])
|
||||
|
||||
def test_main():
|
||||
test_support.run_unittest(HTMLParserTestCase)
|
||||
|
|
Loading…
Reference in New Issue