In CDATA mode, make sure entity-reference syntax is not interpreted;

entity references are not allowed in that mode. Do a better job of scanning <!DOCTYPE ...> declarations; based on the code in HTMLParser.py.
2001-07-16 18:30:35 +00:00 · 2001-07-16 18:30:35 +00:00 · fb38c76e0f
parent e16c7aee4b
commit fb38c76e0f
1 changed files with 27 additions and 9 deletions
--- a/Lib/sgmllib.py
+++ b/Lib/sgmllib.py
@ -5,7 +5,8 @@
 # XXX There should be a way to distinguish between PCDATA (parsed
 # character data -- the normal case), RCDATA (replaceable character
 # data -- only char and entity references and end tags are special)
-# and CDATA (character data -- only end tags are special).
+# and CDATA (character data -- only end tags are special).  RCDATA is
+# not supported at all.


 import re
@ -34,6 +35,9 @@ endbracket = re.compile('[<>]')
 special = re.compile('<![^<>]*>')
 commentopen = re.compile('<!--')
 commentclose = re.compile(r'--\s*>')
+declopen = re.compile('<!')
+declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
+declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
 attrfind = re.compile(
    r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
@ -160,6 +164,10 @@ class SGMLParser:
                    i = k
                    continue
            elif rawdata[i] == '&':
+                if self.literal:
+                    self.handle_data(rawdata[i])
+                    i = i+1
+                    continue
                match = charref.match(rawdata, i)
                if match:
                    name = match.group(1)
@ -210,11 +218,20 @@ class SGMLParser:

    # Internal -- parse declaration.
    def parse_declaration(self, i):
+        # This is some sort of declaration; in "HTML as
+        # deployed," this should only be the document type
+        # declaration ("<!DOCTYPE html...>").
        rawdata = self.rawdata
        j = i + 2
+        assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
+        if rawdata[j:j+1] in ("-", ""):
+            # Start of comment followed by buffer boundary,
+            # or just a buffer boundary.
+            return -1
+        # in practice, this should look like: ((name|stringlit) S*)+ '>'
        n = len(rawdata)
        while j < n:
-            c = rawdata[j:j+1]
+            c = rawdata[j]
            if c == ">":
                # end of declaration syntax
                self.handle_decl(rawdata[i+2:j])
@ -222,15 +239,16 @@ class SGMLParser:
            if c in "\"'":
                m = declstringlit.match(rawdata, j)
                if not m:
-                    # incomplete or an error?
-                    return -1
+                    return -1 # incomplete
+                j = m.end()
+            elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
+                m = declname.match(rawdata, j)
+                if not m:
+                    return -1 # incomplete
                j = m.end()
            else:
-                m = decldata.match(rawdata, j)
-                if not m:
-                    # incomplete or an error?
-                    return -1
-                j = m.end()
+                raise SGMLParseError(
+                    "unexpected char in declaration: %s" % `rawdata[j]`)
        # end of buffer between tokens
        return -1