From 68f8a8061d4f217ec97d8dc19925ba3b57621be1 Mon Sep 17 00:00:00 2001
From: Fred Drake <fdrake@acm.org>
Date: Mon, 24 Sep 2001 20:01:28 +0000
Subject: [PATCH] New base class for the SGMLParser and HTMLParser classes from
 the sgmllib and HTMLParser modules (and indirectly for the htmllib.HTMLParser
 class).

This has all the support for scanning over DOCTYPE declarations; it warrants
having a base class since this is a fair amount of tedious code (since it's
fairly strict), and should be in a separate module to avoid compiling many
REs that are not used (which would happen if this were placed in either then
sgmllib or HTMLParser module).
---
 Lib/markupbase.py | 306 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 306 insertions(+)
 create mode 100644 Lib/markupbase.py
diff --git a/Lib/markupbase.py b/Lib/markupbase.py
new file mode 100644
index 00000000000..0bb7c893b05
--- /dev/null
+++ b/Lib/markupbase.py
@@ -0,0 +1,306 @@
+"""Shared support for scanning document type declarations in HTML and XHTML."""
+
+import re
+import string
+
+_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
+_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
+
+del re
+
+
+class ParserBase:
+    """Parser base class which provides some common support methods used
+    by the SGML/HTML and XHTML parsers."""
+
+    def reset(self):
+        self.lineno = 1
+        self.offset = 0
+
+    def getpos(self):
+        """Return current line number and offset."""
+        return self.lineno, self.offset
+
+    # Internal -- update line number and offset.  This should be
+    # called for each piece of data exactly once, in order -- in other
+    # words the concatenation of all the input strings to this
+    # function should be exactly the entire input.
+    def updatepos(self, i, j):
+        if i >= j:
+            return j
+        rawdata = self.rawdata
+        nlines = string.count(rawdata, "\n", i, j)
+        if nlines:
+            self.lineno = self.lineno + nlines
+            pos = string.rindex(rawdata, "\n", i, j) # Should not fail
+            self.offset = j-(pos+1)
+        else:
+            self.offset = self.offset + j-i
+        return j
+
+    _decl_otherchars = ''
+
+    # Internal -- parse declaration (for use by subclasses).
+    def parse_declaration(self, i):
+        # This is some sort of declaration; in "HTML as
+        # deployed," this should only be the document type
+        # declaration ("<!DOCTYPE html...>").
+        rawdata = self.rawdata
+        import sys
+        j = i + 2
+        assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
+        if rawdata[j:j+1] in ("-", ""):
+            # Start of comment followed by buffer boundary,
+            # or just a buffer boundary.
+            return -1
+        # in practice, this should look like: ((name|stringlit) S*)+ '>'
+        n = len(rawdata)
+        decltype, j = self._scan_name(j, i)
+        if j < 0:
+            return j
+        if decltype == "doctype":
+            self._decl_otherchars = ''
+        while j < n:
+            c = rawdata[j]
+            if c == ">":
+                # end of declaration syntax
+                data = rawdata[i+2:j]
+                if decltype == "doctype":
+                    self.handle_decl(data)
+                else:
+                    self.unknown_decl(data)
+                return j + 1
+            if c in "\"'":
+                m = _declstringlit_match(rawdata, j)
+                if not m:
+                    return -1 # incomplete
+                j = m.end()
+            elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
+                name, j = self._scan_name(j, i)
+            elif c in self._decl_otherchars:
+                j = j + 1
+            elif c == "[":
+                if decltype == "doctype":
+                    j = self._parse_doctype_subset(j + 1, i)
+                else:
+                    self.error("unexpected '[' char in declaration")
+            else:
+                self.error(
+                    "unexpected %s char in declaration" % `rawdata[j]`)
+            if j < 0:
+                return j
+        return -1 # incomplete
+
+    # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
+    # returning the index just past any whitespace following the trailing ']'.
+    def _parse_doctype_subset(self, i, declstartpos):
+        rawdata = self.rawdata
+        n = len(rawdata)
+        j = i
+        while j < n:
+            c = rawdata[j]
+            if c == "<":
+                s = rawdata[j:j+2]
+                if s == "<":
+                    # end of buffer; incomplete
+                    return -1
+                if s != "<!":
+                    self.updatepos(declstartpos, j + 1)
+                    self.error("unexpected char in internal subset (in %s)"
+                               % `s`)
+                if (j + 2) == n:
+                    # end of buffer; incomplete
+                    return -1
+                if (j + 4) > n:
+                    # end of buffer; incomplete
+                    return -1
+                if rawdata[j:j+4] == "<!--":
+                    j = self.parse_comment(j, report=0)
+                    if j < 0:
+                        return j
+                    continue
+                name, j = self._scan_name(j + 2, declstartpos)
+                if j == -1:
+                    return -1
+                if name not in ("attlist", "element", "entity", "notation"):
+                    self.updatepos(declstartpos, j + 2)
+                    self.error(
+                        "unknown declaration %s in internal subset" % `name`)
+                # handle the individual names
+                meth = getattr(self, "_parse_doctype_" + name)
+                j = meth(j, declstartpos)
+                if j < 0:
+                    return j
+            elif c == "%":
+                # parameter entity reference
+                if (j + 1) == n:
+                    # end of buffer; incomplete
+                    return -1
+                s, j = self._scan_name(j + 1, declstartpos)
+                if j < 0:
+                    return j
+                if rawdata[j] == ";":
+                    j = j + 1
+            elif c == "]":
+                j = j + 1
+                while j < n and rawdata[j] in string.whitespace:
+                    j = j + 1
+                if j < n:
+                    if rawdata[j] == ">":
+                        return j
+                    self.updatepos(declstartpos, j)
+                    self.error("unexpected char after internal subset")
+                else:
+                    return -1
+            elif c in string.whitespace:
+                j = j + 1
+            else:
+                self.updatepos(declstartpos, j)
+                self.error("unexpected char %s in internal subset" % `c`)
+        # end of buffer reached
+        return -1
+
+    # Internal -- scan past <!ELEMENT declarations
+    def _parse_doctype_element(self, i, declstartpos):
+        rawdata = self.rawdata
+        n = len(rawdata)
+        name, j = self._scan_name(i, declstartpos)
+        if j == -1:
+            return -1
+        # style content model; just skip until '>'
+        if '>' in rawdata[j:]:
+            return string.find(rawdata, ">", j) + 1
+        return -1
+
+    # Internal -- scan past <!ATTLIST declarations
+    def _parse_doctype_attlist(self, i, declstartpos):
+        rawdata = self.rawdata
+        name, j = self._scan_name(i, declstartpos)
+        c = rawdata[j:j+1]
+        if c == "":
+            return -1
+        if c == ">":
+            return j + 1
+        while 1:
+            # scan a series of attribute descriptions; simplified:
+            #   name type [value] [#constraint]
+            name, j = self._scan_name(j, declstartpos)
+            if j < 0:
+                return j
+            c = rawdata[j:j+1]
+            if c == "":
+                return -1
+            if c == "(":
+                # an enumerated type; look for ')'
+                if ")" in rawdata[j:]:
+                    j = string.find(rawdata, ")", j) + 1
+                else:
+                    return -1
+                while rawdata[j:j+1] in string.whitespace:
+                    j = j + 1
+                if not rawdata[j:]:
+                    # end of buffer, incomplete
+                    return -1
+            else:
+                name, j = self._scan_name(j, declstartpos)
+            c = rawdata[j:j+1]
+            if not c:
+                return -1
+            if c in "'\"":
+                m = _declstringlit_match(rawdata, j)
+                if m:
+                    j = m.end()
+                else:
+                    return -1
+                c = rawdata[j:j+1]
+                if not c:
+                    return -1
+            if c == "#":
+                if rawdata[j:] == "#":
+                    # end of buffer
+                    return -1
+                name, j = self._scan_name(j + 1, declstartpos)
+                if j < 0:
+                    return j
+                c = rawdata[j:j+1]
+                if not c:
+                    return -1
+            if c == '>':
+                # all done
+                return j + 1
+
+    # Internal -- scan past <!NOTATION declarations
+    def _parse_doctype_notation(self, i, declstartpos):
+        name, j = self._scan_name(i, declstartpos)
+        if j < 0:
+            return j
+        rawdata = self.rawdata
+        while 1:
+            c = rawdata[j:j+1]
+            if not c:
+                # end of buffer; incomplete
+                return -1
+            if c == '>':
+                return j + 1
+            if c in "'\"":
+                m = _declstringlit_match(rawdata, j)
+                if not m:
+                    return -1
+                j = m.end()
+            else:
+                name, j = self._scan_name(j, declstartpos)
+                if j < 0:
+                    return j
+
+    # Internal -- scan past <!ENTITY declarations
+    def _parse_doctype_entity(self, i, declstartpos):
+        rawdata = self.rawdata
+        if rawdata[i:i+1] == "%":
+            j = i + 1
+            while 1:
+                c = rawdata[j:j+1]
+                if not c:
+                    return -1
+                if c in string.whitespace:
+                    j = j + 1
+                else:
+                    break
+        else:
+            j = i
+        name, j = self._scan_name(j, declstartpos)
+        if j < 0:
+            return j
+        while 1:
+            c = self.rawdata[j:j+1]
+            if not c:
+                return -1
+            if c in "'\"":
+                m = _declstringlit_match(rawdata, j)
+                if m:
+                    j = m.end()
+                else:
+                    return -1    # incomplete
+            elif c == ">":
+                return j + 1
+            else:
+                name, j = self._scan_name(j, declstartpos)
+                if j < 0:
+                    return j
+
+    # Internal -- scan a name token and the new position and the token, or
+    # return -1 if we've reached the end of the buffer.
+    def _scan_name(self, i, declstartpos):
+        rawdata = self.rawdata
+        n = len(rawdata)
+        if i == n:
+            return None, -1
+        m = _declname_match(rawdata, i)
+        if m:
+            s = m.group()
+            name = s.strip()
+            if (i + len(s)) == n:
+                return None, -1  # end of buffer
+            return name.lower(), m.end()
+        else:
+            self.updatepos(declstartpos, i)
+            self.error("expected name token", self.getpos())