From a219efaa7be1a149ce61200fc9bc9d8831a813ef Mon Sep 17 00:00:00 2001 From: Guido van Rossum Date: Tue, 18 Nov 1997 15:09:54 +0000 Subject: [PATCH] Sjoerd Mullender's xml parser (based on sgmllib, somewhat). --- Lib/xmllib.py | 568 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 568 insertions(+) create mode 100644 Lib/xmllib.py diff --git a/Lib/xmllib.py b/Lib/xmllib.py new file mode 100644 index 00000000000..38328affce5 --- /dev/null +++ b/Lib/xmllib.py @@ -0,0 +1,568 @@ +# A parser for XML, using the derived class as static DTD. +# Author: Sjoerd Mullender + +import re +import string + + +# Regular expressions used for parsing + +_S = '[ \t\r\n]+' +_opS = '[ \t\r\n]*' +_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' +interesting = re.compile('[&<]') +incomplete = re.compile('&(' + _Name + '|#[0-9]*|#x[0-9a-fA-F]*)?|' + '<([a-zA-Z_:][^<>]*|' + '/([a-zA-Z_:][^<>]*)?|' + '![^<>]*|' + '\?[^<>]*)?') + +ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+);?') +entityref = re.compile('&(?P' + _Name + ')[^-a-zA-Z0-9._:]') +charref = re.compile('&#(?P[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])') +space = re.compile(_S) +newline = re.compile('\n') + +starttagopen = re.compile('<' + _Name) +endtagopen = re.compile('/?)>') +endbracket = re.compile('>') +tagfind = re.compile(_Name) +cdataopen = re.compile('') +special = re.compile('[^<>]*)>') +procopen = re.compile('<\?(?P' + _Name + ')' + _S) +procclose = re.compile('\?>') +commentopen = re.compile('') +doubledash = re.compile('--') +attrfind = re.compile( + _S + '(?P' + _Name + ')' + '(' + _opS + '=' + _opS + + '(?P\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9.:+*%?!()_#=~]+))') + + +# XML parser base class -- find tags and call handler functions. +# Usage: p = XMLParser(); p.feed(data); ...; p.close(). +# The dtd is defined by deriving a class which defines methods +# with special names to handle tags: start_foo and end_foo to handle +# and , respectively, or do_foo to handle by itself. +# (Tags are converted to lower case for this purpose.) The data +# between tags is passed to the parser by calling self.handle_data() +# with some data as argument (the data may be split up in arbutrary +# chunks). Entity references are passed by calling +# self.handle_entityref() with the entity reference as argument. + +class XMLParser: + + # Interface -- initialize and reset this instance + def __init__(self, verbose=0): + self.verbose = verbose + self.reset() + + # Interface -- reset this instance. Loses all unprocessed data + def reset(self): + self.rawdata = '' + self.stack = [] + self.lasttag = '???' + self.nomoretags = 0 + self.literal = 0 + self.lineno = 1 + + # For derived classes only -- enter literal mode (CDATA) till EOF + def setnomoretags(self): + self.nomoretags = self.literal = 1 + + # For derived classes only -- enter literal mode (CDATA) + def setliteral(self, *args): + self.literal = 1 + + # Interface -- feed some data to the parser. Call this as + # often as you want, with as little or as much text as you + # want (may include '\n'). (This just saves the text, all the + # processing is done by goahead().) + def feed(self, data): + self.rawdata = self.rawdata + data + self.goahead(0) + + # Interface -- handle the remaining data + def close(self): + self.goahead(1) + + # Interface -- translate references + def translate_references(self, data): + newdata = [] + i = 0 + while 1: + res = ref.search(data, i) + if res is None: + newdata.append(data[i:]) + return string.join(newdata, '') + if data[res.end(0) - 1] != ';': + self.syntax_error(self.lineno, + '; missing after entity/char reference') + newdata.append(data[i:res.start(0)]) + str = res.group(1) + if str[0] == '#': + if str[1] == 'x': + newdata.append(chr(string.atoi(str[2:], 16))) + else: + newdata.append(chr(string.atoi(str[1:]))) + else: + try: + newdata.append(self.entitydefs[str]) + except KeyError: + # can't do it, so keep the entity ref in + newdata.append('&' + str + ';') + i = res.end(0) + + # Internal -- handle data as far as reasonable. May leave state + # and data to be processed by a subsequent call. If 'end' is + # true, force handling all data as if followed by EOF marker. + def goahead(self, end): + rawdata = self.rawdata + i = 0 + n = len(rawdata) + while i < n: + if self.nomoretags: + data = rawdata[i:n] + self.handle_data(data) + self.lineno = self.lineno + string.count(data, '\n') + i = n + break + res = interesting.search(rawdata, i) + if res: + j = res.start(0) + else: + j = n + if i < j: + data = rawdata[i:j] + self.handle_data(data) + self.lineno = self.lineno + string.count(data, '\n') + i = j + if i == n: break + if rawdata[i] == '<': + if starttagopen.match(rawdata, i): + if self.literal: + data = rawdata[i] + self.handle_data(data) + self.lineno = self.lineno + string.count(data, '\n') + i = i+1 + continue + k = self.parse_starttag(i) + if k < 0: break + self.lineno = self.lineno + string.count(rawdata[i:k], '\n') + i = k + continue + if endtagopen.match(rawdata, i): + k = self.parse_endtag(i) + if k < 0: break + self.lineno = self.lineno + string.count(rawdata[i:k], '\n') + i = k + self.literal = 0 + continue + if commentopen.match(rawdata, i): + if self.literal: + data = rawdata[i] + self.handle_data(data) + self.lineno = self.lineno + string.count(data, '\n') + i = i+1 + continue + k = self.parse_comment(i) + if k < 0: break + self.lineno = self.lineno + string.count(rawdata[i:k], '\n') + i = k + continue + if cdataopen.match(rawdata, i): + k = self.parse_cdata(i) + if k < 0: break + self.lineno = self.lineno + string.count(rawdata[i:i], '\n') + i = k + continue + res = procopen.match(rawdata, i) + if res: + k = self.parse_proc(i, res) + if k < 0: break + self.lineno = self.lineno + string.count(rawdata[i:k], '\n') + i = k + continue + res = special.match(rawdata, i) + if res: + if self.literal: + data = rawdata[i] + self.handle_data(data) + self.lineno = self.lineno + string.count(data, '\n') + i = i+1 + continue + self.handle_special(res.group('special')) + self.lineno = self.lineno + string.count(res.group(0), '\n') + i = res.end(0) + continue + elif rawdata[i] == '&': + res = charref.match(rawdata, i) + if res is not None: + i = res.end(0) + if rawdata[i-1] != ';': + self.syntax_error(self.lineno, '; missing in charref') + i = i-1 + self.handle_charref(res.group('char')[:-1]) + self.lineno = self.lineno + string.count(res.group(0), '\n') + continue + res = entityref.match(rawdata, i) + if res is not None: + i = res.end(0) + if rawdata[i-1] != ';': + self.syntax_error(self.lineno, '; missing in entityref') + i = i-1 + self.handle_entityref(res.group('name')) + self.lineno = self.lineno + string.count(res.group(0), '\n') + continue + else: + raise RuntimeError, 'neither < nor & ??' + # We get here only if incomplete matches but + # nothing else + res = incomplete.match(rawdata, i) + if not res: + data = rawdata[i] + self.handle_data(data) + self.lineno = self.lineno + string.count(data, '\n') + i = i+1 + continue + j = res.end(0) + if j == n: + break # Really incomplete + self.syntax_error(self.lineno, 'bogus < or &') + data = res.group(0) + self.handle_data(data) + self.lineno = self.lineno + string.count(data, '\n') + i = j + # end while + if end and i < n: + data = rawdata[i:n] + self.handle_data(data) + self.lineno = self.lineno + string.count(data, '\n') + i = n + self.rawdata = rawdata[i:] + # XXX if end: check for empty stack + + # Internal -- parse comment, return length or -1 if not terminated + def parse_comment(self, i): + rawdata = self.rawdata + if rawdata[i:i+4] <> '