"""HTML 2.0 parser.
See the HTML 2.0 specification:
http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
"""
import sys
import regsub
import string
from sgmllib import SGMLParser
from formatter import AS_IS
class HTMLParser(SGMLParser):
def __init__(self, formatter):
SGMLParser.__init__(self)
self.formatter = formatter
self.savedata = None
self.isindex = 0
self.title = None
self.base = None
self.anchor = None
self.anchorlist = []
self.nofill = 0
self.list_stack = []
# ------ Methods used internally; some may be overridden
# --- Formatter interface, taking care of 'savedata' mode;
# shouldn't need to be overridden
def handle_data(self, data):
if self.savedata is not None:
self.savedata = self.savedata + data
else:
if self.nofill:
self.formatter.add_literal_data(data)
else:
self.formatter.add_flowing_data(data)
# --- Hooks to save data; shouldn't need to be overridden
def save_bgn(self):
self.savedata = ''
def save_end(self):
data = self.savedata
self.savedata = None
return string.join(string.split(data))
# --- Hooks for anchors; should probably be overridden
def anchor_bgn(self, href, name, type):
self.anchor = href
if self.anchor:
self.anchorlist.append(href)
def anchor_end(self):
if self.anchor:
self.handle_data("[%d]" % len(self.anchorlist))
self.anchor = None
# --- Hook for images; should probably be overridden
def handle_image(self, src, alt):
self.handle_data(alt)
# --- Hooks for forms; should probably be overridden
def form_bgn(self, action, method, enctype):
self.do_p([])
self.handle_data("
")
self.do_p([])
def handle_input(self, type, options):
self.handle_data("")
def select_bgn(self, name, size, multiple):
self.handle_data("")
def handle_option(self, value, selected):
self.handle_data("