major rewrite using different formatting paradigm
This commit is contained in:
parent
145b2e0168
commit
7ff5d7f7c7
958
Lib/htmllib.py
958
Lib/htmllib.py
|
@ -1,633 +1,377 @@
|
|||
# A parser for HTML documents
|
||||
# New HTML class
|
||||
|
||||
# XXX Check against HTML 2.0 spec
|
||||
|
||||
# XXX reorder methods according to hierarchy
|
||||
# - html structure: head, body, title, isindex
|
||||
# - headers
|
||||
# - lists, items
|
||||
# - paragraph styles
|
||||
# - forms
|
||||
# - character styles
|
||||
# - images
|
||||
# - bookkeeping
|
||||
# - output generation
|
||||
|
||||
|
||||
# HTML: HyperText Markup Language; an SGML-like syntax used by WWW to
|
||||
# describe hypertext documents
|
||||
#
|
||||
# SGML: Standard Generalized Markup Language
|
||||
#
|
||||
# WWW: World-Wide Web; a distributed hypertext system develped at CERN
|
||||
#
|
||||
# CERN: European Particle Physics Laboratory in Geneva, Switzerland
|
||||
|
||||
|
||||
# This file is only concerned with parsing and formatting HTML
|
||||
# documents, not with the other (hypertext and networking) aspects of
|
||||
# the WWW project. (It does support highlighting of anchors.)
|
||||
|
||||
|
||||
import os
|
||||
import sys
|
||||
import regex
|
||||
import regsub
|
||||
import string
|
||||
import sgmllib
|
||||
from sgmllib import SGMLParser
|
||||
|
||||
|
||||
class HTMLParser(sgmllib.SGMLParser):
|
||||
|
||||
# Copy base class entities and add some
|
||||
entitydefs = {}
|
||||
for key in sgmllib.SGMLParser.entitydefs.keys():
|
||||
entitydefs[key] = sgmllib.SGMLParser.entitydefs[key]
|
||||
entitydefs['bullet'] = '*'
|
||||
|
||||
# Provided -- handlers for tags introducing literal text
|
||||
|
||||
def start_listing(self, attrs):
|
||||
self.setliteral('listing')
|
||||
self.literal_bgn('listing', attrs)
|
||||
|
||||
def end_listing(self):
|
||||
self.literal_end('listing')
|
||||
|
||||
def start_xmp(self, attrs):
|
||||
self.setliteral('xmp')
|
||||
self.literal_bgn('xmp', attrs)
|
||||
|
||||
def end_xmp(self):
|
||||
self.literal_end('xmp')
|
||||
|
||||
def do_plaintext(self, attrs):
|
||||
self.setnomoretags()
|
||||
self.literal_bgn('plaintext', attrs)
|
||||
|
||||
# To be overridden -- begin/end literal mode
|
||||
def literal_bgn(self, tag, attrs): pass
|
||||
def literal_end(self, tag): pass
|
||||
|
||||
|
||||
# Next level of sophistication -- collect anchors, title, nextid and isindex
|
||||
class CollectingParser(HTMLParser):
|
||||
#
|
||||
def __init__(self):
|
||||
HTMLParser.__init__(self)
|
||||
self.savetext = None
|
||||
self.nextid = []
|
||||
self.isindex = 0
|
||||
self.title = ''
|
||||
self.inanchor = 0
|
||||
self.anchors = []
|
||||
self.anchornames = []
|
||||
self.anchortypes = []
|
||||
#
|
||||
def start_a(self, attrs):
|
||||
self.inanchor = 0
|
||||
href = ''
|
||||
name = ''
|
||||
type = ''
|
||||
for attrname, value in attrs:
|
||||
if attrname == 'href':
|
||||
href = value
|
||||
if attrname == 'name=':
|
||||
name = value
|
||||
if attrname == 'type=':
|
||||
type = string.lower(value)
|
||||
if not (href or name):
|
||||
return
|
||||
self.anchors.append(href)
|
||||
self.anchornames.append(name)
|
||||
self.anchortypes.append(type)
|
||||
self.inanchor = len(self.anchors)
|
||||
if not href:
|
||||
self.inanchor = -self.inanchor
|
||||
#
|
||||
def end_a(self):
|
||||
if self.inanchor > 0:
|
||||
# Don't show anchors pointing into the current document
|
||||
if self.anchors[self.inanchor-1][:1] <> '#':
|
||||
self.handle_data('[' + `self.inanchor` + ']')
|
||||
self.inanchor = 0
|
||||
#
|
||||
def start_html(self, attrs): pass
|
||||
def end_html(self): pass
|
||||
#
|
||||
def start_head(self, attrs): pass
|
||||
def end_head(self): pass
|
||||
#
|
||||
def start_body(self, attrs): pass
|
||||
def end_body(self): pass
|
||||
#
|
||||
def do_nextid(self, attrs):
|
||||
self.nextid = attrs
|
||||
#
|
||||
def do_isindex(self, attrs):
|
||||
self.isindex = 1
|
||||
#
|
||||
def start_title(self, attrs):
|
||||
self.savetext = ''
|
||||
#
|
||||
def end_title(self):
|
||||
if self.savetext <> None:
|
||||
self.title = self.savetext
|
||||
self.savetext = None
|
||||
#
|
||||
def handle_data(self, text):
|
||||
if self.savetext is not None:
|
||||
self.savetext = self.savetext + text
|
||||
|
||||
|
||||
# Formatting parser -- takes a formatter and a style sheet as arguments
|
||||
|
||||
# XXX The use of style sheets should change: for each tag and end tag
|
||||
# there should be a style definition, and a style definition should
|
||||
# encompass many more parameters: font, justification, indentation,
|
||||
# vspace before, vspace after, hanging tag...
|
||||
|
||||
wordprog = regex.compile('[^ \t\n]*')
|
||||
spaceprog = regex.compile('[ \t\n]*')
|
||||
|
||||
class FormattingParser(CollectingParser):
|
||||
|
||||
def __init__(self, formatter, stylesheet):
|
||||
CollectingParser.__init__(self)
|
||||
self.fmt = formatter
|
||||
self.stl = stylesheet
|
||||
self.savetext = None
|
||||
self.compact = 0
|
||||
self.nofill = 0
|
||||
self.resetfont()
|
||||
self.setindent(self.stl.stdindent)
|
||||
|
||||
def resetfont(self):
|
||||
self.fontstack = []
|
||||
self.stylestack = []
|
||||
self.fontset = self.stl.stdfontset
|
||||
self.style = ROMAN
|
||||
self.passfont()
|
||||
|
||||
def passfont(self):
|
||||
font = self.fontset[self.style]
|
||||
self.fmt.setfont(font)
|
||||
|
||||
def pushstyle(self, style):
|
||||
self.stylestack.append(self.style)
|
||||
self.style = min(style, len(self.fontset)-1)
|
||||
self.passfont()
|
||||
|
||||
def popstyle(self):
|
||||
self.style = self.stylestack[-1]
|
||||
del self.stylestack[-1]
|
||||
self.passfont()
|
||||
|
||||
def pushfontset(self, fontset, style):
|
||||
self.fontstack.append(self.fontset)
|
||||
self.fontset = fontset
|
||||
self.pushstyle(style)
|
||||
|
||||
def popfontset(self):
|
||||
self.fontset = self.fontstack[-1]
|
||||
del self.fontstack[-1]
|
||||
self.popstyle()
|
||||
|
||||
def flush(self):
|
||||
self.fmt.flush()
|
||||
|
||||
def setindent(self, n):
|
||||
self.fmt.setleftindent(n)
|
||||
|
||||
def needvspace(self, n):
|
||||
self.fmt.needvspace(n)
|
||||
|
||||
def close(self):
|
||||
HTMLParser.close(self)
|
||||
self.fmt.flush()
|
||||
|
||||
def handle_literal(self, text):
|
||||
lines = string.splitfields(text, '\n')
|
||||
for i in range(1, len(lines)):
|
||||
lines[i] = string.expandtabs(lines[i], 8)
|
||||
for line in lines[:-1]:
|
||||
self.fmt.addword(line, 0)
|
||||
self.fmt.flush()
|
||||
self.fmt.nospace = 0
|
||||
for line in lines[-1:]:
|
||||
self.fmt.addword(line, 0)
|
||||
|
||||
def handle_data(self, text):
|
||||
if self.savetext is not None:
|
||||
self.savetext = self.savetext + text
|
||||
return
|
||||
if self.literal:
|
||||
self.handle_literal(text)
|
||||
return
|
||||
i = 0
|
||||
n = len(text)
|
||||
while i < n:
|
||||
j = i + wordprog.match(text, i)
|
||||
word = text[i:j]
|
||||
i = j + spaceprog.match(text, j)
|
||||
self.fmt.addword(word, i-j)
|
||||
if self.nofill and '\n' in text[j:i]:
|
||||
self.fmt.flush()
|
||||
self.fmt.nospace = 0
|
||||
i = j+1
|
||||
while text[i-1] <> '\n': i = i+1
|
||||
|
||||
def literal_bgn(self, tag, attrs):
|
||||
if tag == 'plaintext':
|
||||
self.flush()
|
||||
else:
|
||||
self.needvspace(1)
|
||||
self.pushfontset(self.stl.stdfontset, FIXED)
|
||||
self.setindent(self.stl.literalindent)
|
||||
|
||||
def literal_end(self, tag):
|
||||
self.needvspace(1)
|
||||
self.popfontset()
|
||||
self.setindent(self.stl.stdindent)
|
||||
|
||||
def start_title(self, attrs):
|
||||
self.flush()
|
||||
self.savetext = ''
|
||||
# NB end_title is unchanged
|
||||
|
||||
def do_p(self, attrs):
|
||||
if self.compact:
|
||||
self.flush()
|
||||
else:
|
||||
self.needvspace(1)
|
||||
|
||||
def start_h1(self, attrs):
|
||||
self.needvspace(2)
|
||||
self.setindent(self.stl.h1indent)
|
||||
self.pushfontset(self.stl.h1fontset, BOLD)
|
||||
self.fmt.setjust('c')
|
||||
|
||||
def end_h1(self):
|
||||
self.popfontset()
|
||||
self.needvspace(2)
|
||||
self.setindent(self.stl.stdindent)
|
||||
self.fmt.setjust('l')
|
||||
|
||||
def start_h2(self, attrs):
|
||||
self.needvspace(1)
|
||||
self.setindent(self.stl.h2indent)
|
||||
self.pushfontset(self.stl.h2fontset, BOLD)
|
||||
|
||||
def end_h2(self):
|
||||
self.popfontset()
|
||||
self.needvspace(1)
|
||||
self.setindent(self.stl.stdindent)
|
||||
|
||||
def start_h3(self, attrs):
|
||||
self.needvspace(1)
|
||||
self.setindent(self.stl.stdindent)
|
||||
self.pushfontset(self.stl.h3fontset, BOLD)
|
||||
|
||||
def end_h3(self):
|
||||
self.popfontset()
|
||||
self.needvspace(1)
|
||||
self.setindent(self.stl.stdindent)
|
||||
|
||||
def start_h4(self, attrs):
|
||||
self.needvspace(1)
|
||||
self.setindent(self.stl.stdindent)
|
||||
self.pushfontset(self.stl.stdfontset, BOLD)
|
||||
|
||||
def end_h4(self):
|
||||
self.popfontset()
|
||||
self.needvspace(1)
|
||||
self.setindent(self.stl.stdindent)
|
||||
|
||||
start_h5 = start_h4
|
||||
end_h5 = end_h4
|
||||
|
||||
start_h6 = start_h5
|
||||
end_h6 = end_h5
|
||||
|
||||
start_h7 = start_h6
|
||||
end_h7 = end_h6
|
||||
|
||||
def start_ul(self, attrs):
|
||||
self.needvspace(1)
|
||||
for attrname, value in attrs:
|
||||
if attrname == 'compact':
|
||||
self.compact = 1
|
||||
self.setindent(0)
|
||||
break
|
||||
else:
|
||||
self.setindent(self.stl.ulindent)
|
||||
|
||||
start_dir = start_menu = start_ol = start_ul
|
||||
|
||||
do_li = do_p
|
||||
|
||||
def end_ul(self):
|
||||
self.compact = 0
|
||||
self.needvspace(1)
|
||||
self.setindent(self.stl.stdindent)
|
||||
|
||||
end_dir = end_menu = end_ol = end_ul
|
||||
|
||||
def start_dl(self, attrs):
|
||||
for attrname, value in attrs:
|
||||
if attrname == 'compact':
|
||||
self.compact = 1
|
||||
self.needvspace(1)
|
||||
|
||||
def end_dl(self):
|
||||
self.compact = 0
|
||||
self.needvspace(1)
|
||||
self.setindent(self.stl.stdindent)
|
||||
|
||||
def do_dt(self, attrs):
|
||||
if self.compact:
|
||||
self.flush()
|
||||
else:
|
||||
self.needvspace(1)
|
||||
self.setindent(self.stl.stdindent)
|
||||
|
||||
def do_dd(self, attrs):
|
||||
self.fmt.addword('', 1)
|
||||
self.setindent(self.stl.ddindent)
|
||||
|
||||
def start_address(self, attrs):
|
||||
self.compact = 1
|
||||
self.needvspace(1)
|
||||
self.fmt.setjust('r')
|
||||
|
||||
def end_address(self):
|
||||
self.compact = 0
|
||||
self.needvspace(1)
|
||||
self.setindent(self.stl.stdindent)
|
||||
self.fmt.setjust('l')
|
||||
|
||||
def start_pre(self, attrs):
|
||||
self.needvspace(1)
|
||||
self.nofill = self.nofill + 1
|
||||
self.pushstyle(FIXED)
|
||||
|
||||
def end_pre(self):
|
||||
self.popstyle()
|
||||
self.nofill = self.nofill - 1
|
||||
self.needvspace(1)
|
||||
|
||||
start_typewriter = start_pre
|
||||
end_typewriter = end_pre
|
||||
|
||||
def do_img(self, attrs):
|
||||
self.fmt.addword('(image)', 0)
|
||||
|
||||
# Physical styles
|
||||
|
||||
def start_tt(self, attrs): self.pushstyle(FIXED)
|
||||
def end_tt(self): self.popstyle()
|
||||
|
||||
def start_b(self, attrs): self.pushstyle(BOLD)
|
||||
def end_b(self): self.popstyle()
|
||||
|
||||
def start_i(self, attrs): self.pushstyle(ITALIC)
|
||||
def end_i(self): self.popstyle()
|
||||
|
||||
def start_u(self, attrs): self.pushstyle(ITALIC) # Underline???
|
||||
def end_u(self): self.popstyle()
|
||||
|
||||
def start_r(self, attrs): self.pushstyle(ROMAN) # Not official
|
||||
def end_r(self): self.popstyle()
|
||||
|
||||
# Logical styles
|
||||
|
||||
start_em = start_i
|
||||
end_em = end_i
|
||||
|
||||
start_strong = start_b
|
||||
end_strong = end_b
|
||||
|
||||
start_code = start_tt
|
||||
end_code = end_tt
|
||||
|
||||
start_samp = start_tt
|
||||
end_samp = end_tt
|
||||
|
||||
start_kbd = start_tt
|
||||
end_kbd = end_tt
|
||||
|
||||
start_file = start_tt # unofficial
|
||||
end_file = end_tt
|
||||
|
||||
start_var = start_i
|
||||
end_var = end_i
|
||||
|
||||
start_dfn = start_i
|
||||
end_dfn = end_i
|
||||
|
||||
start_cite = start_i
|
||||
end_cite = end_i
|
||||
|
||||
start_hp1 = start_i
|
||||
end_hp1 = start_i
|
||||
|
||||
start_hp2 = start_b
|
||||
end_hp2 = end_b
|
||||
|
||||
def unknown_starttag(self, tag, attrs):
|
||||
print '*** unknown <' + tag + '>'
|
||||
|
||||
def unknown_endtag(self, tag):
|
||||
print '*** unknown </' + tag + '>'
|
||||
|
||||
|
||||
# An extension of the formatting parser which formats anchors differently.
|
||||
class AnchoringParser(FormattingParser):
|
||||
|
||||
def start_a(self, attrs):
|
||||
FormattingParser.start_a(self, attrs)
|
||||
if self.inanchor:
|
||||
self.fmt.bgn_anchor(self.inanchor)
|
||||
|
||||
def end_a(self):
|
||||
if self.inanchor:
|
||||
self.fmt.end_anchor(self.inanchor)
|
||||
self.inanchor = 0
|
||||
|
||||
|
||||
# Style sheet -- this is never instantiated, but the attributes
|
||||
# of the class object itself are used to specify fonts to be used
|
||||
# for various paragraph styles.
|
||||
# A font set is a non-empty list of fonts, in the order:
|
||||
# [roman, italic, bold, fixed].
|
||||
# When a style is not available the nearest lower style is used
|
||||
|
||||
ROMAN = 0
|
||||
ITALIC = 1
|
||||
BOLD = 2
|
||||
FIXED = 3
|
||||
|
||||
class NullStylesheet:
|
||||
# Fonts -- none
|
||||
stdfontset = [None]
|
||||
h1fontset = [None]
|
||||
h2fontset = [None]
|
||||
h3fontset = [None]
|
||||
# Indents
|
||||
stdindent = 2
|
||||
ddindent = 25
|
||||
ulindent = 4
|
||||
h1indent = 0
|
||||
h2indent = 0
|
||||
literalindent = 0
|
||||
|
||||
class HTMLParser(SGMLParser):
|
||||
|
||||
class X11Stylesheet(NullStylesheet):
|
||||
stdfontset = [
|
||||
'-*-helvetica-medium-r-normal-*-*-100-100-*-*-*-*-*',
|
||||
'-*-helvetica-medium-o-normal-*-*-100-100-*-*-*-*-*',
|
||||
'-*-helvetica-bold-r-normal-*-*-100-100-*-*-*-*-*',
|
||||
'-*-courier-medium-r-normal-*-*-100-100-*-*-*-*-*',
|
||||
]
|
||||
h1fontset = [
|
||||
'-*-helvetica-medium-r-normal-*-*-180-100-*-*-*-*-*',
|
||||
'-*-helvetica-medium-o-normal-*-*-180-100-*-*-*-*-*',
|
||||
'-*-helvetica-bold-r-normal-*-*-180-100-*-*-*-*-*',
|
||||
]
|
||||
h2fontset = [
|
||||
'-*-helvetica-medium-r-normal-*-*-140-100-*-*-*-*-*',
|
||||
'-*-helvetica-medium-o-normal-*-*-140-100-*-*-*-*-*',
|
||||
'-*-helvetica-bold-r-normal-*-*-140-100-*-*-*-*-*',
|
||||
]
|
||||
h3fontset = [
|
||||
'-*-helvetica-medium-r-normal-*-*-120-100-*-*-*-*-*',
|
||||
'-*-helvetica-medium-o-normal-*-*-120-100-*-*-*-*-*',
|
||||
'-*-helvetica-bold-r-normal-*-*-120-100-*-*-*-*-*',
|
||||
]
|
||||
ddindent = 40
|
||||
def __init__(self):
|
||||
SGMLParser.__init__(self)
|
||||
self.savedata = None
|
||||
self.isindex = 0
|
||||
self.title = ''
|
||||
self.para = None
|
||||
self.lists = []
|
||||
self.styles = []
|
||||
self.nofill = 0
|
||||
self.nospace = 1
|
||||
self.softspace = 0
|
||||
|
||||
# --- Data
|
||||
|
||||
class MacStylesheet(NullStylesheet):
|
||||
stdfontset = [
|
||||
('Geneva', 'p', 10),
|
||||
('Geneva', 'i', 10),
|
||||
('Geneva', 'b', 10),
|
||||
('Monaco', 'p', 10),
|
||||
]
|
||||
h1fontset = [
|
||||
('Geneva', 'p', 18),
|
||||
('Geneva', 'i', 18),
|
||||
('Geneva', 'b', 18),
|
||||
('Monaco', 'p', 18),
|
||||
]
|
||||
h3fontset = [
|
||||
('Geneva', 'p', 14),
|
||||
('Geneva', 'i', 14),
|
||||
('Geneva', 'b', 14),
|
||||
('Monaco', 'p', 14),
|
||||
]
|
||||
h3fontset = [
|
||||
('Geneva', 'p', 12),
|
||||
('Geneva', 'i', 12),
|
||||
('Geneva', 'b', 12),
|
||||
('Monaco', 'p', 12),
|
||||
]
|
||||
def handle_image(self, src, alt):
|
||||
self.handle_data(alt)
|
||||
|
||||
def handle_data(self, data):
|
||||
if self.nofill:
|
||||
self.handle_literal(data)
|
||||
return
|
||||
data = regsub.gsub('[ \t\n\r]+', ' ', data)
|
||||
if self.nospace and data[:1] == ' ': data = data[1:]
|
||||
if not data: return
|
||||
self.nospace = 0
|
||||
if self.softspace and data[:1] != ' ': data = ' ' + data
|
||||
if data[-1:] == ' ':
|
||||
data = data[:-1]
|
||||
self.softspace = 1
|
||||
self.output_data(data)
|
||||
|
||||
if os.name == 'mac':
|
||||
StdwinStylesheet = MacStylesheet
|
||||
else:
|
||||
StdwinStylesheet = X11Stylesheet
|
||||
def handle_literal(self, data):
|
||||
self.nospace = 0
|
||||
self.softspace = 0
|
||||
self.output_data(data)
|
||||
|
||||
def output_data(self, data):
|
||||
if self.savedata is not None:
|
||||
self.savedata = self.savedata + data
|
||||
else:
|
||||
self.write_data(data)
|
||||
|
||||
class GLStylesheet(NullStylesheet):
|
||||
stdfontset = [
|
||||
'Helvetica 10',
|
||||
'Helvetica-Italic 10',
|
||||
'Helvetica-Bold 10',
|
||||
'Courier 10',
|
||||
]
|
||||
h1fontset = [
|
||||
'Helvetica 18',
|
||||
'Helvetica-Italic 18',
|
||||
'Helvetica-Bold 18',
|
||||
'Courier 18',
|
||||
]
|
||||
h2fontset = [
|
||||
'Helvetica 14',
|
||||
'Helvetica-Italic 14',
|
||||
'Helvetica-Bold 14',
|
||||
'Courier 14',
|
||||
]
|
||||
h3fontset = [
|
||||
'Helvetica 12',
|
||||
'Helvetica-Italic 12',
|
||||
'Helvetica-Bold 12',
|
||||
'Courier 12',
|
||||
]
|
||||
def write_data(self, data):
|
||||
sys.stdout.write(data)
|
||||
|
||||
def save_bgn(self):
|
||||
self.savedata = ''
|
||||
self.nospace = 1
|
||||
self.softspace = 0
|
||||
|
||||
def save_end(self):
|
||||
saved = self.savedata
|
||||
self.savedata = None
|
||||
self.nospace = 1
|
||||
self.softspace = 0
|
||||
return saved
|
||||
|
||||
def new_para(self):
|
||||
pass
|
||||
|
||||
def new_style(self):
|
||||
pass
|
||||
|
||||
# --- Generic style changes
|
||||
|
||||
def para_bgn(self, tag):
|
||||
if not self.nospace:
|
||||
self.handle_literal('\n')
|
||||
self.nospace = 1
|
||||
self.softspace = 0
|
||||
if tag is not None:
|
||||
self.para = tag
|
||||
self.new_para()
|
||||
|
||||
def para_end(self):
|
||||
self.para_bgn('')
|
||||
|
||||
def push_list(self, tag):
|
||||
self.lists.append(tag)
|
||||
self.para_bgn(None)
|
||||
|
||||
def pop_list(self):
|
||||
del self.lists[-1]
|
||||
self.para_end()
|
||||
|
||||
def literal_bgn(self, tag, attrs):
|
||||
self.para_bgn(tag)
|
||||
|
||||
def literal_end(self, tag):
|
||||
self.para_end()
|
||||
|
||||
def push_style(self, tag):
|
||||
self.styles.append(tag)
|
||||
self.new_style()
|
||||
|
||||
def pop_style(self):
|
||||
del self.styles[-1]
|
||||
self.new_style()
|
||||
|
||||
def anchor_bgn(self, href, name, type):
|
||||
self.push_style(href and 'a' or None)
|
||||
|
||||
def anchor_end(self):
|
||||
self.pop_style()
|
||||
|
||||
# --- Top level tags
|
||||
|
||||
def start_html(self, attrs): pass
|
||||
def end_html(self): pass
|
||||
|
||||
def start_head(self, attrs): pass
|
||||
def end_head(self): pass
|
||||
|
||||
def start_body(self, attrs): pass
|
||||
def end_body(self): pass
|
||||
|
||||
def do_isindex(self, attrs):
|
||||
self.isindex = 1
|
||||
|
||||
def start_title(self, attrs):
|
||||
self.save_bgn()
|
||||
|
||||
def end_title(self):
|
||||
self.title = self.save_end()
|
||||
|
||||
# --- Old HTML 'literal text' tags
|
||||
|
||||
def start_listing(self, attrs):
|
||||
self.setliteral('listing')
|
||||
self.literal_bgn('listing', attrs)
|
||||
|
||||
def end_listing(self):
|
||||
self.literal_end('listing')
|
||||
|
||||
def start_xmp(self, attrs):
|
||||
self.setliteral('xmp')
|
||||
self.literal_bgn('xmp', attrs)
|
||||
|
||||
def end_xmp(self):
|
||||
self.literal_end('xmp')
|
||||
|
||||
def do_plaintext(self, attrs):
|
||||
self.setnomoretags()
|
||||
self.literal_bgn('plaintext', attrs)
|
||||
|
||||
# --- Anchors
|
||||
|
||||
def start_a(self, attrs):
|
||||
href = ''
|
||||
name = ''
|
||||
type = ''
|
||||
for attrname, value in attrs:
|
||||
if attrname == 'href':
|
||||
href = value
|
||||
if attrname == 'name':
|
||||
name = value
|
||||
if attrname == 'type':
|
||||
type = string.lower(value)
|
||||
if not (href or name):
|
||||
return
|
||||
self.anchor_bgn(href, name, type)
|
||||
|
||||
def end_a(self):
|
||||
self.anchor_end()
|
||||
|
||||
# --- Paragraph tags
|
||||
|
||||
def do_p(self, attrs):
|
||||
self.para_bgn(None)
|
||||
|
||||
def do_br(self, attrs):
|
||||
self.handle_literal('\n')
|
||||
self.nospace = 1
|
||||
self.softspace = 0
|
||||
|
||||
def do_hr(self, attrs):
|
||||
self.para_bgn(None)
|
||||
self.handle_literal('-'*40)
|
||||
self.para_end()
|
||||
|
||||
def start_h1(self, attrs):
|
||||
self.para_bgn('h1')
|
||||
|
||||
def start_h2(self, attrs):
|
||||
self.para_bgn('h2')
|
||||
|
||||
def start_h3(self, attrs):
|
||||
self.para_bgn('h3')
|
||||
|
||||
def start_h4(self, attrs):
|
||||
self.para_bgn('h4')
|
||||
|
||||
def start_h5(self, attrs):
|
||||
self.para_bgn('h5')
|
||||
|
||||
def start_h6(self, attrs):
|
||||
self.para_bgn('h6')
|
||||
|
||||
def end_h1(self):
|
||||
self.para_end()
|
||||
|
||||
end_h2 = end_h1
|
||||
end_h3 = end_h2
|
||||
end_h4 = end_h3
|
||||
end_h5 = end_h4
|
||||
end_h6 = end_h5
|
||||
|
||||
def start_ul(self, attrs):
|
||||
self.para_bgn(None)
|
||||
self.push_list('ul')
|
||||
|
||||
def start_ol(self, attrs):
|
||||
self.para_bgn(None)
|
||||
self.push_list('ol')
|
||||
|
||||
def end_ul(self):
|
||||
self.pop_list()
|
||||
self.para_end()
|
||||
|
||||
def do_li(self, attrs):
|
||||
self.para_bgn('li%d' % len(self.lists))
|
||||
|
||||
start_dir = start_menu = start_ul
|
||||
end_dir = end_menu = end_ol = end_ul
|
||||
|
||||
def start_dl(self, attrs):
|
||||
self.para_bgn(None)
|
||||
self.push_list('dl')
|
||||
|
||||
def end_dl(self):
|
||||
self.pop_list()
|
||||
self.para_end()
|
||||
|
||||
def do_dt(self, attrs):
|
||||
self.para_bgn('dt%d' % len(self.lists))
|
||||
|
||||
def do_dd(self, attrs):
|
||||
self.para_bgn('dd%d' % len(self.lists))
|
||||
|
||||
def start_address(self, attrs):
|
||||
self.para_bgn('address')
|
||||
|
||||
def end_address(self):
|
||||
self.para_end()
|
||||
|
||||
def start_pre(self, attrs):
|
||||
self.para_bgn('pre')
|
||||
self.nofill = self.nofill + 1
|
||||
|
||||
def end_pre(self):
|
||||
self.nofill = self.nofill - 1
|
||||
self.para_end()
|
||||
|
||||
start_typewriter = start_pre
|
||||
end_typewriter = end_pre
|
||||
|
||||
def do_img(self, attrs):
|
||||
src = ''
|
||||
alt = ' (image) '
|
||||
for attrname, value in attrs:
|
||||
if attrname == 'alt':
|
||||
alt = value
|
||||
if attrname == 'src':
|
||||
src = value
|
||||
self.handle_image(src, alt)
|
||||
|
||||
# --- Character tags -- physical styles
|
||||
|
||||
def start_tt(self, attrs): self.push_style(FIXED)
|
||||
def end_tt(self): self.pop_style()
|
||||
|
||||
def start_b(self, attrs): self.push_style(BOLD)
|
||||
def end_b(self): self.pop_style()
|
||||
|
||||
def start_i(self, attrs): self.push_style(ITALIC)
|
||||
def end_i(self): self.pop_style()
|
||||
|
||||
def start_u(self, attrs): self.push_style(ITALIC) # Underline???
|
||||
def end_u(self): self.pop_style()
|
||||
|
||||
def start_r(self, attrs): self.push_style(ROMAN) # Not official
|
||||
def end_r(self): self.pop_style()
|
||||
|
||||
# --- Charaacter tags -- logical styles
|
||||
|
||||
start_em = start_i
|
||||
end_em = end_i
|
||||
|
||||
start_strong = start_b
|
||||
end_strong = end_b
|
||||
|
||||
start_code = start_tt
|
||||
end_code = end_tt
|
||||
|
||||
start_samp = start_tt
|
||||
end_samp = end_tt
|
||||
|
||||
start_kbd = start_tt
|
||||
end_kbd = end_tt
|
||||
|
||||
start_file = start_tt # unofficial
|
||||
end_file = end_tt
|
||||
|
||||
start_var = start_i
|
||||
end_var = end_i
|
||||
|
||||
start_dfn = start_i
|
||||
end_dfn = end_i
|
||||
|
||||
start_cite = start_i
|
||||
end_cite = end_i
|
||||
|
||||
start_hp1 = start_i
|
||||
end_hp1 = start_i
|
||||
|
||||
start_hp2 = start_b
|
||||
end_hp2 = end_b
|
||||
|
||||
# --- Form tags
|
||||
|
||||
def start_form(self, attrs):
|
||||
self.para_bgn(None)
|
||||
|
||||
def end_form(self):
|
||||
self.para_end()
|
||||
|
||||
# --- Unhandled tags
|
||||
|
||||
def unknown_starttag(self, tag, attrs):
|
||||
pass
|
||||
|
||||
def unknown_endtag(self, tag):
|
||||
pass
|
||||
|
||||
# Test program -- produces no output but times how long it takes
|
||||
# to send a document to a null formatter, exclusive of I/O
|
||||
|
||||
def test():
|
||||
import fmt
|
||||
import time
|
||||
if sys.argv[1:]: file = sys.argv[1]
|
||||
else: file = 'test.html'
|
||||
data = open(file, 'r').read()
|
||||
t0 = time.time()
|
||||
fmtr = fmt.WritingFormatter(sys.stdout, 79)
|
||||
p = FormattingParser(fmtr, NullStylesheet)
|
||||
p.feed(data)
|
||||
p.close()
|
||||
t1 = time.time()
|
||||
print
|
||||
print '*** Formatting time:', round(t1-t0, 3), 'seconds.'
|
||||
|
||||
|
||||
# Test program using stdwin
|
||||
|
||||
def testStdwin():
|
||||
import stdwin, fmt
|
||||
from stdwinevents import *
|
||||
if sys.argv[1:]: file = sys.argv[1]
|
||||
else: file = 'test.html'
|
||||
data = open(file, 'r').read()
|
||||
window = stdwin.open('testStdwin')
|
||||
b = None
|
||||
while 1:
|
||||
etype, ewin, edetail = stdwin.getevent()
|
||||
if etype == WE_CLOSE:
|
||||
break
|
||||
if etype == WE_SIZE:
|
||||
window.setdocsize(0, 0)
|
||||
window.setorigin(0, 0)
|
||||
window.change((0, 0), (10000, 30000)) # XXX
|
||||
if etype == WE_DRAW:
|
||||
if not b:
|
||||
b = fmt.StdwinBackEnd(window, 1)
|
||||
f = fmt.BaseFormatter(b.d, b)
|
||||
p = FormattingParser(f, MacStylesheet)
|
||||
p.feed(data)
|
||||
p.close()
|
||||
b.finish()
|
||||
else:
|
||||
b.redraw(edetail)
|
||||
window.close()
|
||||
|
||||
|
||||
# Test program using GL
|
||||
|
||||
def testGL():
|
||||
import gl, GL, fmt
|
||||
if sys.argv[1:]: file = sys.argv[1]
|
||||
else: file = 'test.html'
|
||||
data = open(file, 'r').read()
|
||||
W, H = 600, 600
|
||||
gl.foreground()
|
||||
gl.prefsize(W, H)
|
||||
wid = gl.winopen('testGL')
|
||||
gl.ortho2(0, W, H, 0)
|
||||
gl.color(GL.WHITE)
|
||||
gl.clear()
|
||||
gl.color(GL.BLACK)
|
||||
b = fmt.GLBackEnd(wid)
|
||||
f = fmt.BaseFormatter(b.d, b)
|
||||
p = FormattingParser(f, GLStylesheet)
|
||||
p.feed(data)
|
||||
p.close()
|
||||
b.finish()
|
||||
#
|
||||
import time
|
||||
time.sleep(5)
|
||||
file = 'test.html'
|
||||
f = open(file, 'r')
|
||||
data = f.read()
|
||||
f.close()
|
||||
p = HTMLParser()
|
||||
p.feed(data)
|
||||
p.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test()
|
||||
test()
|
||||
|
|
Loading…
Reference in New Issue