added html parser and supporting cast

This commit is contained in:
Guido van Rossum 1995-02-27 13:16:55 +00:00
parent eb9e9d2b2a
commit 7c750e1e09
6 changed files with 3014 additions and 0 deletions

408
Lib/Para.py Normal file
View File

@ -0,0 +1,408 @@
# Text formatting abstractions
# Oft-used type object
Int = type(0)
# Represent a paragraph. This is a list of words with associated
# font and size information, plus indents and justification for the
# entire paragraph.
# Once the words have been added to a paragraph, it can be laid out
# for different line widths. Once laid out, it can be rendered at
# different screen locations. Once rendered, it can be queried
# for mouse hits, and parts of the text can be highlighted
class Para:
#
def __init__(self):
self.words = [] # The words
self.just = 'l' # Justification: 'l', 'r', 'lr' or 'c'
self.indent_left = self.indent_right = self.indent_hang = 0
# Final lay-out parameters, may change
self.left = self.top = self.right = self.bottom = \
self.width = self.height = self.lines = None
#
# Add a word, computing size information for it.
# Words may also be added manually by appending to self.words
# Each word should be a 7-tuple:
# (font, text, width, space, stretch, ascent, descent)
def addword(self, d, font, text, space, stretch):
if font <> None:
d.setfont(font)
width = d.textwidth(text)
ascent = d.baseline()
descent = d.lineheight() - ascent
spw = d.textwidth(' ')
space = space * spw
stretch = stretch * spw
tuple = (font, text, width, space, stretch, ascent, descent)
self.words.append(tuple)
#
# Hooks to begin and end anchors -- insert numbers in the word list!
def bgn_anchor(self, id):
self.words.append(id)
#
def end_anchor(self, id):
self.words.append(0)
#
# Return the total length (width) of the text added so far, in pixels
def getlength(self):
total = 0
for word in self.words:
if type(word) <> Int:
total = total + word[2] + word[3]
return total
#
# Tab to a given position (relative to the current left indent):
# remove all stretch, add fixed space up to the new indent.
# If the current position is already beying the tab stop,
# don't add any new space (but still remove the stretch)
def tabto(self, tab):
total = 0
as, de = 1, 0
for i in range(len(self.words)):
word = self.words[i]
if type(word) == Int: continue
fo, te, wi, sp, st, as, de = word
self.words[i] = fo, te, wi, sp, 0, as, de
total = total + wi + sp
if total < tab:
self.words.append(None, '', 0, tab-total, 0, as, de)
#
# Make a hanging tag: tab to hang, increment indent_left by hang,
# and reset indent_hang to -hang
def makehangingtag(self, hang):
self.tabto(hang)
self.indent_left = self.indent_left + hang
self.indent_hang = -hang
#
# Decide where the line breaks will be given some screen width
def layout(self, linewidth):
self.width = linewidth
height = 0
self.lines = lines = []
avail1 = self.width - self.indent_left - self.indent_right
avail = avail1 - self.indent_hang
words = self.words
i = 0
n = len(words)
lastfont = None
while i < n:
firstfont = lastfont
charcount = 0
width = 0
stretch = 0
ascent = 0
descent = 0
lsp = 0
j = i
while i < n:
word = words[i]
if type(word) == Int:
if word > 0 and width >= avail:
break
i = i+1
continue
fo, te, wi, sp, st, as, de = word
if width + wi > avail and width > 0 and wi > 0:
break
if fo <> None:
lastfont = fo
if width == 0:
firstfont = fo
charcount = charcount + len(te) + (sp > 0)
width = width + wi + sp
lsp = sp
stretch = stretch + st
lst = st
ascent = max(ascent, as)
descent = max(descent, de)
i = i+1
while i > j and type(words[i-1]) == Int and \
words[i-1] > 0: i = i-1
width = width - lsp
if i < n:
stretch = stretch - lst
else:
stretch = 0
tuple = i-j, firstfont, charcount, width, stretch, \
ascent, descent
lines.append(tuple)
height = height + ascent + descent
avail = avail1
self.height = height
#
# Call a function for all words in a line
def visit(self, wordfunc, anchorfunc):
avail1 = self.width - self.indent_left - self.indent_right
avail = avail1 - self.indent_hang
v = self.top
i = 0
for tuple in self.lines:
wordcount, firstfont, charcount, width, stretch, \
ascent, descent = tuple
h = self.left + self.indent_left
if i == 0: h = h + self.indent_hang
extra = 0
if self.just == 'r': h = h + avail - width
elif self.just == 'c': h = h + (avail - width) / 2
elif self.just == 'lr' and stretch > 0:
extra = avail - width
v2 = v + ascent + descent
for j in range(i, i+wordcount):
word = self.words[j]
if type(word) == Int:
ok = anchorfunc(self, tuple, word, \
h, v)
if ok <> None: return ok
continue
fo, te, wi, sp, st, as, de = word
if extra > 0 and stretch > 0:
ex = extra * st / stretch
extra = extra - ex
stretch = stretch - st
else:
ex = 0
h2 = h + wi + sp + ex
ok = wordfunc(self, tuple, word, h, v, \
h2, v2, (j==i), (j==i+wordcount-1))
if ok <> None: return ok
h = h2
v = v2
i = i + wordcount
avail = avail1
#
# Render a paragraph in "drawing object" d, using the rectangle
# given by (left, top, right) with an unspecified bottom.
# Return the computed bottom of the text.
def render(self, d, left, top, right):
if self.width <> right-left:
self.layout(right-left)
self.left = left
self.top = top
self.right = right
self.bottom = self.top + self.height
self.anchorid = 0
try:
self.d = d
self.visit(self.__class__._renderword, \
self.__class__._renderanchor)
finally:
self.d = None
return self.bottom
#
def _renderword(self, tuple, word, h, v, h2, v2, isfirst, islast):
if word[0] <> None: self.d.setfont(word[0])
baseline = v + tuple[5]
self.d.text((h, baseline - word[5]), word[1])
if self.anchorid > 0:
self.d.line((h, baseline+2), (h2, baseline+2))
#
def _renderanchor(self, tuple, word, h, v):
self.anchorid = word
#
# Return which anchor(s) was hit by the mouse
def hitcheck(self, mouseh, mousev):
self.mouseh = mouseh
self.mousev = mousev
self.anchorid = 0
self.hits = []
self.visit(self.__class__._hitcheckword, \
self.__class__._hitcheckanchor)
return self.hits
#
def _hitcheckword(self, tuple, word, h, v, h2, v2, isfirst, islast):
if self.anchorid > 0 and h <= self.mouseh <= h2 and \
v <= self.mousev <= v2:
self.hits.append(self.anchorid)
#
def _hitcheckanchor(self, tuple, word, h, v):
self.anchorid = word
#
# Return whether the given anchor id is present
def hasanchor(self, id):
return id in self.words or -id in self.words
#
# Extract the raw text from the word list, substituting one space
# for non-empty inter-word space, and terminating with '\n'
def extract(self):
text = ''
for w in self.words:
if type(w) <> Int:
word = w[1]
if w[3]: word = word + ' '
text = text + word
return text + '\n'
#
# Return which character position was hit by the mouse, as
# an offset in the entire text as returned by extract().
# Return None if the mouse was not in this paragraph
def whereis(self, d, mouseh, mousev):
if mousev < self.top or mousev > self.bottom:
return None
self.mouseh = mouseh
self.mousev = mousev
self.lastfont = None
self.charcount = 0
try:
self.d = d
return self.visit(self.__class__._whereisword, \
self.__class__._whereisanchor)
finally:
self.d = None
#
def _whereisword(self, tuple, word, h1, v1, h2, v2, isfirst, islast):
fo, te, wi, sp, st, as, de = word
if fo <> None: self.lastfont = fo
h = h1
if isfirst: h1 = 0
if islast: h2 = 999999
if not (v1 <= self.mousev <= v2 and h1 <= self.mouseh <= h2):
self.charcount = self.charcount + len(te) + (sp > 0)
return
if self.lastfont <> None:
self.d.setfont(self.lastfont)
cc = 0
for c in te:
cw = self.d.textwidth(c)
if self.mouseh <= h + cw/2:
return self.charcount + cc
cc = cc+1
h = h+cw
self.charcount = self.charcount + cc
if self.mouseh <= (h+h2) / 2:
return self.charcount
else:
return self.charcount + 1
#
def _whereisanchor(self, tuple, word, h, v):
pass
#
# Return screen position corresponding to position in paragraph.
# Return tuple (h, vtop, vbaseline, vbottom).
# This is more or less the inverse of whereis()
def screenpos(self, d, pos):
if pos < 0:
ascent, descent = self.lines[0][5:7]
return self.left, self.top, self.top + ascent, \
self.top + ascent + descent
self.pos = pos
self.lastfont = None
try:
self.d = d
ok = self.visit(self.__class__._screenposword, \
self.__class__._screenposanchor)
finally:
self.d = None
if ok == None:
ascent, descent = self.lines[-1][5:7]
ok = self.right, self.bottom - ascent - descent, \
self.bottom - descent, self.bottom
return ok
#
def _screenposword(self, tuple, word, h1, v1, h2, v2, isfirst, islast):
fo, te, wi, sp, st, as, de = word
if fo <> None: self.lastfont = fo
cc = len(te) + (sp > 0)
if self.pos > cc:
self.pos = self.pos - cc
return
if self.pos < cc:
self.d.setfont(self.lastfont)
h = h1 + self.d.textwidth(te[:self.pos])
else:
h = h2
ascent, descent = tuple[5:7]
return h, v1, v1+ascent, v2
#
def _screenposanchor(self, tuple, word, h, v):
pass
#
# Invert the stretch of text between pos1 and pos2.
# If pos1 is None, the beginning is implied;
# if pos2 is None, the end is implied.
# Undoes its own effect when called again with the same arguments
def invert(self, d, pos1, pos2):
if pos1 == None:
pos1 = self.left, self.top, self.top, self.top
else:
pos1 = self.screenpos(d, pos1)
if pos2 == None:
pos2 = self.right, self.bottom,self.bottom,self.bottom
else:
pos2 = self.screenpos(d, pos2)
h1, top1, baseline1, bottom1 = pos1
h2, top2, baseline2, bottom2 = pos2
if bottom1 <= top2:
d.invert((h1, top1), (self.right, bottom1))
h1 = self.left
if bottom1 < top2:
d.invert((h1, bottom1), (self.right, top2))
top1, bottom1 = top2, bottom2
d.invert((h1, top1), (h2, bottom2))
# Test class Para
# XXX This was last used on the Mac, hence the weird fonts...
def test():
import stdwin
from stdwinevents import *
words = 'The', 'quick', 'brown', 'fox', 'jumps', 'over', \
'the', 'lazy', 'dog.'
paralist = []
for just in 'l', 'r', 'lr', 'c':
p = Para()
p.just = just
p.addword(stdwin, ('New York', 'p', 12), words[0], 1, 1)
for word in words[1:-1]:
p.addword(stdwin, None, word, 1, 1)
p.addword(stdwin, None, words[-1], 2, 4)
p.addword(stdwin, ('New York', 'b', 18), 'Bye!', 0, 0)
p.addword(stdwin, ('New York', 'p', 10), 'Bye!', 0, 0)
paralist.append(p)
window = stdwin.open('Para.test()')
start = stop = selpara = None
while 1:
etype, win, detail = stdwin.getevent()
if etype == WE_CLOSE:
break
if etype == WE_SIZE:
window.change((0, 0), (1000, 1000))
if etype == WE_DRAW:
width, height = window.getwinsize()
d = None
try:
d = window.begindrawing()
d.cliprect(detail)
d.erase(detail)
v = 0
for p in paralist:
v = p.render(d, 0, v, width)
if p == selpara and \
start <> None and stop <> None:
p.invert(d, start, stop)
finally:
if d: d.close()
if etype == WE_MOUSE_DOWN:
if selpara and start <> None and stop <> None:
d = window.begindrawing()
selpara.invert(d, start, stop)
d.close()
start = stop = selpara = None
mouseh, mousev = detail[0]
for p in paralist:
start = p.whereis(stdwin, mouseh, mousev)
if start <> None:
selpara = p
break
if etype == WE_MOUSE_UP and start <> None and selpara:
mouseh, mousev = detail[0]
stop = selpara.whereis(stdwin, mouseh, mousev)
if stop == None: start = selpara = None
else:
if start > stop:
start, stop = stop, start
d = window.begindrawing()
selpara.invert(d, start, stop)
d.close()
window.close()

621
Lib/fmt.py Normal file
View File

@ -0,0 +1,621 @@
# Text formatting abstractions
import string
import Para
# A formatter back-end object has one method that is called by the formatter:
# addpara(p), where p is a paragraph object. For example:
# Formatter back-end to do nothing at all with the paragraphs
class NullBackEnd:
#
def __init__(self):
pass
#
def addpara(self, p):
pass
#
def bgn_anchor(self, id):
pass
#
def end_anchor(self, id):
pass
# Formatter back-end to collect the paragraphs in a list
class SavingBackEnd(NullBackEnd):
#
def __init__(self):
self.paralist = []
#
def addpara(self, p):
self.paralist.append(p)
#
def hitcheck(self, h, v):
hits = []
for p in self.paralist:
if p.top <= v <= p.bottom:
for id in p.hitcheck(h, v):
if id not in hits:
hits.append(id)
return hits
#
def extract(self):
text = ''
for p in self.paralist:
text = text + (p.extract())
return text
#
def extractpart(self, long1, long2):
if long1 > long2: long1, long2 = long2, long1
para1, pos1 = long1
para2, pos2 = long2
text = ''
while para1 < para2:
ptext = self.paralist[para1].extract()
text = text + ptext[pos1:]
pos1 = 0
para1 = para1 + 1
ptext = self.paralist[para2].extract()
return text + ptext[pos1:pos2]
#
def whereis(self, d, h, v):
total = 0
for i in range(len(self.paralist)):
p = self.paralist[i]
result = p.whereis(d, h, v)
if result <> None:
return i, result
return None
#
def roundtowords(self, long1, long2):
i, offset = long1
text = self.paralist[i].extract()
while offset > 0 and text[offset-1] <> ' ': offset = offset-1
long1 = i, offset
#
i, offset = long2
text = self.paralist[i].extract()
n = len(text)
while offset < n-1 and text[offset] <> ' ': offset = offset+1
long2 = i, offset
#
return long1, long2
#
def roundtoparagraphs(self, long1, long2):
long1 = long1[0], 0
long2 = long2[0], len(self.paralist[long2[0]].extract())
return long1, long2
# Formatter back-end to send the text directly to the drawing object
class WritingBackEnd(NullBackEnd):
#
def __init__(self, d, width):
self.d = d
self.width = width
self.lineno = 0
#
def addpara(self, p):
self.lineno = p.render(self.d, 0, self.lineno, self.width)
# A formatter receives a stream of formatting instructions and assembles
# these into a stream of paragraphs on to a back-end. The assembly is
# parametrized by a text measurement object, which must match the output
# operations of the back-end. The back-end is responsible for splitting
# paragraphs up in lines of a given maximum width. (This is done because
# in a windowing environment, when the window size changes, there is no
# need to redo the assembly into paragraphs, but the splitting into lines
# must be done taking the new window size into account.)
# Formatter base class. Initialize it with a text measurement object,
# which is used for text measurements, and a back-end object,
# which receives the completed paragraphs. The formatting methods are:
# setfont(font)
# setleftindent(nspaces)
# setjust(type) where type is 'l', 'c', 'r', or 'lr'
# flush()
# vspace(nlines)
# needvspace(nlines)
# addword(word, nspaces)
class BaseFormatter:
#
def __init__(self, d, b):
# Drawing object used for text measurements
self.d = d
#
# BackEnd object receiving completed paragraphs
self.b = b
#
# Parameters of the formatting model
self.leftindent = 0
self.just = 'l'
self.font = None
self.blanklines = 0
#
# Parameters derived from the current font
self.space = d.textwidth(' ')
self.line = d.lineheight()
self.ascent = d.baseline()
self.descent = self.line - self.ascent
#
# Parameter derived from the default font
self.n_space = self.space
#
# Current paragraph being built
self.para = None
self.nospace = 1
#
# Font to set on the next word
self.nextfont = None
#
def newpara(self):
return Para.Para()
#
def setfont(self, font):
if font == None: return
self.font = self.nextfont = font
d = self.d
d.setfont(font)
self.space = d.textwidth(' ')
self.line = d.lineheight()
self.ascent = d.baseline()
self.descent = self.line - self.ascent
#
def setleftindent(self, nspaces):
self.leftindent = int(self.n_space * nspaces)
if self.para:
hang = self.leftindent - self.para.indent_left
if hang > 0 and self.para.getlength() <= hang:
self.para.makehangingtag(hang)
self.nospace = 1
else:
self.flush()
#
def setrightindent(self, nspaces):
self.rightindent = int(self.n_space * nspaces)
if self.para:
self.para.indent_right = self.rightindent
self.flush()
#
def setjust(self, just):
self.just = just
if self.para:
self.para.just = self.just
#
def flush(self):
if self.para:
self.b.addpara(self.para)
self.para = None
if self.font <> None:
self.d.setfont(self.font)
self.nospace = 1
#
def vspace(self, nlines):
self.flush()
if nlines > 0:
self.para = self.newpara()
tuple = None, '', 0, 0, 0, int(nlines*self.line), 0
self.para.words.append(tuple)
self.flush()
self.blanklines = self.blanklines + nlines
#
def needvspace(self, nlines):
self.flush() # Just to be sure
if nlines > self.blanklines:
self.vspace(nlines - self.blanklines)
#
def addword(self, text, space):
if self.nospace and not text:
return
self.nospace = 0
self.blanklines = 0
if not self.para:
self.para = self.newpara()
self.para.indent_left = self.leftindent
self.para.just = self.just
self.nextfont = self.font
space = int(space * self.space)
self.para.words.append(self.nextfont, text, \
self.d.textwidth(text), space, space, \
self.ascent, self.descent)
self.nextfont = None
#
def bgn_anchor(self, id):
if not self.para:
self.nospace = 0
self.addword('', 0)
self.para.bgn_anchor(id)
#
def end_anchor(self, id):
if not self.para:
self.nospace = 0
self.addword('', 0)
self.para.end_anchor(id)
# Measuring object for measuring text as viewed on a tty
class NullMeasurer:
#
def __init__(self):
pass
#
def setfont(self, font):
pass
#
def textwidth(self, text):
return len(text)
#
def lineheight(self):
return 1
#
def baseline(self):
return 0
# Drawing object for writing plain ASCII text to a file
class FileWriter:
#
def __init__(self, fp):
self.fp = fp
self.lineno, self.colno = 0, 0
#
def setfont(self, font):
pass
#
def text(self, (h, v), str):
if not str: return
if '\n' in str:
raise ValueError, 'can\'t write \\n'
while self.lineno < v:
self.fp.write('\n')
self.colno, self.lineno = 0, self.lineno + 1
while self.lineno > v:
# XXX This should never happen...
self.fp.write('\033[A') # ANSI up arrow
self.lineno = self.lineno - 1
if self.colno < h:
self.fp.write(' ' * (h - self.colno))
elif self.colno > h:
self.fp.write('\b' * (self.colno - h))
self.colno = h
self.fp.write(str)
self.colno = h + len(str)
# Formatting class to do nothing at all with the data
class NullFormatter(BaseFormatter):
#
def __init__(self):
d = NullMeasurer()
b = NullBackEnd()
BaseFormatter.__init__(self, d, b)
# Formatting class to write directly to a file
class WritingFormatter(BaseFormatter):
#
def __init__(self, fp, width):
dm = NullMeasurer()
dw = FileWriter(fp)
b = WritingBackEnd(dw, width)
BaseFormatter.__init__(self, dm, b)
self.blanklines = 1
#
# Suppress multiple blank lines
def needvspace(self, nlines):
BaseFormatter.needvspace(self, min(1, nlines))
# A "FunnyFormatter" writes ASCII text with a twist: *bold words*,
# _italic text_ and _underlined words_, and `quoted text'.
# It assumes that the fonts are 'r', 'i', 'b', 'u', 'q': (roman,
# italic, bold, underline, quote).
# Moreover, if the font is in upper case, the text is converted to
# UPPER CASE.
class FunnyFormatter(WritingFormatter):
#
def flush(self):
if self.para: finalize(self.para)
WritingFormatter.flush(self)
# Surrounds *bold words* and _italic text_ in a paragraph with
# appropriate markers, fixing the size (assuming these characters'
# width is 1).
openchar = \
{'b':'*', 'i':'_', 'u':'_', 'q':'`', 'B':'*', 'I':'_', 'U':'_', 'Q':'`'}
closechar = \
{'b':'*', 'i':'_', 'u':'_', 'q':'\'', 'B':'*', 'I':'_', 'U':'_', 'Q':'\''}
def finalize(para):
oldfont = curfont = 'r'
para.words.append('r', '', 0, 0, 0, 0) # temporary, deleted at end
for i in range(len(para.words)):
fo, te, wi = para.words[i][:3]
if fo <> None: curfont = fo
if curfont <> oldfont:
if closechar.has_key(oldfont):
c = closechar[oldfont]
j = i-1
while j > 0 and para.words[j][1] == '': j = j-1
fo1, te1, wi1 = para.words[j][:3]
te1 = te1 + c
wi1 = wi1 + len(c)
para.words[j] = (fo1, te1, wi1) + \
para.words[j][3:]
if openchar.has_key(curfont) and te:
c = openchar[curfont]
te = c + te
wi = len(c) + wi
para.words[i] = (fo, te, wi) + \
para.words[i][3:]
if te: oldfont = curfont
else: oldfont = 'r'
if curfont in string.uppercase:
te = string.upper(te)
para.words[i] = (fo, te, wi) + para.words[i][3:]
del para.words[-1]
# Formatter back-end to draw the text in a window.
# This has an option to draw while the paragraphs are being added,
# to minimize the delay before the user sees anything.
# This manages the entire "document" of the window.
class StdwinBackEnd(SavingBackEnd):
#
def __init__(self, window, drawnow):
self.window = window
self.drawnow = drawnow
self.width = window.getwinsize()[0]
self.selection = None
self.height = 0
window.setorigin(0, 0)
window.setdocsize(0, 0)
self.d = window.begindrawing()
SavingBackEnd.__init__(self)
#
def finish(self):
self.d.close()
self.d = None
self.window.setdocsize(0, self.height)
#
def addpara(self, p):
self.paralist.append(p)
if self.drawnow:
self.height = \
p.render(self.d, 0, self.height, self.width)
else:
p.layout(self.width)
p.left = 0
p.top = self.height
p.right = self.width
p.bottom = self.height + p.height
self.height = p.bottom
#
def resize(self):
self.window.change((0, 0), (self.width, self.height))
self.width = self.window.getwinsize()[0]
self.height = 0
for p in self.paralist:
p.layout(self.width)
p.left = 0
p.top = self.height
p.right = self.width
p.bottom = self.height + p.height
self.height = p.bottom
self.window.change((0, 0), (self.width, self.height))
self.window.setdocsize(0, self.height)
#
def redraw(self, area):
d = self.window.begindrawing()
(left, top), (right, bottom) = area
d.erase(area)
d.cliprect(area)
for p in self.paralist:
if top < p.bottom and p.top < bottom:
v = p.render(d, p.left, p.top, p.right)
if self.selection:
self.invert(d, self.selection)
d.close()
#
def setselection(self, new):
if new:
long1, long2 = new
pos1 = long1[:3]
pos2 = long2[:3]
new = pos1, pos2
if new <> self.selection:
d = self.window.begindrawing()
if self.selection:
self.invert(d, self.selection)
if new:
self.invert(d, new)
d.close()
self.selection = new
#
def getselection(self):
return self.selection
#
def extractselection(self):
if self.selection:
a, b = self.selection
return self.extractpart(a, b)
else:
return None
#
def invert(self, d, region):
long1, long2 = region
if long1 > long2: long1, long2 = long2, long1
para1, pos1 = long1
para2, pos2 = long2
while para1 < para2:
self.paralist[para1].invert(d, pos1, None)
pos1 = None
para1 = para1 + 1
self.paralist[para2].invert(d, pos1, pos2)
#
def search(self, prog):
import regex, string
if type(prog) == type(''):
prog = regex.compile(string.lower(prog))
if self.selection:
iold = self.selection[0][0]
else:
iold = -1
hit = None
for i in range(len(self.paralist)):
if i == iold or i < iold and hit:
continue
p = self.paralist[i]
text = string.lower(p.extract())
if prog.search(text) >= 0:
a, b = prog.regs[0]
long1 = i, a
long2 = i, b
hit = long1, long2
if i > iold:
break
if hit:
self.setselection(hit)
i = hit[0][0]
p = self.paralist[i]
self.window.show((p.left, p.top), (p.right, p.bottom))
return 1
else:
return 0
#
def showanchor(self, id):
for i in range(len(self.paralist)):
p = self.paralist[i]
if p.hasanchor(id):
long1 = i, 0
long2 = i, len(p.extract())
hit = long1, long2
self.setselection(hit)
self.window.show( \
(p.left, p.top), (p.right, p.bottom))
break
# GL extensions
class GLFontCache:
#
def __init__(self):
self.reset()
self.setfont('')
#
def reset(self):
self.fontkey = None
self.fonthandle = None
self.fontinfo = None
self.fontcache = {}
#
def close(self):
self.reset()
#
def setfont(self, fontkey):
if fontkey == '':
fontkey = 'Times-Roman 12'
elif ' ' not in fontkey:
fontkey = fontkey + ' 12'
if fontkey == self.fontkey:
return
if self.fontcache.has_key(fontkey):
handle = self.fontcache[fontkey]
else:
import string
i = string.index(fontkey, ' ')
name, sizestr = fontkey[:i], fontkey[i:]
size = eval(sizestr)
key1 = name + ' 1'
key = name + ' ' + `size`
# NB key may differ from fontkey!
if self.fontcache.has_key(key):
handle = self.fontcache[key]
else:
if self.fontcache.has_key(key1):
handle = self.fontcache[key1]
else:
import fm
handle = fm.findfont(name)
self.fontcache[key1] = handle
handle = handle.scalefont(size)
self.fontcache[fontkey] = \
self.fontcache[key] = handle
self.fontkey = fontkey
if self.fonthandle <> handle:
self.fonthandle = handle
self.fontinfo = handle.getfontinfo()
handle.setfont()
class GLMeasurer(GLFontCache):
#
def textwidth(self, text):
return self.fonthandle.getstrwidth(text)
#
def baseline(self):
return self.fontinfo[6] - self.fontinfo[3]
#
def lineheight(self):
return self.fontinfo[6]
class GLWriter(GLFontCache):
#
# NOTES:
# (1) Use gl.ortho2 to use X pixel coordinates!
#
def text(self, (h, v), text):
import gl, fm
gl.cmov2i(h, v + self.fontinfo[6] - self.fontinfo[3])
fm.prstr(text)
#
def setfont(self, fontkey):
oldhandle = self.fonthandle
GLFontCache.setfont(fontkey)
if self.fonthandle <> oldhandle:
handle.setfont()
class GLMeasurerWriter(GLMeasurer, GLWriter):
pass
class GLBackEnd(SavingBackEnd):
#
def __init__(self, wid):
import gl
gl.winset(wid)
self.wid = wid
self.width = gl.getsize()[1]
self.height = 0
self.d = GLMeasurerWriter()
SavingBackEnd.__init__(self)
#
def finish(self):
pass
#
def addpara(self, p):
self.paralist.append(p)
self.height = p.render(self.d, 0, self.height, self.width)
#
def redraw(self):
import gl
gl.winset(self.wid)
width = gl.getsize()[1]
if width <> self.width:
setdocsize = 1
self.width = width
for p in self.paralist:
p.top = p.bottom = None
d = self.d
v = 0
for p in self.paralist:
v = p.render(d, 0, v, width)

635
Lib/htmllib.py Normal file
View File

@ -0,0 +1,635 @@
# A parser for HTML documents
# HTML: HyperText Markup Language; an SGML-like syntax used by WWW to
# describe hypertext documents
#
# SGML: Standard Generalized Markup Language
#
# WWW: World-Wide Web; a distributed hypertext system develped at CERN
#
# CERN: European Particle Physics Laboratory in Geneva, Switzerland
# This file is only concerned with parsing and formatting HTML
# documents, not with the other (hypertext and networking) aspects of
# the WWW project. (It does support highlighting of anchors.)
import os
import sys
import regex
import string
import sgmllib
class HTMLParser(sgmllib.SGMLParser):
# Copy base class entities and add some
entitydefs = {}
for key in sgmllib.SGMLParser.entitydefs.keys():
entitydefs[key] = sgmllib.SGMLParser.entitydefs[key]
entitydefs['bullet'] = '*'
# Provided -- handlers for tags introducing literal text
def start_listing(self, attrs):
self.setliteral('listing')
self.literal_bgn('listing', attrs)
def end_listing(self):
self.literal_end('listing')
def start_xmp(self, attrs):
self.setliteral('xmp')
self.literal_bgn('xmp', attrs)
def end_xmp(self):
self.literal_end('xmp')
def do_plaintext(self, attrs):
self.setnomoretags()
self.literal_bgn('plaintext', attrs)
# To be overridden -- begin/end literal mode
def literal_bgn(self, tag, attrs): pass
def literal_end(self, tag): pass
# Next level of sophistication -- collect anchors, title, nextid and isindex
class CollectingParser(HTMLParser):
#
def __init__(self):
HTMLParser.__init__(self)
self.savetext = None
self.nextid = ''
self.isindex = 0
self.title = ''
self.inanchor = 0
self.anchors = []
self.anchornames = []
self.anchortypes = []
#
def start_a(self, attrs):
self.inanchor = 0
href = ''
name = ''
type = ''
for attrname, value in attrs:
if attrname == 'href':
href = value
if attrname == 'name=':
name = value
if attrname == 'type=':
type = string.lower(value)
if not (href or name):
return
self.anchors.append(href)
self.anchornames.append(name)
self.anchortypes.append(type)
self.inanchor = len(self.anchors)
if not href:
self.inanchor = -self.inanchor
#
def end_a(self):
if self.inanchor > 0:
# Don't show anchors pointing into the current document
if self.anchors[self.inanchor-1][:1] <> '#':
self.handle_data('[' + `self.inanchor` + ']')
self.inanchor = 0
#
def start_header(self, attrs): pass
def end_header(self): pass
#
# (head is the same as header)
def start_head(self, attrs): pass
def end_head(self): pass
#
def start_body(self, attrs): pass
def end_body(self): pass
#
def do_nextid(self, attrs):
self.nextid = attrs
#
def do_isindex(self, attrs):
self.isindex = 1
#
def start_title(self, attrs):
self.savetext = ''
#
def end_title(self):
if self.savetext <> None:
self.title = self.savetext
self.savetext = None
#
def handle_data(self, text):
if self.savetext is not None:
self.savetext = self.savetext + text
# Formatting parser -- takes a formatter and a style sheet as arguments
# XXX The use of style sheets should change: for each tag and end tag
# there should be a style definition, and a style definition should
# encompass many more parameters: font, justification, indentation,
# vspace before, vspace after, hanging tag...
wordprog = regex.compile('[^ \t\n]*')
spaceprog = regex.compile('[ \t\n]*')
class FormattingParser(CollectingParser):
def __init__(self, formatter, stylesheet):
CollectingParser.__init__(self)
self.fmt = formatter
self.stl = stylesheet
self.savetext = None
self.compact = 0
self.nofill = 0
self.resetfont()
self.setindent(self.stl.stdindent)
def resetfont(self):
self.fontstack = []
self.stylestack = []
self.fontset = self.stl.stdfontset
self.style = ROMAN
self.passfont()
def passfont(self):
font = self.fontset[self.style]
self.fmt.setfont(font)
def pushstyle(self, style):
self.stylestack.append(self.style)
self.style = min(style, len(self.fontset)-1)
self.passfont()
def popstyle(self):
self.style = self.stylestack[-1]
del self.stylestack[-1]
self.passfont()
def pushfontset(self, fontset, style):
self.fontstack.append(self.fontset)
self.fontset = fontset
self.pushstyle(style)
def popfontset(self):
self.fontset = self.fontstack[-1]
del self.fontstack[-1]
self.popstyle()
def flush(self):
self.fmt.flush()
def setindent(self, n):
self.fmt.setleftindent(n)
def needvspace(self, n):
self.fmt.needvspace(n)
def close(self):
HTMLParser.close(self)
self.fmt.flush()
def handle_literal(self, text):
lines = string.splitfields(text, '\n')
for i in range(1, len(lines)):
lines[i] = string.expandtabs(lines[i], 8)
for line in lines[:-1]:
self.fmt.addword(line, 0)
self.fmt.flush()
self.fmt.nospace = 0
for line in lines[-1:]:
self.fmt.addword(line, 0)
def handle_data(self, text):
if self.savetext is not None:
self.savetext = self.savetext + text
return
if self.literal:
self.handle_literal(text)
return
i = 0
n = len(text)
while i < n:
j = i + wordprog.match(text, i)
word = text[i:j]
i = j + spaceprog.match(text, j)
self.fmt.addword(word, i-j)
if self.nofill and '\n' in text[j:i]:
self.fmt.flush()
self.fmt.nospace = 0
i = j+1
while text[i-1] <> '\n': i = i+1
def literal_bgn(self, tag, attrs):
if tag == 'plaintext':
self.flush()
else:
self.needvspace(1)
self.pushfontset(self.stl.stdfontset, FIXED)
self.setindent(self.stl.literalindent)
def literal_end(self, tag):
self.needvspace(1)
self.popfontset()
self.setindent(self.stl.stdindent)
def start_title(self, attrs):
self.flush()
self.savetext = ''
# NB end_title is unchanged
def do_p(self, attrs):
if self.compact:
self.flush()
else:
self.needvspace(1)
def start_h1(self, attrs):
self.needvspace(2)
self.setindent(self.stl.h1indent)
self.pushfontset(self.stl.h1fontset, BOLD)
self.fmt.setjust('c')
def end_h1(self):
self.popfontset()
self.needvspace(2)
self.setindent(self.stl.stdindent)
self.fmt.setjust('l')
def start_h2(self, attrs):
self.needvspace(1)
self.setindent(self.stl.h2indent)
self.pushfontset(self.stl.h2fontset, BOLD)
def end_h2(self):
self.popfontset()
self.needvspace(1)
self.setindent(self.stl.stdindent)
def start_h3(self, attrs):
self.needvspace(1)
self.setindent(self.stl.stdindent)
self.pushfontset(self.stl.h3fontset, BOLD)
def end_h3(self):
self.popfontset()
self.needvspace(1)
self.setindent(self.stl.stdindent)
def start_h4(self, attrs):
self.needvspace(1)
self.setindent(self.stl.stdindent)
self.pushfontset(self.stl.stdfontset, BOLD)
def end_h4(self):
self.popfontset()
self.needvspace(1)
self.setindent(self.stl.stdindent)
start_h5 = start_h4
end_h5 = end_h4
start_h6 = start_h5
end_h6 = end_h5
start_h7 = start_h6
end_h7 = end_h6
def start_ul(self, attrs):
self.needvspace(1)
for attrname, value in attrs:
if attrname == 'compact':
self.compact = 1
self.setindent(0)
break
else:
self.setindent(self.stl.ulindent)
start_dir = start_menu = start_ol = start_ul
do_li = do_p
def end_ul(self):
self.compact = 0
self.needvspace(1)
self.setindent(self.stl.stdindent)
end_dir = end_menu = end_ol = end_ul
def start_dl(self, attrs):
for attrname, value in attrs:
if attrname == 'compact':
self.compact = 1
self.needvspace(1)
def end_dl(self):
self.compact = 0
self.needvspace(1)
self.setindent(self.stl.stdindent)
def do_dt(self, attrs):
if self.compact:
self.flush()
else:
self.needvspace(1)
self.setindent(self.stl.stdindent)
def do_dd(self, attrs):
self.fmt.addword('', 1)
self.setindent(self.stl.ddindent)
def start_address(self, attrs):
self.compact = 1
self.needvspace(1)
self.fmt.setjust('r')
def end_address(self):
self.compact = 0
self.needvspace(1)
self.setindent(self.stl.stdindent)
self.fmt.setjust('l')
def start_pre(self, attrs):
self.needvspace(1)
self.nofill = self.nofill + 1
self.pushstyle(FIXED)
def end_pre(self):
self.popstyle()
self.nofill = self.nofill - 1
self.needvspace(1)
start_typewriter = start_pre
end_typewriter = end_pre
def do_img(self, attrs):
self.fmt.addword('(image)', 0)
# Physical styles
def start_tt(self, attrs): self.pushstyle(FIXED)
def end_tt(self): self.popstyle()
def start_b(self, attrs): self.pushstyle(BOLD)
def end_b(self): self.popstyle()
def start_i(self, attrs): self.pushstyle(ITALIC)
def end_i(self): self.popstyle()
def start_u(self, attrs): self.pushstyle(ITALIC) # Underline???
def end_u(self): self.popstyle()
def start_r(self, attrs): self.pushstyle(ROMAN) # Not official
def end_r(self): self.popstyle()
# Logical styles
start_em = start_i
end_em = end_i
start_strong = start_b
end_strong = end_b
start_code = start_tt
end_code = end_tt
start_samp = start_tt
end_samp = end_tt
start_kbd = start_tt
end_kbd = end_tt
start_file = start_tt # unofficial
end_file = end_tt
start_var = start_i
end_var = end_i
start_dfn = start_i
end_dfn = end_i
start_cite = start_i
end_cite = end_i
start_hp1 = start_i
end_hp1 = start_i
start_hp2 = start_b
end_hp2 = end_b
def unknown_starttag(self, tag, attrs):
print '*** unknown <' + tag + '>'
def unknown_endtag(self, tag):
print '*** unknown </' + tag + '>'
# An extension of the formatting parser which formats anchors differently.
class AnchoringParser(FormattingParser):
def start_a(self, attrs):
FormattingParser.start_a(self, attrs)
if self.inanchor:
self.fmt.bgn_anchor(self.inanchor)
def end_a(self):
if self.inanchor:
self.fmt.end_anchor(self.inanchor)
self.inanchor = 0
# Style sheet -- this is never instantiated, but the attributes
# of the class object itself are used to specify fonts to be used
# for various paragraph styles.
# A font set is a non-empty list of fonts, in the order:
# [roman, italic, bold, fixed].
# When a style is not available the nearest lower style is used
ROMAN = 0
ITALIC = 1
BOLD = 2
FIXED = 3
class NullStylesheet:
# Fonts -- none
stdfontset = [None]
h1fontset = [None]
h2fontset = [None]
h3fontset = [None]
# Indents
stdindent = 2
ddindent = 25
ulindent = 4
h1indent = 0
h2indent = 0
literalindent = 0
class X11Stylesheet(NullStylesheet):
stdfontset = [ \
'-*-helvetica-medium-r-normal-*-*-100-100-*-*-*-*-*', \
'-*-helvetica-medium-o-normal-*-*-100-100-*-*-*-*-*', \
'-*-helvetica-bold-r-normal-*-*-100-100-*-*-*-*-*', \
'-*-courier-medium-r-normal-*-*-100-100-*-*-*-*-*', \
]
h1fontset = [ \
'-*-helvetica-medium-r-normal-*-*-180-100-*-*-*-*-*', \
'-*-helvetica-medium-o-normal-*-*-180-100-*-*-*-*-*', \
'-*-helvetica-bold-r-normal-*-*-180-100-*-*-*-*-*', \
]
h2fontset = [ \
'-*-helvetica-medium-r-normal-*-*-140-100-*-*-*-*-*', \
'-*-helvetica-medium-o-normal-*-*-140-100-*-*-*-*-*', \
'-*-helvetica-bold-r-normal-*-*-140-100-*-*-*-*-*', \
]
h3fontset = [ \
'-*-helvetica-medium-r-normal-*-*-120-100-*-*-*-*-*', \
'-*-helvetica-medium-o-normal-*-*-120-100-*-*-*-*-*', \
'-*-helvetica-bold-r-normal-*-*-120-100-*-*-*-*-*', \
]
ddindent = 40
class MacStylesheet(NullStylesheet):
stdfontset = [ \
('Geneva', 'p', 10), \
('Geneva', 'i', 10), \
('Geneva', 'b', 10), \
('Monaco', 'p', 10), \
]
h1fontset = [ \
('Geneva', 'p', 18), \
('Geneva', 'i', 18), \
('Geneva', 'b', 18), \
('Monaco', 'p', 18), \
]
h3fontset = [ \
('Geneva', 'p', 14), \
('Geneva', 'i', 14), \
('Geneva', 'b', 14), \
('Monaco', 'p', 14), \
]
h3fontset = [ \
('Geneva', 'p', 12), \
('Geneva', 'i', 12), \
('Geneva', 'b', 12), \
('Monaco', 'p', 12), \
]
if os.name == 'mac':
StdwinStylesheet = MacStylesheet
else:
StdwinStylesheet = X11Stylesheet
class GLStylesheet(NullStylesheet):
stdfontset = [ \
'Helvetica 10', \
'Helvetica-Italic 10', \
'Helvetica-Bold 10', \
'Courier 10', \
]
h1fontset = [ \
'Helvetica 18', \
'Helvetica-Italic 18', \
'Helvetica-Bold 18', \
'Courier 18', \
]
h2fontset = [ \
'Helvetica 14', \
'Helvetica-Italic 14', \
'Helvetica-Bold 14', \
'Courier 14', \
]
h3fontset = [ \
'Helvetica 12', \
'Helvetica-Italic 12', \
'Helvetica-Bold 12', \
'Courier 12', \
]
# Test program -- produces no output but times how long it takes
# to send a document to a null formatter, exclusive of I/O
def test():
import fmt
import time
if sys.argv[1:]: file = sys.argv[1]
else: file = 'test.html'
data = open(file, 'r').read()
t0 = time.time()
fmtr = fmt.WritingFormatter(sys.stdout, 79)
p = FormattingParser(fmtr, NullStylesheet)
p.feed(data)
p.close()
t1 = time.time()
print
print '*** Formatting time:', round(t1-t0, 3), 'seconds.'
# Test program using stdwin
def testStdwin():
import stdwin, fmt
from stdwinevents import *
if sys.argv[1:]: file = sys.argv[1]
else: file = 'test.html'
data = open(file, 'r').read()
window = stdwin.open('testStdwin')
b = None
while 1:
etype, ewin, edetail = stdwin.getevent()
if etype == WE_CLOSE:
break
if etype == WE_SIZE:
window.setdocsize(0, 0)
window.setorigin(0, 0)
window.change((0, 0), (10000, 30000)) # XXX
if etype == WE_DRAW:
if not b:
b = fmt.StdwinBackEnd(window, 1)
f = fmt.BaseFormatter(b.d, b)
p = FormattingParser(f, \
MacStylesheet)
p.feed(data)
p.close()
b.finish()
else:
b.redraw(edetail)
window.close()
# Test program using GL
def testGL():
import gl, GL, fmt
if sys.argv[1:]: file = sys.argv[1]
else: file = 'test.html'
data = open(file, 'r').read()
W, H = 600, 600
gl.foreground()
gl.prefsize(W, H)
wid = gl.winopen('testGL')
gl.ortho2(0, W, H, 0)
gl.color(GL.WHITE)
gl.clear()
gl.color(GL.BLACK)
b = fmt.GLBackEnd(wid)
f = fmt.BaseFormatter(b.d, b)
p = FormattingParser(f, GLStylesheet)
p.feed(data)
p.close()
b.finish()
#
import time
time.sleep(5)
if __name__ == '__main__':
test()

408
Lib/lib-old/Para.py Normal file
View File

@ -0,0 +1,408 @@
# Text formatting abstractions
# Oft-used type object
Int = type(0)
# Represent a paragraph. This is a list of words with associated
# font and size information, plus indents and justification for the
# entire paragraph.
# Once the words have been added to a paragraph, it can be laid out
# for different line widths. Once laid out, it can be rendered at
# different screen locations. Once rendered, it can be queried
# for mouse hits, and parts of the text can be highlighted
class Para:
#
def __init__(self):
self.words = [] # The words
self.just = 'l' # Justification: 'l', 'r', 'lr' or 'c'
self.indent_left = self.indent_right = self.indent_hang = 0
# Final lay-out parameters, may change
self.left = self.top = self.right = self.bottom = \
self.width = self.height = self.lines = None
#
# Add a word, computing size information for it.
# Words may also be added manually by appending to self.words
# Each word should be a 7-tuple:
# (font, text, width, space, stretch, ascent, descent)
def addword(self, d, font, text, space, stretch):
if font <> None:
d.setfont(font)
width = d.textwidth(text)
ascent = d.baseline()
descent = d.lineheight() - ascent
spw = d.textwidth(' ')
space = space * spw
stretch = stretch * spw
tuple = (font, text, width, space, stretch, ascent, descent)
self.words.append(tuple)
#
# Hooks to begin and end anchors -- insert numbers in the word list!
def bgn_anchor(self, id):
self.words.append(id)
#
def end_anchor(self, id):
self.words.append(0)
#
# Return the total length (width) of the text added so far, in pixels
def getlength(self):
total = 0
for word in self.words:
if type(word) <> Int:
total = total + word[2] + word[3]
return total
#
# Tab to a given position (relative to the current left indent):
# remove all stretch, add fixed space up to the new indent.
# If the current position is already beying the tab stop,
# don't add any new space (but still remove the stretch)
def tabto(self, tab):
total = 0
as, de = 1, 0
for i in range(len(self.words)):
word = self.words[i]
if type(word) == Int: continue
fo, te, wi, sp, st, as, de = word
self.words[i] = fo, te, wi, sp, 0, as, de
total = total + wi + sp
if total < tab:
self.words.append(None, '', 0, tab-total, 0, as, de)
#
# Make a hanging tag: tab to hang, increment indent_left by hang,
# and reset indent_hang to -hang
def makehangingtag(self, hang):
self.tabto(hang)
self.indent_left = self.indent_left + hang
self.indent_hang = -hang
#
# Decide where the line breaks will be given some screen width
def layout(self, linewidth):
self.width = linewidth
height = 0
self.lines = lines = []
avail1 = self.width - self.indent_left - self.indent_right
avail = avail1 - self.indent_hang
words = self.words
i = 0
n = len(words)
lastfont = None
while i < n:
firstfont = lastfont
charcount = 0
width = 0
stretch = 0
ascent = 0
descent = 0
lsp = 0
j = i
while i < n:
word = words[i]
if type(word) == Int:
if word > 0 and width >= avail:
break
i = i+1
continue
fo, te, wi, sp, st, as, de = word
if width + wi > avail and width > 0 and wi > 0:
break
if fo <> None:
lastfont = fo
if width == 0:
firstfont = fo
charcount = charcount + len(te) + (sp > 0)
width = width + wi + sp
lsp = sp
stretch = stretch + st
lst = st
ascent = max(ascent, as)
descent = max(descent, de)
i = i+1
while i > j and type(words[i-1]) == Int and \
words[i-1] > 0: i = i-1
width = width - lsp
if i < n:
stretch = stretch - lst
else:
stretch = 0
tuple = i-j, firstfont, charcount, width, stretch, \
ascent, descent
lines.append(tuple)
height = height + ascent + descent
avail = avail1
self.height = height
#
# Call a function for all words in a line
def visit(self, wordfunc, anchorfunc):
avail1 = self.width - self.indent_left - self.indent_right
avail = avail1 - self.indent_hang
v = self.top
i = 0
for tuple in self.lines:
wordcount, firstfont, charcount, width, stretch, \
ascent, descent = tuple
h = self.left + self.indent_left
if i == 0: h = h + self.indent_hang
extra = 0
if self.just == 'r': h = h + avail - width
elif self.just == 'c': h = h + (avail - width) / 2
elif self.just == 'lr' and stretch > 0:
extra = avail - width
v2 = v + ascent + descent
for j in range(i, i+wordcount):
word = self.words[j]
if type(word) == Int:
ok = anchorfunc(self, tuple, word, \
h, v)
if ok <> None: return ok
continue
fo, te, wi, sp, st, as, de = word
if extra > 0 and stretch > 0:
ex = extra * st / stretch
extra = extra - ex
stretch = stretch - st
else:
ex = 0
h2 = h + wi + sp + ex
ok = wordfunc(self, tuple, word, h, v, \
h2, v2, (j==i), (j==i+wordcount-1))
if ok <> None: return ok
h = h2
v = v2
i = i + wordcount
avail = avail1
#
# Render a paragraph in "drawing object" d, using the rectangle
# given by (left, top, right) with an unspecified bottom.
# Return the computed bottom of the text.
def render(self, d, left, top, right):
if self.width <> right-left:
self.layout(right-left)
self.left = left
self.top = top
self.right = right
self.bottom = self.top + self.height
self.anchorid = 0
try:
self.d = d
self.visit(self.__class__._renderword, \
self.__class__._renderanchor)
finally:
self.d = None
return self.bottom
#
def _renderword(self, tuple, word, h, v, h2, v2, isfirst, islast):
if word[0] <> None: self.d.setfont(word[0])
baseline = v + tuple[5]
self.d.text((h, baseline - word[5]), word[1])
if self.anchorid > 0:
self.d.line((h, baseline+2), (h2, baseline+2))
#
def _renderanchor(self, tuple, word, h, v):
self.anchorid = word
#
# Return which anchor(s) was hit by the mouse
def hitcheck(self, mouseh, mousev):
self.mouseh = mouseh
self.mousev = mousev
self.anchorid = 0
self.hits = []
self.visit(self.__class__._hitcheckword, \
self.__class__._hitcheckanchor)
return self.hits
#
def _hitcheckword(self, tuple, word, h, v, h2, v2, isfirst, islast):
if self.anchorid > 0 and h <= self.mouseh <= h2 and \
v <= self.mousev <= v2:
self.hits.append(self.anchorid)
#
def _hitcheckanchor(self, tuple, word, h, v):
self.anchorid = word
#
# Return whether the given anchor id is present
def hasanchor(self, id):
return id in self.words or -id in self.words
#
# Extract the raw text from the word list, substituting one space
# for non-empty inter-word space, and terminating with '\n'
def extract(self):
text = ''
for w in self.words:
if type(w) <> Int:
word = w[1]
if w[3]: word = word + ' '
text = text + word
return text + '\n'
#
# Return which character position was hit by the mouse, as
# an offset in the entire text as returned by extract().
# Return None if the mouse was not in this paragraph
def whereis(self, d, mouseh, mousev):
if mousev < self.top or mousev > self.bottom:
return None
self.mouseh = mouseh
self.mousev = mousev
self.lastfont = None
self.charcount = 0
try:
self.d = d
return self.visit(self.__class__._whereisword, \
self.__class__._whereisanchor)
finally:
self.d = None
#
def _whereisword(self, tuple, word, h1, v1, h2, v2, isfirst, islast):
fo, te, wi, sp, st, as, de = word
if fo <> None: self.lastfont = fo
h = h1
if isfirst: h1 = 0
if islast: h2 = 999999
if not (v1 <= self.mousev <= v2 and h1 <= self.mouseh <= h2):
self.charcount = self.charcount + len(te) + (sp > 0)
return
if self.lastfont <> None:
self.d.setfont(self.lastfont)
cc = 0
for c in te:
cw = self.d.textwidth(c)
if self.mouseh <= h + cw/2:
return self.charcount + cc
cc = cc+1
h = h+cw
self.charcount = self.charcount + cc
if self.mouseh <= (h+h2) / 2:
return self.charcount
else:
return self.charcount + 1
#
def _whereisanchor(self, tuple, word, h, v):
pass
#
# Return screen position corresponding to position in paragraph.
# Return tuple (h, vtop, vbaseline, vbottom).
# This is more or less the inverse of whereis()
def screenpos(self, d, pos):
if pos < 0:
ascent, descent = self.lines[0][5:7]
return self.left, self.top, self.top + ascent, \
self.top + ascent + descent
self.pos = pos
self.lastfont = None
try:
self.d = d
ok = self.visit(self.__class__._screenposword, \
self.__class__._screenposanchor)
finally:
self.d = None
if ok == None:
ascent, descent = self.lines[-1][5:7]
ok = self.right, self.bottom - ascent - descent, \
self.bottom - descent, self.bottom
return ok
#
def _screenposword(self, tuple, word, h1, v1, h2, v2, isfirst, islast):
fo, te, wi, sp, st, as, de = word
if fo <> None: self.lastfont = fo
cc = len(te) + (sp > 0)
if self.pos > cc:
self.pos = self.pos - cc
return
if self.pos < cc:
self.d.setfont(self.lastfont)
h = h1 + self.d.textwidth(te[:self.pos])
else:
h = h2
ascent, descent = tuple[5:7]
return h, v1, v1+ascent, v2
#
def _screenposanchor(self, tuple, word, h, v):
pass
#
# Invert the stretch of text between pos1 and pos2.
# If pos1 is None, the beginning is implied;
# if pos2 is None, the end is implied.
# Undoes its own effect when called again with the same arguments
def invert(self, d, pos1, pos2):
if pos1 == None:
pos1 = self.left, self.top, self.top, self.top
else:
pos1 = self.screenpos(d, pos1)
if pos2 == None:
pos2 = self.right, self.bottom,self.bottom,self.bottom
else:
pos2 = self.screenpos(d, pos2)
h1, top1, baseline1, bottom1 = pos1
h2, top2, baseline2, bottom2 = pos2
if bottom1 <= top2:
d.invert((h1, top1), (self.right, bottom1))
h1 = self.left
if bottom1 < top2:
d.invert((h1, bottom1), (self.right, top2))
top1, bottom1 = top2, bottom2
d.invert((h1, top1), (h2, bottom2))
# Test class Para
# XXX This was last used on the Mac, hence the weird fonts...
def test():
import stdwin
from stdwinevents import *
words = 'The', 'quick', 'brown', 'fox', 'jumps', 'over', \
'the', 'lazy', 'dog.'
paralist = []
for just in 'l', 'r', 'lr', 'c':
p = Para()
p.just = just
p.addword(stdwin, ('New York', 'p', 12), words[0], 1, 1)
for word in words[1:-1]:
p.addword(stdwin, None, word, 1, 1)
p.addword(stdwin, None, words[-1], 2, 4)
p.addword(stdwin, ('New York', 'b', 18), 'Bye!', 0, 0)
p.addword(stdwin, ('New York', 'p', 10), 'Bye!', 0, 0)
paralist.append(p)
window = stdwin.open('Para.test()')
start = stop = selpara = None
while 1:
etype, win, detail = stdwin.getevent()
if etype == WE_CLOSE:
break
if etype == WE_SIZE:
window.change((0, 0), (1000, 1000))
if etype == WE_DRAW:
width, height = window.getwinsize()
d = None
try:
d = window.begindrawing()
d.cliprect(detail)
d.erase(detail)
v = 0
for p in paralist:
v = p.render(d, 0, v, width)
if p == selpara and \
start <> None and stop <> None:
p.invert(d, start, stop)
finally:
if d: d.close()
if etype == WE_MOUSE_DOWN:
if selpara and start <> None and stop <> None:
d = window.begindrawing()
selpara.invert(d, start, stop)
d.close()
start = stop = selpara = None
mouseh, mousev = detail[0]
for p in paralist:
start = p.whereis(stdwin, mouseh, mousev)
if start <> None:
selpara = p
break
if etype == WE_MOUSE_UP and start <> None and selpara:
mouseh, mousev = detail[0]
stop = selpara.whereis(stdwin, mouseh, mousev)
if stop == None: start = selpara = None
else:
if start > stop:
start, stop = stop, start
d = window.begindrawing()
selpara.invert(d, start, stop)
d.close()
window.close()

621
Lib/lib-old/fmt.py Normal file
View File

@ -0,0 +1,621 @@
# Text formatting abstractions
import string
import Para
# A formatter back-end object has one method that is called by the formatter:
# addpara(p), where p is a paragraph object. For example:
# Formatter back-end to do nothing at all with the paragraphs
class NullBackEnd:
#
def __init__(self):
pass
#
def addpara(self, p):
pass
#
def bgn_anchor(self, id):
pass
#
def end_anchor(self, id):
pass
# Formatter back-end to collect the paragraphs in a list
class SavingBackEnd(NullBackEnd):
#
def __init__(self):
self.paralist = []
#
def addpara(self, p):
self.paralist.append(p)
#
def hitcheck(self, h, v):
hits = []
for p in self.paralist:
if p.top <= v <= p.bottom:
for id in p.hitcheck(h, v):
if id not in hits:
hits.append(id)
return hits
#
def extract(self):
text = ''
for p in self.paralist:
text = text + (p.extract())
return text
#
def extractpart(self, long1, long2):
if long1 > long2: long1, long2 = long2, long1
para1, pos1 = long1
para2, pos2 = long2
text = ''
while para1 < para2:
ptext = self.paralist[para1].extract()
text = text + ptext[pos1:]
pos1 = 0
para1 = para1 + 1
ptext = self.paralist[para2].extract()
return text + ptext[pos1:pos2]
#
def whereis(self, d, h, v):
total = 0
for i in range(len(self.paralist)):
p = self.paralist[i]
result = p.whereis(d, h, v)
if result <> None:
return i, result
return None
#
def roundtowords(self, long1, long2):
i, offset = long1
text = self.paralist[i].extract()
while offset > 0 and text[offset-1] <> ' ': offset = offset-1
long1 = i, offset
#
i, offset = long2
text = self.paralist[i].extract()
n = len(text)
while offset < n-1 and text[offset] <> ' ': offset = offset+1
long2 = i, offset
#
return long1, long2
#
def roundtoparagraphs(self, long1, long2):
long1 = long1[0], 0
long2 = long2[0], len(self.paralist[long2[0]].extract())
return long1, long2
# Formatter back-end to send the text directly to the drawing object
class WritingBackEnd(NullBackEnd):
#
def __init__(self, d, width):
self.d = d
self.width = width
self.lineno = 0
#
def addpara(self, p):
self.lineno = p.render(self.d, 0, self.lineno, self.width)
# A formatter receives a stream of formatting instructions and assembles
# these into a stream of paragraphs on to a back-end. The assembly is
# parametrized by a text measurement object, which must match the output
# operations of the back-end. The back-end is responsible for splitting
# paragraphs up in lines of a given maximum width. (This is done because
# in a windowing environment, when the window size changes, there is no
# need to redo the assembly into paragraphs, but the splitting into lines
# must be done taking the new window size into account.)
# Formatter base class. Initialize it with a text measurement object,
# which is used for text measurements, and a back-end object,
# which receives the completed paragraphs. The formatting methods are:
# setfont(font)
# setleftindent(nspaces)
# setjust(type) where type is 'l', 'c', 'r', or 'lr'
# flush()
# vspace(nlines)
# needvspace(nlines)
# addword(word, nspaces)
class BaseFormatter:
#
def __init__(self, d, b):
# Drawing object used for text measurements
self.d = d
#
# BackEnd object receiving completed paragraphs
self.b = b
#
# Parameters of the formatting model
self.leftindent = 0
self.just = 'l'
self.font = None
self.blanklines = 0
#
# Parameters derived from the current font
self.space = d.textwidth(' ')
self.line = d.lineheight()
self.ascent = d.baseline()
self.descent = self.line - self.ascent
#
# Parameter derived from the default font
self.n_space = self.space
#
# Current paragraph being built
self.para = None
self.nospace = 1
#
# Font to set on the next word
self.nextfont = None
#
def newpara(self):
return Para.Para()
#
def setfont(self, font):
if font == None: return
self.font = self.nextfont = font
d = self.d
d.setfont(font)
self.space = d.textwidth(' ')
self.line = d.lineheight()
self.ascent = d.baseline()
self.descent = self.line - self.ascent
#
def setleftindent(self, nspaces):
self.leftindent = int(self.n_space * nspaces)
if self.para:
hang = self.leftindent - self.para.indent_left
if hang > 0 and self.para.getlength() <= hang:
self.para.makehangingtag(hang)
self.nospace = 1
else:
self.flush()
#
def setrightindent(self, nspaces):
self.rightindent = int(self.n_space * nspaces)
if self.para:
self.para.indent_right = self.rightindent
self.flush()
#
def setjust(self, just):
self.just = just
if self.para:
self.para.just = self.just
#
def flush(self):
if self.para:
self.b.addpara(self.para)
self.para = None
if self.font <> None:
self.d.setfont(self.font)
self.nospace = 1
#
def vspace(self, nlines):
self.flush()
if nlines > 0:
self.para = self.newpara()
tuple = None, '', 0, 0, 0, int(nlines*self.line), 0
self.para.words.append(tuple)
self.flush()
self.blanklines = self.blanklines + nlines
#
def needvspace(self, nlines):
self.flush() # Just to be sure
if nlines > self.blanklines:
self.vspace(nlines - self.blanklines)
#
def addword(self, text, space):
if self.nospace and not text:
return
self.nospace = 0
self.blanklines = 0
if not self.para:
self.para = self.newpara()
self.para.indent_left = self.leftindent
self.para.just = self.just
self.nextfont = self.font
space = int(space * self.space)
self.para.words.append(self.nextfont, text, \
self.d.textwidth(text), space, space, \
self.ascent, self.descent)
self.nextfont = None
#
def bgn_anchor(self, id):
if not self.para:
self.nospace = 0
self.addword('', 0)
self.para.bgn_anchor(id)
#
def end_anchor(self, id):
if not self.para:
self.nospace = 0
self.addword('', 0)
self.para.end_anchor(id)
# Measuring object for measuring text as viewed on a tty
class NullMeasurer:
#
def __init__(self):
pass
#
def setfont(self, font):
pass
#
def textwidth(self, text):
return len(text)
#
def lineheight(self):
return 1
#
def baseline(self):
return 0
# Drawing object for writing plain ASCII text to a file
class FileWriter:
#
def __init__(self, fp):
self.fp = fp
self.lineno, self.colno = 0, 0
#
def setfont(self, font):
pass
#
def text(self, (h, v), str):
if not str: return
if '\n' in str:
raise ValueError, 'can\'t write \\n'
while self.lineno < v:
self.fp.write('\n')
self.colno, self.lineno = 0, self.lineno + 1
while self.lineno > v:
# XXX This should never happen...
self.fp.write('\033[A') # ANSI up arrow
self.lineno = self.lineno - 1
if self.colno < h:
self.fp.write(' ' * (h - self.colno))
elif self.colno > h:
self.fp.write('\b' * (self.colno - h))
self.colno = h
self.fp.write(str)
self.colno = h + len(str)
# Formatting class to do nothing at all with the data
class NullFormatter(BaseFormatter):
#
def __init__(self):
d = NullMeasurer()
b = NullBackEnd()
BaseFormatter.__init__(self, d, b)
# Formatting class to write directly to a file
class WritingFormatter(BaseFormatter):
#
def __init__(self, fp, width):
dm = NullMeasurer()
dw = FileWriter(fp)
b = WritingBackEnd(dw, width)
BaseFormatter.__init__(self, dm, b)
self.blanklines = 1
#
# Suppress multiple blank lines
def needvspace(self, nlines):
BaseFormatter.needvspace(self, min(1, nlines))
# A "FunnyFormatter" writes ASCII text with a twist: *bold words*,
# _italic text_ and _underlined words_, and `quoted text'.
# It assumes that the fonts are 'r', 'i', 'b', 'u', 'q': (roman,
# italic, bold, underline, quote).
# Moreover, if the font is in upper case, the text is converted to
# UPPER CASE.
class FunnyFormatter(WritingFormatter):
#
def flush(self):
if self.para: finalize(self.para)
WritingFormatter.flush(self)
# Surrounds *bold words* and _italic text_ in a paragraph with
# appropriate markers, fixing the size (assuming these characters'
# width is 1).
openchar = \
{'b':'*', 'i':'_', 'u':'_', 'q':'`', 'B':'*', 'I':'_', 'U':'_', 'Q':'`'}
closechar = \
{'b':'*', 'i':'_', 'u':'_', 'q':'\'', 'B':'*', 'I':'_', 'U':'_', 'Q':'\''}
def finalize(para):
oldfont = curfont = 'r'
para.words.append('r', '', 0, 0, 0, 0) # temporary, deleted at end
for i in range(len(para.words)):
fo, te, wi = para.words[i][:3]
if fo <> None: curfont = fo
if curfont <> oldfont:
if closechar.has_key(oldfont):
c = closechar[oldfont]
j = i-1
while j > 0 and para.words[j][1] == '': j = j-1
fo1, te1, wi1 = para.words[j][:3]
te1 = te1 + c
wi1 = wi1 + len(c)
para.words[j] = (fo1, te1, wi1) + \
para.words[j][3:]
if openchar.has_key(curfont) and te:
c = openchar[curfont]
te = c + te
wi = len(c) + wi
para.words[i] = (fo, te, wi) + \
para.words[i][3:]
if te: oldfont = curfont
else: oldfont = 'r'
if curfont in string.uppercase:
te = string.upper(te)
para.words[i] = (fo, te, wi) + para.words[i][3:]
del para.words[-1]
# Formatter back-end to draw the text in a window.
# This has an option to draw while the paragraphs are being added,
# to minimize the delay before the user sees anything.
# This manages the entire "document" of the window.
class StdwinBackEnd(SavingBackEnd):
#
def __init__(self, window, drawnow):
self.window = window
self.drawnow = drawnow
self.width = window.getwinsize()[0]
self.selection = None
self.height = 0
window.setorigin(0, 0)
window.setdocsize(0, 0)
self.d = window.begindrawing()
SavingBackEnd.__init__(self)
#
def finish(self):
self.d.close()
self.d = None
self.window.setdocsize(0, self.height)
#
def addpara(self, p):
self.paralist.append(p)
if self.drawnow:
self.height = \
p.render(self.d, 0, self.height, self.width)
else:
p.layout(self.width)
p.left = 0
p.top = self.height
p.right = self.width
p.bottom = self.height + p.height
self.height = p.bottom
#
def resize(self):
self.window.change((0, 0), (self.width, self.height))
self.width = self.window.getwinsize()[0]
self.height = 0
for p in self.paralist:
p.layout(self.width)
p.left = 0
p.top = self.height
p.right = self.width
p.bottom = self.height + p.height
self.height = p.bottom
self.window.change((0, 0), (self.width, self.height))
self.window.setdocsize(0, self.height)
#
def redraw(self, area):
d = self.window.begindrawing()
(left, top), (right, bottom) = area
d.erase(area)
d.cliprect(area)
for p in self.paralist:
if top < p.bottom and p.top < bottom:
v = p.render(d, p.left, p.top, p.right)
if self.selection:
self.invert(d, self.selection)
d.close()
#
def setselection(self, new):
if new:
long1, long2 = new
pos1 = long1[:3]
pos2 = long2[:3]
new = pos1, pos2
if new <> self.selection:
d = self.window.begindrawing()
if self.selection:
self.invert(d, self.selection)
if new:
self.invert(d, new)
d.close()
self.selection = new
#
def getselection(self):
return self.selection
#
def extractselection(self):
if self.selection:
a, b = self.selection
return self.extractpart(a, b)
else:
return None
#
def invert(self, d, region):
long1, long2 = region
if long1 > long2: long1, long2 = long2, long1
para1, pos1 = long1
para2, pos2 = long2
while para1 < para2:
self.paralist[para1].invert(d, pos1, None)
pos1 = None
para1 = para1 + 1
self.paralist[para2].invert(d, pos1, pos2)
#
def search(self, prog):
import regex, string
if type(prog) == type(''):
prog = regex.compile(string.lower(prog))
if self.selection:
iold = self.selection[0][0]
else:
iold = -1
hit = None
for i in range(len(self.paralist)):
if i == iold or i < iold and hit:
continue
p = self.paralist[i]
text = string.lower(p.extract())
if prog.search(text) >= 0:
a, b = prog.regs[0]
long1 = i, a
long2 = i, b
hit = long1, long2
if i > iold:
break
if hit:
self.setselection(hit)
i = hit[0][0]
p = self.paralist[i]
self.window.show((p.left, p.top), (p.right, p.bottom))
return 1
else:
return 0
#
def showanchor(self, id):
for i in range(len(self.paralist)):
p = self.paralist[i]
if p.hasanchor(id):
long1 = i, 0
long2 = i, len(p.extract())
hit = long1, long2
self.setselection(hit)
self.window.show( \
(p.left, p.top), (p.right, p.bottom))
break
# GL extensions
class GLFontCache:
#
def __init__(self):
self.reset()
self.setfont('')
#
def reset(self):
self.fontkey = None
self.fonthandle = None
self.fontinfo = None
self.fontcache = {}
#
def close(self):
self.reset()
#
def setfont(self, fontkey):
if fontkey == '':
fontkey = 'Times-Roman 12'
elif ' ' not in fontkey:
fontkey = fontkey + ' 12'
if fontkey == self.fontkey:
return
if self.fontcache.has_key(fontkey):
handle = self.fontcache[fontkey]
else:
import string
i = string.index(fontkey, ' ')
name, sizestr = fontkey[:i], fontkey[i:]
size = eval(sizestr)
key1 = name + ' 1'
key = name + ' ' + `size`
# NB key may differ from fontkey!
if self.fontcache.has_key(key):
handle = self.fontcache[key]
else:
if self.fontcache.has_key(key1):
handle = self.fontcache[key1]
else:
import fm
handle = fm.findfont(name)
self.fontcache[key1] = handle
handle = handle.scalefont(size)
self.fontcache[fontkey] = \
self.fontcache[key] = handle
self.fontkey = fontkey
if self.fonthandle <> handle:
self.fonthandle = handle
self.fontinfo = handle.getfontinfo()
handle.setfont()
class GLMeasurer(GLFontCache):
#
def textwidth(self, text):
return self.fonthandle.getstrwidth(text)
#
def baseline(self):
return self.fontinfo[6] - self.fontinfo[3]
#
def lineheight(self):
return self.fontinfo[6]
class GLWriter(GLFontCache):
#
# NOTES:
# (1) Use gl.ortho2 to use X pixel coordinates!
#
def text(self, (h, v), text):
import gl, fm
gl.cmov2i(h, v + self.fontinfo[6] - self.fontinfo[3])
fm.prstr(text)
#
def setfont(self, fontkey):
oldhandle = self.fonthandle
GLFontCache.setfont(fontkey)
if self.fonthandle <> oldhandle:
handle.setfont()
class GLMeasurerWriter(GLMeasurer, GLWriter):
pass
class GLBackEnd(SavingBackEnd):
#
def __init__(self, wid):
import gl
gl.winset(wid)
self.wid = wid
self.width = gl.getsize()[1]
self.height = 0
self.d = GLMeasurerWriter()
SavingBackEnd.__init__(self)
#
def finish(self):
pass
#
def addpara(self, p):
self.paralist.append(p)
self.height = p.render(self.d, 0, self.height, self.width)
#
def redraw(self):
import gl
gl.winset(self.wid)
width = gl.getsize()[1]
if width <> self.width:
setdocsize = 1
self.width = width
for p in self.paralist:
p.top = p.bottom = None
d = self.d
v = 0
for p in self.paralist:
v = p.render(d, 0, v, width)

321
Lib/sgmllib.py Normal file
View File

@ -0,0 +1,321 @@
# A parser for SGML, using the derived class as static DTD.
# XXX This only supports those SGML features used by HTML.
# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
# and CDATA (character data -- only end tags are special).
import regex
import string
# Regular expressions used for parsing
incomplete = regex.compile( \
'<!-?\|</[a-zA-Z][a-zA-Z0-9]*[ \t\n]*\|</?\|' + \
'&#[a-zA-Z0-9]*\|&[a-zA-Z][a-zA-Z0-9]*\|&')
entityref = regex.compile('&[a-zA-Z][a-zA-Z0-9]*[;.]')
charref = regex.compile('&#[a-zA-Z0-9]+;')
starttagopen = regex.compile('<[a-zA-Z]')
endtag = regex.compile('</[a-zA-Z][a-zA-Z0-9]*[ \t\n]*>')
commentopen = regex.compile('<!--')
# SGML parser base class -- find tags and call handler functions.
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
# The dtd is defined by deriving a class which defines methods
# with special names to handle tags: start_foo and end_foo to handle
# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
# (Tags are converted to lower case for this purpose.) The data
# between tags is passed to the parser by calling self.handle_data()
# with some data as argument (the data may be split up in arbutrary
# chunks). Entity references are passed by calling
# self.handle_entityref() with the entity reference as argument.
class SGMLParser:
# Interface -- initialize and reset this instance
def __init__(self):
self.reset()
# Interface -- reset this instance. Loses all unprocessed data
def reset(self):
self.rawdata = ''
self.stack = []
self.nomoretags = 0
self.literal = 0
# For derived classes only -- enter literal mode (CDATA) till EOF
def setnomoretags(self):
self.nomoretags = self.literal = 1
# For derived classes only -- enter literal mode (CDATA)
def setliteral(self, *args):
self.literal = 1
# Interface -- feed some data to the parser. Call this as
# often as you want, with as little or as much text as you
# want (may include '\n'). (This just saves the text, all the
# processing is done by process() or close().)
def feed(self, data):
self.rawdata = self.rawdata + data
self.goahead(0)
# Interface -- handle the remaining data
def close(self):
self.goahead(1)
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
def goahead(self, end):
rawdata = self.rawdata
i = 0
n = len(rawdata)
while i < n:
if self.nomoretags:
self.handle_data(rawdata[i:n])
i = n
break
j = incomplete.search(rawdata, i)
if j < 0: j = n
if i < j: self.handle_data(rawdata[i:j])
i = j
if i == n: break
if rawdata[i] == '<':
if starttagopen.match(rawdata, i) >= 0:
if self.literal:
self.handle_data(rawdata[i])
i = i+1
continue
k = self.parse_starttag(i)
if k < 0: break
i = i + k
continue
k = endtag.match(rawdata, i)
if k >= 0:
j = i+k
self.parse_endtag(rawdata[i:j])
i = j
self.literal = 0
continue
if commentopen.match(rawdata, i) >= 0:
if self.literal:
self.handle_data(rawdata[i])
i = i+1
continue
k = self.parse_comment(i)
if k < 0: break
i = i+k
continue
elif rawdata[i] == '&':
k = charref.match(rawdata, i)
if k >= 0:
j = i+k
self.handle_charref(rawdata[i+2:j-1])
i = j
continue
k = entityref.match(rawdata, i)
if k >= 0:
j = i+k
self.handle_entityref(rawdata[i+1:j-1])
i = j
continue
else:
raise RuntimeError, 'neither < nor & ??'
# We get here only if incomplete matches but
# nothing else
k = incomplete.match(rawdata, i)
if k < 0: raise RuntimeError, 'no incomplete match ??'
j = i+k
if j == n: break # Really incomplete
self.handle_data(rawdata[i:j])
i = j
# end while
if end and i < n:
self.handle_data(rawdata[i:n])
i = n
self.rawdata = rawdata[i:]
# XXX if end: check for empty stack
# Internal -- parse comment, return length or -1 if not ternimated
def parse_comment(self, i):
rawdata = self.rawdata
if rawdata[i:i+4] <> '<!--':
raise RuntimeError, 'unexpected call to handle_comment'
try:
j = string.index(rawdata, '--', i+4)
except string.index_error:
return -1
self.handle_comment(rawdata[i+4: j])
j = j+2
n = len(rawdata)
while j < n and rawdata[j] in ' \t\n': j = j+1
if j == n: return -1 # Wait for final '>'
if rawdata[j] == '>':
j = j+1
else:
print '*** comment not terminated with >'
print repr(rawdata[j-5:j]), '*!*', repr(rawdata[j:j+5])
return j-i
# Internal -- handle starttag, return length or -1 if not terminated
def parse_starttag(self, i):
rawdata = self.rawdata
try:
j = string.index(rawdata, '>', i)
except string.index_error:
return -1
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
tagfind = regex.compile('[a-zA-Z][a-zA-Z0-9]*')
attrfind = regex.compile( \
'[ \t\n]+\([a-zA-Z][a-zA-Z0-9]*\)' + \
'\([ \t\n]*=[ \t\n]*' + \
'\(\'[^\']*\';\|"[^"]*"\|[-a-zA-Z0-9./:+*%?!()_#]+\)\)?')
k = tagfind.match(rawdata, i+1)
if k < 0:
raise RuntimeError, 'unexpected call to parse_starttag'
k = i+1+k
tag = string.lower(rawdata[i+1:k])
while k < j:
l = attrfind.match(rawdata, k)
if l < 0: break
regs = attrfind.regs
a1, b1 = regs[1]
a2, b2 = regs[2]
a3, b3 = regs[3]
attrname = rawdata[a1:b1]
if '=' in rawdata[k:k+l]:
attrvalue = rawdata[a3:b3]
if attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
else:
attrvalue = ''
attrs.append(string.lower(attrname), attrvalue)
k = k + l
j = j+1
try:
method = getattr(self, 'start_' + tag)
except AttributeError:
try:
method = getattr(self, 'do_' + tag)
except AttributeError:
self.unknown_starttag(tag, attrs)
return j-i
method(attrs)
return j-i
self.stack.append(tag)
method(attrs)
return j-i
# Internal -- parse endtag
def parse_endtag(self, data):
if data[:2] <> '</' or data[-1:] <> '>':
raise RuntimeError, 'unexpected call to parse_endtag'
tag = string.lower(string.strip(data[2:-1]))
try:
method = getattr(self, 'end_' + tag)
except AttributeError:
self.unknown_endtag(tag)
return
if self.stack and self.stack[-1] == tag:
del self.stack[-1]
else:
print '*** Unbalanced </' + tag + '>'
print '*** Stack:', self.stack
found = None
for i in range(len(self.stack)):
if self.stack[i] == tag: found = i
if found <> None:
del self.stack[found:]
method()
# Example -- handle character reference, no need to override
def handle_charref(self, name):
try:
n = string.atoi(name)
except string.atoi_error:
self.unknown_charref(name)
return
if not 0 <= n <= 255:
self.unknown_charref(name)
return
self.handle_data(chr(n))
# Definition of entities -- derived classes may override
entitydefs = \
{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
# Example -- handle entity reference, no need to override
def handle_entityref(self, name):
table = self.__class__.entitydefs
name = string.lower(name)
if table.has_key(name):
self.handle_data(table[name])
else:
self.unknown_entityref(name)
return
# Example -- handle data, should be overridden
def handle_data(self, data):
pass
# Example -- handle comment, could be overridden
def handle_comment(self, data):
pass
# To be overridden -- handlers for unknown objects
def unknown_starttag(self, tag, attrs): pass
def unknown_endtag(self, tag): pass
def unknown_charref(self, ref): pass
def unknown_entityref(self, ref): pass
class TestSGML(SGMLParser):
def handle_data(self, data):
r = repr(data)
if len(r) > 72:
r = r[:35] + '...' + r[-35:]
print 'data:', r
def handle_comment(self, data):
r = repr(data)
if len(r) > 68:
r = r[:32] + '...' + r[-32:]
print 'comment:', r
def unknown_starttag(self, tag, attrs):
print 'start tag: <' + tag,
for name, value in attrs:
print name + '=' + '"' + value + '"',
print '>'
def unknown_endtag(self, tag):
print 'end tag: </' + tag + '>'
def unknown_entityref(self, ref):
print '*** unknown entity ref: &' + ref + ';'
def unknown_charref(self, ref):
print '*** unknown char ref: &#' + ref + ';'
def test():
file = 'test.html'
f = open(file, 'r')
x = TestSGML()
while 1:
line = f.readline()
if not line:
x.close()
break
x.feed(line)
#test()