Instead of printint, use self.message() or self.note().

This commit is contained in:
Guido van Rossum 1998-07-08 03:04:39 +00:00
parent 0fd9408c40
commit 125700addb
2 changed files with 63 additions and 72 deletions

View File

@ -249,6 +249,17 @@ class Checker:
self.errors = {} self.errors = {}
self.urlopener = MyURLopener() self.urlopener = MyURLopener()
self.changed = 0 self.changed = 0
def note(self, level, format, *args):
if self.verbose > level:
if args:
format = format%args
self.message(format)
def message(self, format, *args):
if args:
format = format%args
print format
def __getstate__(self): def __getstate__(self):
return (self.roots, self.todo, self.done, self.bad, self.round) return (self.roots, self.todo, self.done, self.bad, self.round)
@ -280,23 +291,18 @@ class Checker:
if self.robots.has_key(root): return if self.robots.has_key(root): return
url = urlparse.urljoin(root, "/robots.txt") url = urlparse.urljoin(root, "/robots.txt")
self.robots[root] = rp = robotparser.RobotFileParser() self.robots[root] = rp = robotparser.RobotFileParser()
if self.verbose > 2: self.note(2, "Parsing %s", url)
print "Parsing", url rp.debug = self.verbose > 3
rp.debug = self.verbose > 3
rp.set_url(url) rp.set_url(url)
try: try:
rp.read() rp.read()
except IOError, msg: except IOError, msg:
if self.verbose > 1: self.note(1, "I/O error parsing %s: %s", url, msg)
print "I/O error parsing", url, ":", msg
def run(self): def run(self):
while self.todo: while self.todo:
self.round = self.round + 1 self.round = self.round + 1
if self.verbose > 0: self.note(0, "\nRound %d (%s)\n", self.round, self.status())
print
print "Round %d (%s)" % (self.round, self.status())
print
urls = self.todo.keys() urls = self.todo.keys()
urls.sort() urls.sort()
del urls[self.roundsize:] del urls[self.roundsize:]
@ -310,40 +316,37 @@ class Checker:
len(self.bad)) len(self.bad))
def report(self): def report(self):
print self.message("")
if not self.todo: print "Final", if not self.todo: s = "Final"
else: print "Interim", else: s = "Interim"
print "Report (%s)" % self.status() self.message("%s Report (%s)", s, self.status())
self.report_errors() self.report_errors()
def report_errors(self): def report_errors(self):
if not self.bad: if not self.bad:
print self.message("\nNo errors")
print "No errors"
return return
print self.message("\nError Report:")
print "Error Report:"
sources = self.errors.keys() sources = self.errors.keys()
sources.sort() sources.sort()
for source in sources: for source in sources:
triples = self.errors[source] triples = self.errors[source]
print self.message("")
if len(triples) > 1: if len(triples) > 1:
print len(triples), "Errors in", source self.message("%d Errors in %s", len(triples), source)
else: else:
print "Error in", source self.message("Error in %s", source)
for url, rawlink, msg in triples: for url, rawlink, msg in triples:
print " HREF", url, if rawlink != url: s = " (%s)" % rawlink
if rawlink != url: print "(%s)" % rawlink, else: s = ""
print self.message(" HREF %s%s\n msg %s", url, s, msg)
print " msg", msg
def dopage(self, url): def dopage(self, url):
if self.verbose > 1: if self.verbose > 1:
if self.verbose > 2: if self.verbose > 2:
self.show("Check ", url, " from", self.todo[url]) self.show("Check ", url, " from", self.todo[url])
else: else:
print "Check ", url self.message("Check %s", url)
page = self.getpage(url) page = self.getpage(url)
if page: if page:
for info in page.getlinkinfos(): for info in page.getlinkinfos():
@ -360,18 +363,15 @@ class Checker:
def newdonelink(self, url, origin): def newdonelink(self, url, origin):
self.done[url].append(origin) self.done[url].append(origin)
if self.verbose > 3: self.note(3, " Done link %s", url)
print " Done link", url
def newtodolink(self, url, origin): def newtodolink(self, url, origin):
if self.todo.has_key(url): if self.todo.has_key(url):
self.todo[url].append(origin) self.todo[url].append(origin)
if self.verbose > 3: self.note(3, " Seen todo link %s", url)
print " Seen todo link", url
else: else:
self.todo[url] = [origin] self.todo[url] = [origin]
if self.verbose > 3: self.note(3, " New todo link %s", url)
print " New todo link", url
def markdone(self, url): def markdone(self, url):
self.done[url] = self.todo[url] self.done[url] = self.todo[url]
@ -381,18 +381,21 @@ class Checker:
def inroots(self, url): def inroots(self, url):
for root in self.roots: for root in self.roots:
if url[:len(root)] == root: if url[:len(root)] == root:
root = urlparse.urljoin(root, "/") return self.isallowed(root, url)
return self.robots[root].can_fetch(AGENTNAME, url)
return 0 return 0
def isallowed(self, root, url):
root = urlparse.urljoin(root, "/")
return self.robots[root].can_fetch(AGENTNAME, url)
def getpage(self, url): def getpage(self, url):
if url[:7] == 'mailto:' or url[:5] == 'news:': if url[:7] == 'mailto:' or url[:5] == 'news:':
if self.verbose > 1: print " Not checking mailto/news URL" self.note(1, " Not checking mailto/news URL")
return None return None
isint = self.inroots(url) isint = self.inroots(url)
if not isint: if not isint:
if not self.checkext: if not self.checkext:
if self.verbose > 1: print " Not checking ext link" self.note(1, " Not checking ext link")
return None return None
f = self.openpage(url) f = self.openpage(url)
if f: if f:
@ -400,11 +403,10 @@ class Checker:
return None return None
text, nurl = self.readhtml(url) text, nurl = self.readhtml(url)
if nurl != url: if nurl != url:
if self.verbose > 1: self.note(1, " Redirected to %s", nurl)
print " Redirected to", nurl
url = nurl url = nurl
if text: if text:
return Page(text, url, verbose=self.verbose, maxpage=self.maxpage) return Page(text, url, maxpage=self.maxpage, checker=self)
def readhtml(self, url): def readhtml(self, url):
text = None text = None
@ -429,8 +431,7 @@ class Checker:
return self.urlopener.open(url) return self.urlopener.open(url)
except IOError, msg: except IOError, msg:
msg = self.sanitize(msg) msg = self.sanitize(msg)
if self.verbose > 0: self.note(0, "Error %s", msg)
print "Error ", msg
if self.verbose > 0: if self.verbose > 0:
self.show(" HREF ", url, " from", self.todo[url]) self.show(" HREF ", url, " from", self.todo[url])
self.setbad(url, msg) self.setbad(url, msg)
@ -446,21 +447,18 @@ class Checker:
if ctype == 'text/html': if ctype == 'text/html':
return 1 return 1
else: else:
if self.verbose > 1: self.note(1, " Not HTML, mime type %s", ctype)
print " Not HTML, mime type", ctype
return 0 return 0
def setgood(self, url): def setgood(self, url):
if self.bad.has_key(url): if self.bad.has_key(url):
del self.bad[url] del self.bad[url]
self.changed = 1 self.changed = 1
if self.verbose > 0: self.note(0, "(Clear previously seen error)")
print "(Clear previously seen error)"
def setbad(self, url, msg): def setbad(self, url, msg):
if self.bad.has_key(url) and self.bad[url] == msg: if self.bad.has_key(url) and self.bad[url] == msg:
if self.verbose > 0: self.note(0, "(Seen this error before)")
print "(Seen this error before)"
return return
self.bad[url] = msg self.bad[url] = msg
self.changed = 1 self.changed = 1
@ -485,15 +483,15 @@ class Checker:
# changed into methods so they can be overridden in subclasses. # changed into methods so they can be overridden in subclasses.
def show(self, p1, link, p2, origins): def show(self, p1, link, p2, origins):
print p1, link self.message("%s %s", p1, link)
i = 0 i = 0
for source, rawlink in origins: for source, rawlink in origins:
i = i+1 i = i+1
if i == 2: if i == 2:
p2 = ' '*len(p2) p2 = ' '*len(p2)
print p2, source, if rawlink != link: s = " (%s)" % rawlink
if rawlink != link: print "(%s)" % rawlink, else: s = ""
print self.message("%s %s%s", p2, source, s)
def sanitize(self, msg): def sanitize(self, msg):
if isinstance(IOError, ClassType) and isinstance(msg, IOError): if isinstance(IOError, ClassType) and isinstance(msg, IOError):
@ -521,16 +519,11 @@ class Checker:
def save_pickle(self, dumpfile=DUMPFILE): def save_pickle(self, dumpfile=DUMPFILE):
if not self.changed: if not self.changed:
if self.verbose > 0: self.note(0, "\nNo need to save checkpoint")
print
print "No need to save checkpoint"
elif not dumpfile: elif not dumpfile:
if self.verbose > 0: self.note(0, "No dumpfile, won't save checkpoint")
print "No dumpfile, won't save checkpoint"
else: else:
if self.verbose > 0: self.note(0, "\nSaving checkpoint to %s ...", dumpfile)
print
print "Saving checkpoint to %s ..." % dumpfile
newfile = dumpfile + ".new" newfile = dumpfile + ".new"
f = open(newfile, "wb") f = open(newfile, "wb")
pickle.dump(self, f) pickle.dump(self, f)
@ -540,29 +533,26 @@ class Checker:
except os.error: except os.error:
pass pass
os.rename(newfile, dumpfile) os.rename(newfile, dumpfile)
if self.verbose > 0: self.note(0, "Done.")
print "Done."
return 1 return 1
class Page: class Page:
def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE): def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None):
self.text = text self.text = text
self.url = url self.url = url
self.verbose = verbose self.verbose = verbose
self.maxpage = maxpage self.maxpage = maxpage
self.checker = checker
def getlinkinfos(self): def getlinkinfos(self):
size = len(self.text) size = len(self.text)
if size > self.maxpage: if size > self.maxpage:
if self.verbose > 0: self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
print "Skip huge file", self.url
print " (%.0f Kbytes)" % (size*0.001)
return [] return []
if self.verbose > 2: self.checker.note(2, " Parsing %s (%d bytes)", self.url, size)
print " Parsing", self.url, "(%d bytes)" % size parser = MyHTMLParser(verbose=self.verbose, checker=self.checker)
parser = MyHTMLParser(verbose=self.verbose)
parser.feed(self.text) parser.feed(self.text)
parser.close() parser.close()
rawlinks = parser.getlinks() rawlinks = parser.getlinks()
@ -631,10 +621,11 @@ class MyURLopener(urllib.FancyURLopener):
class MyHTMLParser(sgmllib.SGMLParser): class MyHTMLParser(sgmllib.SGMLParser):
def __init__(self, verbose=VERBOSE): def __init__(self, verbose=VERBOSE, checker=None):
self.myverbose = verbose # now unused
self.checker = checker
self.base = None self.base = None
self.links = {} self.links = {}
self.myverbose = verbose
sgmllib.SGMLParser.__init__(self) sgmllib.SGMLParser.__init__(self)
def start_a(self, attributes): def start_a(self, attributes):
@ -662,8 +653,8 @@ class MyHTMLParser(sgmllib.SGMLParser):
if name == 'href': if name == 'href':
if value: value = string.strip(value) if value: value = string.strip(value)
if value: if value:
if self.myverbose > 1: if self.checker:
print " Base", value self.checker.note(1, " Base %s", value)
self.base = value self.base = value
def getlinks(self): def getlinks(self):

View File

@ -76,7 +76,7 @@ class Sucker(webchecker.Checker):
f = open(path, "wb") f = open(path, "wb")
f.write(text) f.write(text)
f.close() f.close()
print "saved", path self.message("saved %s", path)
def savefilename(self, url): def savefilename(self, url):
type, rest = urllib.splittype(url) type, rest = urllib.splittype(url)