mirror of https://github.com/python/cpython
Instead of printint, use self.message() or self.note().
This commit is contained in:
parent
0fd9408c40
commit
125700addb
|
@ -249,6 +249,17 @@ class Checker:
|
||||||
self.errors = {}
|
self.errors = {}
|
||||||
self.urlopener = MyURLopener()
|
self.urlopener = MyURLopener()
|
||||||
self.changed = 0
|
self.changed = 0
|
||||||
|
|
||||||
|
def note(self, level, format, *args):
|
||||||
|
if self.verbose > level:
|
||||||
|
if args:
|
||||||
|
format = format%args
|
||||||
|
self.message(format)
|
||||||
|
|
||||||
|
def message(self, format, *args):
|
||||||
|
if args:
|
||||||
|
format = format%args
|
||||||
|
print format
|
||||||
|
|
||||||
def __getstate__(self):
|
def __getstate__(self):
|
||||||
return (self.roots, self.todo, self.done, self.bad, self.round)
|
return (self.roots, self.todo, self.done, self.bad, self.round)
|
||||||
|
@ -280,23 +291,18 @@ class Checker:
|
||||||
if self.robots.has_key(root): return
|
if self.robots.has_key(root): return
|
||||||
url = urlparse.urljoin(root, "/robots.txt")
|
url = urlparse.urljoin(root, "/robots.txt")
|
||||||
self.robots[root] = rp = robotparser.RobotFileParser()
|
self.robots[root] = rp = robotparser.RobotFileParser()
|
||||||
if self.verbose > 2:
|
self.note(2, "Parsing %s", url)
|
||||||
print "Parsing", url
|
rp.debug = self.verbose > 3
|
||||||
rp.debug = self.verbose > 3
|
|
||||||
rp.set_url(url)
|
rp.set_url(url)
|
||||||
try:
|
try:
|
||||||
rp.read()
|
rp.read()
|
||||||
except IOError, msg:
|
except IOError, msg:
|
||||||
if self.verbose > 1:
|
self.note(1, "I/O error parsing %s: %s", url, msg)
|
||||||
print "I/O error parsing", url, ":", msg
|
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
while self.todo:
|
while self.todo:
|
||||||
self.round = self.round + 1
|
self.round = self.round + 1
|
||||||
if self.verbose > 0:
|
self.note(0, "\nRound %d (%s)\n", self.round, self.status())
|
||||||
print
|
|
||||||
print "Round %d (%s)" % (self.round, self.status())
|
|
||||||
print
|
|
||||||
urls = self.todo.keys()
|
urls = self.todo.keys()
|
||||||
urls.sort()
|
urls.sort()
|
||||||
del urls[self.roundsize:]
|
del urls[self.roundsize:]
|
||||||
|
@ -310,40 +316,37 @@ class Checker:
|
||||||
len(self.bad))
|
len(self.bad))
|
||||||
|
|
||||||
def report(self):
|
def report(self):
|
||||||
print
|
self.message("")
|
||||||
if not self.todo: print "Final",
|
if not self.todo: s = "Final"
|
||||||
else: print "Interim",
|
else: s = "Interim"
|
||||||
print "Report (%s)" % self.status()
|
self.message("%s Report (%s)", s, self.status())
|
||||||
self.report_errors()
|
self.report_errors()
|
||||||
|
|
||||||
def report_errors(self):
|
def report_errors(self):
|
||||||
if not self.bad:
|
if not self.bad:
|
||||||
print
|
self.message("\nNo errors")
|
||||||
print "No errors"
|
|
||||||
return
|
return
|
||||||
print
|
self.message("\nError Report:")
|
||||||
print "Error Report:"
|
|
||||||
sources = self.errors.keys()
|
sources = self.errors.keys()
|
||||||
sources.sort()
|
sources.sort()
|
||||||
for source in sources:
|
for source in sources:
|
||||||
triples = self.errors[source]
|
triples = self.errors[source]
|
||||||
print
|
self.message("")
|
||||||
if len(triples) > 1:
|
if len(triples) > 1:
|
||||||
print len(triples), "Errors in", source
|
self.message("%d Errors in %s", len(triples), source)
|
||||||
else:
|
else:
|
||||||
print "Error in", source
|
self.message("Error in %s", source)
|
||||||
for url, rawlink, msg in triples:
|
for url, rawlink, msg in triples:
|
||||||
print " HREF", url,
|
if rawlink != url: s = " (%s)" % rawlink
|
||||||
if rawlink != url: print "(%s)" % rawlink,
|
else: s = ""
|
||||||
print
|
self.message(" HREF %s%s\n msg %s", url, s, msg)
|
||||||
print " msg", msg
|
|
||||||
|
|
||||||
def dopage(self, url):
|
def dopage(self, url):
|
||||||
if self.verbose > 1:
|
if self.verbose > 1:
|
||||||
if self.verbose > 2:
|
if self.verbose > 2:
|
||||||
self.show("Check ", url, " from", self.todo[url])
|
self.show("Check ", url, " from", self.todo[url])
|
||||||
else:
|
else:
|
||||||
print "Check ", url
|
self.message("Check %s", url)
|
||||||
page = self.getpage(url)
|
page = self.getpage(url)
|
||||||
if page:
|
if page:
|
||||||
for info in page.getlinkinfos():
|
for info in page.getlinkinfos():
|
||||||
|
@ -360,18 +363,15 @@ class Checker:
|
||||||
|
|
||||||
def newdonelink(self, url, origin):
|
def newdonelink(self, url, origin):
|
||||||
self.done[url].append(origin)
|
self.done[url].append(origin)
|
||||||
if self.verbose > 3:
|
self.note(3, " Done link %s", url)
|
||||||
print " Done link", url
|
|
||||||
|
|
||||||
def newtodolink(self, url, origin):
|
def newtodolink(self, url, origin):
|
||||||
if self.todo.has_key(url):
|
if self.todo.has_key(url):
|
||||||
self.todo[url].append(origin)
|
self.todo[url].append(origin)
|
||||||
if self.verbose > 3:
|
self.note(3, " Seen todo link %s", url)
|
||||||
print " Seen todo link", url
|
|
||||||
else:
|
else:
|
||||||
self.todo[url] = [origin]
|
self.todo[url] = [origin]
|
||||||
if self.verbose > 3:
|
self.note(3, " New todo link %s", url)
|
||||||
print " New todo link", url
|
|
||||||
|
|
||||||
def markdone(self, url):
|
def markdone(self, url):
|
||||||
self.done[url] = self.todo[url]
|
self.done[url] = self.todo[url]
|
||||||
|
@ -381,18 +381,21 @@ class Checker:
|
||||||
def inroots(self, url):
|
def inroots(self, url):
|
||||||
for root in self.roots:
|
for root in self.roots:
|
||||||
if url[:len(root)] == root:
|
if url[:len(root)] == root:
|
||||||
root = urlparse.urljoin(root, "/")
|
return self.isallowed(root, url)
|
||||||
return self.robots[root].can_fetch(AGENTNAME, url)
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
def isallowed(self, root, url):
|
||||||
|
root = urlparse.urljoin(root, "/")
|
||||||
|
return self.robots[root].can_fetch(AGENTNAME, url)
|
||||||
|
|
||||||
def getpage(self, url):
|
def getpage(self, url):
|
||||||
if url[:7] == 'mailto:' or url[:5] == 'news:':
|
if url[:7] == 'mailto:' or url[:5] == 'news:':
|
||||||
if self.verbose > 1: print " Not checking mailto/news URL"
|
self.note(1, " Not checking mailto/news URL")
|
||||||
return None
|
return None
|
||||||
isint = self.inroots(url)
|
isint = self.inroots(url)
|
||||||
if not isint:
|
if not isint:
|
||||||
if not self.checkext:
|
if not self.checkext:
|
||||||
if self.verbose > 1: print " Not checking ext link"
|
self.note(1, " Not checking ext link")
|
||||||
return None
|
return None
|
||||||
f = self.openpage(url)
|
f = self.openpage(url)
|
||||||
if f:
|
if f:
|
||||||
|
@ -400,11 +403,10 @@ class Checker:
|
||||||
return None
|
return None
|
||||||
text, nurl = self.readhtml(url)
|
text, nurl = self.readhtml(url)
|
||||||
if nurl != url:
|
if nurl != url:
|
||||||
if self.verbose > 1:
|
self.note(1, " Redirected to %s", nurl)
|
||||||
print " Redirected to", nurl
|
|
||||||
url = nurl
|
url = nurl
|
||||||
if text:
|
if text:
|
||||||
return Page(text, url, verbose=self.verbose, maxpage=self.maxpage)
|
return Page(text, url, maxpage=self.maxpage, checker=self)
|
||||||
|
|
||||||
def readhtml(self, url):
|
def readhtml(self, url):
|
||||||
text = None
|
text = None
|
||||||
|
@ -429,8 +431,7 @@ class Checker:
|
||||||
return self.urlopener.open(url)
|
return self.urlopener.open(url)
|
||||||
except IOError, msg:
|
except IOError, msg:
|
||||||
msg = self.sanitize(msg)
|
msg = self.sanitize(msg)
|
||||||
if self.verbose > 0:
|
self.note(0, "Error %s", msg)
|
||||||
print "Error ", msg
|
|
||||||
if self.verbose > 0:
|
if self.verbose > 0:
|
||||||
self.show(" HREF ", url, " from", self.todo[url])
|
self.show(" HREF ", url, " from", self.todo[url])
|
||||||
self.setbad(url, msg)
|
self.setbad(url, msg)
|
||||||
|
@ -446,21 +447,18 @@ class Checker:
|
||||||
if ctype == 'text/html':
|
if ctype == 'text/html':
|
||||||
return 1
|
return 1
|
||||||
else:
|
else:
|
||||||
if self.verbose > 1:
|
self.note(1, " Not HTML, mime type %s", ctype)
|
||||||
print " Not HTML, mime type", ctype
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def setgood(self, url):
|
def setgood(self, url):
|
||||||
if self.bad.has_key(url):
|
if self.bad.has_key(url):
|
||||||
del self.bad[url]
|
del self.bad[url]
|
||||||
self.changed = 1
|
self.changed = 1
|
||||||
if self.verbose > 0:
|
self.note(0, "(Clear previously seen error)")
|
||||||
print "(Clear previously seen error)"
|
|
||||||
|
|
||||||
def setbad(self, url, msg):
|
def setbad(self, url, msg):
|
||||||
if self.bad.has_key(url) and self.bad[url] == msg:
|
if self.bad.has_key(url) and self.bad[url] == msg:
|
||||||
if self.verbose > 0:
|
self.note(0, "(Seen this error before)")
|
||||||
print "(Seen this error before)"
|
|
||||||
return
|
return
|
||||||
self.bad[url] = msg
|
self.bad[url] = msg
|
||||||
self.changed = 1
|
self.changed = 1
|
||||||
|
@ -485,15 +483,15 @@ class Checker:
|
||||||
# changed into methods so they can be overridden in subclasses.
|
# changed into methods so they can be overridden in subclasses.
|
||||||
|
|
||||||
def show(self, p1, link, p2, origins):
|
def show(self, p1, link, p2, origins):
|
||||||
print p1, link
|
self.message("%s %s", p1, link)
|
||||||
i = 0
|
i = 0
|
||||||
for source, rawlink in origins:
|
for source, rawlink in origins:
|
||||||
i = i+1
|
i = i+1
|
||||||
if i == 2:
|
if i == 2:
|
||||||
p2 = ' '*len(p2)
|
p2 = ' '*len(p2)
|
||||||
print p2, source,
|
if rawlink != link: s = " (%s)" % rawlink
|
||||||
if rawlink != link: print "(%s)" % rawlink,
|
else: s = ""
|
||||||
print
|
self.message("%s %s%s", p2, source, s)
|
||||||
|
|
||||||
def sanitize(self, msg):
|
def sanitize(self, msg):
|
||||||
if isinstance(IOError, ClassType) and isinstance(msg, IOError):
|
if isinstance(IOError, ClassType) and isinstance(msg, IOError):
|
||||||
|
@ -521,16 +519,11 @@ class Checker:
|
||||||
|
|
||||||
def save_pickle(self, dumpfile=DUMPFILE):
|
def save_pickle(self, dumpfile=DUMPFILE):
|
||||||
if not self.changed:
|
if not self.changed:
|
||||||
if self.verbose > 0:
|
self.note(0, "\nNo need to save checkpoint")
|
||||||
print
|
|
||||||
print "No need to save checkpoint"
|
|
||||||
elif not dumpfile:
|
elif not dumpfile:
|
||||||
if self.verbose > 0:
|
self.note(0, "No dumpfile, won't save checkpoint")
|
||||||
print "No dumpfile, won't save checkpoint"
|
|
||||||
else:
|
else:
|
||||||
if self.verbose > 0:
|
self.note(0, "\nSaving checkpoint to %s ...", dumpfile)
|
||||||
print
|
|
||||||
print "Saving checkpoint to %s ..." % dumpfile
|
|
||||||
newfile = dumpfile + ".new"
|
newfile = dumpfile + ".new"
|
||||||
f = open(newfile, "wb")
|
f = open(newfile, "wb")
|
||||||
pickle.dump(self, f)
|
pickle.dump(self, f)
|
||||||
|
@ -540,29 +533,26 @@ class Checker:
|
||||||
except os.error:
|
except os.error:
|
||||||
pass
|
pass
|
||||||
os.rename(newfile, dumpfile)
|
os.rename(newfile, dumpfile)
|
||||||
if self.verbose > 0:
|
self.note(0, "Done.")
|
||||||
print "Done."
|
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|
||||||
class Page:
|
class Page:
|
||||||
|
|
||||||
def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE):
|
def __init__(self, text, url, verbose=VERBOSE, maxpage=MAXPAGE, checker=None):
|
||||||
self.text = text
|
self.text = text
|
||||||
self.url = url
|
self.url = url
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.maxpage = maxpage
|
self.maxpage = maxpage
|
||||||
|
self.checker = checker
|
||||||
|
|
||||||
def getlinkinfos(self):
|
def getlinkinfos(self):
|
||||||
size = len(self.text)
|
size = len(self.text)
|
||||||
if size > self.maxpage:
|
if size > self.maxpage:
|
||||||
if self.verbose > 0:
|
self.note(0, "Skip huge file %s (%.0f Kbytes)", self.url, (size*0.001))
|
||||||
print "Skip huge file", self.url
|
|
||||||
print " (%.0f Kbytes)" % (size*0.001)
|
|
||||||
return []
|
return []
|
||||||
if self.verbose > 2:
|
self.checker.note(2, " Parsing %s (%d bytes)", self.url, size)
|
||||||
print " Parsing", self.url, "(%d bytes)" % size
|
parser = MyHTMLParser(verbose=self.verbose, checker=self.checker)
|
||||||
parser = MyHTMLParser(verbose=self.verbose)
|
|
||||||
parser.feed(self.text)
|
parser.feed(self.text)
|
||||||
parser.close()
|
parser.close()
|
||||||
rawlinks = parser.getlinks()
|
rawlinks = parser.getlinks()
|
||||||
|
@ -631,10 +621,11 @@ class MyURLopener(urllib.FancyURLopener):
|
||||||
|
|
||||||
class MyHTMLParser(sgmllib.SGMLParser):
|
class MyHTMLParser(sgmllib.SGMLParser):
|
||||||
|
|
||||||
def __init__(self, verbose=VERBOSE):
|
def __init__(self, verbose=VERBOSE, checker=None):
|
||||||
|
self.myverbose = verbose # now unused
|
||||||
|
self.checker = checker
|
||||||
self.base = None
|
self.base = None
|
||||||
self.links = {}
|
self.links = {}
|
||||||
self.myverbose = verbose
|
|
||||||
sgmllib.SGMLParser.__init__(self)
|
sgmllib.SGMLParser.__init__(self)
|
||||||
|
|
||||||
def start_a(self, attributes):
|
def start_a(self, attributes):
|
||||||
|
@ -662,8 +653,8 @@ class MyHTMLParser(sgmllib.SGMLParser):
|
||||||
if name == 'href':
|
if name == 'href':
|
||||||
if value: value = string.strip(value)
|
if value: value = string.strip(value)
|
||||||
if value:
|
if value:
|
||||||
if self.myverbose > 1:
|
if self.checker:
|
||||||
print " Base", value
|
self.checker.note(1, " Base %s", value)
|
||||||
self.base = value
|
self.base = value
|
||||||
|
|
||||||
def getlinks(self):
|
def getlinks(self):
|
||||||
|
|
|
@ -76,7 +76,7 @@ class Sucker(webchecker.Checker):
|
||||||
f = open(path, "wb")
|
f = open(path, "wb")
|
||||||
f.write(text)
|
f.write(text)
|
||||||
f.close()
|
f.close()
|
||||||
print "saved", path
|
self.message("saved %s", path)
|
||||||
|
|
||||||
def savefilename(self, url):
|
def savefilename(self, url):
|
||||||
type, rest = urllib.splittype(url)
|
type, rest = urllib.splittype(url)
|
||||||
|
|
Loading…
Reference in New Issue