web tree checker

1997-01-30 02:44:48 +00:00 · 1997-01-30 02:44:48 +00:00 · 272b37d686
parent d7e4705d8f
commit 272b37d686
1 changed files with 488 additions and 0 deletions
--- a/Tools/webchecker/webchecker.py
+++ b/Tools/webchecker/webchecker.py
@ -0,0 +1,488 @@
+#! /usr/bin/env python
+
+"""Web tree checker.
+
+This utility is handy to check a subweb of the world-wide web for
+errors.  A subweb is specified by giving one or more ``root URLs''; a
+page belongs to the subweb if one of the root URLs is an initial
+prefix of it.
+
+File URL extension:
+
+In order to easy the checking of subwebs via the local file system,
+the interpretation of ``file:'' URLs is extended to mimic the behavior
+of your average HTTP daemon: if a directory pathname is given, the
+file index.html in that directory is returned if it exists, otherwise
+a directory listing is returned.  Now, you can point webchecker to the
+document tree in the local file system of your HTTP daemon, and have
+most of it checked.  In fact the default works this way if your local
+web tree is located at /usr/local/etc/httpd/htdpcs (the default for
+the NCSA HTTP daemon and probably others).
+
+Reports printed:
+
+When done, it reports links to pages outside the web (unless -q is
+specified), and pages with bad links within the subweb.  When
+interrupted, it print those same reports for the pages that it has
+checked already.
+
+In verbose mode, additional messages are printed during the
+information gathering phase.  By default, it prints a summary of its
+work status every 50 URLs (adjustable with the -r option), and it
+reports errors as they are encountered.  Use the -q option to disable
+this output.
+
+Checkpoint feature:
+
+Whether interrupted or not, it dumps its state (a Python pickle) to a
+checkpoint file and the -R option allows it to restart from the
+checkpoint (assuming that the pages on the subweb that were already
+processed haven't changed).  Even when it has run till completion, -R
+can still be useful -- it will print the reports again, and -Rq prints
+the errors only.  In this case, the checkpoint file is not written
+again.  The checkpoint file can be set with the -d option.
+
+The checkpoint file is written as a Python pickle.  Remember that
+Python's pickle module is currently quite slow.  Give it the time it
+needs to load and save the checkpoint file.  When interrupted while
+writing the checkpoint file, the old checkpoint file is not
+overwritten, but all work done in the current run is lost.
+
+Miscellaneous:
+
+- Because the HTML parser is a bit slow, very large HTML files are
+  skipped.  The size limit can be set with the -m option.
+
+- Before fetching a page, it guesses its type based on its extension.
+If it is a known extension and the type is not text/http, the page is
+not fetched.  This is a huge optimization but occasionally it means
+links can be missed.  The mimetypes.py module (also in this directory)
+has a built-in table mapping most currently known suffixes, and in
+addition attempts to read the mime.types configuration files in the
+default locations of Netscape and the NCSA HTTP daemon.
+
+- It only follows links indicated by <A> tags.  It doesn't follow
+links in <FORM> or <IMG> or whatever other tags might contain
+hyperlinks.  It does honor the <BASE> tag.
+
+- It could be argued that it should also check external links for
+validity.  This is true, but is is more error-prone.  I think I will
+make this an option in the future.
+
+
+Usage: webchecker.py [option] ... [rooturl] ...
+
+Options:
+
+-R        -- restart from checkpoint file
+-d file   -- checkpoint filename (default %(DUMPFILE)s)
+-m bytes  -- skip HTML pages larger than this size (default %(MAXPAGE)d)
+-q        -- quiet operation (also suppresses external links report)
+-r number -- number of links processed per round (default %(ROUNDSIZE)d)
+-v        -- verbose operation; repeating -v will increase verbosity
+
+Arguments:
+
+rooturl   -- URL to start checking
+             (default %(DEFROOT)s)
+
+"""
+
+
+import sys
+import os
+from types import *
+import string
+import StringIO
+import getopt
+import pickle
+
+import urllib
+import urlparse
+import htmllib
+import formatter
+
+import mimetypes
+
+
+# Tunable parameters
+DEFROOT = "file:/usr/local/etc/httpd/htdocs/"	# Default root URL
+MAXPAGE = 50000				# Ignore files bigger than this
+ROUNDSIZE = 50				# Number of links processed per round
+DUMPFILE = "@webchecker.pickle"		# Pickled checkpoint
+
+
+# Global variables
+verbose = 1
+maxpage = MAXPAGE
+roundsize = ROUNDSIZE
+
+
+def main():
+    global verbose, maxpage, roundsize
+    dumpfile = DUMPFILE
+    restart = 0
+
+    try:
+	opts, args = getopt.getopt(sys.argv[1:], 'Rd:m:qr:v')
+    except getopt.error, msg:
+	sys.stdout = sys.stderr
+	print msg
+	print __doc__ % globals()
+	sys.exit(2)
+    for o, a in opts:
+	if o == '-R':
+	    restart = 1
+	if o == '-d':
+	    dumpfile = a
+	if o == '-m':
+	    maxpage = string.atoi(a)
+	if o == '-q':
+	    verbose = 0
+	if o == '-r':
+	    roundsize = string.atoi(a)
+	if o == '-v':
+	    verbose = verbose + 1
+
+    if restart:
+	if verbose > 0:
+	    print "Loading checkpoint from %s ..." % dumpfile
+	f = open(dumpfile, "rb")
+	c = pickle.load(f)
+	f.close()
+	if verbose > 0:
+	    print "Done."
+	    print "Root:", string.join(c.roots, "\n      ")
+    else:
+	c = Checker()
+	if not args:
+	    args.append(DEFROOT)
+
+    for arg in args:
+	c.addroot(arg)
+
+    if not c.todo:
+	needsave = 0
+    else:
+	needsave = 1
+    try:
+	c.run()
+    except KeyboardInterrupt:
+	if verbose > 0:
+	    print "[interrupted]"
+    c.report()
+    if not needsave:
+	if verbose > 0:
+	    print
+	    print "No need to save checkpoint"
+    elif dumpfile:
+	if verbose > 0:
+	    print
+	    print "Saving checkpoint to %s ..." % dumpfile
+	newfile = dumpfile + ".new"
+	f = open(newfile, "wb")
+	pickle.dump(c, f)
+	f.flush()
+	f.close()
+	try:
+	    os.unlink(dumpfile)
+	except os.error:
+	    pass
+	os.rename(newfile, dumpfile)
+	if verbose > 0:
+	    print "Done."
+	    if dumpfile == DUMPFILE:
+		print "Use ``%s -R'' to restart." % sys.argv[0]
+	    else:
+		print "Use ``%s -R -d %s'' to restart." % (sys.argv[0],
+							   dumpfile)
+
+
+class Checker:
+
+    def __init__(self):
+	self.roots = []
+	self.todo = {}
+	self.done = {}
+	self.ext = {}
+	self.bad = {}
+	self.urlopener = MyURLopener()
+	self.round = 0
+
+    def addroot(self, root):
+	if root not in self.roots:
+	    self.roots.append(root)
+	    self.todo[root] = []
+
+    def run(self):
+	while self.todo:
+	    self.round = self.round + 1
+	    if verbose > 0:
+		print
+		print "Round", self.round,
+		print "(%d to do, %d done, %d external, %d bad)" % (
+		    len(self.todo), len(self.done),
+		    len(self.ext), len(self.bad))
+		print
+	    urls = self.todo.keys()[:roundsize]
+	    for url in urls:
+		self.dopage(url)
+		self.done[url] = self.todo[url]
+		del self.todo[url]
+
+    def report(self):
+	print
+	if not self.todo: print "Final",
+	else: print "Interim",
+	print "Report (%d to do, %d done, %d external, %d bad)" % (
+	    len(self.todo), len(self.done),
+	    len(self.ext), len(self.bad))
+	if verbose > 0:
+	    self.report_extrefs()
+	# Report errors last because the output may get truncated
+	self.report_errors()
+
+    def report_extrefs(self):
+	if not self.ext:
+	    print
+	    print "No external URLs"
+	    return
+	print
+	print "External URLs:"
+	print
+	urls = self.ext.keys()
+	urls.sort()
+	for url in urls:
+	    show("HREF ", url, " from", self.ext[url])
+
+    def report_errors(self):
+	if not self.bad:
+	    print
+	    print "No errors"
+	    return
+	print
+	print "Error Report:"
+	urls = self.bad.keys()
+	urls.sort()
+	bysource = {}
+	for url in urls:
+	    try:
+		origins = self.done[url]
+	    except KeyError:
+		origins = self.todo[url]
+	    for source, rawlink in origins:
+		triple = url, rawlink, self.bad[url]
+		try:
+		    bysource[source].append(triple)
+		except KeyError:
+		    bysource[source] = [triple]
+	sources = bysource.keys()
+	sources.sort()
+	for source in sources:
+	    triples = bysource[source]
+	    print
+	    if len(triples) > 1:
+		print len(triples), "Errors in", source
+	    else:
+		print "Error in", source
+	    for url, rawlink, msg in triples:
+		print "  HREF", url,
+		if rawlink != url: print "(%s)" % rawlink,
+		print
+		print "   msg", msg
+
+    def dopage(self, url):
+	if verbose > 1:
+	    if verbose > 2:
+		show("Page  ", url, "  from", self.todo[url])
+	    else:
+		print "Page  ", url
+	page = self.getpage(url)
+	if not page:
+	    return
+	for info in page.getlinkinfos():
+	    link, rawlink = info
+	    origin = url, rawlink
+	    if not self.inroots(link):
+		try:
+		    self.ext[link].append(origin)
+		    if verbose > 3:
+			print "  New ext link", link,
+			if link != rawlink: print "(%s)" % rawlink,
+			print
+		except KeyError:
+		    if verbose > 3:
+			print "  Seen ext link", link,
+			if link != rawlink: print "(%s)" % rawlink,
+			print
+		    self.ext[link] = [origin]
+	    elif self.done.has_key(link):
+		if verbose > 3:
+		    print "  Done link", link
+		self.done[link].append(origin)
+	    elif self.todo.has_key(link):
+		if verbose > 3:
+		    print "  Seen todo link", link
+		self.todo[link].append(origin)
+	    else:
+		if verbose > 3:
+		    print "  New todo link", link
+		self.todo[link] = [origin]
+
+    def inroots(self, url):
+	for root in self.roots:
+	    if url[:len(root)] == root:
+		return 1
+	return 0
+
+    def getpage(self, url):
+	ctype, encoding = mimetypes.guess_type(url)
+	if encoding:
+	    if verbose > 2:
+		print "  Won't bother, URL suggests encoding %s" % `encoding`
+	    return None
+	if ctype and ctype != 'text/html':
+	    if verbose > 2:
+		print "  Won't bother, URL suggests mime type %s" % `ctype`
+	    return None
+	try:
+	    f = self.urlopener.open(url)
+	except IOError, msg:
+	    if verbose > 0:
+		print "Error ", msg
+	    if verbose > 0:
+		show(" HREF ", url, "  from", self.todo[url])
+	    self.bad[url] = msg
+	    return None
+	nurl = f.geturl()
+	info = f.info()
+	if info.has_key('content-type'):
+	    ctype = string.lower(info['content-type'])
+	if nurl != url:
+	    if verbose > 1:
+		print "Redirected to", nurl
+	    if not ctype:
+		ctype, encoding = mimetypes.guess_type(nurl)
+	if ctype != 'text/html':
+	    f.close()
+	    if verbose > 2:
+		print "  Not HTML, mime type", ctype
+	    return None
+	text = f.read()
+	f.close()
+	return Page(text, nurl)
+
+
+class Page:
+
+    def __init__(self, text, url):
+	self.text = text
+	self.url = url
+
+    def getlinkinfos(self):
+	size = len(self.text)
+	if size > maxpage:
+	    if verbose > 0:
+		print "Skip huge file", self.url
+		print "  (%.0f Kbytes)" % (size*0.001)
+	    return []
+	if verbose > 2:
+	    print "  Parsing", self.url, "(%d bytes)" % size
+	parser = MyHTMLParser(formatter.NullFormatter())
+	parser.feed(self.text)
+	parser.close()
+	rawlinks = parser.getlinks()
+	base = urlparse.urljoin(self.url, parser.getbase() or "")
+	infos = []
+	for rawlink in rawlinks:
+	    t = urlparse.urlparse(rawlink)
+	    t = t[:-1] + ('',)
+	    rawlink = urlparse.urlunparse(t)
+	    link = urlparse.urljoin(base, rawlink)
+	    infos.append((link, rawlink))
+	return infos
+
+
+class MyStringIO(StringIO.StringIO):
+
+    def __init__(self, url, info):
+	self.__url = url
+	self.__info = info
+	StringIO.StringIO.__init__(self)
+
+    def info(self):
+	return self.__info
+
+    def geturl(self):
+	return self.__url
+
+
+class MyURLopener(urllib.FancyURLopener):
+
+    http_error_default = urllib.URLopener.http_error_default
+
+    def open_file(self, url):
+	path = urllib.url2pathname(urllib.unquote(url))
+	if path[-1] != os.sep:
+	    url = url + '/'
+	if os.path.isdir(path):
+	    indexpath = os.path.join(path, "index.html")
+	    if os.path.exists(indexpath):
+		return self.open_file(url + "index.html")
+	    try:
+		names = os.listdir(path)
+	    except os.error, msg:
+		raise IOError, msg, sys.exc_traceback
+	    names.sort()
+	    s = MyStringIO("file:"+url, {'content-type': 'text/html'})
+	    s.write('<BASE HREF="file:%s">\n' %
+		    urllib.quote(os.path.join(path, "")))
+	    for name in names:
+		q = urllib.quote(name)
+		s.write('<A HREF="%s">%s</A>\n' % (q, q))
+	    s.seek(0)
+	    return s
+	return urllib.FancyURLopener.open_file(self, path)
+
+
+class MyHTMLParser(htmllib.HTMLParser):
+
+    def __init__(*args):
+	self = args[0]
+	self.base = None
+	self.links = []
+	apply(htmllib.HTMLParser.__init__, args)
+
+    def start_a(self, attributes):
+	for name, value in attributes:
+	    if name == 'href' and value and value not in self.links:
+		self.links.append(string.strip(value))
+
+    def do_base(self, attributes):
+	for name, value in attributes:
+	    if name == 'href' and value:
+		if verbose > 1:
+		    print "  Base", value
+		self.base = value
+
+    def getlinks(self):
+	return self.links
+
+    def getbase(self):
+	return self.base
+
+
+def show(p1, link, p2, origins):
+    print p1, link
+    i = 0
+    for source, rawlink in origins:
+	i = i+1
+	if i == 2:
+	    p2 = ' '*len(p2)
+	print p2, source,
+	if rawlink != link: print "(%s)" % rawlink,
+	print
+
+
+if __name__ == '__main__':
+    main()