Added robots.txt support, using Skip Montanaro's parser.

Fixed occasional inclusion of unpicklable objects (Message in errors).
Changed indent of a few messages.
This commit is contained in:
Guido van Rossum 1997-01-30 03:19:41 +00:00
parent bbf8c2fafd
commit 3edbb35023
1 changed files with 38 additions and 3 deletions

View File

@ -50,8 +50,13 @@ overwritten, but all work done in the current run is lost.
Miscellaneous:
- Webchecker honors the "robots.txt" convention. Thanks to Skip
Montanaro for his robotparser.py module (included in this directory)!
The agent name is hardwired to "webchecker". URLs that are disallowed
by the robots.txt file are reported as external URLs.
- Because the HTML parser is a bit slow, very large HTML files are
skipped. The size limit can be set with the -m option.
skipped. The size limit can be set with the -m option.
- Before fetching a page, it guesses its type based on its extension.
If it is a known extension and the type is not text/http, the page is
@ -103,6 +108,7 @@ import htmllib
import formatter
import mimetypes
import robotparser
# Tunable parameters
@ -110,6 +116,7 @@ DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
MAXPAGE = 50000 # Ignore files bigger than this
ROUNDSIZE = 50 # Number of links processed per round
DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
AGENTNAME = "webchecker" # Agent name for robots.txt parser
# Global variables
@ -208,11 +215,32 @@ class Checker:
self.bad = {}
self.urlopener = MyURLopener()
self.round = 0
self.robots = {}
def __getstate__(self):
return (self.roots, self.todo, self.done,
self.ext, self.bad, self.round)
def __setstate__(self, state):
(self.roots, self.todo, self.done,
self.ext, self.bad, self.round) = state
for root in self.roots:
self.addrobot(root)
def addroot(self, root):
if root not in self.roots:
self.roots.append(root)
self.todo[root] = []
self.addrobot(root)
def addrobot(self, root):
self.robots[root] = rp = robotparser.RobotFileParser()
if verbose > 3:
print "Parsing robots.txt file"
rp.debug = 1
url = urlparse.urljoin(root, "/robots.txt")
rp.set_url(url)
rp.read()
def run(self):
while self.todo:
@ -332,7 +360,7 @@ class Checker:
def inroots(self, url):
for root in self.roots:
if url[:len(root)] == root:
return 1
return self.robots[root].can_fetch(AGENTNAME, url)
return 0
def getpage(self, url):
@ -348,6 +376,13 @@ class Checker:
try:
f = self.urlopener.open(url)
except IOError, msg:
if (type(msg) == TupleType and
len(msg) >= 4 and
msg[0] == 'http error' and
type(msg[3]) == InstanceType):
# Remove the Message instance -- it may contain
# a file object which prevents pickling.
msg = msg[:3] + msg[4:]
if verbose > 0:
print "Error ", msg
if verbose > 0:
@ -360,7 +395,7 @@ class Checker:
ctype = string.lower(info['content-type'])
if nurl != url:
if verbose > 1:
print "Redirected to", nurl
print " Redirected to", nurl
if not ctype:
ctype, encoding = mimetypes.guess_type(nurl)
if ctype != 'text/html':