Added robots.txt support, using Skip Montanaro's parser.

Fixed occasional inclusion of unpicklable objects (Message in errors).
Changed indent of a few messages.
This commit is contained in:
Guido van Rossum 1997-01-30 03:19:41 +00:00
parent bbf8c2fafd
commit 3edbb35023
1 changed files with 38 additions and 3 deletions

View File

@ -50,8 +50,13 @@ overwritten, but all work done in the current run is lost.
Miscellaneous: Miscellaneous:
- Webchecker honors the "robots.txt" convention. Thanks to Skip
Montanaro for his robotparser.py module (included in this directory)!
The agent name is hardwired to "webchecker". URLs that are disallowed
by the robots.txt file are reported as external URLs.
- Because the HTML parser is a bit slow, very large HTML files are - Because the HTML parser is a bit slow, very large HTML files are
skipped. The size limit can be set with the -m option. skipped. The size limit can be set with the -m option.
- Before fetching a page, it guesses its type based on its extension. - Before fetching a page, it guesses its type based on its extension.
If it is a known extension and the type is not text/http, the page is If it is a known extension and the type is not text/http, the page is
@ -103,6 +108,7 @@ import htmllib
import formatter import formatter
import mimetypes import mimetypes
import robotparser
# Tunable parameters # Tunable parameters
@ -110,6 +116,7 @@ DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
MAXPAGE = 50000 # Ignore files bigger than this MAXPAGE = 50000 # Ignore files bigger than this
ROUNDSIZE = 50 # Number of links processed per round ROUNDSIZE = 50 # Number of links processed per round
DUMPFILE = "@webchecker.pickle" # Pickled checkpoint DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
AGENTNAME = "webchecker" # Agent name for robots.txt parser
# Global variables # Global variables
@ -208,11 +215,32 @@ class Checker:
self.bad = {} self.bad = {}
self.urlopener = MyURLopener() self.urlopener = MyURLopener()
self.round = 0 self.round = 0
self.robots = {}
def __getstate__(self):
return (self.roots, self.todo, self.done,
self.ext, self.bad, self.round)
def __setstate__(self, state):
(self.roots, self.todo, self.done,
self.ext, self.bad, self.round) = state
for root in self.roots:
self.addrobot(root)
def addroot(self, root): def addroot(self, root):
if root not in self.roots: if root not in self.roots:
self.roots.append(root) self.roots.append(root)
self.todo[root] = [] self.todo[root] = []
self.addrobot(root)
def addrobot(self, root):
self.robots[root] = rp = robotparser.RobotFileParser()
if verbose > 3:
print "Parsing robots.txt file"
rp.debug = 1
url = urlparse.urljoin(root, "/robots.txt")
rp.set_url(url)
rp.read()
def run(self): def run(self):
while self.todo: while self.todo:
@ -332,7 +360,7 @@ class Checker:
def inroots(self, url): def inroots(self, url):
for root in self.roots: for root in self.roots:
if url[:len(root)] == root: if url[:len(root)] == root:
return 1 return self.robots[root].can_fetch(AGENTNAME, url)
return 0 return 0
def getpage(self, url): def getpage(self, url):
@ -348,6 +376,13 @@ class Checker:
try: try:
f = self.urlopener.open(url) f = self.urlopener.open(url)
except IOError, msg: except IOError, msg:
if (type(msg) == TupleType and
len(msg) >= 4 and
msg[0] == 'http error' and
type(msg[3]) == InstanceType):
# Remove the Message instance -- it may contain
# a file object which prevents pickling.
msg = msg[:3] + msg[4:]
if verbose > 0: if verbose > 0:
print "Error ", msg print "Error ", msg
if verbose > 0: if verbose > 0:
@ -360,7 +395,7 @@ class Checker:
ctype = string.lower(info['content-type']) ctype = string.lower(info['content-type'])
if nurl != url: if nurl != url:
if verbose > 1: if verbose > 1:
print "Redirected to", nurl print " Redirected to", nurl
if not ctype: if not ctype:
ctype, encoding = mimetypes.guess_type(nurl) ctype, encoding = mimetypes.guess_type(nurl)
if ctype != 'text/html': if ctype != 'text/html':