Added robots.txt support, using Skip Montanaro's parser.
Fixed occasional inclusion of unpicklable objects (Message in errors). Changed indent of a few messages.
This commit is contained in:
parent
bbf8c2fafd
commit
3edbb35023
|
@ -50,8 +50,13 @@ overwritten, but all work done in the current run is lost.
|
||||||
|
|
||||||
Miscellaneous:
|
Miscellaneous:
|
||||||
|
|
||||||
|
- Webchecker honors the "robots.txt" convention. Thanks to Skip
|
||||||
|
Montanaro for his robotparser.py module (included in this directory)!
|
||||||
|
The agent name is hardwired to "webchecker". URLs that are disallowed
|
||||||
|
by the robots.txt file are reported as external URLs.
|
||||||
|
|
||||||
- Because the HTML parser is a bit slow, very large HTML files are
|
- Because the HTML parser is a bit slow, very large HTML files are
|
||||||
skipped. The size limit can be set with the -m option.
|
skipped. The size limit can be set with the -m option.
|
||||||
|
|
||||||
- Before fetching a page, it guesses its type based on its extension.
|
- Before fetching a page, it guesses its type based on its extension.
|
||||||
If it is a known extension and the type is not text/http, the page is
|
If it is a known extension and the type is not text/http, the page is
|
||||||
|
@ -103,6 +108,7 @@ import htmllib
|
||||||
import formatter
|
import formatter
|
||||||
|
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
import robotparser
|
||||||
|
|
||||||
|
|
||||||
# Tunable parameters
|
# Tunable parameters
|
||||||
|
@ -110,6 +116,7 @@ DEFROOT = "file:/usr/local/etc/httpd/htdocs/" # Default root URL
|
||||||
MAXPAGE = 50000 # Ignore files bigger than this
|
MAXPAGE = 50000 # Ignore files bigger than this
|
||||||
ROUNDSIZE = 50 # Number of links processed per round
|
ROUNDSIZE = 50 # Number of links processed per round
|
||||||
DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
|
DUMPFILE = "@webchecker.pickle" # Pickled checkpoint
|
||||||
|
AGENTNAME = "webchecker" # Agent name for robots.txt parser
|
||||||
|
|
||||||
|
|
||||||
# Global variables
|
# Global variables
|
||||||
|
@ -208,11 +215,32 @@ class Checker:
|
||||||
self.bad = {}
|
self.bad = {}
|
||||||
self.urlopener = MyURLopener()
|
self.urlopener = MyURLopener()
|
||||||
self.round = 0
|
self.round = 0
|
||||||
|
self.robots = {}
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
return (self.roots, self.todo, self.done,
|
||||||
|
self.ext, self.bad, self.round)
|
||||||
|
|
||||||
|
def __setstate__(self, state):
|
||||||
|
(self.roots, self.todo, self.done,
|
||||||
|
self.ext, self.bad, self.round) = state
|
||||||
|
for root in self.roots:
|
||||||
|
self.addrobot(root)
|
||||||
|
|
||||||
def addroot(self, root):
|
def addroot(self, root):
|
||||||
if root not in self.roots:
|
if root not in self.roots:
|
||||||
self.roots.append(root)
|
self.roots.append(root)
|
||||||
self.todo[root] = []
|
self.todo[root] = []
|
||||||
|
self.addrobot(root)
|
||||||
|
|
||||||
|
def addrobot(self, root):
|
||||||
|
self.robots[root] = rp = robotparser.RobotFileParser()
|
||||||
|
if verbose > 3:
|
||||||
|
print "Parsing robots.txt file"
|
||||||
|
rp.debug = 1
|
||||||
|
url = urlparse.urljoin(root, "/robots.txt")
|
||||||
|
rp.set_url(url)
|
||||||
|
rp.read()
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
while self.todo:
|
while self.todo:
|
||||||
|
@ -332,7 +360,7 @@ class Checker:
|
||||||
def inroots(self, url):
|
def inroots(self, url):
|
||||||
for root in self.roots:
|
for root in self.roots:
|
||||||
if url[:len(root)] == root:
|
if url[:len(root)] == root:
|
||||||
return 1
|
return self.robots[root].can_fetch(AGENTNAME, url)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def getpage(self, url):
|
def getpage(self, url):
|
||||||
|
@ -348,6 +376,13 @@ class Checker:
|
||||||
try:
|
try:
|
||||||
f = self.urlopener.open(url)
|
f = self.urlopener.open(url)
|
||||||
except IOError, msg:
|
except IOError, msg:
|
||||||
|
if (type(msg) == TupleType and
|
||||||
|
len(msg) >= 4 and
|
||||||
|
msg[0] == 'http error' and
|
||||||
|
type(msg[3]) == InstanceType):
|
||||||
|
# Remove the Message instance -- it may contain
|
||||||
|
# a file object which prevents pickling.
|
||||||
|
msg = msg[:3] + msg[4:]
|
||||||
if verbose > 0:
|
if verbose > 0:
|
||||||
print "Error ", msg
|
print "Error ", msg
|
||||||
if verbose > 0:
|
if verbose > 0:
|
||||||
|
@ -360,7 +395,7 @@ class Checker:
|
||||||
ctype = string.lower(info['content-type'])
|
ctype = string.lower(info['content-type'])
|
||||||
if nurl != url:
|
if nurl != url:
|
||||||
if verbose > 1:
|
if verbose > 1:
|
||||||
print "Redirected to", nurl
|
print " Redirected to", nurl
|
||||||
if not ctype:
|
if not ctype:
|
||||||
ctype, encoding = mimetypes.guess_type(nurl)
|
ctype, encoding = mimetypes.guess_type(nurl)
|
||||||
if ctype != 'text/html':
|
if ctype != 'text/html':
|
||||||
|
|
Loading…
Reference in New Issue