Get rid of _test(), _main(), _debug() and _check(). Tests are no longer

needed (better set available in Lib/test/test_robotparser.py).  Clean up a
few PEP 8 nits (compound statements on a single line, whitespace around
operators).
This commit is contained in:
Skip Montanaro 2008-04-28 03:27:53 +00:00
parent dfd982715b
commit b8bdbc04e7
1 changed files with 12 additions and 93 deletions

View File

@ -9,15 +9,11 @@
The robots.txt Exclusion Protocol is implemented as specified in The robots.txt Exclusion Protocol is implemented as specified in
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
""" """
import urlparse,urllib import urlparse
import urllib
__all__ = ["RobotFileParser"] __all__ = ["RobotFileParser"]
debug = 0
def _debug(msg):
if debug: print msg
class RobotFileParser: class RobotFileParser:
""" This class provides a set of methods to read, parse and answer """ This class provides a set of methods to read, parse and answer
@ -67,12 +63,9 @@ class RobotFileParser:
self.errcode = opener.errcode self.errcode = opener.errcode
if self.errcode in (401, 403): if self.errcode in (401, 403):
self.disallow_all = True self.disallow_all = True
_debug("disallow all")
elif self.errcode >= 400: elif self.errcode >= 400:
self.allow_all = True self.allow_all = True
_debug("allow all")
elif self.errcode == 200 and lines: elif self.errcode == 200 and lines:
_debug("parse lines")
self.parse(lines) self.parse(lines)
def _add_entry(self, entry): def _add_entry(self, entry):
@ -93,19 +86,16 @@ class RobotFileParser:
for line in lines: for line in lines:
linenumber = linenumber + 1 linenumber = linenumber + 1
if not line: if not line:
if state==1: if state == 1:
_debug("line %d: warning: you should insert"
" allow: or disallow: directives below any"
" user-agent: line" % linenumber)
entry = Entry() entry = Entry()
state = 0 state = 0
elif state==2: elif state == 2:
self._add_entry(entry) self._add_entry(entry)
entry = Entry() entry = Entry()
state = 0 state = 0
# remove optional comment and strip line # remove optional comment and strip line
i = line.find('#') i = line.find('#')
if i>=0: if i >= 0:
line = line[:i] line = line[:i]
line = line.strip() line = line.strip()
if not line: if not line:
@ -115,41 +105,24 @@ class RobotFileParser:
line[0] = line[0].strip().lower() line[0] = line[0].strip().lower()
line[1] = urllib.unquote(line[1].strip()) line[1] = urllib.unquote(line[1].strip())
if line[0] == "user-agent": if line[0] == "user-agent":
if state==2: if state == 2:
_debug("line %d: warning: you should insert a blank"
" line before any user-agent"
" directive" % linenumber)
self._add_entry(entry) self._add_entry(entry)
entry = Entry() entry = Entry()
entry.useragents.append(line[1]) entry.useragents.append(line[1])
state = 1 state = 1
elif line[0] == "disallow": elif line[0] == "disallow":
if state==0: if state != 0:
_debug("line %d: error: you must insert a user-agent:"
" directive before this line" % linenumber)
else:
entry.rulelines.append(RuleLine(line[1], False)) entry.rulelines.append(RuleLine(line[1], False))
state = 2 state = 2
elif line[0] == "allow": elif line[0] == "allow":
if state==0: if state != 0:
_debug("line %d: error: you must insert a user-agent:"
" directive before this line" % linenumber)
else:
entry.rulelines.append(RuleLine(line[1], True)) entry.rulelines.append(RuleLine(line[1], True))
else: if state == 2:
_debug("line %d: warning: unknown key %s" % (linenumber,
line[0]))
else:
_debug("line %d: error: malformed line %s"%(linenumber, line))
if state==2:
self.entries.append(entry) self.entries.append(entry)
_debug("Parsed rules:\n%s" % str(self))
def can_fetch(self, useragent, url): def can_fetch(self, useragent, url):
"""using the parsed robots.txt decide if useragent can fetch url""" """using the parsed robots.txt decide if useragent can fetch url"""
_debug("Checking robots.txt allowance for:\n user agent: %s\n url: %s" %
(useragent, url))
if self.disallow_all: if self.disallow_all:
return False return False
if self.allow_all: if self.allow_all:
@ -182,10 +155,10 @@ class RuleLine:
self.allowance = allowance self.allowance = allowance
def applies_to(self, filename): def applies_to(self, filename):
return self.path=="*" or filename.startswith(self.path) return self.path == "*" or filename.startswith(self.path)
def __str__(self): def __str__(self):
return (self.allowance and "Allow" or "Disallow")+": "+self.path return (self.allowance and "Allow" or "Disallow") + ": " + self.path
class Entry: class Entry:
@ -207,7 +180,7 @@ class Entry:
# split the name token and make it lower case # split the name token and make it lower case
useragent = useragent.split("/")[0].lower() useragent = useragent.split("/")[0].lower()
for agent in self.useragents: for agent in self.useragents:
if agent=='*': if agent == '*':
# we have the catch-all agent # we have the catch-all agent
return True return True
agent = agent.lower() agent = agent.lower()
@ -220,7 +193,6 @@ class Entry:
- our agent applies to this entry - our agent applies to this entry
- filename is URL decoded""" - filename is URL decoded"""
for line in self.rulelines: for line in self.rulelines:
_debug((filename, str(line), line.allowance))
if line.applies_to(filename): if line.applies_to(filename):
return line.allowance return line.allowance
return True return True
@ -239,56 +211,3 @@ class URLopener(urllib.FancyURLopener):
self.errcode = errcode self.errcode = errcode
return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
errmsg, headers) errmsg, headers)
def _check(a,b):
if not b:
ac = "access denied"
else:
ac = "access allowed"
if a!=b:
print "failed"
else:
print "ok (%s)" % ac
print
def _test():
global debug
rp = RobotFileParser()
debug = 1
# robots.txt that exists, gotten to by redirection
rp.set_url('http://www.musi-cal.com/robots.txt')
rp.read()
# test for re.escape
_check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
# this should match the first rule, which is a disallow
_check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
# various cherry pickers
_check(rp.can_fetch('CherryPickerSE',
'http://www.musi-cal.com/cgi-bin/event-search'
'?city=San+Francisco'), 0)
_check(rp.can_fetch('CherryPickerSE/1.0',
'http://www.musi-cal.com/cgi-bin/event-search'
'?city=San+Francisco'), 0)
_check(rp.can_fetch('CherryPickerSE/1.5',
'http://www.musi-cal.com/cgi-bin/event-search'
'?city=San+Francisco'), 0)
# case sensitivity
_check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
_check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
# substring test
_check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
# tests for catch-all * agent
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
_check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
# robots.txt that does not exist
rp.set_url('http://www.lycos.com/robots.txt')
rp.read()
_check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
if __name__ == '__main__':
_test()