Close issue 3437 - missing state change when Allow lines are processed.

Adds test cases which use Allow: as well.
This commit is contained in:
Skip Montanaro 2008-07-27 00:49:02 +00:00
parent 4b99e9b479
commit 1ef19f0de1
2 changed files with 74 additions and 0 deletions

View File

@ -76,6 +76,10 @@ class RobotFileParser:
"""parse the input lines from a robots.txt file.
We allow that a user-agent: line is not preceded by
one or more blank lines."""
# states:
# 0: start state
# 1: saw user-agent line
# 2: saw an allow or disallow line
state = 0
linenumber = 0
entry = Entry()
@ -114,6 +118,7 @@ class RobotFileParser:
elif line[0] == "allow":
if state != 0:
entry.rulelines.append(RuleLine(line[1], True))
state = 2
if state == 2:
self.entries.append(entry)

View File

@ -134,6 +134,75 @@ bad = [] # Bug report says "/" should be denied, but that is not in the RFC
RobotTest(7, doc, good, bad)
# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364
# 8.
doc = """
User-agent: Googlebot
Allow: /folder1/myfile.html
Disallow: /folder1/
"""
good = ['/folder1/myfile.html']
bad = ['/folder1/anotherfile.html']
RobotTest(8, doc, good, bad, agent="Googlebot")
# 9. This file is incorrect because "Googlebot" is a substring of
# "Googlebot-Mobile", so test 10 works just like test 9.
doc = """
User-agent: Googlebot
Disallow: /
User-agent: Googlebot-Mobile
Allow: /
"""
good = []
bad = ['/something.jpg']
RobotTest(9, doc, good, bad, agent="Googlebot")
good = []
bad = ['/something.jpg']
RobotTest(10, doc, good, bad, agent="Googlebot-Mobile")
# 11. Get the order correct.
doc = """
User-agent: Googlebot-Mobile
Allow: /
User-agent: Googlebot
Disallow: /
"""
good = []
bad = ['/something.jpg']
RobotTest(11, doc, good, bad, agent="Googlebot")
good = ['/something.jpg']
bad = []
RobotTest(12, doc, good, bad, agent="Googlebot-Mobile")
# 13. Google also got the order wrong in #8. You need to specify the
# URLs from more specific to more general.
doc = """
User-agent: Googlebot
Allow: /folder1/myfile.html
Disallow: /folder1/
"""
good = ['/folder1/myfile.html']
bad = ['/folder1/anotherfile.html']
RobotTest(13, doc, good, bad, agent="googlebot")
class TestCase(unittest.TestCase):
def runTest(self):
test_support.requires('network')