bpo-35922: Fix RobotFileParser when robots.txt has no relevant crawl delay or request rate (GH-11791)

Co-Authored-By: Tal Einat <taleinat+github@gmail.com>
This commit is contained in:
Rémi Lapeyre 2019-06-16 08:48:57 +02:00 committed by Tal Einat
parent 3a1d50e7e5
commit 8047e0e1c6
3 changed files with 26 additions and 14 deletions

View File

@ -97,30 +97,38 @@ Disallow: /
class BaseRequestRateTest(BaseRobotTest): class BaseRequestRateTest(BaseRobotTest):
request_rate = None
crawl_delay = None
def test_request_rate(self): def test_request_rate(self):
parser = self.parser
for url in self.good + self.bad: for url in self.good + self.bad:
agent, url = self.get_agent_and_url(url) agent, url = self.get_agent_and_url(url)
with self.subTest(url=url, agent=agent): with self.subTest(url=url, agent=agent):
if self.crawl_delay: self.assertEqual(parser.crawl_delay(agent), self.crawl_delay)
self.assertEqual(
self.parser.crawl_delay(agent), self.crawl_delay parsed_request_rate = parser.request_rate(agent)
) self.assertEqual(parsed_request_rate, self.request_rate)
if self.request_rate: if self.request_rate is not None:
self.assertIsInstance( self.assertIsInstance(
self.parser.request_rate(agent), parsed_request_rate,
urllib.robotparser.RequestRate urllib.robotparser.RequestRate
) )
self.assertEqual( self.assertEqual(
self.parser.request_rate(agent).requests, parsed_request_rate.requests,
self.request_rate.requests self.request_rate.requests
) )
self.assertEqual( self.assertEqual(
self.parser.request_rate(agent).seconds, parsed_request_rate.seconds,
self.request_rate.seconds self.request_rate.seconds
) )
class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
robots_txt = ''
good = ['/foo']
class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase): class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
robots_txt = """\ robots_txt = """\
User-agent: figtree User-agent: figtree
@ -141,10 +149,6 @@ Disallow: /%7ejoe/index.html
class DifferentAgentTest(CrawlDelayAndRequestRateTest): class DifferentAgentTest(CrawlDelayAndRequestRateTest):
agent = 'FigTree Robot libwww-perl/5.04' agent = 'FigTree Robot libwww-perl/5.04'
# these are not actually tested, but we still need to parse it
# in order to accommodate the input parameters
request_rate = None
crawl_delay = None
class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase): class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):

View File

@ -186,7 +186,9 @@ class RobotFileParser:
for entry in self.entries: for entry in self.entries:
if entry.applies_to(useragent): if entry.applies_to(useragent):
return entry.delay return entry.delay
return self.default_entry.delay if self.default_entry:
return self.default_entry.delay
return None
def request_rate(self, useragent): def request_rate(self, useragent):
if not self.mtime(): if not self.mtime():
@ -194,7 +196,9 @@ class RobotFileParser:
for entry in self.entries: for entry in self.entries:
if entry.applies_to(useragent): if entry.applies_to(useragent):
return entry.req_rate return entry.req_rate
return self.default_entry.req_rate if self.default_entry:
return self.default_entry.req_rate
return None
def site_maps(self): def site_maps(self):
if not self.sitemaps: if not self.sitemaps:

View File

@ -0,0 +1,4 @@
Fix :meth:`RobotFileParser.crawl_delay` and
:meth:`RobotFileParser.request_rate` to return ``None`` rather than
raise :exc:`AttributeError` when no relevant rule is defined in the
robots.txt file. Patch by Rémi Lapeyre.