bpo-35922: Fix RobotFileParser when robots.txt has no relevant crawl delay or request rate (GH-11791)
Co-Authored-By: Tal Einat <taleinat+github@gmail.com>
This commit is contained in:
parent
3a1d50e7e5
commit
8047e0e1c6
|
@ -97,30 +97,38 @@ Disallow: /
|
||||||
|
|
||||||
|
|
||||||
class BaseRequestRateTest(BaseRobotTest):
|
class BaseRequestRateTest(BaseRobotTest):
|
||||||
|
request_rate = None
|
||||||
|
crawl_delay = None
|
||||||
|
|
||||||
def test_request_rate(self):
|
def test_request_rate(self):
|
||||||
|
parser = self.parser
|
||||||
for url in self.good + self.bad:
|
for url in self.good + self.bad:
|
||||||
agent, url = self.get_agent_and_url(url)
|
agent, url = self.get_agent_and_url(url)
|
||||||
with self.subTest(url=url, agent=agent):
|
with self.subTest(url=url, agent=agent):
|
||||||
if self.crawl_delay:
|
self.assertEqual(parser.crawl_delay(agent), self.crawl_delay)
|
||||||
self.assertEqual(
|
|
||||||
self.parser.crawl_delay(agent), self.crawl_delay
|
parsed_request_rate = parser.request_rate(agent)
|
||||||
)
|
self.assertEqual(parsed_request_rate, self.request_rate)
|
||||||
if self.request_rate:
|
if self.request_rate is not None:
|
||||||
self.assertIsInstance(
|
self.assertIsInstance(
|
||||||
self.parser.request_rate(agent),
|
parsed_request_rate,
|
||||||
urllib.robotparser.RequestRate
|
urllib.robotparser.RequestRate
|
||||||
)
|
)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
self.parser.request_rate(agent).requests,
|
parsed_request_rate.requests,
|
||||||
self.request_rate.requests
|
self.request_rate.requests
|
||||||
)
|
)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
self.parser.request_rate(agent).seconds,
|
parsed_request_rate.seconds,
|
||||||
self.request_rate.seconds
|
self.request_rate.seconds
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class EmptyFileTest(BaseRequestRateTest, unittest.TestCase):
|
||||||
|
robots_txt = ''
|
||||||
|
good = ['/foo']
|
||||||
|
|
||||||
|
|
||||||
class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
|
class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
|
||||||
robots_txt = """\
|
robots_txt = """\
|
||||||
User-agent: figtree
|
User-agent: figtree
|
||||||
|
@ -141,10 +149,6 @@ Disallow: /%7ejoe/index.html
|
||||||
|
|
||||||
class DifferentAgentTest(CrawlDelayAndRequestRateTest):
|
class DifferentAgentTest(CrawlDelayAndRequestRateTest):
|
||||||
agent = 'FigTree Robot libwww-perl/5.04'
|
agent = 'FigTree Robot libwww-perl/5.04'
|
||||||
# these are not actually tested, but we still need to parse it
|
|
||||||
# in order to accommodate the input parameters
|
|
||||||
request_rate = None
|
|
||||||
crawl_delay = None
|
|
||||||
|
|
||||||
|
|
||||||
class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
|
class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase):
|
||||||
|
|
|
@ -186,7 +186,9 @@ class RobotFileParser:
|
||||||
for entry in self.entries:
|
for entry in self.entries:
|
||||||
if entry.applies_to(useragent):
|
if entry.applies_to(useragent):
|
||||||
return entry.delay
|
return entry.delay
|
||||||
return self.default_entry.delay
|
if self.default_entry:
|
||||||
|
return self.default_entry.delay
|
||||||
|
return None
|
||||||
|
|
||||||
def request_rate(self, useragent):
|
def request_rate(self, useragent):
|
||||||
if not self.mtime():
|
if not self.mtime():
|
||||||
|
@ -194,7 +196,9 @@ class RobotFileParser:
|
||||||
for entry in self.entries:
|
for entry in self.entries:
|
||||||
if entry.applies_to(useragent):
|
if entry.applies_to(useragent):
|
||||||
return entry.req_rate
|
return entry.req_rate
|
||||||
return self.default_entry.req_rate
|
if self.default_entry:
|
||||||
|
return self.default_entry.req_rate
|
||||||
|
return None
|
||||||
|
|
||||||
def site_maps(self):
|
def site_maps(self):
|
||||||
if not self.sitemaps:
|
if not self.sitemaps:
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
Fix :meth:`RobotFileParser.crawl_delay` and
|
||||||
|
:meth:`RobotFileParser.request_rate` to return ``None`` rather than
|
||||||
|
raise :exc:`AttributeError` when no relevant rule is defined in the
|
||||||
|
robots.txt file. Patch by Rémi Lapeyre.
|
Loading…
Reference in New Issue