From 58a1a76baefc92d9e2392a5dbf65e39e44fb8f55 Mon Sep 17 00:00:00 2001 From: "Miss Islington (bot)" <31488909+miss-islington@users.noreply.github.com> Date: Sun, 16 Jun 2019 00:07:54 -0700 Subject: [PATCH] bpo-35922: Fix RobotFileParser when robots.txt has no relevant crawl delay or request rate (GH-11791) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Tal Einat (cherry picked from commit 8047e0e1c620f69cc21f9ca48b24bf2cdd5c3668) Co-authored-by: Rémi Lapeyre --- Lib/test/test_robotparser.py | 28 +++++++++++-------- Lib/urllib/robotparser.py | 8 ++++-- .../2019-06-11-19-34-29.bpo-35922.rxpzWr.rst | 4 +++ 3 files changed, 26 insertions(+), 14 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2019-06-11-19-34-29.bpo-35922.rxpzWr.rst diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 84a267ad956..77cd7c4d29d 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -97,30 +97,38 @@ Disallow: / class BaseRequestRateTest(BaseRobotTest): + request_rate = None + crawl_delay = None def test_request_rate(self): + parser = self.parser for url in self.good + self.bad: agent, url = self.get_agent_and_url(url) with self.subTest(url=url, agent=agent): - if self.crawl_delay: - self.assertEqual( - self.parser.crawl_delay(agent), self.crawl_delay - ) - if self.request_rate: + self.assertEqual(parser.crawl_delay(agent), self.crawl_delay) + + parsed_request_rate = parser.request_rate(agent) + self.assertEqual(parsed_request_rate, self.request_rate) + if self.request_rate is not None: self.assertIsInstance( - self.parser.request_rate(agent), + parsed_request_rate, urllib.robotparser.RequestRate ) self.assertEqual( - self.parser.request_rate(agent).requests, + parsed_request_rate.requests, self.request_rate.requests ) self.assertEqual( - self.parser.request_rate(agent).seconds, + parsed_request_rate.seconds, self.request_rate.seconds ) +class EmptyFileTest(BaseRequestRateTest, unittest.TestCase): + robots_txt = '' + good = ['/foo'] + + class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase): robots_txt = """\ User-agent: figtree @@ -141,10 +149,6 @@ Disallow: /%7ejoe/index.html class DifferentAgentTest(CrawlDelayAndRequestRateTest): agent = 'FigTree Robot libwww-perl/5.04' - # these are not actually tested, but we still need to parse it - # in order to accommodate the input parameters - request_rate = None - crawl_delay = None class InvalidRequestRateTest(BaseRobotTest, unittest.TestCase): diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 7089916a4f8..c58565e3945 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -186,7 +186,9 @@ class RobotFileParser: for entry in self.entries: if entry.applies_to(useragent): return entry.delay - return self.default_entry.delay + if self.default_entry: + return self.default_entry.delay + return None def request_rate(self, useragent): if not self.mtime(): @@ -194,7 +196,9 @@ class RobotFileParser: for entry in self.entries: if entry.applies_to(useragent): return entry.req_rate - return self.default_entry.req_rate + if self.default_entry: + return self.default_entry.req_rate + return None def site_maps(self): if not self.sitemaps: diff --git a/Misc/NEWS.d/next/Library/2019-06-11-19-34-29.bpo-35922.rxpzWr.rst b/Misc/NEWS.d/next/Library/2019-06-11-19-34-29.bpo-35922.rxpzWr.rst new file mode 100644 index 00000000000..5271a495624 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2019-06-11-19-34-29.bpo-35922.rxpzWr.rst @@ -0,0 +1,4 @@ +Fix :meth:`RobotFileParser.crawl_delay` and +:meth:`RobotFileParser.request_rate` to return ``None`` rather than +raise :exc:`AttributeError` when no relevant rule is defined in the +robots.txt file. Patch by Rémi Lapeyre.