Issue #25400: RobotFileParser now correctly returns default values for crawl_delay and request_rate

Initial patch by Peter Wirtz.
This commit is contained in:
Berker Peksag 2016-09-18 20:17:58 +03:00
parent 85c98bf968
commit 9a7bbb2e3f
3 changed files with 46 additions and 21 deletions

View File

@ -79,7 +79,28 @@ Disallow: /
bad = ['/cyberworld/map/index.html', '/', '/tmp/']
class CrawlDelayAndRequestRateTest(BaseRobotTest, unittest.TestCase):
class BaseRequestRateTest(BaseRobotTest):
def test_request_rate(self):
for url in self.good + self.bad:
agent, url = self.get_agent_and_url(url)
with self.subTest(url=url, agent=agent):
if self.crawl_delay:
self.assertEqual(
self.parser.crawl_delay(agent), self.crawl_delay
)
if self.request_rate:
self.assertEqual(
self.parser.request_rate(agent).requests,
self.request_rate.requests
)
self.assertEqual(
self.parser.request_rate(agent).seconds,
self.request_rate.seconds
)
class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
robots_txt = """\
User-agent: figtree
Crawl-delay: 3
@ -96,24 +117,6 @@ Disallow: /%7ejoe/index.html
bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
'/a%2fb.html', '/~joe/index.html']
def test_request_rate(self):
for url in self.good:
agent, url = self.get_agent_and_url(url)
with self.subTest(url=url, agent=agent):
if self.crawl_delay:
self.assertEqual(
self.parser.crawl_delay(agent), self.crawl_delay
)
if self.request_rate and self.parser.request_rate(agent):
self.assertEqual(
self.parser.request_rate(agent).requests,
self.request_rate.requests
)
self.assertEqual(
self.parser.request_rate(agent).seconds,
self.request_rate.seconds
)
class DifferentAgentTest(CrawlDelayAndRequestRateTest):
agent = 'FigTree Robot libwww-perl/5.04'
@ -230,6 +233,19 @@ Disallow: /another/path?
bad = ['/another/path?']
class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
robots_txt = """\
User-agent: *
Crawl-delay: 1
Request-rate: 3/15
Disallow: /cyberworld/map/
"""
request_rate = namedtuple('req_rate', 'requests seconds')(3, 15)
crawl_delay = 1
good = ['/', '/test.html']
bad = ['/cyberworld/map/index.html']
class RobotHandler(BaseHTTPRequestHandler):
def do_GET(self):
@ -309,6 +325,8 @@ class NetworkTestCase(unittest.TestCase):
self.assertTrue(parser.allow_all)
self.assertFalse(parser.disallow_all)
self.assertEqual(parser.mtime(), 0)
self.assertIsNone(parser.crawl_delay('*'))
self.assertIsNone(parser.request_rate('*'))
if __name__=='__main__':
unittest.main()

View File

@ -175,16 +175,20 @@ class RobotFileParser:
return True
def crawl_delay(self, useragent):
if not self.mtime():
return None
for entry in self.entries:
if entry.applies_to(useragent):
return entry.delay
return None
return self.default_entry.delay
def request_rate(self, useragent):
if not self.mtime():
return None
for entry in self.entries:
if entry.applies_to(useragent):
return entry.req_rate
return None
return self.default_entry.req_rate
def __str__(self):
return ''.join([str(entry) + "\n" for entry in self.entries])

View File

@ -29,6 +29,9 @@ Core and Builtins
Library
-------
- Issue #25400: RobotFileParser now correctly returns default values for
crawl_delay and request_rate. Initial patch by Peter Wirtz.
- Issue #27932: Prevent memory leak in win32_ver().
- Fix UnboundLocalError in socket._sendfile_use_sendfile.