Issue #25400: Merge from 3.6
This commit is contained in:
commit
76ab4164e5
|
@ -79,7 +79,28 @@ Disallow: /
|
||||||
bad = ['/cyberworld/map/index.html', '/', '/tmp/']
|
bad = ['/cyberworld/map/index.html', '/', '/tmp/']
|
||||||
|
|
||||||
|
|
||||||
class CrawlDelayAndRequestRateTest(BaseRobotTest, unittest.TestCase):
|
class BaseRequestRateTest(BaseRobotTest):
|
||||||
|
|
||||||
|
def test_request_rate(self):
|
||||||
|
for url in self.good + self.bad:
|
||||||
|
agent, url = self.get_agent_and_url(url)
|
||||||
|
with self.subTest(url=url, agent=agent):
|
||||||
|
if self.crawl_delay:
|
||||||
|
self.assertEqual(
|
||||||
|
self.parser.crawl_delay(agent), self.crawl_delay
|
||||||
|
)
|
||||||
|
if self.request_rate:
|
||||||
|
self.assertEqual(
|
||||||
|
self.parser.request_rate(agent).requests,
|
||||||
|
self.request_rate.requests
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
self.parser.request_rate(agent).seconds,
|
||||||
|
self.request_rate.seconds
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase):
|
||||||
robots_txt = """\
|
robots_txt = """\
|
||||||
User-agent: figtree
|
User-agent: figtree
|
||||||
Crawl-delay: 3
|
Crawl-delay: 3
|
||||||
|
@ -96,24 +117,6 @@ Disallow: /%7ejoe/index.html
|
||||||
bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
|
bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
|
||||||
'/a%2fb.html', '/~joe/index.html']
|
'/a%2fb.html', '/~joe/index.html']
|
||||||
|
|
||||||
def test_request_rate(self):
|
|
||||||
for url in self.good:
|
|
||||||
agent, url = self.get_agent_and_url(url)
|
|
||||||
with self.subTest(url=url, agent=agent):
|
|
||||||
if self.crawl_delay:
|
|
||||||
self.assertEqual(
|
|
||||||
self.parser.crawl_delay(agent), self.crawl_delay
|
|
||||||
)
|
|
||||||
if self.request_rate and self.parser.request_rate(agent):
|
|
||||||
self.assertEqual(
|
|
||||||
self.parser.request_rate(agent).requests,
|
|
||||||
self.request_rate.requests
|
|
||||||
)
|
|
||||||
self.assertEqual(
|
|
||||||
self.parser.request_rate(agent).seconds,
|
|
||||||
self.request_rate.seconds
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class DifferentAgentTest(CrawlDelayAndRequestRateTest):
|
class DifferentAgentTest(CrawlDelayAndRequestRateTest):
|
||||||
agent = 'FigTree Robot libwww-perl/5.04'
|
agent = 'FigTree Robot libwww-perl/5.04'
|
||||||
|
@ -230,6 +233,19 @@ Disallow: /another/path?
|
||||||
bad = ['/another/path?']
|
bad = ['/another/path?']
|
||||||
|
|
||||||
|
|
||||||
|
class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
|
||||||
|
robots_txt = """\
|
||||||
|
User-agent: *
|
||||||
|
Crawl-delay: 1
|
||||||
|
Request-rate: 3/15
|
||||||
|
Disallow: /cyberworld/map/
|
||||||
|
"""
|
||||||
|
request_rate = namedtuple('req_rate', 'requests seconds')(3, 15)
|
||||||
|
crawl_delay = 1
|
||||||
|
good = ['/', '/test.html']
|
||||||
|
bad = ['/cyberworld/map/index.html']
|
||||||
|
|
||||||
|
|
||||||
class RobotHandler(BaseHTTPRequestHandler):
|
class RobotHandler(BaseHTTPRequestHandler):
|
||||||
|
|
||||||
def do_GET(self):
|
def do_GET(self):
|
||||||
|
@ -309,6 +325,8 @@ class NetworkTestCase(unittest.TestCase):
|
||||||
self.assertTrue(parser.allow_all)
|
self.assertTrue(parser.allow_all)
|
||||||
self.assertFalse(parser.disallow_all)
|
self.assertFalse(parser.disallow_all)
|
||||||
self.assertEqual(parser.mtime(), 0)
|
self.assertEqual(parser.mtime(), 0)
|
||||||
|
self.assertIsNone(parser.crawl_delay('*'))
|
||||||
|
self.assertIsNone(parser.request_rate('*'))
|
||||||
|
|
||||||
if __name__=='__main__':
|
if __name__=='__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
|
@ -175,16 +175,20 @@ class RobotFileParser:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def crawl_delay(self, useragent):
|
def crawl_delay(self, useragent):
|
||||||
|
if not self.mtime():
|
||||||
|
return None
|
||||||
for entry in self.entries:
|
for entry in self.entries:
|
||||||
if entry.applies_to(useragent):
|
if entry.applies_to(useragent):
|
||||||
return entry.delay
|
return entry.delay
|
||||||
return None
|
return self.default_entry.delay
|
||||||
|
|
||||||
def request_rate(self, useragent):
|
def request_rate(self, useragent):
|
||||||
|
if not self.mtime():
|
||||||
|
return None
|
||||||
for entry in self.entries:
|
for entry in self.entries:
|
||||||
if entry.applies_to(useragent):
|
if entry.applies_to(useragent):
|
||||||
return entry.req_rate
|
return entry.req_rate
|
||||||
return None
|
return self.default_entry.req_rate
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return ''.join([str(entry) + "\n" for entry in self.entries])
|
return ''.join([str(entry) + "\n" for entry in self.entries])
|
||||||
|
|
|
@ -35,6 +35,9 @@ Core and Builtins
|
||||||
Library
|
Library
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
- Issue #25400: RobotFileParser now correctly returns default values for
|
||||||
|
crawl_delay and request_rate. Initial patch by Peter Wirtz.
|
||||||
|
|
||||||
- Issue #27932: Prevent memory leak in win32_ver().
|
- Issue #27932: Prevent memory leak in win32_ver().
|
||||||
|
|
||||||
- Fix UnboundLocalError in socket._sendfile_use_sendfile.
|
- Fix UnboundLocalError in socket._sendfile_use_sendfile.
|
||||||
|
|
Loading…
Reference in New Issue