bpo-31325: Fix usage of namedtuple in RobotFileParser.parse() (#4529)
This commit is contained in:
parent
0858495a50
commit
3df02dbc8e
|
@ -69,10 +69,10 @@ structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html.
|
||||||
.. method:: request_rate(useragent)
|
.. method:: request_rate(useragent)
|
||||||
|
|
||||||
Returns the contents of the ``Request-rate`` parameter from
|
Returns the contents of the ``Request-rate`` parameter from
|
||||||
``robots.txt`` in the form of a :func:`~collections.namedtuple`
|
``robots.txt`` as a :term:`named tuple` ``RequestRate(requests, seconds)``.
|
||||||
``(requests, seconds)``. If there is no such parameter or it doesn't
|
If there is no such parameter or it doesn't apply to the *useragent*
|
||||||
apply to the *useragent* specified or the ``robots.txt`` entry for this
|
specified or the ``robots.txt`` entry for this parameter has invalid
|
||||||
parameter has invalid syntax, return ``None``.
|
syntax, return ``None``.
|
||||||
|
|
||||||
.. versionadded:: 3.6
|
.. versionadded:: 3.6
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,6 @@ import os
|
||||||
import threading
|
import threading
|
||||||
import unittest
|
import unittest
|
||||||
import urllib.robotparser
|
import urllib.robotparser
|
||||||
from collections import namedtuple
|
|
||||||
from test import support
|
from test import support
|
||||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||||
|
|
||||||
|
@ -87,6 +86,10 @@ class BaseRequestRateTest(BaseRobotTest):
|
||||||
self.parser.crawl_delay(agent), self.crawl_delay
|
self.parser.crawl_delay(agent), self.crawl_delay
|
||||||
)
|
)
|
||||||
if self.request_rate:
|
if self.request_rate:
|
||||||
|
self.assertIsInstance(
|
||||||
|
self.parser.request_rate(agent),
|
||||||
|
urllib.robotparser.RequestRate
|
||||||
|
)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
self.parser.request_rate(agent).requests,
|
self.parser.request_rate(agent).requests,
|
||||||
self.request_rate.requests
|
self.request_rate.requests
|
||||||
|
@ -108,7 +111,7 @@ Disallow: /a%2fb.html
|
||||||
Disallow: /%7ejoe/index.html
|
Disallow: /%7ejoe/index.html
|
||||||
"""
|
"""
|
||||||
agent = 'figtree'
|
agent = 'figtree'
|
||||||
request_rate = namedtuple('req_rate', 'requests seconds')(9, 30)
|
request_rate = urllib.robotparser.RequestRate(9, 30)
|
||||||
crawl_delay = 3
|
crawl_delay = 3
|
||||||
good = [('figtree', '/foo.html')]
|
good = [('figtree', '/foo.html')]
|
||||||
bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
|
bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
|
||||||
|
@ -237,7 +240,7 @@ Crawl-delay: 1
|
||||||
Request-rate: 3/15
|
Request-rate: 3/15
|
||||||
Disallow: /cyberworld/map/
|
Disallow: /cyberworld/map/
|
||||||
"""
|
"""
|
||||||
request_rate = namedtuple('req_rate', 'requests seconds')(3, 15)
|
request_rate = urllib.robotparser.RequestRate(3, 15)
|
||||||
crawl_delay = 1
|
crawl_delay = 1
|
||||||
good = ['/', '/test.html']
|
good = ['/', '/test.html']
|
||||||
bad = ['/cyberworld/map/index.html']
|
bad = ['/cyberworld/map/index.html']
|
||||||
|
|
|
@ -16,6 +16,9 @@ import urllib.request
|
||||||
|
|
||||||
__all__ = ["RobotFileParser"]
|
__all__ = ["RobotFileParser"]
|
||||||
|
|
||||||
|
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
|
||||||
|
|
||||||
|
|
||||||
class RobotFileParser:
|
class RobotFileParser:
|
||||||
""" This class provides a set of methods to read, parse and answer
|
""" This class provides a set of methods to read, parse and answer
|
||||||
questions about a single robots.txt file.
|
questions about a single robots.txt file.
|
||||||
|
@ -136,11 +139,7 @@ class RobotFileParser:
|
||||||
# check if all values are sane
|
# check if all values are sane
|
||||||
if (len(numbers) == 2 and numbers[0].strip().isdigit()
|
if (len(numbers) == 2 and numbers[0].strip().isdigit()
|
||||||
and numbers[1].strip().isdigit()):
|
and numbers[1].strip().isdigit()):
|
||||||
req_rate = collections.namedtuple('req_rate',
|
entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
|
||||||
'requests seconds')
|
|
||||||
entry.req_rate = req_rate
|
|
||||||
entry.req_rate.requests = int(numbers[0])
|
|
||||||
entry.req_rate.seconds = int(numbers[1])
|
|
||||||
state = 2
|
state = 2
|
||||||
if state == 2:
|
if state == 2:
|
||||||
self._add_entry(entry)
|
self._add_entry(entry)
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
Fix wrong usage of :func:`collections.namedtuple` in
|
||||||
|
the :meth:`RobotFileParser.parse() <urllib.robotparser.RobotFileParser.parse>`
|
||||||
|
method.
|
||||||
|
|
||||||
|
Initial patch by Robin Wellner.
|
Loading…
Reference in New Issue