bpo-31325: Fix usage of namedtuple in RobotFileParser.parse() (#4529)

This commit is contained in:
Berker Peksag 2017-11-24 02:40:26 +03:00 committed by Raymond Hettinger
parent 0858495a50
commit 3df02dbc8e
4 changed files with 19 additions and 12 deletions

View File

@ -69,10 +69,10 @@ structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html.
.. method:: request_rate(useragent) .. method:: request_rate(useragent)
Returns the contents of the ``Request-rate`` parameter from Returns the contents of the ``Request-rate`` parameter from
``robots.txt`` in the form of a :func:`~collections.namedtuple` ``robots.txt`` as a :term:`named tuple` ``RequestRate(requests, seconds)``.
``(requests, seconds)``. If there is no such parameter or it doesn't If there is no such parameter or it doesn't apply to the *useragent*
apply to the *useragent* specified or the ``robots.txt`` entry for this specified or the ``robots.txt`` entry for this parameter has invalid
parameter has invalid syntax, return ``None``. syntax, return ``None``.
.. versionadded:: 3.6 .. versionadded:: 3.6

View File

@ -3,7 +3,6 @@ import os
import threading import threading
import unittest import unittest
import urllib.robotparser import urllib.robotparser
from collections import namedtuple
from test import support from test import support
from http.server import BaseHTTPRequestHandler, HTTPServer from http.server import BaseHTTPRequestHandler, HTTPServer
@ -87,6 +86,10 @@ class BaseRequestRateTest(BaseRobotTest):
self.parser.crawl_delay(agent), self.crawl_delay self.parser.crawl_delay(agent), self.crawl_delay
) )
if self.request_rate: if self.request_rate:
self.assertIsInstance(
self.parser.request_rate(agent),
urllib.robotparser.RequestRate
)
self.assertEqual( self.assertEqual(
self.parser.request_rate(agent).requests, self.parser.request_rate(agent).requests,
self.request_rate.requests self.request_rate.requests
@ -108,7 +111,7 @@ Disallow: /a%2fb.html
Disallow: /%7ejoe/index.html Disallow: /%7ejoe/index.html
""" """
agent = 'figtree' agent = 'figtree'
request_rate = namedtuple('req_rate', 'requests seconds')(9, 30) request_rate = urllib.robotparser.RequestRate(9, 30)
crawl_delay = 3 crawl_delay = 3
good = [('figtree', '/foo.html')] good = [('figtree', '/foo.html')]
bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html',
@ -237,7 +240,7 @@ Crawl-delay: 1
Request-rate: 3/15 Request-rate: 3/15
Disallow: /cyberworld/map/ Disallow: /cyberworld/map/
""" """
request_rate = namedtuple('req_rate', 'requests seconds')(3, 15) request_rate = urllib.robotparser.RequestRate(3, 15)
crawl_delay = 1 crawl_delay = 1
good = ['/', '/test.html'] good = ['/', '/test.html']
bad = ['/cyberworld/map/index.html'] bad = ['/cyberworld/map/index.html']

View File

@ -16,6 +16,9 @@ import urllib.request
__all__ = ["RobotFileParser"] __all__ = ["RobotFileParser"]
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
class RobotFileParser: class RobotFileParser:
""" This class provides a set of methods to read, parse and answer """ This class provides a set of methods to read, parse and answer
questions about a single robots.txt file. questions about a single robots.txt file.
@ -136,11 +139,7 @@ class RobotFileParser:
# check if all values are sane # check if all values are sane
if (len(numbers) == 2 and numbers[0].strip().isdigit() if (len(numbers) == 2 and numbers[0].strip().isdigit()
and numbers[1].strip().isdigit()): and numbers[1].strip().isdigit()):
req_rate = collections.namedtuple('req_rate', entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
'requests seconds')
entry.req_rate = req_rate
entry.req_rate.requests = int(numbers[0])
entry.req_rate.seconds = int(numbers[1])
state = 2 state = 2
if state == 2: if state == 2:
self._add_entry(entry) self._add_entry(entry)

View File

@ -0,0 +1,5 @@
Fix wrong usage of :func:`collections.namedtuple` in
the :meth:`RobotFileParser.parse() <urllib.robotparser.RobotFileParser.parse>`
method.
Initial patch by Robin Wellner.