bpo-21475: Support the Sitemap extension in robotparser (GH-6883)

2018-05-16 07:52:07 -07:00 · 2018-05-16 07:52:07 -07:00 · 5db5c0669e
parent 7a1c027501
commit 5db5c0669e
5 changed files with 47 additions and 0 deletions
--- a/Doc/library/urllib.robotparser.rst
+++ b/Doc/library/urllib.robotparser.rst
@ -76,6 +76,15 @@ structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html.

      .. versionadded:: 3.6

+   .. method:: site_maps()
+
+      Returns the contents of the ``Sitemap`` parameter from
+      ``robots.txt`` in the form of a :func:`list`. If there is no such
+      parameter or the ``robots.txt`` entry for this parameter has
+      invalid syntax, return ``None``.
+
+      .. versionadded:: 3.8
+

 The following example demonstrates basic use of the :class:`RobotFileParser`
 class::
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@ -12,6 +12,7 @@ class BaseRobotTest:
    agent = 'test_robotparser'
    good = []
    bad = []
+    site_maps = None

    def setUp(self):
        lines = io.StringIO(self.robots_txt).readlines()
@ -36,6 +37,9 @@ class BaseRobotTest:
            with self.subTest(url=url, agent=agent):
                self.assertFalse(self.parser.can_fetch(agent, url))

+    def test_site_maps(self):
+        self.assertEqual(self.parser.site_maps(), self.site_maps)
+

 class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
@ -65,6 +69,23 @@ Disallow:
    bad = ['/cyberworld/map/index.html']


+class SitemapTest(BaseRobotTest, unittest.TestCase):
+    robots_txt = """\
+# robots.txt for http://www.example.com/
+
+User-agent: *
+Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml
+Sitemap: http://www.google.com/hostednews/sitemap_index.xml
+Request-rate: 3/15
+Disallow: /cyberworld/map/ # This is an infinite virtual URL space
+
+    """
+    good = ['/', '/test.html']
+    bad = ['/cyberworld/map/index.html']
+    site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml',
+                 'http://www.google.com/hostednews/sitemap_index.xml']
+
+
 class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase):
    robots_txt = """\
 # go away
--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@ -27,6 +27,7 @@ class RobotFileParser:

    def __init__(self, url=''):
        self.entries = []
+        self.sitemaps = []
        self.default_entry = None
        self.disallow_all = False
        self.allow_all = False
@ -141,6 +142,12 @@ class RobotFileParser:
                            and numbers[1].strip().isdigit()):
                            entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1]))
                        state = 2
+                elif line[0] == "sitemap":
+                    # According to http://www.sitemaps.org/protocol.html
+                    # "This directive is independent of the user-agent line,
+                    #  so it doesn't matter where you place it in your file."
+                    # Therefore we do not change the state of the parser.
+                    self.sitemaps.append(line[1])
        if state == 2:
            self._add_entry(entry)

@ -189,6 +196,11 @@ class RobotFileParser:
                return entry.req_rate
        return self.default_entry.req_rate

+    def site_maps(self):
+        if not self.sitemaps:
+            return None
+        return self.sitemaps
+
    def __str__(self):
        entries = self.entries
        if self.default_entry is not None:
--- a/Misc/ACKS
+++ b/Misc/ACKS
@ -109,6 +109,7 @@ Anthony Baxter
 Mike Bayer
 Samuel L. Bayer
 Bo Bayles
+Christopher Beacham AKA Lady Red
 Tommy Beadle
 Donald Beaudry
 David Beazley
@ -1760,6 +1761,7 @@ Dik Winter
 Blake Winton
 Jean-Claude Wippler
 Stéphane Wirtel
+Peter Wirtz
 Lars Wirzenius
 John Wiseman
 Chris Withers
--- a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst
+++ b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst
@ -0,0 +1,3 @@
+Added support for Site Maps to urllib's ``RobotFileParser`` as
+:meth:`RobotFileParser.site_maps() <urllib.robotparser.RobotFileParser.site_maps>`.
+Patch by Lady Red, based on patch by Peter Wirtz.