From 5db5c0669e624767375593cc1a01f32092c91c58 Mon Sep 17 00:00:00 2001 From: Christopher Beacham Date: Wed, 16 May 2018 07:52:07 -0700 Subject: [PATCH] bpo-21475: Support the Sitemap extension in robotparser (GH-6883) --- Doc/library/urllib.robotparser.rst | 9 ++++++++ Lib/test/test_robotparser.py | 21 +++++++++++++++++++ Lib/urllib/robotparser.py | 12 +++++++++++ Misc/ACKS | 2 ++ .../2018-05-15-15-03-48.bpo-28612.E9dz39.rst | 3 +++ 5 files changed, 47 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst diff --git a/Doc/library/urllib.robotparser.rst b/Doc/library/urllib.robotparser.rst index e3b90e673ca..544f50273dd 100644 --- a/Doc/library/urllib.robotparser.rst +++ b/Doc/library/urllib.robotparser.rst @@ -76,6 +76,15 @@ structure of :file:`robots.txt` files, see http://www.robotstxt.org/orig.html. .. versionadded:: 3.6 + .. method:: site_maps() + + Returns the contents of the ``Sitemap`` parameter from + ``robots.txt`` in the form of a :func:`list`. If there is no such + parameter or the ``robots.txt`` entry for this parameter has + invalid syntax, return ``None``. + + .. versionadded:: 3.8 + The following example demonstrates basic use of the :class:`RobotFileParser` class:: diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index bee8d238be6..84a267ad956 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -12,6 +12,7 @@ class BaseRobotTest: agent = 'test_robotparser' good = [] bad = [] + site_maps = None def setUp(self): lines = io.StringIO(self.robots_txt).readlines() @@ -36,6 +37,9 @@ class BaseRobotTest: with self.subTest(url=url, agent=agent): self.assertFalse(self.parser.can_fetch(agent, url)) + def test_site_maps(self): + self.assertEqual(self.parser.site_maps(), self.site_maps) + class UserAgentWildcardTest(BaseRobotTest, unittest.TestCase): robots_txt = """\ @@ -65,6 +69,23 @@ Disallow: bad = ['/cyberworld/map/index.html'] +class SitemapTest(BaseRobotTest, unittest.TestCase): + robots_txt = """\ +# robots.txt for http://www.example.com/ + +User-agent: * +Sitemap: http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml +Sitemap: http://www.google.com/hostednews/sitemap_index.xml +Request-rate: 3/15 +Disallow: /cyberworld/map/ # This is an infinite virtual URL space + + """ + good = ['/', '/test.html'] + bad = ['/cyberworld/map/index.html'] + site_maps = ['http://www.gstatic.com/s2/sitemaps/profiles-sitemap.xml', + 'http://www.google.com/hostednews/sitemap_index.xml'] + + class RejectAllRobotsTest(BaseRobotTest, unittest.TestCase): robots_txt = """\ # go away diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 92e4efe6865..7089916a4f8 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -27,6 +27,7 @@ class RobotFileParser: def __init__(self, url=''): self.entries = [] + self.sitemaps = [] self.default_entry = None self.disallow_all = False self.allow_all = False @@ -141,6 +142,12 @@ class RobotFileParser: and numbers[1].strip().isdigit()): entry.req_rate = RequestRate(int(numbers[0]), int(numbers[1])) state = 2 + elif line[0] == "sitemap": + # According to http://www.sitemaps.org/protocol.html + # "This directive is independent of the user-agent line, + # so it doesn't matter where you place it in your file." + # Therefore we do not change the state of the parser. + self.sitemaps.append(line[1]) if state == 2: self._add_entry(entry) @@ -189,6 +196,11 @@ class RobotFileParser: return entry.req_rate return self.default_entry.req_rate + def site_maps(self): + if not self.sitemaps: + return None + return self.sitemaps + def __str__(self): entries = self.entries if self.default_entry is not None: diff --git a/Misc/ACKS b/Misc/ACKS index 665b4dd7f43..5c05ee7d5aa 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -109,6 +109,7 @@ Anthony Baxter Mike Bayer Samuel L. Bayer Bo Bayles +Christopher Beacham AKA Lady Red Tommy Beadle Donald Beaudry David Beazley @@ -1760,6 +1761,7 @@ Dik Winter Blake Winton Jean-Claude Wippler Stéphane Wirtel +Peter Wirtz Lars Wirzenius John Wiseman Chris Withers diff --git a/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst new file mode 100644 index 00000000000..e3e8f16eef0 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2018-05-15-15-03-48.bpo-28612.E9dz39.rst @@ -0,0 +1,3 @@ +Added support for Site Maps to urllib's ``RobotFileParser`` as +:meth:`RobotFileParser.site_maps() `. +Patch by Lady Red, based on patch by Peter Wirtz.