#17403: urllib.parse.robotparser normalizes the urls before adding to ruleline.

This helps in handling certain types invalid urls in a conservative manner.
2013-05-29 05:54:31 -07:00 · 2013-05-29 05:54:31 -07:00 · c70a6ae49b
parent eb4c9c77b8
commit c70a6ae49b
3 changed files with 17 additions and 0 deletions
--- a/Lib/test/test_robotparser.py
+++ b/Lib/test/test_robotparser.py
@ -234,6 +234,18 @@ bad = ['/some/path']

 RobotTest(15, doc, good, bad)

+# 16. Empty query (issue #17403). Normalizing the url first.
+doc = """
+User-agent: *
+Allow: /some/path?
+Disallow: /another/path?
+"""
+
+good = ['/some/path?']
+bad = ['/another/path?']
+
+RobotTest(16, doc, good, bad)
+

 class NetworkTestCase(unittest.TestCase):

--- a/Lib/urllib/robotparser.py
+++ b/Lib/urllib/robotparser.py
@ -157,6 +157,7 @@ class RuleLine:
        if path == '' and not allowance:
            # an empty value means allow all
            allowance = True
+        path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
        self.path = urllib.parse.quote(path)
        self.allowance = allowance

--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -24,6 +24,10 @@ Core and Builtins
 Library
 -------

+- Issue #17403: urllib.parse.robotparser normalizes the urls before adding to
+  ruleline. This helps in handling certain types invalid urls in a conservative
+  manner.
+
 - Issue #18025: Fixed a segfault in io.BufferedIOBase.readinto() when raw
  stream's read() returns more bytes than requested.