From c70a6ae49bd162af06130e48a45579d445e058a8 Mon Sep 17 00:00:00 2001 From: Senthil Kumaran Date: Wed, 29 May 2013 05:54:31 -0700 Subject: [PATCH] #17403: urllib.parse.robotparser normalizes the urls before adding to ruleline. This helps in handling certain types invalid urls in a conservative manner. --- Lib/test/test_robotparser.py | 12 ++++++++++++ Lib/urllib/robotparser.py | 1 + Misc/NEWS | 4 ++++ 3 files changed, 17 insertions(+) diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index 8c09e7452c5..d1dfd9eeec0 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -234,6 +234,18 @@ bad = ['/some/path'] RobotTest(15, doc, good, bad) +# 16. Empty query (issue #17403). Normalizing the url first. +doc = """ +User-agent: * +Allow: /some/path? +Disallow: /another/path? +""" + +good = ['/some/path?'] +bad = ['/another/path?'] + +RobotTest(16, doc, good, bad) + class NetworkTestCase(unittest.TestCase): diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index 75be4af4091..978ba58d84a 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -157,6 +157,7 @@ class RuleLine: if path == '' and not allowance: # an empty value means allow all allowance = True + path = urllib.parse.urlunparse(urllib.parse.urlparse(path)) self.path = urllib.parse.quote(path) self.allowance = allowance diff --git a/Misc/NEWS b/Misc/NEWS index 828e240c5be..be6fd578db2 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -24,6 +24,10 @@ Core and Builtins Library ------- +- Issue #17403: urllib.parse.robotparser normalizes the urls before adding to + ruleline. This helps in handling certain types invalid urls in a conservative + manner. + - Issue #18025: Fixed a segfault in io.BufferedIOBase.readinto() when raw stream's read() returns more bytes than requested.