From 73fd46d24e45c34f0fb87261e5471584a7c273df Mon Sep 17 00:00:00 2001 From: Jeremy Hylton Date: Fri, 18 Jul 2008 20:59:44 +0000 Subject: [PATCH] Bug 3347: robotparser failed because it didn't convert bytes to string. The solution is to convert bytes to text via utf-8. I'm not entirely sure if this is safe, but it looks like robots.txt is expected to be ascii. --- Lib/test/test_robotparser.py | 15 ++++++++++++--- Lib/urllib/robotparser.py | 8 ++++++-- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py index fbb02bcbc2f..f02f9866044 100644 --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -136,8 +136,9 @@ bad = [] # Bug report says "/" should be denied, but that is not in the RFC RobotTest(7, doc, good, bad) -class TestCase(unittest.TestCase): - def runTest(self): +class NetworkTestCase(unittest.TestCase): + + def testPasswordProtectedSite(self): support.requires('network') # whole site is password-protected. url = 'http://mueblesmoraleda.com' @@ -146,9 +147,17 @@ class TestCase(unittest.TestCase): parser.read() self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False) + def testPythonOrg(self): + support.requires('network') + parser = urllib.robotparser.RobotFileParser( + "http://www.python.org/robots.txt") + parser.read() + self.assertTrue(parser.can_fetch("*", + "http://www.python.org/robots.txt")) + def test_main(): + support.run_unittest(NetworkTestCase) support.run_unittest(tests) - TestCase().run() if __name__=='__main__': support.Verbose = 1 diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py index a91df8d815b..c55fb5082f6 100644 --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -60,7 +60,8 @@ class RobotFileParser: elif err.code >= 400: self.allow_all = True else: - self.parse(f.read().splitlines()) + raw = f.read() + self.parse(raw.decode("utf-8").splitlines()) def _add_entry(self, entry): if "*" in entry.useragents: @@ -123,7 +124,10 @@ class RobotFileParser: return True # search for given user agent matches # the first match counts - url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/" + url = urllib.parse.quote( + urllib.parse.urlparse(urllib.parse.unquote(url))[2]) + if not url: + url = "/" for entry in self.entries: if entry.applies_to(useragent): return entry.allowance(url)