Issue #17214: Percent-encode non-ASCII bytes in redirect targets

Some servers send Location header fields with non-ASCII bytes, but "http.
client" requires the request target to be ASCII-encodable, otherwise a
UnicodeEncodeError is raised. Based on patch by Christian Heimes.

Python 2 does not suffer any problem because it allows non-ASCII bytes in the
HTTP request target.
This commit is contained in:
Martin Panter 2016-05-16 01:14:20 +00:00
parent ce6e06874b
commit e6f060903c
3 changed files with 52 additions and 1 deletions

View File

@ -1224,6 +1224,41 @@ class HandlerTests(unittest.TestCase):
fp = urllib.request.urlopen("http://python.org/path") fp = urllib.request.urlopen("http://python.org/path")
self.assertEqual(fp.geturl(), "http://python.org/path?query") self.assertEqual(fp.geturl(), "http://python.org/path?query")
def test_redirect_encoding(self):
# Some characters in the redirect target may need special handling,
# but most ASCII characters should be treated as already encoded
class Handler(urllib.request.HTTPHandler):
def http_open(self, req):
result = self.do_open(self.connection, req)
self.last_buf = self.connection.buf
# Set up a normal response for the next request
self.connection = test_urllib.fakehttp(
b'HTTP/1.1 200 OK\r\n'
b'Content-Length: 3\r\n'
b'\r\n'
b'123'
)
return result
handler = Handler()
opener = urllib.request.build_opener(handler)
tests = (
(b'/p\xC3\xA5-dansk/', b'/p%C3%A5-dansk/'),
(b'/spaced%20path/', b'/spaced%20path/'),
(b'/spaced path/', b'/spaced%20path/'),
(b'/?p\xC3\xA5-dansk', b'/?p%C3%A5-dansk'),
)
for [location, result] in tests:
with self.subTest(repr(location)):
handler.connection = test_urllib.fakehttp(
b'HTTP/1.1 302 Redirect\r\n'
b'Location: ' + location + b'\r\n'
b'\r\n'
)
response = opener.open('http://example.com/')
expected = b'GET ' + result + b' '
request = handler.last_buf
self.assertTrue(request.startswith(expected), repr(request))
def test_proxy(self): def test_proxy(self):
o = OpenerDirector() o = OpenerDirector()
ph = urllib.request.ProxyHandler(dict(http="proxy.example.com:3128")) ph = urllib.request.ProxyHandler(dict(http="proxy.example.com:3128"))

View File

@ -91,6 +91,7 @@ import os
import posixpath import posixpath
import re import re
import socket import socket
import string
import sys import sys
import time import time
import collections import collections
@ -616,8 +617,12 @@ class HTTPRedirectHandler(BaseHandler):
# from the user (of urllib.request, in this case). In practice, # from the user (of urllib.request, in this case). In practice,
# essentially all clients do redirect in this case, so we do # essentially all clients do redirect in this case, so we do
# the same. # the same.
# be conciliant with URIs containing a space
# Be conciliant with URIs containing a space. This is mainly
# redundant with the more complete encoding done in http_error_302(),
# but it is kept for compatibility with other callers.
newurl = newurl.replace(' ', '%20') newurl = newurl.replace(' ', '%20')
CONTENT_HEADERS = ("content-length", "content-type") CONTENT_HEADERS = ("content-length", "content-type")
newheaders = dict((k, v) for k, v in req.headers.items() newheaders = dict((k, v) for k, v in req.headers.items()
if k.lower() not in CONTENT_HEADERS) if k.lower() not in CONTENT_HEADERS)
@ -657,6 +662,11 @@ class HTTPRedirectHandler(BaseHandler):
urlparts[2] = "/" urlparts[2] = "/"
newurl = urlunparse(urlparts) newurl = urlunparse(urlparts)
# http.client.parse_headers() decodes as ISO-8859-1. Recover the
# original bytes and percent-encode non-ASCII bytes, and any special
# characters such as the space.
newurl = quote(
newurl, encoding="iso-8859-1", safe=string.punctuation)
newurl = urljoin(req.full_url, newurl) newurl = urljoin(req.full_url, newurl)
# XXX Probably want to forget about the state of the current # XXX Probably want to forget about the state of the current

View File

@ -121,6 +121,12 @@ Library
- Issue #14132: Fix urllib.request redirect handling when the target only has - Issue #14132: Fix urllib.request redirect handling when the target only has
a query string. Original fix by Ján Janech. a query string. Original fix by Ján Janech.
- Issue #17214: The "urllib.request" module now percent-encodes non-ASCII
bytes found in redirect target URLs. Some servers send Location header
fields with non-ASCII bytes, but "http.client" requires the request target
to be ASCII-encodable, otherwise a UnicodeEncodeError is raised. Based on
patch by Christian Heimes.
- Issue #26892: Honor debuglevel flag in urllib.request.HTTPHandler. Patch - Issue #26892: Honor debuglevel flag in urllib.request.HTTPHandler. Patch
contributed by Chi Hsuan Yen. contributed by Chi Hsuan Yen.