From c4e671eec20dfcb29b18596a89ef075f826c9f96 Mon Sep 17 00:00:00 2001 From: "Gregory P. Smith" Date: Tue, 30 Apr 2019 19:12:21 -0700 Subject: [PATCH] bpo-30458: Disallow control chars in http URLs. (GH-12755) Disallow control chars in http URLs in urllib.urlopen. This addresses a potential security problem for applications that do not sanity check their URLs where http request headers could be injected. --- Lib/http/client.py | 14 ++++++ Lib/test/test_urllib.py | 49 +++++++++++++++++++ Lib/test/test_xmlrpc.py | 9 +++- .../2019-04-10-08-53-30.bpo-36276.51E-DA.rst | 1 + 4 files changed, 71 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Security/2019-04-10-08-53-30.bpo-36276.51E-DA.rst diff --git a/Lib/http/client.py b/Lib/http/client.py index 5a2225276b1..99d6a68cf42 100644 --- a/Lib/http/client.py +++ b/Lib/http/client.py @@ -137,6 +137,16 @@ _MAXHEADERS = 100 _is_legal_header_name = re.compile(rb'[^:\s][^:\r\n]*').fullmatch _is_illegal_header_value = re.compile(rb'\n(?![ \t])|\r(?![ \t\n])').search +# These characters are not allowed within HTTP URL paths. +# See https://tools.ietf.org/html/rfc3986#section-3.3 and the +# https://tools.ietf.org/html/rfc3986#appendix-A pchar definition. +# Prevents CVE-2019-9740. Includes control characters such as \r\n. +# We don't restrict chars above \x7f as putrequest() limits us to ASCII. +_contains_disallowed_url_pchar_re = re.compile('[\x00-\x20\x7f]') +# Arguably only these _should_ allowed: +# _is_allowed_url_pchars_re = re.compile(r"^[/!$&'()*+,;=:@%a-zA-Z0-9._~-]+$") +# We are more lenient for assumed real world compatibility purposes. + # We always set the Content-Length header for these methods because some # servers will otherwise respond with a 411 _METHODS_EXPECTING_BODY = {'PATCH', 'POST', 'PUT'} @@ -1079,6 +1089,10 @@ class HTTPConnection: self._method = method if not url: url = '/' + # Prevent CVE-2019-9740. + if match := _contains_disallowed_url_pchar_re.search(url): + raise ValueError(f"URL can't contain control characters. {url!r} " + f"(found at least {match.group()!r})") request = '%s %s %s' % (method, url, self._http_vsn_str) # Non-ASCII characters should have been eliminated earlier diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 2ac73b58d83..e87c85b9287 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -329,6 +329,55 @@ class urlopen_HttpTests(unittest.TestCase, FakeHTTPMixin, FakeFTPMixin): finally: self.unfakehttp() + def test_url_with_control_char_rejected(self): + for char_no in list(range(0, 0x21)) + [0x7f]: + char = chr(char_no) + schemeless_url = f"//localhost:7777/test{char}/" + self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.") + try: + # We explicitly test urllib.request.urlopen() instead of the top + # level 'def urlopen()' function defined in this... (quite ugly) + # test suite. They use different url opening codepaths. Plain + # urlopen uses FancyURLOpener which goes via a codepath that + # calls urllib.parse.quote() on the URL which makes all of the + # above attempts at injection within the url _path_ safe. + escaped_char_repr = repr(char).replace('\\', r'\\') + with self.assertRaisesRegex( + ValueError, f"contain control.*{escaped_char_repr}"): + urllib.request.urlopen(f"http:{schemeless_url}") + with self.assertRaisesRegex( + ValueError, f"contain control.*{escaped_char_repr}"): + urllib.request.urlopen(f"https:{schemeless_url}") + # This code path quotes the URL so there is no injection. + resp = urlopen(f"http:{schemeless_url}") + self.assertNotIn(char, resp.geturl()) + finally: + self.unfakehttp() + + def test_url_with_newline_header_injection_rejected(self): + self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.") + host = "localhost:7777?a=1 HTTP/1.1\r\nX-injected: header\r\nTEST: 123" + schemeless_url = "//" + host + ":8080/test/?test=a" + try: + # We explicitly test urllib.request.urlopen() instead of the top + # level 'def urlopen()' function defined in this... (quite ugly) + # test suite. They use different url opening codepaths. Plain + # urlopen uses FancyURLOpener which goes via a codepath that + # calls urllib.parse.quote() on the URL which makes all of the + # above attempts at injection within the url _path_ safe. + with self.assertRaisesRegex( + ValueError, r"contain control.*\\r.*(found at least . .)"): + urllib.request.urlopen(f"http:{schemeless_url}") + with self.assertRaisesRegex(ValueError, r"contain control.*\\n"): + urllib.request.urlopen(f"https:{schemeless_url}") + # This code path quotes the URL so there is no injection. + resp = urlopen(f"http:{schemeless_url}") + self.assertNotIn(' ', resp.geturl()) + self.assertNotIn('\r', resp.geturl()) + self.assertNotIn('\n', resp.geturl()) + finally: + self.unfakehttp() + def test_read_0_9(self): # "0.9" response accepted (but not "simple responses" without # a status line) diff --git a/Lib/test/test_xmlrpc.py b/Lib/test/test_xmlrpc.py index 9c8b6958c62..52bacc1eafa 100644 --- a/Lib/test/test_xmlrpc.py +++ b/Lib/test/test_xmlrpc.py @@ -943,8 +943,13 @@ class SimpleServerTestCase(BaseServerTestCase): def test_partial_post(self): # Check that a partial POST doesn't make the server loop: issue #14001. - with contextlib.closing(http.client.HTTPConnection(ADDR, PORT)) as conn: - conn.request('POST', '/RPC2 HTTP/1.0\r\nContent-Length: 100\r\n\r\nbye') + with contextlib.closing(socket.create_connection((ADDR, PORT))) as conn: + conn.send('POST /RPC2 HTTP/1.0\r\n' + 'Content-Length: 100\r\n\r\n' + 'bye HTTP/1.1\r\n' + f'Host: {ADDR}:{PORT}\r\n' + 'Accept-Encoding: identity\r\n' + 'Content-Length: 0\r\n\r\n'.encode('ascii')) def test_context_manager(self): with xmlrpclib.ServerProxy(URL) as server: diff --git a/Misc/NEWS.d/next/Security/2019-04-10-08-53-30.bpo-36276.51E-DA.rst b/Misc/NEWS.d/next/Security/2019-04-10-08-53-30.bpo-36276.51E-DA.rst new file mode 100644 index 00000000000..4fed4d54504 --- /dev/null +++ b/Misc/NEWS.d/next/Security/2019-04-10-08-53-30.bpo-36276.51E-DA.rst @@ -0,0 +1 @@ +Address CVE-2019-9740 by disallowing URL paths with embedded whitespace or control characters through into the underlying http client request. Such potentially malicious header injection URLs now cause a ValueError to be raised. \ No newline at end of file