Issue #16423: urllib.request now has support for ``data:`` URLs.

Patch by Mathias Panzenböck.
This commit is contained in:
Antoine Pitrou 2012-11-24 17:59:08 +01:00
parent a833e0d8ae
commit df204be922
5 changed files with 137 additions and 5 deletions

View File

@ -121,7 +121,7 @@ The :mod:`urllib.request` module defines the following functions:
instances of them or subclasses of them: :class:`ProxyHandler`,
:class:`UnknownHandler`, :class:`HTTPHandler`, :class:`HTTPDefaultErrorHandler`,
:class:`HTTPRedirectHandler`, :class:`FTPHandler`, :class:`FileHandler`,
:class:`HTTPErrorProcessor`.
:class:`HTTPErrorProcessor`, :class:`DataHandler`.
If the Python installation has SSL support (i.e., if the :mod:`ssl` module
can be imported), :class:`HTTPSHandler` will also be added.
@ -346,6 +346,11 @@ The following classes are provided:
Open local files.
.. class:: DataHandler()
Open data URLs.
.. versionadded:: 3.4
.. class:: FTPHandler()
@ -972,6 +977,21 @@ FileHandler Objects
hostname is given, an :exc:`URLError` is raised.
.. _data-handler-objects:
DataHandler Objects
-------------------
.. method:: DataHandler.data_open(req)
Read a data URL. This kind of URL contains the content encoded in the URL
itself. The data URL syntax is specified in :rfc:`2397`. This implementation
ignores white spaces in base64 encoded data URLs so the URL may be wrapped
in whatever source file it comes from. But even though some browsers don't
mind about a missing padding at the end of a base64 encoded data URL, this
implementation will raise an :exc:`ValueError` in that case.
.. _ftp-handler-objects:
FTPHandler Objects
@ -1374,7 +1394,9 @@ some point in the future.
pair: FTP; protocol
* Currently, only the following protocols are supported: HTTP (versions 0.9 and
1.0), FTP, and local files.
1.0), FTP, local files, and data URLs.
.. versionchanged:: 3.4 Added support for data URLs.
* The caching feature of :func:`urlretrieve` has been disabled until someone
finds the time to hack proper processing of Expiration time headers.

View File

@ -337,6 +337,79 @@ Content-Type: text/html; charset=iso-8859-1
with support.check_warnings(('',DeprecationWarning)):
urllib.request.URLopener()
class urlopen_DataTests(unittest.TestCase):
"""Test urlopen() opening a data URL."""
def setUp(self):
# text containing URL special- and unicode-characters
self.text = "test data URLs :;,%=& \u00f6 \u00c4 "
# 2x1 pixel RGB PNG image with one black and one white pixel
self.image = (
b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x02\x00\x00\x00'
b'\x01\x08\x02\x00\x00\x00{@\xe8\xdd\x00\x00\x00\x01sRGB\x00\xae'
b'\xce\x1c\xe9\x00\x00\x00\x0fIDAT\x08\xd7c```\xf8\xff\xff?\x00'
b'\x06\x01\x02\xfe\no/\x1e\x00\x00\x00\x00IEND\xaeB`\x82')
self.text_url = (
"data:text/plain;charset=UTF-8,test%20data%20URLs%20%3A%3B%2C%25%3"
"D%26%20%C3%B6%20%C3%84%20")
self.text_url_base64 = (
"data:text/plain;charset=ISO-8859-1;base64,dGVzdCBkYXRhIFVSTHMgOjs"
"sJT0mIPYgxCA%3D")
# base64 encoded data URL that contains ignorable spaces,
# such as "\n", " ", "%0A", and "%20".
self.image_url = (
"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAIAAAABCAIAAAB7\n"
"QOjdAAAAAXNSR0IArs4c6QAAAA9JREFUCNdj%0AYGBg%2BP//PwAGAQL%2BCm8 "
"vHgAAAABJRU5ErkJggg%3D%3D%0A%20")
self.text_url_resp = urllib.request.urlopen(self.text_url)
self.text_url_base64_resp = urllib.request.urlopen(
self.text_url_base64)
self.image_url_resp = urllib.request.urlopen(self.image_url)
def test_interface(self):
# Make sure object returned by urlopen() has the specified methods
for attr in ("read", "readline", "readlines",
"close", "info", "geturl", "getcode", "__iter__"):
self.assertTrue(hasattr(self.text_url_resp, attr),
"object returned by urlopen() lacks %s attribute" %
attr)
def test_info(self):
self.assertIsInstance(self.text_url_resp.info(), email.message.Message)
self.assertEqual(self.text_url_base64_resp.info().get_params(),
[('text/plain', ''), ('charset', 'ISO-8859-1')])
self.assertEqual(self.image_url_resp.info()['content-length'],
str(len(self.image)))
self.assertEqual(urllib.request.urlopen("data:,").info().get_params(),
[('text/plain', ''), ('charset', 'US-ASCII')])
def test_geturl(self):
self.assertEqual(self.text_url_resp.geturl(), self.text_url)
self.assertEqual(self.text_url_base64_resp.geturl(),
self.text_url_base64)
self.assertEqual(self.image_url_resp.geturl(), self.image_url)
def test_read_text(self):
self.assertEqual(self.text_url_resp.read().decode(
dict(self.text_url_resp.info().get_params())['charset']), self.text)
def test_read_text_base64(self):
self.assertEqual(self.text_url_base64_resp.read().decode(
dict(self.text_url_base64_resp.info().get_params())['charset']),
self.text)
def test_read_image(self):
self.assertEqual(self.image_url_resp.read(), self.image)
def test_missing_comma(self):
self.assertRaises(ValueError,urllib.request.urlopen,'data:text/plain')
def test_invalid_base64_data(self):
# missing padding character
self.assertRaises(ValueError,urllib.request.urlopen,'data:;base64,Cg=')
class urlretrieve_FileTests(unittest.TestCase):
"""Test urllib.urlretrieve() on local files"""
@ -1313,6 +1386,7 @@ def test_main():
support.run_unittest(
urlopen_FileTests,
urlopen_HttpTests,
urlopen_DataTests,
urlretrieve_FileTests,
urlretrieve_HttpTests,
ProxyTests,

View File

@ -103,7 +103,8 @@ from urllib.error import URLError, HTTPError, ContentTooShortError
from urllib.parse import (
urlparse, urlsplit, urljoin, unwrap, quote, unquote,
splittype, splithost, splitport, splituser, splitpasswd,
splitattr, splitquery, splitvalue, splittag, to_bytes, urlunparse)
splitattr, splitquery, splitvalue, splittag, to_bytes,
unquote_to_bytes, urlunparse)
from urllib.response import addinfourl, addclosehook
# check for SSL
@ -121,7 +122,7 @@ __all__ = [
'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm',
'AbstractBasicAuthHandler', 'HTTPBasicAuthHandler', 'ProxyBasicAuthHandler',
'AbstractDigestAuthHandler', 'HTTPDigestAuthHandler', 'ProxyDigestAuthHandler',
'HTTPHandler', 'FileHandler', 'FTPHandler', 'CacheFTPHandler',
'HTTPHandler', 'FileHandler', 'FTPHandler', 'CacheFTPHandler', 'DataHandler',
'UnknownHandler', 'HTTPErrorProcessor',
# Functions
'urlopen', 'install_opener', 'build_opener',
@ -535,7 +536,8 @@ def build_opener(*handlers):
opener = OpenerDirector()
default_classes = [ProxyHandler, UnknownHandler, HTTPHandler,
HTTPDefaultErrorHandler, HTTPRedirectHandler,
FTPHandler, FileHandler, HTTPErrorProcessor]
FTPHandler, FileHandler, HTTPErrorProcessor,
DataHandler]
if hasattr(http.client, "HTTPSConnection"):
default_classes.append(HTTPSHandler)
skip = set()
@ -1541,6 +1543,36 @@ class CacheFTPHandler(FTPHandler):
self.cache.clear()
self.timeout.clear()
class DataHandler(BaseHandler):
def data_open(self, req):
# data URLs as specified in RFC 2397.
#
# ignores POSTed data
#
# syntax:
# dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
# mediatype := [ type "/" subtype ] *( ";" parameter )
# data := *urlchar
# parameter := attribute "=" value
url = req.full_url
scheme, data = url.split(":",1)
mediatype, data = data.split(",",1)
# even base64 encoded data URLs might be quoted so unquote in any case:
data = unquote_to_bytes(data)
if mediatype.endswith(";base64"):
data = base64.decodebytes(data)
mediatype = mediatype[:-7]
if not mediatype:
mediatype = "text/plain;charset=US-ASCII"
headers = email.message_from_string("Content-type: %s\nContent-length: %d\n" %
(mediatype, len(data)))
return addinfourl(io.BytesIO(data), headers, url)
# Code move from the old urllib module

View File

@ -884,6 +884,7 @@ Mike Pall
Todd R. Palmer
Juan David Ibáñez Palomar
Jan Palus
Mathias Panzenböck
M. Papillon
Peter Parente
Alexandre Parenteau

View File

@ -138,6 +138,9 @@ Core and Builtins
Library
-------
- Issue #16423: urllib.request now has support for ``data:`` URLs. Patch by
Mathias Panzenböck.
- Issue #4473: Add a POP3.stls() to switch a clear-text POP3 session into
an encrypted POP3 session, on supported servers. Patch by Lorenzo Catucci.