Issue10050 - urlretrieve uses newer urlopen. reporthook of urlretrieve takes, block number, block read size, file_size

This commit is contained in:
Senthil Kumaran 2012-03-13 19:29:33 -07:00
parent a2251aadaa
commit e24f96a059
5 changed files with 119 additions and 42 deletions

View File

@ -56,6 +56,13 @@ The simplest way to use urllib.request is as follows::
response = urllib.request.urlopen('http://python.org/')
html = response.read()
If you wish to retrieve a resource via URL and store it in a temporary location,
you can do so via the :func:`urlretrieve` function::
import urllib.request
local_filename, headers = urllib.request.urlretrieve('http://python.org/')
html = open(local_filename)
Many uses of urllib will be that simple (note that instead of an 'http:' URL we
could have used an URL starting with 'ftp:', 'file:', etc.). However, it's the
purpose of this tutorial to explain the more complicated cases, concentrating on

View File

@ -1124,16 +1124,14 @@ The following functions and classes are ported from the Python 2 module
``urllib`` (as opposed to ``urllib2``). They might become deprecated at
some point in the future.
.. function:: urlretrieve(url, filename=None, reporthook=None, data=None)
Copy a network object denoted by a URL to a local file, if necessary. If the URL
points to a local file, or a valid cached copy of the object exists, the object
is not copied. Return a tuple ``(filename, headers)`` where *filename* is the
Copy a network object denoted by a URL to a local file. If the URL
points to a local file, the object will not be copied unless filename is supplied.
Return a tuple ``(filename, headers)`` where *filename* is the
local file name under which the object can be found, and *headers* is whatever
the :meth:`info` method of the object returned by :func:`urlopen` returned (for
a remote object, possibly cached). Exceptions are the same as for
:func:`urlopen`.
a remote object). Exceptions are the same as for :func:`urlopen`.
The second argument, if present, specifies the file location to copy to (if
absent, the location will be a tempfile with a generated name). The third
@ -1144,11 +1142,18 @@ some point in the future.
third argument may be ``-1`` on older FTP servers which do not return a file
size in response to a retrieval request.
The following example illustrates the most common usage scenario::
>>> import urllib.request
>>> local_filename, headers = urllib.request.urlretrieve('http://python.org/')
>>> html = open(local_filename)
>>> html.close()
If the *url* uses the :file:`http:` scheme identifier, the optional *data*
argument may be given to specify a ``POST`` request (normally the request type
is ``GET``). The *data* argument must in standard
:mimetype:`application/x-www-form-urlencoded` format; see the :func:`urlencode`
function below.
argument may be given to specify a ``POST`` request (normally the request
type is ``GET``). The *data* argument must in standard
:mimetype:`application/x-www-form-urlencoded` format; see the
:func:`urlencode` function below.
:func:`urlretrieve` will raise :exc:`ContentTooShortError` when it detects that
the amount of data available was less than the expected amount (which is the
@ -1156,20 +1161,20 @@ some point in the future.
the download is interrupted.
The *Content-Length* is treated as a lower bound: if there's more data to read,
:func:`urlretrieve` reads more data, but if less data is available, it raises
the exception.
urlretrieve reads more data, but if less data is available, it raises the
exception.
You can still retrieve the downloaded data in this case, it is stored in the
:attr:`content` attribute of the exception instance.
If no *Content-Length* header was supplied, :func:`urlretrieve` can not check
the size of the data it has downloaded, and just returns it. In this case
you just have to assume that the download was successful.
If no *Content-Length* header was supplied, urlretrieve can not check the size
of the data it has downloaded, and just returns it. In this case you just have
to assume that the download was successful.
.. function:: urlcleanup()
Clear the cache that may have been built up by previous calls to
:func:`urlretrieve`.
Cleans up temporary files that may have been left behind by previous
calls to :func:`urlretrieve`.
.. class:: URLopener(proxies=None, **x509)

View File

@ -384,11 +384,11 @@ class urlretrieve_FileTests(unittest.TestCase):
def test_reporthook(self):
# Make sure that the reporthook works.
def hooktester(count, block_size, total_size, count_holder=[0]):
self.assertIsInstance(count, int)
self.assertIsInstance(block_size, int)
self.assertIsInstance(total_size, int)
self.assertEqual(count, count_holder[0])
def hooktester(block_count, block_read_size, file_size, count_holder=[0]):
self.assertIsInstance(block_count, int)
self.assertIsInstance(block_read_size, int)
self.assertIsInstance(file_size, int)
self.assertEqual(block_count, count_holder[0])
count_holder[0] = count_holder[0] + 1
second_temp = "%s.2" % support.TESTFN
self.registerFileForCleanUp(second_temp)
@ -399,8 +399,8 @@ class urlretrieve_FileTests(unittest.TestCase):
def test_reporthook_0_bytes(self):
# Test on zero length file. Should call reporthook only 1 time.
report = []
def hooktester(count, block_size, total_size, _report=report):
_report.append((count, block_size, total_size))
def hooktester(block_count, block_read_size, file_size, _report=report):
_report.append((block_count, block_read_size, file_size))
srcFileName = self.createNewTempFile()
urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName),
support.TESTFN, hooktester)
@ -410,31 +410,31 @@ class urlretrieve_FileTests(unittest.TestCase):
def test_reporthook_5_bytes(self):
# Test on 5 byte file. Should call reporthook only 2 times (once when
# the "network connection" is established and once when the block is
# read). Since the block size is 8192 bytes, only one block read is
# required to read the entire file.
# read).
report = []
def hooktester(count, block_size, total_size, _report=report):
_report.append((count, block_size, total_size))
def hooktester(block_count, block_read_size, file_size, _report=report):
_report.append((block_count, block_read_size, file_size))
srcFileName = self.createNewTempFile(b"x" * 5)
urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName),
support.TESTFN, hooktester)
self.assertEqual(len(report), 2)
self.assertEqual(report[0][1], 8192)
self.assertEqual(report[0][2], 5)
self.assertEqual(report[0][1], 0)
self.assertEqual(report[1][1], 5)
def test_reporthook_8193_bytes(self):
# Test on 8193 byte file. Should call reporthook only 3 times (once
# when the "network connection" is established, once for the next 8192
# bytes, and once for the last byte).
report = []
def hooktester(count, block_size, total_size, _report=report):
_report.append((count, block_size, total_size))
def hooktester(block_count, block_read_size, file_size, _report=report):
_report.append((block_count, block_read_size, file_size))
srcFileName = self.createNewTempFile(b"x" * 8193)
urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName),
support.TESTFN, hooktester)
self.assertEqual(len(report), 3)
self.assertEqual(report[0][1], 8192)
self.assertEqual(report[0][2], 8193)
self.assertEqual(report[0][1], 0)
self.assertEqual(report[1][1], 8192)
self.assertEqual(report[2][1], 1)
class urlretrieve_HttpTests(unittest.TestCase, FakeHTTPMixin):

View File

@ -94,6 +94,9 @@ import socket
import sys
import time
import collections
import tempfile
import contextlib
from urllib.error import URLError, HTTPError, ContentTooShortError
from urllib.parse import (
@ -156,17 +159,78 @@ def install_opener(opener):
global _opener
_opener = opener
# TODO(jhylton): Make this work with the same global opener.
_urlopener = None
_url_tempfiles = []
def urlretrieve(url, filename=None, reporthook=None, data=None):
global _urlopener
if not _urlopener:
_urlopener = FancyURLopener()
return _urlopener.retrieve(url, filename, reporthook, data)
"""
Retrieve a URL into a temporary location on disk.
Requires a URL argument. If a filename is passed, it is used as
the temporary file location. The reporthook argument should be
a callable that accepts a block number, a read size, and the
total file size of the URL target. The data argument should be
valid URL encoded data.
If a filename is passed and the URL points to a local resource,
the result is a copy from local file to new file.
Returns a tuple containing the path to the newly created
data file as well as the resulting HTTPMessage object.
"""
url_type, path = splittype(url)
with contextlib.closing(urlopen(url, data)) as fp:
headers = fp.info()
# Just return the local path and the "headers" for file://
# URLs. No sense in performing a copy unless requested.
if url_type == "file" and not filename:
return os.path.normpath(path), headers
# Handle temporary file setup.
if filename:
tfp = open(filename, 'wb')
else:
tfp = tempfile.NamedTemporaryFile(delete=False)
filename = tfp.name
_url_tempfiles.append(filename)
with tfp:
result = filename, headers
bs = 1024*8
size = -1
read = 0
blocknum = 0
if "content-length" in headers:
size = int(headers["Content-Length"])
if reporthook:
reporthook(blocknum, 0, size)
while True:
block = fp.read(bs)
if not block:
break
read += len(block)
tfp.write(block)
blocknum += 1
if reporthook:
reporthook(blocknum, len(block), size)
if size >= 0 and read < size:
raise ContentTooShortError(
"retrieval incomplete: got only %i out of %i bytes"
% (read, size), result)
return result
def urlcleanup():
if _urlopener:
_urlopener.cleanup()
for temp_file in _url_tempfiles:
try:
os.unlink(temp_file)
except EnvironmentError:
pass
del _url_tempfiles[:]
global _opener
if _opener:
_opener = None

View File

@ -288,6 +288,7 @@ Julien Élie
Lance Ellinghaus
David Ely
Jeff Epler
Jeff McNeil
Tom Epperly
Stoffel Erasmus
Jürgen A. Erhard