Issue10050 - urlretrieve uses newer urlopen. reporthook of urlretrieve takes, block number, block read size, file_size
This commit is contained in:
parent
a2251aadaa
commit
e24f96a059
|
@ -56,6 +56,13 @@ The simplest way to use urllib.request is as follows::
|
|||
response = urllib.request.urlopen('http://python.org/')
|
||||
html = response.read()
|
||||
|
||||
If you wish to retrieve a resource via URL and store it in a temporary location,
|
||||
you can do so via the :func:`urlretrieve` function::
|
||||
|
||||
import urllib.request
|
||||
local_filename, headers = urllib.request.urlretrieve('http://python.org/')
|
||||
html = open(local_filename)
|
||||
|
||||
Many uses of urllib will be that simple (note that instead of an 'http:' URL we
|
||||
could have used an URL starting with 'ftp:', 'file:', etc.). However, it's the
|
||||
purpose of this tutorial to explain the more complicated cases, concentrating on
|
||||
|
|
|
@ -1124,16 +1124,14 @@ The following functions and classes are ported from the Python 2 module
|
|||
``urllib`` (as opposed to ``urllib2``). They might become deprecated at
|
||||
some point in the future.
|
||||
|
||||
|
||||
.. function:: urlretrieve(url, filename=None, reporthook=None, data=None)
|
||||
|
||||
Copy a network object denoted by a URL to a local file, if necessary. If the URL
|
||||
points to a local file, or a valid cached copy of the object exists, the object
|
||||
is not copied. Return a tuple ``(filename, headers)`` where *filename* is the
|
||||
Copy a network object denoted by a URL to a local file. If the URL
|
||||
points to a local file, the object will not be copied unless filename is supplied.
|
||||
Return a tuple ``(filename, headers)`` where *filename* is the
|
||||
local file name under which the object can be found, and *headers* is whatever
|
||||
the :meth:`info` method of the object returned by :func:`urlopen` returned (for
|
||||
a remote object, possibly cached). Exceptions are the same as for
|
||||
:func:`urlopen`.
|
||||
a remote object). Exceptions are the same as for :func:`urlopen`.
|
||||
|
||||
The second argument, if present, specifies the file location to copy to (if
|
||||
absent, the location will be a tempfile with a generated name). The third
|
||||
|
@ -1144,11 +1142,18 @@ some point in the future.
|
|||
third argument may be ``-1`` on older FTP servers which do not return a file
|
||||
size in response to a retrieval request.
|
||||
|
||||
The following example illustrates the most common usage scenario::
|
||||
|
||||
>>> import urllib.request
|
||||
>>> local_filename, headers = urllib.request.urlretrieve('http://python.org/')
|
||||
>>> html = open(local_filename)
|
||||
>>> html.close()
|
||||
|
||||
If the *url* uses the :file:`http:` scheme identifier, the optional *data*
|
||||
argument may be given to specify a ``POST`` request (normally the request type
|
||||
is ``GET``). The *data* argument must in standard
|
||||
:mimetype:`application/x-www-form-urlencoded` format; see the :func:`urlencode`
|
||||
function below.
|
||||
argument may be given to specify a ``POST`` request (normally the request
|
||||
type is ``GET``). The *data* argument must in standard
|
||||
:mimetype:`application/x-www-form-urlencoded` format; see the
|
||||
:func:`urlencode` function below.
|
||||
|
||||
:func:`urlretrieve` will raise :exc:`ContentTooShortError` when it detects that
|
||||
the amount of data available was less than the expected amount (which is the
|
||||
|
@ -1156,20 +1161,20 @@ some point in the future.
|
|||
the download is interrupted.
|
||||
|
||||
The *Content-Length* is treated as a lower bound: if there's more data to read,
|
||||
:func:`urlretrieve` reads more data, but if less data is available, it raises
|
||||
the exception.
|
||||
urlretrieve reads more data, but if less data is available, it raises the
|
||||
exception.
|
||||
|
||||
You can still retrieve the downloaded data in this case, it is stored in the
|
||||
:attr:`content` attribute of the exception instance.
|
||||
|
||||
If no *Content-Length* header was supplied, :func:`urlretrieve` can not check
|
||||
the size of the data it has downloaded, and just returns it. In this case
|
||||
you just have to assume that the download was successful.
|
||||
If no *Content-Length* header was supplied, urlretrieve can not check the size
|
||||
of the data it has downloaded, and just returns it. In this case you just have
|
||||
to assume that the download was successful.
|
||||
|
||||
.. function:: urlcleanup()
|
||||
|
||||
Clear the cache that may have been built up by previous calls to
|
||||
:func:`urlretrieve`.
|
||||
Cleans up temporary files that may have been left behind by previous
|
||||
calls to :func:`urlretrieve`.
|
||||
|
||||
.. class:: URLopener(proxies=None, **x509)
|
||||
|
||||
|
|
|
@ -384,11 +384,11 @@ class urlretrieve_FileTests(unittest.TestCase):
|
|||
|
||||
def test_reporthook(self):
|
||||
# Make sure that the reporthook works.
|
||||
def hooktester(count, block_size, total_size, count_holder=[0]):
|
||||
self.assertIsInstance(count, int)
|
||||
self.assertIsInstance(block_size, int)
|
||||
self.assertIsInstance(total_size, int)
|
||||
self.assertEqual(count, count_holder[0])
|
||||
def hooktester(block_count, block_read_size, file_size, count_holder=[0]):
|
||||
self.assertIsInstance(block_count, int)
|
||||
self.assertIsInstance(block_read_size, int)
|
||||
self.assertIsInstance(file_size, int)
|
||||
self.assertEqual(block_count, count_holder[0])
|
||||
count_holder[0] = count_holder[0] + 1
|
||||
second_temp = "%s.2" % support.TESTFN
|
||||
self.registerFileForCleanUp(second_temp)
|
||||
|
@ -399,8 +399,8 @@ class urlretrieve_FileTests(unittest.TestCase):
|
|||
def test_reporthook_0_bytes(self):
|
||||
# Test on zero length file. Should call reporthook only 1 time.
|
||||
report = []
|
||||
def hooktester(count, block_size, total_size, _report=report):
|
||||
_report.append((count, block_size, total_size))
|
||||
def hooktester(block_count, block_read_size, file_size, _report=report):
|
||||
_report.append((block_count, block_read_size, file_size))
|
||||
srcFileName = self.createNewTempFile()
|
||||
urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName),
|
||||
support.TESTFN, hooktester)
|
||||
|
@ -410,31 +410,31 @@ class urlretrieve_FileTests(unittest.TestCase):
|
|||
def test_reporthook_5_bytes(self):
|
||||
# Test on 5 byte file. Should call reporthook only 2 times (once when
|
||||
# the "network connection" is established and once when the block is
|
||||
# read). Since the block size is 8192 bytes, only one block read is
|
||||
# required to read the entire file.
|
||||
# read).
|
||||
report = []
|
||||
def hooktester(count, block_size, total_size, _report=report):
|
||||
_report.append((count, block_size, total_size))
|
||||
def hooktester(block_count, block_read_size, file_size, _report=report):
|
||||
_report.append((block_count, block_read_size, file_size))
|
||||
srcFileName = self.createNewTempFile(b"x" * 5)
|
||||
urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName),
|
||||
support.TESTFN, hooktester)
|
||||
self.assertEqual(len(report), 2)
|
||||
self.assertEqual(report[0][1], 8192)
|
||||
self.assertEqual(report[0][2], 5)
|
||||
self.assertEqual(report[0][1], 0)
|
||||
self.assertEqual(report[1][1], 5)
|
||||
|
||||
def test_reporthook_8193_bytes(self):
|
||||
# Test on 8193 byte file. Should call reporthook only 3 times (once
|
||||
# when the "network connection" is established, once for the next 8192
|
||||
# bytes, and once for the last byte).
|
||||
report = []
|
||||
def hooktester(count, block_size, total_size, _report=report):
|
||||
_report.append((count, block_size, total_size))
|
||||
def hooktester(block_count, block_read_size, file_size, _report=report):
|
||||
_report.append((block_count, block_read_size, file_size))
|
||||
srcFileName = self.createNewTempFile(b"x" * 8193)
|
||||
urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName),
|
||||
support.TESTFN, hooktester)
|
||||
self.assertEqual(len(report), 3)
|
||||
self.assertEqual(report[0][1], 8192)
|
||||
self.assertEqual(report[0][2], 8193)
|
||||
self.assertEqual(report[0][1], 0)
|
||||
self.assertEqual(report[1][1], 8192)
|
||||
self.assertEqual(report[2][1], 1)
|
||||
|
||||
|
||||
class urlretrieve_HttpTests(unittest.TestCase, FakeHTTPMixin):
|
||||
|
|
|
@ -94,6 +94,9 @@ import socket
|
|||
import sys
|
||||
import time
|
||||
import collections
|
||||
import tempfile
|
||||
import contextlib
|
||||
|
||||
|
||||
from urllib.error import URLError, HTTPError, ContentTooShortError
|
||||
from urllib.parse import (
|
||||
|
@ -156,17 +159,78 @@ def install_opener(opener):
|
|||
global _opener
|
||||
_opener = opener
|
||||
|
||||
# TODO(jhylton): Make this work with the same global opener.
|
||||
_urlopener = None
|
||||
_url_tempfiles = []
|
||||
def urlretrieve(url, filename=None, reporthook=None, data=None):
|
||||
global _urlopener
|
||||
if not _urlopener:
|
||||
_urlopener = FancyURLopener()
|
||||
return _urlopener.retrieve(url, filename, reporthook, data)
|
||||
"""
|
||||
Retrieve a URL into a temporary location on disk.
|
||||
|
||||
Requires a URL argument. If a filename is passed, it is used as
|
||||
the temporary file location. The reporthook argument should be
|
||||
a callable that accepts a block number, a read size, and the
|
||||
total file size of the URL target. The data argument should be
|
||||
valid URL encoded data.
|
||||
|
||||
If a filename is passed and the URL points to a local resource,
|
||||
the result is a copy from local file to new file.
|
||||
|
||||
Returns a tuple containing the path to the newly created
|
||||
data file as well as the resulting HTTPMessage object.
|
||||
"""
|
||||
url_type, path = splittype(url)
|
||||
|
||||
with contextlib.closing(urlopen(url, data)) as fp:
|
||||
headers = fp.info()
|
||||
|
||||
# Just return the local path and the "headers" for file://
|
||||
# URLs. No sense in performing a copy unless requested.
|
||||
if url_type == "file" and not filename:
|
||||
return os.path.normpath(path), headers
|
||||
|
||||
# Handle temporary file setup.
|
||||
if filename:
|
||||
tfp = open(filename, 'wb')
|
||||
else:
|
||||
tfp = tempfile.NamedTemporaryFile(delete=False)
|
||||
filename = tfp.name
|
||||
_url_tempfiles.append(filename)
|
||||
|
||||
with tfp:
|
||||
result = filename, headers
|
||||
bs = 1024*8
|
||||
size = -1
|
||||
read = 0
|
||||
blocknum = 0
|
||||
if "content-length" in headers:
|
||||
size = int(headers["Content-Length"])
|
||||
|
||||
if reporthook:
|
||||
reporthook(blocknum, 0, size)
|
||||
|
||||
while True:
|
||||
block = fp.read(bs)
|
||||
if not block:
|
||||
break
|
||||
read += len(block)
|
||||
tfp.write(block)
|
||||
blocknum += 1
|
||||
if reporthook:
|
||||
reporthook(blocknum, len(block), size)
|
||||
|
||||
if size >= 0 and read < size:
|
||||
raise ContentTooShortError(
|
||||
"retrieval incomplete: got only %i out of %i bytes"
|
||||
% (read, size), result)
|
||||
|
||||
return result
|
||||
|
||||
def urlcleanup():
|
||||
if _urlopener:
|
||||
_urlopener.cleanup()
|
||||
for temp_file in _url_tempfiles:
|
||||
try:
|
||||
os.unlink(temp_file)
|
||||
except EnvironmentError:
|
||||
pass
|
||||
|
||||
del _url_tempfiles[:]
|
||||
global _opener
|
||||
if _opener:
|
||||
_opener = None
|
||||
|
|
Loading…
Reference in New Issue