diff -r cfcff2683c71 Doc/howto/urllib2.rst --- a/Doc/howto/urllib2.rst Fri Mar 18 15:09:10 2011 -0700 +++ b/Doc/howto/urllib2.rst Sat Mar 19 23:02:15 2011 -0400 @@ -56,6 +56,13 @@ response = urllib.request.urlopen('http://python.org/') html = response.read() +If you wish to retrieve a resource via URL and store it in a temporary location, +you can do so via the :func:`urlretrieve` function:: + + import urllib.request + local_filename, headers = urllib.request.urlretrieve('http://python.org/') + html = open(local_filename) + Many uses of urllib will be that simple (note that instead of an 'http:' URL we could have used an URL starting with 'ftp:', 'file:', etc.). However, it's the purpose of this tutorial to explain the more complicated cases, concentrating on diff -r cfcff2683c71 Doc/library/urllib.request.rst --- a/Doc/library/urllib.request.rst Fri Mar 18 15:09:10 2011 -0700 +++ b/Doc/library/urllib.request.rst Sat Mar 19 23:02:15 2011 -0400 @@ -80,6 +80,58 @@ .. versionadded:: 3.2 *data* can be an iterable object. +.. function:: urlretrieve(url, filename=None, reporthook=None, data=None) + + Copy a network object denoted by a URL to a local file. If the URL + points to a local file, the object will not be copied unless filename is supplied. + Return a tuple ``(filename, headers)`` where *filename* is the + local file name under which the object can be found, and *headers* is whatever + the :meth:`info` method of the object returned by :func:`urlopen` returned (for + a remote object). Exceptions are the same as for :func:`urlopen`. + + The second argument, if present, specifies the file location to copy to (if + absent, the location will be a tempfile with a generated name). The third + argument, if present, is a hook function that will be called once on + establishment of the network connection and once after each block read + thereafter. The hook will be passed three arguments; a count of blocks + transferred so far, a block size in bytes, and the total size of the file. The + third argument may be ``-1`` on older FTP servers which do not return a file + size in response to a retrieval request. + + The following example illustrates the most common usage scenario:: + + >>> import urllib.request + >>> local_filename, headers = urllib.request.urlretrieve('http://python.org/') + >>> html = open(local_filename) + >>> html.close() + + If the *url* uses the :file:`http:` scheme identifier, the optional *data* + argument may be given to specify a ``POST`` request (normally the request type + is ``GET``). The *data* argument must in standard + :mimetype:`application/x-www-form-urlencoded` format; see the :func:`urlencode` + function below. + + :func:`urlretrieve` will raise :exc:`ContentTooShortError` when it detects that + the amount of data available was less than the expected amount (which is the + size reported by a *Content-Length* header). This can occur, for example, when + the download is interrupted. + + The *Content-Length* is treated as a lower bound: if there's more data to read, + urlretrieve reads more data, but if less data is available, it raises the + exception. + + You can still retrieve the downloaded data in this case, it is stored in the + :attr:`content` attribute of the exception instance. + + If no *Content-Length* header was supplied, urlretrieve can not check the size + of the data it has downloaded, and just returns it. In this case you just have + to assume that the download was successful. + +.. function:: urlcleanup() + + Cleans up temporary files that may have been left behind by previous + calls to :func:`urlretrieve`. + .. function:: install_opener(opener) Install an :class:`OpenerDirector` instance as the default global opener. @@ -1078,51 +1130,6 @@ some point in the future. -.. function:: urlretrieve(url, filename=None, reporthook=None, data=None) - - Copy a network object denoted by a URL to a local file, if necessary. If the URL - points to a local file, or a valid cached copy of the object exists, the object - is not copied. Return a tuple ``(filename, headers)`` where *filename* is the - local file name under which the object can be found, and *headers* is whatever - the :meth:`info` method of the object returned by :func:`urlopen` returned (for - a remote object, possibly cached). Exceptions are the same as for - :func:`urlopen`. - - The second argument, if present, specifies the file location to copy to (if - absent, the location will be a tempfile with a generated name). The third - argument, if present, is a hook function that will be called once on - establishment of the network connection and once after each block read - thereafter. The hook will be passed three arguments; a count of blocks - transferred so far, a block size in bytes, and the total size of the file. The - third argument may be ``-1`` on older FTP servers which do not return a file - size in response to a retrieval request. - - If the *url* uses the :file:`http:` scheme identifier, the optional *data* - argument may be given to specify a ``POST`` request (normally the request type - is ``GET``). The *data* argument must in standard - :mimetype:`application/x-www-form-urlencoded` format; see the :func:`urlencode` - function below. - - :func:`urlretrieve` will raise :exc:`ContentTooShortError` when it detects that - the amount of data available was less than the expected amount (which is the - size reported by a *Content-Length* header). This can occur, for example, when - the download is interrupted. - - The *Content-Length* is treated as a lower bound: if there's more data to read, - urlretrieve reads more data, but if less data is available, it raises the - exception. - - You can still retrieve the downloaded data in this case, it is stored in the - :attr:`content` attribute of the exception instance. - - If no *Content-Length* header was supplied, urlretrieve can not check the size - of the data it has downloaded, and just returns it. In this case you just have - to assume that the download was successful. - -.. function:: urlcleanup() - - Clear the cache that may have been built up by previous calls to - :func:`urlretrieve`. .. class:: URLopener(proxies=None, **x509) diff -r cfcff2683c71 Lib/test/test_urllib.py --- a/Lib/test/test_urllib.py Fri Mar 18 15:09:10 2011 -0700 +++ b/Lib/test/test_urllib.py Sat Mar 19 23:02:15 2011 -0400 @@ -339,7 +339,7 @@ urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName), support.TESTFN, hooktester) self.assertEqual(len(report), 2) - self.assertEqual(report[0][1], 8192) + self.assertEqual(report[0][1], 0) self.assertEqual(report[0][2], 5) def test_reporthook_8193_bytes(self): @@ -353,7 +353,7 @@ urllib.request.urlretrieve(self.constructLocalFileUrl(srcFileName), support.TESTFN, hooktester) self.assertEqual(len(report), 3) - self.assertEqual(report[0][1], 8192) + self.assertEqual(report[0][1], 0) self.assertEqual(report[0][2], 8193) class QuotingTests(unittest.TestCase): diff -r cfcff2683c71 Lib/urllib/request.py --- a/Lib/urllib/request.py Fri Mar 18 15:09:10 2011 -0700 +++ b/Lib/urllib/request.py Sat Mar 19 23:02:15 2011 -0400 @@ -93,8 +93,10 @@ import re import socket import sys +import tempfile import time import collections +import contextlib from urllib.error import URLError, HTTPError, ContentTooShortError from urllib.parse import ( @@ -141,17 +143,78 @@ global _opener _opener = opener -# TODO(jhylton): Make this work with the same global opener. -_urlopener = None +_url_tempfiles = [] def urlretrieve(url, filename=None, reporthook=None, data=None): - global _urlopener - if not _urlopener: - _urlopener = FancyURLopener() - return _urlopener.retrieve(url, filename, reporthook, data) + """ + Retrieve a URL into a temporary location on disk. + + Requires a URL argument. If a filename is passed, it is used as + the temporary file location. The reporthook argument should be + a callable that accepts a block number, a read size, and the + total file size of the URL target. The data argument should be + valid URL encoded data. + + If a filename is passed and the URL points to a local resource, + the result is a copy from local file to new file. + + Returns a tuple containing the path to the newly created + data file as well as the resulting HTTPMessage object. + """ + url_type, path = splittype(url) + + with contextlib.closing(urlopen(url, data)) as fp: + headers = fp.info() + + # Just return the local path and the "headers" for file:// + # URLs. No sense in performing a copy unless requested. + if url_type == "file" and not filename: + return os.path.normpath(path), headers + + # Handle temporary file setup. + if filename: + tfp = open(filename, 'wb') + else: + tfp = tempfile.NamedTemporaryFile(delete=False) + filename = tfp.name + _url_tempfiles.append(filename) + + with tfp: + result = filename, headers + bs = 1024*8 + size = -1 + read = 0 + blocknum = 0 + if "content-length" in headers: + size = int(headers["Content-Length"]) + + if reporthook: + reporthook(blocknum, 0, size) + + while True: + block = fp.read(bs) + if not block: + break + read += len(block) + tfp.write(block) + blocknum += 1 + if reporthook: + reporthook(blocknum, len(block), size) + + if size >= 0 and read < size: + raise ContentTooShortError( + "retrieval incomplete: got only %i out of %i bytes" + % (read, size), result) + + return result def urlcleanup(): - if _urlopener: - _urlopener.cleanup() + for temp_file in _url_tempfiles: + try: + os.unlink(temp_file) + except EnvironmentError: + pass + + del _url_tempfiles[:] global _opener if _opener: _opener = None diff -r cfcff2683c71 Misc/ACKS --- a/Misc/ACKS Fri Mar 18 15:09:10 2011 -0700 +++ b/Misc/ACKS Sat Mar 19 23:02:15 2011 -0400 @@ -576,6 +576,7 @@ Gordon McMillan Caolan McNamara Andrew McNamara +Jeff McNeil Craig McPheeters Lambert Meertens Bill van Melle