diff -r 1902fe7e5542 Doc/howto/urllib2.rst --- a/Doc/howto/urllib2.rst Fri Mar 18 02:22:15 2011 -0700 +++ b/Doc/howto/urllib2.rst Fri Mar 18 21:13:34 2011 -0400 @@ -56,6 +56,13 @@ response = urllib.request.urlopen('http://python.org/') html = response.read() +If you wish to retrieve a resource via URL and store it in a temporary location, +you can do so via the :func:`urlretrieve` function:: + + import urllib.request + local_filename, headers = urllib.request.urlretrieve('http://python.org/') + html = open(local_filename) + Many uses of urllib will be that simple (note that instead of an 'http:' URL we could have used an URL starting with 'ftp:', 'file:', etc.). However, it's the purpose of this tutorial to explain the more complicated cases, concentrating on diff -r 1902fe7e5542 Doc/library/urllib.request.rst --- a/Doc/library/urllib.request.rst Fri Mar 18 02:22:15 2011 -0700 +++ b/Doc/library/urllib.request.rst Fri Mar 18 21:13:34 2011 -0400 @@ -80,6 +80,58 @@ .. versionadded:: 3.2 *data* can be an iterable object. +.. function:: urlretrieve(url, filename=None, reporthook=None, data=None) + + Copy a network object denoted by a URL to a local file. If the URL + points to a local file, the object will not be copied unless filename is supplied. + Return a tuple ``(filename, headers)`` where *filename* is the + local file name under which the object can be found, and *headers* is whatever + the :meth:`info` method of the object returned by :func:`urlopen` returned (for + a remote object). Exceptions are the same as for :func:`urlopen`. + + The second argument, if present, specifies the file location to copy to (if + absent, the location will be a tempfile with a generated name). The third + argument, if present, is a hook function that will be called once on + establishment of the network connection and once after each block read + thereafter. The hook will be passed three arguments; a count of blocks + transferred so far, a block size in bytes, and the total size of the file. The + third argument may be ``-1`` on older FTP servers which do not return a file + size in response to a retrieval request. + + The following example illustrates the most common usage scenario:: + + >>> import urllib.request + >>> local_filename, headers = urllib.request.urlretrieve('http://python.org/') + >>> html = open(local_filename) + >>> html.close() + + If the *url* uses the :file:`http:` scheme identifier, the optional *data* + argument may be given to specify a ``POST`` request (normally the request type + is ``GET``). The *data* argument must in standard + :mimetype:`application/x-www-form-urlencoded` format; see the :func:`urlencode` + function below. + + :func:`urlretrieve` will raise :exc:`ContentTooShortError` when it detects that + the amount of data available was less than the expected amount (which is the + size reported by a *Content-Length* header). This can occur, for example, when + the download is interrupted. + + The *Content-Length* is treated as a lower bound: if there's more data to read, + urlretrieve reads more data, but if less data is available, it raises the + exception. + + You can still retrieve the downloaded data in this case, it is stored in the + :attr:`content` attribute of the exception instance. + + If no *Content-Length* header was supplied, urlretrieve can not check the size + of the data it has downloaded, and just returns it. In this case you just have + to assume that the download was successful. + +.. function:: urlcleanup() + + Cleans up temporary files that may have been left behind by previous + calls to :func:`urlretrieve`. + .. function:: install_opener(opener) Install an :class:`OpenerDirector` instance as the default global opener. @@ -1078,51 +1130,6 @@ some point in the future. -.. function:: urlretrieve(url, filename=None, reporthook=None, data=None) - - Copy a network object denoted by a URL to a local file, if necessary. If the URL - points to a local file, or a valid cached copy of the object exists, the object - is not copied. Return a tuple ``(filename, headers)`` where *filename* is the - local file name under which the object can be found, and *headers* is whatever - the :meth:`info` method of the object returned by :func:`urlopen` returned (for - a remote object, possibly cached). Exceptions are the same as for - :func:`urlopen`. - - The second argument, if present, specifies the file location to copy to (if - absent, the location will be a tempfile with a generated name). The third - argument, if present, is a hook function that will be called once on - establishment of the network connection and once after each block read - thereafter. The hook will be passed three arguments; a count of blocks - transferred so far, a block size in bytes, and the total size of the file. The - third argument may be ``-1`` on older FTP servers which do not return a file - size in response to a retrieval request. - - If the *url* uses the :file:`http:` scheme identifier, the optional *data* - argument may be given to specify a ``POST`` request (normally the request type - is ``GET``). The *data* argument must in standard - :mimetype:`application/x-www-form-urlencoded` format; see the :func:`urlencode` - function below. - - :func:`urlretrieve` will raise :exc:`ContentTooShortError` when it detects that - the amount of data available was less than the expected amount (which is the - size reported by a *Content-Length* header). This can occur, for example, when - the download is interrupted. - - The *Content-Length* is treated as a lower bound: if there's more data to read, - urlretrieve reads more data, but if less data is available, it raises the - exception. - - You can still retrieve the downloaded data in this case, it is stored in the - :attr:`content` attribute of the exception instance. - - If no *Content-Length* header was supplied, urlretrieve can not check the size - of the data it has downloaded, and just returns it. In this case you just have - to assume that the download was successful. - -.. function:: urlcleanup() - - Clear the cache that may have been built up by previous calls to - :func:`urlretrieve`. .. class:: URLopener(proxies=None, **x509) diff -r 1902fe7e5542 Lib/test/regrtest.py --- a/Lib/test/regrtest.py Fri Mar 18 02:22:15 2011 -0700 +++ b/Lib/test/regrtest.py Fri Mar 18 21:13:34 2011 -0400 @@ -1386,7 +1386,6 @@ test_ttk_guionly test_ttk_textonly test_timeout - test_urllibnet test_multiprocessing """, 'aix5': diff -r 1902fe7e5542 Lib/test/test_urllib2net.py --- a/Lib/test/test_urllib2net.py Fri Mar 18 02:22:15 2011 -0700 +++ b/Lib/test/test_urllib2net.py Fri Mar 18 21:13:34 2011 -0400 @@ -8,6 +8,8 @@ import socket import urllib.error import urllib.request +import email.message +import time import sys try: import ssl @@ -319,6 +321,162 @@ self.assertIn(b"Unfortunately", contents) +class UrlretrieveNetworkTests(unittest.TestCase): + """Tests urllib.request.urlretrieve using the network.""" + + def urlretrieve(self, *args): + resource = args[0] + with support.transient_internet(resource): + return urllib.request.urlretrieve(*args) + + def test_basic(self): + # test basic functionality. + file_location,info = self.urlretrieve("http://www.python.org/") + self.assertTrue(os.path.exists(file_location), "file location returned by" + " urlretrieve is not a valid path") + fd = open(file_location, encoding='utf-8') + try: + self.assertTrue(fd.read(), "reading from the file location returned" + " by urlretrieve failed") + finally: + fd.close() + os.unlink(file_location) + + def test_specified_path(self): + # make sure that specifying the location of the file to write to works. + file_location,info = self.urlretrieve("http://www.python.org/", + support.TESTFN) + self.assertEqual(file_location, support.TESTFN) + self.assertTrue(os.path.exists(file_location)) + fd = open(file_location, encoding='utf-8') + try: + self.assertTrue(fd.read(), "reading from temporary file failed") + finally: + fd.close() + os.unlink(file_location) + + def test_header(self): + # make sure header returned as 2nd value from urlretrieve is good. + file_location, header = self.urlretrieve("http://www.python.org/") + os.unlink(file_location) + self.assertIsInstance(header, email.message.Message, + "header is not an instance of email.message.Message") + + def test_data_header(self): + logo = "http://www.python.org/community/logos/python-logo-master-v3-TM.png" + file_location, fileheaders = self.urlretrieve(logo) + os.unlink(file_location) + datevalue = fileheaders.get('Date') + dateformat = '%a, %d %b %Y %H:%M:%S GMT' + try: + time.strptime(datevalue, dateformat) + except ValueError: + self.fail('date value not in %r format', dateformat) + +class UrlopenNetworkTests(unittest.TestCase): + """Tests urllib.reqest.urlopen using the network. + + These tests are not exhaustive. Assuming that testing using files does a + good job overall of some of the basic interface features. There are no + tests exercising the optional 'data' and 'proxies' arguments. No tests + for transparent redirection have been written. + + setUp is not used for always constructing a connection to + http://www.python.org/ since there a few tests that don't use that address + and making a connection is expensive enough to warrant minimizing unneeded + connections. + + """ + + def urlopen(self, *args, **kwargs): + resource = args[0] + with support.transient_internet(resource): + return urllib.request.urlopen(*args, **kwargs) + + def test_basic(self): + # simple test expected to pass. + open_url = self.urlopen("http://www.python.org/") + for attr in ("read", "readline", "readlines", "fileno", "close", + "info", "geturl"): + self.assertTrue(hasattr(open_url, attr), "object returned from " + "urlopen lacks the %s attribute" % attr) + try: + self.assertTrue(open_url.read(), "calling 'read' failed") + finally: + open_url.close() + + def test_readlines(self): + # test both readline and readlines. + open_url = self.urlopen("http://www.python.org/") + try: + self.assertIsInstance(open_url.readline(), bytes, + "readline did not return a string") + self.assertIsInstance(open_url.readlines(), list, + "readlines did not return a list") + finally: + open_url.close() + + def test_info(self): + # test 'info'. + open_url = self.urlopen("http://www.python.org/") + try: + info_obj = open_url.info() + finally: + open_url.close() + self.assertIsInstance(info_obj, email.message.Message, + "object returned by 'info' is not an " + "instance of email.message.Message") + self.assertEqual(info_obj.get_content_subtype(), "html") + + def test_geturl(self): + # make sure same URL as opened is returned by geturl. + URL = "http://www.python.org/" + open_url = self.urlopen(URL) + try: + gotten_url = open_url.geturl() + finally: + open_url.close() + self.assertEqual(gotten_url, URL) + + def test_getcode(self): + # test getcode() with the fancy opener to get 404 error codes + URL = "http://www.python.org/XXXinvalidXXX" + open_url = urllib.request.FancyURLopener().open(URL) + try: + code = open_url.getcode() + finally: + open_url.close() + self.assertEqual(code, 404) + + def test_fileno(self): + if sys.platform in ('win32',): + # on Windows, socket handles are not file descriptors; this + # test can't pass on Windows. + return + # Make sure fd returned by fileno is valid. + open_url = self.urlopen("http://www.python.org/", timeout=None) + fd = open_url.fileno() + fd_file = os.fdopen(fd, encoding='utf-8') + try: + self.assertTrue(fd_file.read(), "reading from file created using fd " + "returned by fileno failed") + finally: + fd_file.close() + + def test_bad_address(self): + # make sure proper exception is raised. + self.assertRaises(IOError, + # SF patch 809915: In Sep 2003, VeriSign started + # highjacking invalid .com and .net addresses to + # boost traffic to their own site. This test + # started failing then. One hopes the .invalid + # domain will be spared to serve its defined + # purpose. + # urllib.urlopen, "http://www.sadflkjsasadf.com/") + urllib.request.urlopen, + "http://sadflkjsasf.i.nvali.d/") + + def test_main(): support.requires("network") support.run_unittest(AuthTests, @@ -326,6 +484,8 @@ OtherNetworkTests, CloseSocketTest, TimeoutTest, + UrlretrieveNetworkTests, + UrlopenNetworkTests ) if __name__ == "__main__": diff -r 1902fe7e5542 Lib/test/test_urllibnet.py --- a/Lib/test/test_urllibnet.py Fri Mar 18 02:22:15 2011 -0700 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,193 +0,0 @@ -#!/usr/bin/env python3 - -import unittest -from test import support - -import socket -import urllib.request -import sys -import os -import email.message -import time - - -class URLTimeoutTest(unittest.TestCase): - - TIMEOUT = 30.0 - - def setUp(self): - socket.setdefaulttimeout(self.TIMEOUT) - - def tearDown(self): - socket.setdefaulttimeout(None) - - def testURLread(self): - with support.transient_internet("www.python.org"): - f = urllib.request.urlopen("http://www.python.org/") - x = f.read() - -class urlopenNetworkTests(unittest.TestCase): - """Tests urllib.reqest.urlopen using the network. - - These tests are not exhaustive. Assuming that testing using files does a - good job overall of some of the basic interface features. There are no - tests exercising the optional 'data' and 'proxies' arguments. No tests - for transparent redirection have been written. - - setUp is not used for always constructing a connection to - http://www.python.org/ since there a few tests that don't use that address - and making a connection is expensive enough to warrant minimizing unneeded - connections. - - """ - - def urlopen(self, *args, **kwargs): - resource = args[0] - with support.transient_internet(resource): - return urllib.request.urlopen(*args, **kwargs) - - def test_basic(self): - # Simple test expected to pass. - open_url = self.urlopen("http://www.python.org/") - for attr in ("read", "readline", "readlines", "fileno", "close", - "info", "geturl"): - self.assertTrue(hasattr(open_url, attr), "object returned from " - "urlopen lacks the %s attribute" % attr) - try: - self.assertTrue(open_url.read(), "calling 'read' failed") - finally: - open_url.close() - - def test_readlines(self): - # Test both readline and readlines. - open_url = self.urlopen("http://www.python.org/") - try: - self.assertIsInstance(open_url.readline(), bytes, - "readline did not return a string") - self.assertIsInstance(open_url.readlines(), list, - "readlines did not return a list") - finally: - open_url.close() - - def test_info(self): - # Test 'info'. - open_url = self.urlopen("http://www.python.org/") - try: - info_obj = open_url.info() - finally: - open_url.close() - self.assertIsInstance(info_obj, email.message.Message, - "object returned by 'info' is not an " - "instance of email.message.Message") - self.assertEqual(info_obj.get_content_subtype(), "html") - - def test_geturl(self): - # Make sure same URL as opened is returned by geturl. - URL = "http://www.python.org/" - open_url = self.urlopen(URL) - try: - gotten_url = open_url.geturl() - finally: - open_url.close() - self.assertEqual(gotten_url, URL) - - def test_getcode(self): - # test getcode() with the fancy opener to get 404 error codes - URL = "http://www.python.org/XXXinvalidXXX" - open_url = urllib.request.FancyURLopener().open(URL) - try: - code = open_url.getcode() - finally: - open_url.close() - self.assertEqual(code, 404) - - def test_fileno(self): - if sys.platform in ('win32',): - # On Windows, socket handles are not file descriptors; this - # test can't pass on Windows. - return - # Make sure fd returned by fileno is valid. - open_url = self.urlopen("http://www.python.org/", timeout=None) - fd = open_url.fileno() - FILE = os.fdopen(fd, encoding='utf-8') - try: - self.assertTrue(FILE.read(), "reading from file created using fd " - "returned by fileno failed") - finally: - FILE.close() - - def test_bad_address(self): - # Make sure proper exception is raised when connecting to a bogus - # address. - self.assertRaises(IOError, - # SF patch 809915: In Sep 2003, VeriSign started - # highjacking invalid .com and .net addresses to - # boost traffic to their own site. This test - # started failing then. One hopes the .invalid - # domain will be spared to serve its defined - # purpose. - # urllib.urlopen, "http://www.sadflkjsasadf.com/") - urllib.request.urlopen, - "http://sadflkjsasf.i.nvali.d/") - -class urlretrieveNetworkTests(unittest.TestCase): - """Tests urllib.request.urlretrieve using the network.""" - - def urlretrieve(self, *args): - resource = args[0] - with support.transient_internet(resource): - return urllib.request.urlretrieve(*args) - - def test_basic(self): - # Test basic functionality. - file_location,info = self.urlretrieve("http://www.python.org/") - self.assertTrue(os.path.exists(file_location), "file location returned by" - " urlretrieve is not a valid path") - FILE = open(file_location, encoding='utf-8') - try: - self.assertTrue(FILE.read(), "reading from the file location returned" - " by urlretrieve failed") - finally: - FILE.close() - os.unlink(file_location) - - def test_specified_path(self): - # Make sure that specifying the location of the file to write to works. - file_location,info = self.urlretrieve("http://www.python.org/", - support.TESTFN) - self.assertEqual(file_location, support.TESTFN) - self.assertTrue(os.path.exists(file_location)) - FILE = open(file_location, encoding='utf-8') - try: - self.assertTrue(FILE.read(), "reading from temporary file failed") - finally: - FILE.close() - os.unlink(file_location) - - def test_header(self): - # Make sure header returned as 2nd value from urlretrieve is good. - file_location, header = self.urlretrieve("http://www.python.org/") - os.unlink(file_location) - self.assertIsInstance(header, email.message.Message, - "header is not an instance of email.message.Message") - - def test_data_header(self): - logo = "http://www.python.org/community/logos/python-logo-master-v3-TM.png" - file_location, fileheaders = self.urlretrieve(logo) - os.unlink(file_location) - datevalue = fileheaders.get('Date') - dateformat = '%a, %d %b %Y %H:%M:%S GMT' - try: - time.strptime(datevalue, dateformat) - except ValueError: - self.fail('Date value not in %r format', dateformat) - - -def test_main(): - support.requires('network') - support.run_unittest(URLTimeoutTest, - urlopenNetworkTests, - urlretrieveNetworkTests) - -if __name__ == "__main__": - test_main() diff -r 1902fe7e5542 Lib/urllib/request.py --- a/Lib/urllib/request.py Fri Mar 18 02:22:15 2011 -0700 +++ b/Lib/urllib/request.py Fri Mar 18 21:13:34 2011 -0400 @@ -95,6 +95,7 @@ import sys import time import collections +import contextlib from urllib.error import URLError, HTTPError, ContentTooShortError from urllib.parse import ( @@ -141,17 +142,80 @@ global _opener _opener = opener -# TODO(jhylton): Make this work with the same global opener. -_urlopener = None +_url_tempfiles = [] def urlretrieve(url, filename=None, reporthook=None, data=None): - global _urlopener - if not _urlopener: - _urlopener = FancyURLopener() - return _urlopener.retrieve(url, filename, reporthook, data) + """ + Retrieve a URL into a temporary location on disk. + + Requires a URL argument. If a filename is passed, it is used as + the temporary file location. The reporthook argument should be + a callable that accepts a block number, a block size, and the + total file size of the URL target. The data argument should be + valid URL encoded data. + + If a filename is passed and the URL points to a local resource, + the result is a copy from local file to new file. + + Returns a tuple containing the path to the newly created + data file as well as the resulting HTTPMessage object. + """ + url_type, path = splittype(url) + + with contextlib.closing(urlopen(url, data)) as fp: + headers = fp.info() + + # Just return the local path and the "headers" for file:// + # URLs. No sense in performing a copy unless requested. + if url_type == "file" and not filename: + return os.path.normpath(path), headers + + # Handle temporary file setup. + if filename: + tfp = open(filename, 'wb') + else: + import tempfile + tfp = tempfile.NamedTemporaryFile(delete=False) + filename = tfp.name + _url_tempfiles.append(filename) + + try: + result = filename, headers + bs = 1024*8 + size = -1 + read = 0 + blocknum = 0 + if reporthook: + if "content-length" in headers: + size = int(headers["Content-Length"]) + reporthook(blocknum, bs, size) + + while True: + block = fp.read(bs) + if not block: + break + read += len(block) + tfp.write(block) + blocknum += 1 + if reporthook: + reporthook(blocknum, bs, size) + finally: + tfp.close() + + if size >= 0 and read < size: + raise ContentTooShortError( + "retrieval incomplete: got only %i out of %i bytes" + % (read, size), result) + + return result def urlcleanup(): - if _urlopener: - _urlopener.cleanup() + for temp_file in _url_tempfiles: + try: + os.unlink(temp_file) + except EnvironmentError: + pass + + del _url_tempfiles[:] global _opener if _opener: _opener = None diff -r 1902fe7e5542 Misc/ACKS --- a/Misc/ACKS Fri Mar 18 02:22:15 2011 -0700 +++ b/Misc/ACKS Fri Mar 18 21:13:34 2011 -0400 @@ -576,6 +576,7 @@ Gordon McMillan Caolan McNamara Andrew McNamara +Jeff McNeil Craig McPheeters Lambert Meertens Bill van Melle diff -r 1902fe7e5542 Misc/NEWS --- a/Misc/NEWS Fri Mar 18 02:22:15 2011 -0700 +++ b/Misc/NEWS Fri Mar 18 21:13:34 2011 -0400 @@ -9,7 +9,6 @@ Core and Builtins ----------------- - - Issue #11320: fix bogus memory management in Modules/getpath.c, leading to a possible crash when calling Py_SetPath(). @@ -74,6 +73,8 @@ Library ------- +- Issue #10050: Update urlretrieve to simply rely on urlopen. This function is + no longer dependent on FancyURLOpener. - Issue #5421: Fix misleading error message when one of socket.sendto()'s arguments has the wrong type. Patch by Nikita Vetoshkin.