Index: Doc/library/urllib.rst =================================================================== --- Doc/library/urllib.rst (revision 78945) +++ Doc/library/urllib.rst (working copy) @@ -223,11 +223,25 @@ .. function:: unquote(string) - Replace ``%xx`` escapes by their single-character equivalent. + Replace ``%xx`` escapes by their single-byte equivalent. + *string* may be either a :class:`str` or a :class:`unicode`. + + If it is a :class:`unicode`, unescaped non-ASCII characters in *string* + are encoded into UTF-8 bytes. The result is a :class:`str` regardless of + the type of *string*. + + Example: ``unquote('/%7Econnolly/')`` yields ``'/~connolly/'``. + Example: ``unquote(u'/El%20Ni%C3%B1o/')`` yields ``'/El Ni\xc3\xb1o/'``, + which decodes with UTF-8 as ``u'/El NiƱo/'``. + .. versionchanged:: 2.7 + :func:`unquote` now returns a :class:`str` on :class:`unicode` input, + where it used to return a :class:`unicode`. Unicode strings are encoded + with ``'utf-8'``. + .. function:: unquote_plus(string) Like :func:`unquote`, but also replaces plus signs by spaces, as required for Index: Lib/urllib.py =================================================================== --- Lib/urllib.py (revision 78945) +++ Lib/urllib.py (working copy) @@ -1163,6 +1163,10 @@ def unquote(s): """unquote('abc%20def') -> 'abc def'.""" + # Note: strings are encoded as UTF-8. This is only an issue if it contains + # unescaped non-ASCII characters, which URIs should not. + if isinstance(s, unicode): + s = s.encode('utf-8') res = s.split('%') for i in xrange(1, len(res)): item = res[i] Index: Lib/test/test_urllib.py =================================================================== --- Lib/test/test_urllib.py (revision 78945) +++ Lib/test/test_urllib.py (working copy) @@ -463,9 +463,39 @@ self.assertEqual(expect, result, "using unquote_plus(): %s != %s" % (expect, result)) + def test_unquote_nonascii_bytes(self): + # Test on escaped non-ASCII bytes + given = '%A2%D8ab%FF' + expect = b'\xa2\xd8ab\xff' + result = urllib.unquote(given) + self.assertEqual(expect, result, "using unquote(): %r != %r" + % (expect, result)) + # Test on unescaped non-ASCII bytes + # (Technically an invalid URI; expect those bytes to be preserved) + given = b'%A2\xd8ab%FF' + expect = b'\xa2\xd8ab\xff' + result = urllib.unquote(given) + self.assertEqual(expect, result, "using unquote(): %r != %r" + % (expect, result)) + def test_unquote_with_unicode(self): r = urllib.unquote(u'br%C3%BCckner_sapporo_20050930.doc') - self.assertEqual(r, u'br\xc3\xbcckner_sapporo_20050930.doc') + self.assertEqual(r, 'br\xc3\xbcckner_sapporo_20050930.doc') + # Test on a string with unescaped non-ASCII characters + # (Technically an invalid URI; expect those characters to be UTF-8 + # encoded). + given = u'\u6f22%C3%BC' + result = urllib.unquote(given) + expect = b'\xe6\xbc\xa2\xc3\xbc' # UTF-8 for "\u6f22\u00fc" + self.assertEqual(expect, result, + "using unquote(): %r != %r" % (expect, result)) + # Test that the result of unquoting an ASCII string is the same as the + # result of unquoting the same string as unicode + given = "%C3%BC" + result = urllib.unquote(given) + expect = urllib.unquote(given.decode('ascii')) + self.assertEqual(expect, result, "using unquote(): %r != %r" + % (expect, result)) class urlencode_Tests(unittest.TestCase): """Tests for urlencode()"""