Index: Doc/library/urllib.rst =================================================================== --- Doc/library/urllib.rst (revision 78945) +++ Doc/library/urllib.rst (working copy) @@ -202,25 +202,41 @@ Utility functions ----------------- -.. function:: quote(string[, safe]) +.. function:: quote(string[, safe[, encoding[, errors]]]) Replace special characters in *string* using the ``%xx`` escape. Letters, digits, and the characters ``'_.-'`` are never quoted. By default, this - function is intended for quoting the path section of the URL.The optional + function is intended for quoting the path section of the URL. The optional *safe* parameter specifies additional characters that should not be quoted --- its default value is ``'/'``. + *string* may be either a :class:`str` or a :class:`unicode`. + + The optional *encoding* and *errors* parameters specify how to deal with + non-ASCII characters, as accepted by the :meth:`unicode.encode` method. + *encoding* defaults to ``'utf-8'``. + *errors* defaults to ``'strict'``, meaning unsupported characters raise a + :class:`UnicodeEncodeError`. + Non-Unicode strings are not encoded by default, and all bytes are allowed. + Example: ``quote('/~connolly/')`` yields ``'/%7econnolly/'``. + Example: ``quote(u'/El Niño/')`` yields ``'/El%20Ni%C3%B1o/'``. -.. function:: quote_plus(string[, safe]) + .. versionchanged:: 2.7 + Added *encoding* and *errors* parameters. + +.. function:: quote_plus(string[, safe[, encoding[, errors]]]) + Like :func:`quote`, but also replaces spaces by plus signs, as required for quoting HTML form values when building up a query string to go into a URL. Plus signs in the original string are escaped unless they are included in *safe*. It also does not have *safe* default to ``'/'``. + Example: ``quote_plus(u'/El Niño/')`` yields ``'%2FEl+Ni%C3%B1o%2F'``. + .. function:: unquote(string) Replace ``%xx`` escapes by their single-character equivalent. Index: Lib/urllib.py =================================================================== --- Lib/urllib.py (revision 78945) +++ Lib/urllib.py (working copy) @@ -1184,7 +1184,7 @@ '0123456789' '_.-') _safemaps = {} -def quote(s, safe = '/'): +def quote(s, safe = '/', encoding=None, errors=None): """quote('abc def') -> 'abc%20def' Each part of a URL, e.g. the path info, the query, etc., has a @@ -1204,8 +1204,26 @@ is reserved, but in typical usage the quote function is being called on a path where the existing slash characters are used as reserved characters. + + string and safe may be either str or unicode objects. + + The optional encoding and errors parameters specify how to deal with + non-ASCII characters, as accepted by the unicode.encode method. + By default, encoding='utf-8' (characters are encoded with UTF-8), and + errors='strict' (unsupported characters raise a UnicodeEncodeError). """ - cachekey = (safe, always_safe) + if encoding is not None or isinstance(s, unicode): + if encoding is None: + encoding = 'utf-8' + if errors is None: + errors = 'strict' + s = s.encode(encoding, errors) + if isinstance(safe, unicode): + # Normalize 'safe' by converting to str and removing non-ASCII chars + safe = safe.encode('ascii', 'ignore') + # (Note that if 'safe' is already a str, non-ASCII bytes are allowed, + # keeping with historical Python behaviour) + cachekey = safe try: safe_map = _safemaps[cachekey] except KeyError: @@ -1218,12 +1236,12 @@ res = map(safe_map.__getitem__, s) return ''.join(res) -def quote_plus(s, safe = ''): +def quote_plus(s, safe = '', encoding=None, errors=None): """Quote the query fragment of a URL; replacing ' ' with '+'""" if ' ' in s: - s = quote(s, safe + ' ') + s = quote(s, safe + ' ', encoding, errors) return s.replace(' ', '+') - return quote(s, safe) + return quote(s, safe, encoding, errors) def urlencode(query,doseq=0): """Encode a sequence of two-element tuples or dictionary into a URL query string. Index: Lib/test/test_urllib.py =================================================================== --- Lib/test/test_urllib.py (revision 78945) +++ Lib/test/test_urllib.py (working copy) @@ -355,6 +355,38 @@ self.assertEqual(quote_by_default, result, "using quote_plus(): %s != %s" % (quote_by_default, result)) + # Safe expressed as unicode rather than str + result = urllib.quote(quote_by_default, safe=u"<>") + self.assertEqual(quote_by_default, result, + "using quote(): %r != %r" % (quote_by_default, result)) + # "Safe" non-ASCII bytes should still work + # (Technically disallowed by the URI standard, but allowed for + # backwards compatibility with previous versions of Python) + result = urllib.quote(b"a\xfcb", safe=b"\xfc") + expect = b"a\xfcb" + self.assertEqual(expect, result, + "using quote(): %r != %r" % + (expect, result)) + # Same as above, but with 'safe' as a unicode rather than str + # "Safe" non-ASCII unicode characters should have no effect + # (Since URIs are not allowed to have non-ASCII characters) + result = urllib.quote(b"a\xfcb", safe=u"\xfc") + expect = urllib.quote(b"a\xfcb", safe="") + self.assertEqual(expect, result, + "using quote(): %r != %r" % + (expect, result)) + # Same as above, but quoting a unicode rather than a str + result = urllib.quote(u"a\xfcb", encoding="latin-1", safe=b"\xfc") + expect = b"a\xfcb" + self.assertEqual(expect, result, + "using quote(): %r != %r" % + (expect, result)) + # Same as above, but with both the quoted value and 'safe' as unicode + result = urllib.quote(u"a\xfcb", encoding="latin-1", safe=u"\xfc") + expect = urllib.quote(u"a\xfcb", encoding="latin-1", safe="") + self.assertEqual(expect, result, + "using quote(): %r != %r" % + (expect, result)) def test_default_quoting(self): # Make sure all characters that should be quoted are by default sans @@ -406,7 +438,82 @@ 'alpha%2Bbeta+gamma') self.assertEqual(urllib.quote_plus('alpha+beta gamma', '+'), 'alpha+beta+gamma') + # Test with unicode + self.assertEqual(urllib.quote_plus(u'alpha+beta gamma'), + 'alpha%2Bbeta+gamma') + # Test with safe unicode + self.assertEqual(urllib.quote_plus('alpha+beta gamma', u'+'), + 'alpha+beta+gamma') + def test_quote_bytes(self): + # Non-ASCII bytes should quote directly to percent-encoded values + given = b"\xa2\xd8ab\xff" + expect = "%A2%D8ab%FF" + result = urllib.quote(given) + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Encoding argument should raise UnicodeDecodeError on bytes input + # with non-ASCII characters (just as with str.encode). + self.assertRaises(UnicodeDecodeError, urllib.quote, given, + encoding="latin-1") + + def test_quote_with_unicode(self): + # Characters in Latin-1 range, encoded by default in UTF-8 + given = u"\xa2\xd8ab\xff" + expect = "%C2%A2%C3%98ab%C3%BF" + result = urllib.quote(given) + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Characters in Latin-1 range, encoded by with None (default) + result = urllib.quote(given, encoding=None, errors=None) + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Characters in Latin-1 range, encoded with Latin-1 + given = u"\xa2\xd8ab\xff" + expect = "%A2%D8ab%FF" + result = urllib.quote(given, encoding="latin-1") + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Characters in BMP, encoded by default in UTF-8 + given = u"\u6f22\u5b57" # "Kanji" + expect = "%E6%BC%A2%E5%AD%97" + result = urllib.quote(given) + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Characters in BMP, encoded with Latin-1 + given = u"\u6f22\u5b57" + self.assertRaises(UnicodeEncodeError, urllib.quote, given, + encoding="latin-1") + # Characters in BMP, encoded with Latin-1, with replace error handling + given = u"\u6f22\u5b57" + expect = "%3F%3F" # "??" + result = urllib.quote(given, encoding="latin-1", + errors="replace") + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Characters in BMP, Latin-1, with xmlcharref error handling + given = u"\u6f22\u5b57" + expect = "%26%2328450%3B%26%2323383%3B" # "漢字" + result = urllib.quote(given, encoding="latin-1", + errors="xmlcharrefreplace") + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + + def test_quote_plus_with_unicode(self): + # Encoding (latin-1) test for quote_plus + given = u"\xa2\xd8 \xff" + expect = "%A2%D8+%FF" + result = urllib.quote_plus(given, encoding="latin-1") + self.assertEqual(expect, result, + "using quote_plus(): %r != %r" % (expect, result)) + # Errors test for quote_plus + given = u"ab\u6f22\u5b57 cd" + expect = "ab%3F%3F+cd" + result = urllib.quote_plus(given, encoding="latin-1", + errors="replace") + self.assertEqual(expect, result, + "using quote_plus(): %r != %r" % (expect, result)) + class UnquotingTests(unittest.TestCase): """Tests for unquote() and unquote_plus()