Index: Doc/library/urllib.parse.rst =================================================================== --- Doc/library/urllib.parse.rst (revision 65605) +++ Doc/library/urllib.parse.rst (working copy) @@ -182,37 +182,81 @@ string. If there is no fragment identifier in *url*, return *url* unmodified and an empty string. -.. function:: quote(string[, safe]) +.. function:: quote(string[, safe[, encoding[, errors]]]) Replace special characters in *string* using the ``%xx`` escape. Letters, digits, and the characters ``'_.-'`` are never quoted. The optional *safe* parameter specifies additional characters that should not be quoted --- its default value is ``'/'``. - Example: ``quote('/~connolly/')`` yields ``'/%7econnolly/'``. + *string* may be either a :class:`str` or a :class:`bytes`. + The optional *encoding* and *errors* parameters specify how to deal with + non-ASCII characters, as accepted by the :meth:`str.encode` method. + *encoding* defaults to ``'utf-8'``. + *errors* defaults to ``'strict'``, meaning unsupported characters raise a + :class:`UnicodeEncodeError`. + *encoding* and *errors* are ignored if *string* is a :class:`bytes`. -.. function:: quote_plus(string[, safe]) + Example: ``quote('/El Niño/')`` yields ``'/El%20Ni%C3%B1o/'``. + +.. function:: quote_plus(string[, safe[, encoding[, errors]]]) + Like :func:`quote`, but also replace spaces by plus signs, as required for quoting HTML form values. Plus signs in the original string are escaped unless they are included in *safe*. It also does not have *safe* default to ``'/'``. + Example: ``quote_plus('/El Niño/')`` yields ``'%2FEl+Ni%C3%B1o%2F'``. -.. function:: unquote(string) +.. function:: quote_from_bytes(bytes[, safe]) + Like :func:`quote`, but accepts a :class:`bytes` object rather than a + :class:`str`, and does not perform string-to-bytes encoding. + + Example: ``quote_from_bytes(b'/El Ni\xc3\xb1o/')`` yields + ``'/El%20Ni%C3%B1o/'``. + +.. function:: unquote(string[, encoding[, errors]]) + Replace ``%xx`` escapes by their single-character equivalent. + The optional *encoding* and *errors* parameters specify how to decode + percent-encoded sequences into Unicode characters, as accepted by the + :meth:`bytes.decode` method. - Example: ``unquote('/%7Econnolly/')`` yields ``'/~connolly/'``. + *string* must be a :class:`str`. + *encoding* defaults to ``'utf-8'``. + *errors* defaults to ``'strict'``, meaning invalid sequences raise + a :class:`UnicodeDecodeError`. -.. function:: unquote_plus(string) + Example: ``unquote('/El%20Ni%C3%B1o/')`` yields ``'/El Niño/'``. + +.. function:: unquote_plus(string[, encoding[, errors]]) + Like :func:`unquote`, but also replace plus signs by spaces, as required for unquoting HTML form values. + *string* must be a :class:`str`. + Example: ``unquote_plus('/El+Ni%C3%B1o/')`` yields ``'/El Niño/'``. + +.. function:: unquote_to_bytes(string) + + Replace ``%xx`` escapes by their single-octet equivalent, and return a + :class:`bytes` object. + + *string* must be a :class:`str`. + + Unescaped non-ASCII characters in the input string are encoded into UTF-8 + bytes. + + Example: ``unquote_to_bytes('/El%20Ni%C3%B1o/')`` yields + ``b'/El Ni\xc3\xb1o/'``. + + .. function:: urlencode(query[, doseq]) Convert a mapping object or a sequence of two-element tuples to a "url-encoded" Index: Lib/urllib/parse.py =================================================================== --- Lib/urllib/parse.py (revision 65605) +++ Lib/urllib/parse.py (working copy) @@ -7,7 +7,9 @@ import sys __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", - "urlsplit", "urlunsplit"] + "urlsplit", "urlunsplit", + "quote", "quote_plus", "quote_from_bytes", + "unquote", "unquote_plus", "unquote_to_bytes"] # A classification of schemes ('' means apply by default) uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', @@ -260,50 +262,119 @@ else: return url, '' +def unquote_to_bytes(s): + """unquote_to_bytes('abc%20def') -> b'abc def'.""" + # Note: strings are encoded as UTF-8. This is only an issue if it contains + # unescaped non-ASCII characters, which URIs should not. + res = s.split('%') + res[0] = res[0].encode('utf-8') + for i in range(1, len(res)): + item = res[i] + try: + if item == '': raise ValueError + res[i] = bytes.fromhex(item[:2]) + item[2:].encode('utf-8') + except ValueError: + res[i] = b'%' + item.encode('utf-8') + return b"".join(res) -_hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) -_hextochr.update(('%02X' % i, chr(i)) for i in range(256)) +def unquote(s, encoding='utf-8', errors='strict'): + """Replace %xx escapes by their single-character equivalent. The optional + encoding and errors parameters specify how to decode percent-encoded + sequences into Unicode characters, as accepted by the bytes.decode() + method. + By default, percent-encoded sequences are decoded with UTF-8, and invalid + sequences raise a UnicodeDecodeError. -def unquote(s): - """unquote('abc%20def') -> 'abc def'.""" + unquote('abc%20def') -> 'abc def'. + """ + if encoding is None: encoding = 'utf-8' + if errors is None: errors = 'strict' + # pct_sequence: contiguous sequence of percent-encoded bytes, decoded + # (list of single-byte bytes objects) + pct_sequence = [] res = s.split('%') for i in range(1, len(res)): item = res[i] try: - res[i] = _hextochr[item[:2]] + item[2:] - except KeyError: - res[i] = '%' + item - except UnicodeDecodeError: - res[i] = chr(int(item[:2], 16)) + item[2:] - return "".join(res) + if item == '': raise ValueError + pct_sequence.append(bytes.fromhex(item[:2])) + rest = item[2:] + except ValueError: + rest = '%' + item + if len(rest) == 0: + # This segment was just a single percent-encoded character. + # May be part of a sequence of code units, so delay decoding. + # (Stored in pct_sequence). + res[i] = '' + else: + # Encountered non-percent-encoded characters. Flush the current + # pct_sequence. + res[i] = b''.join(pct_sequence).decode(encoding, errors) + rest + pct_sequence = [] + if len(pct_sequence) > 0: + # Flush the final pct_sequence + # res[-1] will always be empty if pct_sequence != [] + res[-1] = b''.join(pct_sequence).decode(encoding, errors) + return ''.join(res) -def unquote_plus(s): - """unquote('%7e/abc+def') -> '~/abc def'""" +def unquote_plus(s, encoding='utf-8', errors='strict'): + """Like unquote(), but also replace plus signs by spaces, as required for + unquoting HTML form values. + + unquote_plus('%7e/abc+def') -> '~/abc def' + """ s = s.replace('+', ' ') - return unquote(s) + return unquote(s, encoding, errors) -always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' +always_safe = frozenset( + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' '0123456789' '_.-') +safe_bytes = frozenset( + b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + b'abcdefghijklmnopqrstuvwxyz' + b'0123456789' b'_.-') _safe_quoters= {} class Quoter: - def __init__(self, safe): + def __init__(self, safe, encoding, errors): + """safe: str object.""" self.cache = {} - self.safe = safe + always_safe + self.bytecache = {} + # safe is a bytes object + self.safe = always_safe.union(safe) + self.encoding = encoding + self.errors = errors def __call__(self, c): + """ + c: An int, representing a byte to be encoded. Must have range(0,256). + Returns a str. + """ try: return self.cache[c] except KeyError: - if ord(c) < 256: - res = (c in self.safe) and c or ('%%%02X' % ord(c)) - self.cache[c] = res - return res + if c in self.safe: + res = c else: - return "".join(['%%%02X' % i for i in c.encode("utf-8")]) + res = ''.join(self.__encodebyte(b) for b in + c.encode(self.encoding, self.errors)) + self.cache[c] = res + return res -def quote(s, safe = '/'): + def __encodebyte(self, b): + """ + b: An int, representing a byte to be encoded. Must have range(0,256). + Returns a str. + """ + try: + return self.bytecache[b] + except KeyError: + res = b in safe_bytes and chr(b) or ('%%%02X' % b) + self.cache[b] = res + return res + +def quote(s, safe='/', encoding='utf-8', errors='strict'): """quote('abc def') -> 'abc%20def' Each part of a URL, e.g. the path info, the query, etc., has a @@ -323,23 +394,60 @@ is reserved, but in typical usage the quote function is being called on a path where the existing slash characters are used as reserved characters. + + The optional encoding and errors parameters specify how to deal with + non-ASCII characters, as accepted by the str.encode method. + By default, characters are encoded with UTF-8, and unsupported characters + raise a UnicodeEncodeError. """ - cachekey = (safe, always_safe) + if encoding is None: encoding = 'utf-8' + if errors is None: errors = 'strict' + if not isinstance(safe, str): + # Safe bytes are treated as code points + safe = safe.decode('latin-1') + if not isinstance(s, str): + # bytes or bytearray: treat as a Latin-1 string (so each byte is + # directly percent-encoded) + s = s.decode('latin-1') + encoding = 'latin-1' + cachekey = (safe, always_safe, encoding, errors) try: quoter = _safe_quoters[cachekey] except KeyError: - quoter = Quoter(safe) + quoter = Quoter(safe, encoding, errors) _safe_quoters[cachekey] = quoter res = map(quoter, s) return ''.join(res) -def quote_plus(s, safe = ''): - """Quote the query fragment of a URL; replacing ' ' with '+'""" - if ' ' in s: - s = quote(s, safe + ' ') +def quote_plus(s, safe='', encoding='utf-8', errors='strict'): + """Like quote(), but also replace ' ' with '+', as required for quoting + HTML form values. Plus signs in the original string are escaped unless + they are included in safe. It also does not have safe default to '/'. + """ + # Check if ' ' in s, where s may either be a str or bytes + if ' ' in s if isinstance(s, str) else b' ' in s: + s = quote(s, safe + ' ' if isinstance(safe, str) else safe + b' ') return s.replace(' ', '+') - return quote(s, safe) + return quote(s, safe, encoding, errors) +def quote_from_bytes(s, safe='/'): + if not isinstance(safe, str): + # Safe bytes are treated as code points + safe = safe.decode('latin-1') + if not isinstance(s, bytes) or isinstance(s, bytearray): + raise TypeError("quote_from_bytes() expected a bytes") + # bytes or bytearray: treat as a Latin-1 string (so each byte is + # directly percent-encoded) + s = s.decode('latin-1') + cachekey = (safe, always_safe, 'latin-1', 'strict') + try: + quoter = _safe_quoters[cachekey] + except KeyError: + quoter = Quoter(safe, 'latin-1', 'strict') + _safe_quoters[cachekey] = quoter + res = map(quoter, s) + return ''.join(res) + def urlencode(query,doseq=0): """Encode a sequence of two-element tuples or dictionary into a URL query string. Index: Lib/email/utils.py =================================================================== --- Lib/email/utils.py (revision 65605) +++ Lib/email/utils.py (working copy) @@ -219,7 +219,7 @@ charset is given but not language, the string is encoded using the empty string for language. """ - s = urllib.parse.quote(s, safe='') + s = urllib.parse.quote(s, safe='', encoding=charset) if charset is None and language is None: return s if language is None: @@ -271,7 +271,10 @@ # language specifiers at the beginning of the string. for num, s, encoded in continuations: if encoded: - s = urllib.parse.unquote(s) + # Decode as "latin-1", so the characters in s directly + # represent the percent-encoded octet values. + # collapse_rfc2231_value treats this as an octet sequence. + s = urllib.parse.unquote(s, encoding="latin-1") extended = True value.append(s) value = quote(EMPTYSTRING.join(value)) Index: Lib/test/test_http_cookiejar.py =================================================================== --- Lib/test/test_http_cookiejar.py (revision 65605) +++ Lib/test/test_http_cookiejar.py (working copy) @@ -539,6 +539,8 @@ # unquoted unsafe ("/foo\031/bar", "/foo%19/bar"), ("/\175foo/bar", "/%7Dfoo/bar"), + # unicode, latin-1 range + ("/foo/bar\u00fc", "/foo/bar%C3%BC"), # UTF-8 encoded # unicode ("/foo/bar\uabcd", "/foo/bar%EA%AF%8D"), # UTF-8 encoded ] @@ -1444,7 +1446,8 @@ # Try some URL encodings of the PATHs. # (the behaviour here has changed from libwww-perl) c = CookieJar(DefaultCookiePolicy(rfc2965=True)) - interact_2965(c, "http://www.acme.com/foo%2f%25/%3c%3c%0Anew%E5/%E5", + interact_2965(c, "http://www.acme.com/foo%2f%25/" + "%3c%3c%0Anew%C3%A5/%C3%A5", "foo = bar; version = 1") cookie = interact_2965( Index: Lib/test/test_cgi.py =================================================================== --- Lib/test/test_cgi.py (revision 65605) +++ Lib/test/test_cgi.py (working copy) @@ -68,6 +68,8 @@ ("&a=b", [('a', 'b')]), ("a=a+b&b=b+c", [('a', 'a b'), ('b', 'b c')]), ("a=1&a=2", [('a', '1'), ('a', '2')]), + ("a=%26&b=%3D", [('a', '&'), ('b', '=')]), + ("a=%C3%BC&b=%CA%83", [('a', '\xfc'), ('b', '\u0283')]), ] parse_strict_test_cases = [ Index: Lib/test/test_wsgiref.py =================================================================== --- Lib/test/test_wsgiref.py (revision 65605) +++ Lib/test/test_wsgiref.py (working copy) @@ -291,6 +291,7 @@ def testAppURIs(self): self.checkAppURI("http://127.0.0.1/") self.checkAppURI("http://127.0.0.1/spam", SCRIPT_NAME="/spam") + self.checkAppURI("http://127.0.0.1/sp%C3%A4m", SCRIPT_NAME="/späm") self.checkAppURI("http://spam.example.com:2071/", HTTP_HOST="spam.example.com:2071", SERVER_PORT="2071") self.checkAppURI("http://spam.example.com/", @@ -304,6 +305,7 @@ def testReqURIs(self): self.checkReqURI("http://127.0.0.1/") self.checkReqURI("http://127.0.0.1/spam", SCRIPT_NAME="/spam") + self.checkReqURI("http://127.0.0.1/sp%C3%A4m", SCRIPT_NAME="/späm") self.checkReqURI("http://127.0.0.1/spammity/spam", SCRIPT_NAME="/spammity", PATH_INFO="/spam") self.checkReqURI("http://127.0.0.1/spammity/spam?say=ni", Index: Lib/test/test_urllib.py =================================================================== --- Lib/test/test_urllib.py (revision 65605) +++ Lib/test/test_urllib.py (working copy) @@ -336,10 +336,10 @@ "_.-"]) result = urllib.parse.quote(do_not_quote) self.assertEqual(do_not_quote, result, - "using quote(): %s != %s" % (do_not_quote, result)) + "using quote(): %r != %r" % (do_not_quote, result)) result = urllib.parse.quote_plus(do_not_quote) self.assertEqual(do_not_quote, result, - "using quote_plus(): %s != %s" % (do_not_quote, result)) + "using quote_plus(): %r != %r" % (do_not_quote, result)) def test_default_safe(self): # Test '/' is default value for 'safe' parameter @@ -350,12 +350,45 @@ quote_by_default = "<>" result = urllib.parse.quote(quote_by_default, safe=quote_by_default) self.assertEqual(quote_by_default, result, - "using quote(): %s != %s" % (quote_by_default, result)) + "using quote(): %r != %r" % (quote_by_default, result)) result = urllib.parse.quote_plus(quote_by_default, safe=quote_by_default) self.assertEqual(quote_by_default, result, - "using quote_plus(): %s != %s" % + "using quote_plus(): %r != %r" % (quote_by_default, result)) + # Safe expressed as bytes rather than str + result = urllib.parse.quote(quote_by_default, safe=b"<>") + self.assertEqual(quote_by_default, result, + "using quote(): %r != %r" % (quote_by_default, result)) + # "Safe" non-ASCII characters + # (Since URIs are not allowed to have non-ASCII characters) + given = "a\u00fcb" + expect = given + result = urllib.parse.quote(given, safe="\u00fc") + self.assertEqual(expect, result, "using quote(): %r != %r" % + (expect, result)) + given = "a\u0123b" + expect = given + result = urllib.parse.quote(given, safe="\u0123") + self.assertEqual(expect, result, "using quote(): %r != %r" % + (expect, result)) + + # Using bytes (both quote and quote_as_bytes) + given = b"a\xfcb" + expect = str(given, "latin-1") + result = urllib.parse.quote(given, safe="\xfc") + self.assertEqual(expect, result, "using quote(): %r != %r" % + (expect, result)) + result = urllib.parse.quote(given, safe=b"\xfc") + self.assertEqual(expect, result, "using quote(): %r != %r" % + (expect, result)) + result = urllib.parse.quote_from_bytes(given, safe="\xfc") + self.assertEqual(expect, result, "using quote_from_bytes(): %r != %r"% + (expect, result)) + result = urllib.parse.quote_from_bytes(given, safe=b"\xfc") + self.assertEqual(expect, result, "using quote_from_bytes(): %r != %r"% + (expect, result)) + def test_default_quoting(self): # Make sure all characters that should be quoted are by default sans # space (separate test for that). @@ -378,35 +411,100 @@ expected = "ab%5B%5Dcd" result = urllib.parse.quote(partial_quote) self.assertEqual(expected, result, - "using quote(): %s != %s" % (expected, result)) + "using quote(): %r != %r" % (expected, result)) self.assertEqual(expected, result, - "using quote_plus(): %s != %s" % (expected, result)) + "using quote_plus(): %r != %r" % (expected, result)) def test_quoting_space(self): # Make sure quote() and quote_plus() handle spaces as specified in # their unique way result = urllib.parse.quote(' ') self.assertEqual(result, hexescape(' '), - "using quote(): %s != %s" % (result, hexescape(' '))) + "using quote(): %r != %r" % (result, hexescape(' '))) result = urllib.parse.quote_plus(' ') self.assertEqual(result, '+', - "using quote_plus(): %s != +" % result) + "using quote_plus(): %r != +" % result) given = "a b cd e f" expect = given.replace(' ', hexescape(' ')) result = urllib.parse.quote(given) self.assertEqual(expect, result, - "using quote(): %s != %s" % (expect, result)) + "using quote(): %r != %r" % (expect, result)) expect = given.replace(' ', '+') result = urllib.parse.quote_plus(given) self.assertEqual(expect, result, - "using quote_plus(): %s != %s" % (expect, result)) + "using quote_plus(): %r != %r" % (expect, result)) def test_quoting_plus(self): self.assertEqual(urllib.parse.quote_plus('alpha+beta gamma'), 'alpha%2Bbeta+gamma') self.assertEqual(urllib.parse.quote_plus('alpha+beta gamma', '+'), 'alpha+beta+gamma') + # Test with bytes + self.assertEqual(urllib.parse.quote_plus(b'alpha+beta gamma'), + 'alpha%2Bbeta+gamma') + # Test with safe bytes + self.assertEqual(urllib.parse.quote_plus('alpha+beta gamma', b'+'), + 'alpha+beta+gamma') + def test_quote_bytes(self): + # Bytes should quote directly to percent-encoded values + given = b"\xa2\xd8ab\xff" + expect = "%A2%D8ab%FF" + result = urllib.parse.quote(given) + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Encoding argument should have no effect + result = urllib.parse.quote(given, encoding="latin-1") + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # quote_from_bytes should work the same + result = urllib.parse.quote_from_bytes(given) + self.assertEqual(expect, result, + "using quote_from_bytes(): %r != %r" + % (expect, result)) + + def test_quote_with_unicode(self): + # Characters in Latin-1 range, encoded by default in UTF-8 + given = "\xa2\xd8ab\xff" + expect = "%C2%A2%C3%98ab%C3%BF" + result = urllib.parse.quote(given) + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Characters in Latin-1 range, encoded by with None (default) + result = urllib.parse.quote(given, encoding=None, errors=None) + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Characters in Latin-1 range, encoded with Latin-1 + given = "\xa2\xd8ab\xff" + expect = "%A2%D8ab%FF" + result = urllib.parse.quote(given, encoding="latin-1") + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Characters in BMP, encoded by default in UTF-8 + given = "\u6f22\u5b57" # "Kanji" + expect = "%E6%BC%A2%E5%AD%97" + result = urllib.parse.quote(given) + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Characters in BMP, encoded with Latin-1 + given = "\u6f22\u5b57" + self.assertRaises(UnicodeEncodeError, urllib.parse.quote, given, + encoding="latin-1") + # Characters in BMP, encoded with Latin-1, with replace error handling + given = "\u6f22\u5b57" + expect = "%3F%3F" # "??" + result = urllib.parse.quote(given, encoding="latin-1", + errors="replace") + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Characters in BMP, Latin-1, with xmlcharref error handling + given = "\u6f22\u5b57" + expect = "%26%2328450%3B%26%2323383%3B" # "漢字" + result = urllib.parse.quote(given, encoding="latin-1", + errors="xmlcharrefreplace") + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + class UnquotingTests(unittest.TestCase): """Tests for unquote() and unquote_plus() @@ -422,23 +520,62 @@ expect = chr(num) result = urllib.parse.unquote(given) self.assertEqual(expect, result, - "using unquote(): %s != %s" % (expect, result)) + "using unquote(): %r != %r" % (expect, result)) result = urllib.parse.unquote_plus(given) self.assertEqual(expect, result, - "using unquote_plus(): %s != %s" % + "using unquote_plus(): %r != %r" % (expect, result)) escape_list.append(given) escape_string = ''.join(escape_list) del escape_list result = urllib.parse.unquote(escape_string) self.assertEqual(result.count('%'), 1, - "using quote(): not all characters escaped; %s" % - result) - result = urllib.parse.unquote(escape_string) - self.assertEqual(result.count('%'), 1, "using unquote(): not all characters escaped: " "%s" % result) + def test_unquoting_badpercent(self): + # Test unquoting on bad percent-escapes + given = '%xab' + expect = given + result = urllib.parse.unquote(given) + self.assertEqual(expect, result, "using unquote(): %r != %r" + % (expect, result)) + given = '%x' + expect = given + result = urllib.parse.unquote(given) + self.assertEqual(expect, result, "using unquote(): %r != %r" + % (expect, result)) + given = '%' + expect = given + result = urllib.parse.unquote(given) + self.assertEqual(expect, result, "using unquote(): %r != %r" + % (expect, result)) + # unquote_to_bytes + given = '%xab' + expect = bytes(given, 'ascii') + result = urllib.parse.unquote_to_bytes(given) + self.assertEqual(expect, result, "using unquote_to_bytes(): %r != %r" + % (expect, result)) + given = '%x' + expect = bytes(given, 'ascii') + result = urllib.parse.unquote_to_bytes(given) + self.assertEqual(expect, result, "using unquote_to_bytes(): %r != %r" + % (expect, result)) + given = '%' + expect = bytes(given, 'ascii') + result = urllib.parse.unquote_to_bytes(given) + self.assertEqual(expect, result, "using unquote_to_bytes(): %r != %r" + % (expect, result)) + + def test_unquoting_mixed_case(self): + # Test unquoting on mixed-case hex digits in the percent-escapes + given = '%Ab%eA' + expect = b'\xab\xea' + result = urllib.parse.unquote_to_bytes(given) + self.assertEqual(expect, result, + "using unquote_to_bytes(): %r != %r" + % (expect, result)) + def test_unquoting_parts(self): # Make sure unquoting works when have non-quoted characters # interspersed @@ -446,10 +583,10 @@ expect = "abcd" result = urllib.parse.unquote(given) self.assertEqual(expect, result, - "using quote(): %s != %s" % (expect, result)) + "using quote(): %r != %r" % (expect, result)) result = urllib.parse.unquote_plus(given) self.assertEqual(expect, result, - "using unquote_plus(): %s != %s" % (expect, result)) + "using unquote_plus(): %r != %r" % (expect, result)) def test_unquoting_plus(self): # Test difference between unquote() and unquote_plus() @@ -457,16 +594,85 @@ expect = given result = urllib.parse.unquote(given) self.assertEqual(expect, result, - "using unquote(): %s != %s" % (expect, result)) + "using unquote(): %r != %r" % (expect, result)) expect = given.replace('+', ' ') result = urllib.parse.unquote_plus(given) self.assertEqual(expect, result, - "using unquote_plus(): %s != %s" % (expect, result)) + "using unquote_plus(): %r != %r" % (expect, result)) + def test_unquote_to_bytes(self): + given = 'br%C3%BCckner_sapporo_20050930.doc' + expect = b'br\xc3\xbcckner_sapporo_20050930.doc' + result = urllib.parse.unquote_to_bytes(given) + self.assertEqual(expect, result, + "using unquote_to_bytes(): %r != %r" + % (expect, result)) + # Test on a string with unescaped non-ASCII characters + # (Technically an invalid URI; expect those characters to be UTF-8 + # encoded). + result = urllib.parse.unquote_to_bytes("\u6f22%C3%BC") + expect = b'\xe6\xbc\xa2\xc3\xbc' # UTF-8 for "\u6f22\u00fc" + self.assertEqual(expect, result, + "using unquote_to_bytes(): %r != %r" + % (expect, result)) + def test_unquote_with_unicode(self): - r = urllib.parse.unquote('br%C3%BCckner_sapporo_20050930.doc') - self.assertEqual(r, 'br\xc3\xbcckner_sapporo_20050930.doc') + # Characters in the Latin-1 range, encoded with UTF-8 + given = 'br%C3%BCckner_sapporo_20050930.doc' + expect = 'br\u00fcckner_sapporo_20050930.doc' + result = urllib.parse.unquote(given) + self.assertEqual(expect, result, + "using unquote(): %r != %r" % (expect, result)) + # Characters in the Latin-1 range, encoded with None (default) + result = urllib.parse.unquote(given, encoding=None, errors=None) + self.assertEqual(expect, result, + "using unquote(): %r != %r" % (expect, result)) + # Characters in the Latin-1 range, encoded with Latin-1 + result = urllib.parse.unquote('br%FCckner_sapporo_20050930.doc', + encoding="latin-1") + expect = 'br\u00fcckner_sapporo_20050930.doc' + self.assertEqual(expect, result, + "using unquote(): %r != %r" % (expect, result)) + + # Characters in BMP, encoded with UTF-8 + given = "%E6%BC%A2%E5%AD%97" + expect = "\u6f22\u5b57" # "Kanji" + result = urllib.parse.unquote(given) + self.assertEqual(expect, result, + "using unquote(): %r != %r" % (expect, result)) + + # Decode with UTF-8, invalid sequence + given = "%F3%B1" + self.assertRaises(UnicodeDecodeError, urllib.parse.unquote, given) + + # Decode with UTF-8, invalid sequence, replace errors + given = "%F3%B1" + expect = "\ufffd" # Replacement character + result = urllib.parse.unquote(given, errors="replace") + self.assertEqual(expect, result, + "using unquote(): %r != %r" % (expect, result)) + + # Decode with UTF-8, invalid sequence, ignoring errors + given = "%F3%B1" + expect = "" + result = urllib.parse.unquote(given, errors="ignore") + self.assertEqual(expect, result, + "using unquote(): %r != %r" % (expect, result)) + + # A mix of non-ASCII and percent-encoded characters, UTF-8 + result = urllib.parse.unquote("\u6f22%C3%BC") + expect = '\u6f22\u00fc' + self.assertEqual(expect, result, + "using unquote(): %r != %r" % (expect, result)) + + # A mix of non-ASCII and percent-encoded characters, Latin-1 + # (Note, the string contains non-Latin-1-representable characters) + result = urllib.parse.unquote("\u6f22%FC", encoding="latin-1") + expect = '\u6f22\u00fc' + self.assertEqual(expect, result, + "using unquote(): %r != %r" % (expect, result)) + class urlencode_Tests(unittest.TestCase): """Tests for urlencode()"""