--- parse.py.patch7 2008-08-01 00:46:11.000000000 +1000 +++ parse.py.patch8 2008-08-08 00:48:37.000000000 +1000 @@ -1,8 +1,8 @@ Index: Doc/library/urllib.parse.rst =================================================================== ---- Doc/library/urllib.parse.rst (revision 65324) +--- Doc/library/urllib.parse.rst (revision 65574) +++ Doc/library/urllib.parse.rst (working copy) -@@ -182,37 +182,75 @@ +@@ -182,37 +182,81 @@ string. If there is no fragment identifier in *url*, return *url* unmodified and an empty string. @@ -20,8 +20,8 @@ + The optional *encoding* and *errors* parameters specify how to deal with + non-ASCII characters, as accepted by the :meth:`str.encode` method. + *encoding* defaults to ``'utf-8'``. -+ *errors* defaults to ``'replace'``, meaning unsupported characters are -+ replaced by a placeholder character. ++ *errors* defaults to ``'strict'``, meaning unsupported characters raise a ++ :class:`UnicodeEncodeError`. + *encoding* and *errors* are ignored if *string* is a :class:`bytes`. -.. function:: quote_plus(string[, safe]) @@ -40,8 +40,8 @@ -.. function:: unquote(string) +.. function:: quote_from_bytes(bytes[, safe]) -+ An alias for :func:`quote`, intended for use with a :class:`bytes` object -+ rather than a :class:`str`. ++ Like :func:`quote`, but accepts a :class:`bytes` object rather than a ++ :class:`str`, and does not perform string-to-bytes encoding. + + Example: ``quote_from_bytes(b'/El Ni\xc3\xb1o/')`` yields + ``'/El%20Ni%C3%B1o/'``. @@ -54,26 +54,32 @@ + :meth:`bytes.decode` method. - Example: ``unquote('/%7Econnolly/')`` yields ``'/~connolly/'``. -+ *encoding* defaults to ``'utf-8'``. -+ *errors* defaults to ``'replace'``, meaning invalid sequences are -+ replaced by a placeholder character. ++ *string* must be a :class:`str`. -+ Example: ``unquote('/El%20Ni%C3%B1o/')`` yields ``'/El Niño/'``. ++ *encoding* defaults to ``'utf-8'``. ++ *errors* defaults to ``'strict'``, meaning invalid sequences raise ++ a :class:`UnicodeDecodeError`. -.. function:: unquote_plus(string) ++ Example: ``unquote('/El%20Ni%C3%B1o/')`` yields ``'/El Niño/'``. ++ +.. function:: unquote_plus(string[, encoding[, errors]]) + Like :func:`unquote`, but also replace plus signs by spaces, as required for unquoting HTML form values. -+ Example: ``unquote_plus('/El+Ni%C3%B1o/')`` yields ``'/El Niño/'``. ++ *string* must be a :class:`str`. ++ Example: ``unquote_plus('/El+Ni%C3%B1o/')`` yields ``'/El Niño/'``. ++ +.. function:: unquote_to_bytes(string) + + Replace ``%xx`` escapes by their single-octet equivalent, and return a + :class:`bytes` object. + ++ *string* must be a :class:`str`. ++ + Unescaped non-ASCII characters in the input string are encoded into UTF-8 + bytes. + @@ -86,20 +92,23 @@ Convert a mapping object or a sequence of two-element tuples to a "url-encoded" Index: Lib/urllib/parse.py =================================================================== ---- Lib/urllib/parse.py (revision 65324) +--- Lib/urllib/parse.py (revision 65574) +++ Lib/urllib/parse.py (working copy) -@@ -260,50 +260,98 @@ +@@ -7,7 +7,9 @@ + import sys + + __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", +- "urlsplit", "urlunsplit"] ++ "urlsplit", "urlunsplit", ++ "quote", "quote_plus", "quote_from_bytes", ++ "unquote", "unquote_plus", "unquote_to_bytes"] + + # A classification of schemes ('' means apply by default) + uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', +@@ -260,50 +262,94 @@ else: return url, '' -+# _hextochr maps 2-hex-digit strings onto single bytes -+# eg. _hextochr['2f'] = b'\x2f' -+# Maps lowercase and uppercase variants (but not mixed case). -+_hextochr = dict(('%02x' % i, bytes([i])) for i in range(256)) -+_hextochr.update(('%02X' % i, bytes([i])) for i in range(256)) - --_hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) --_hextochr.update(('%02X' % i, chr(i)) for i in range(256)) +def unquote_to_bytes(s): + """unquote_to_bytes('abc%20def') -> b'abc def'.""" + # Note: strings are encoded as UTF-8. This is only an issue if it contains @@ -109,23 +118,27 @@ + for i in range(1, len(res)): + item = res[i] + try: -+ res[i] = _hextochr[item[:2]] + item[2:].encode('utf-8') ++ res[i] = bytes.fromhex(item[:2]) + item[2:].encode('utf-8') + except KeyError: + res[i] = b'%' + item.encode('utf-8') + return b"".join(res) --def unquote(s): -- """unquote('abc%20def') -> 'abc def'.""" -+def unquote(s, encoding = "utf-8", errors = "replace"): +-_hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) +-_hextochr.update(('%02X' % i, chr(i)) for i in range(256)) ++def unquote(s, encoding='utf-8', errors='strict'): + """Replace %xx escapes by their single-character equivalent. The optional + encoding and errors parameters specify how to decode percent-encoded + sequences into Unicode characters, as accepted by the bytes.decode() + method. + By default, percent-encoded sequences are decoded with UTF-8, and invalid -+ sequences are replaced by a placeholder character. -+ ++ sequences raise a UnicodeDecodeError. + +-def unquote(s): +- """unquote('abc%20def') -> 'abc def'.""" + unquote('abc%20def') -> 'abc def'. + """ ++ if encoding is None: encoding = 'utf-8' ++ if errors is None: errors = 'strict' + # pct_sequence: contiguous sequence of percent-encoded bytes, decoded + # (list of single-byte bytes objects) + pct_sequence = [] @@ -134,12 +147,13 @@ item = res[i] try: - res[i] = _hextochr[item[:2]] + item[2:] -+ pct_sequence.append(_hextochr[item[:2]]) ++ pct_sequence.append(bytes.fromhex(item[:2])) + rest = item[2:] except KeyError: - res[i] = '%' + item - except UnicodeDecodeError: - res[i] = chr(int(item[:2], 16)) + item[2:] +- return "".join(res) + rest = '%' + item + if len(rest) == 0: + # This segment was just a single percent-encoded character. @@ -155,11 +169,11 @@ + # Flush the final pct_sequence + # res[-1] will always be empty if pct_sequence != [] + res[-1] = b''.join(pct_sequence).decode(encoding, errors) - return "".join(res) ++ return ''.join(res) -def unquote_plus(s): - """unquote('%7e/abc+def') -> '~/abc def'""" -+def unquote_plus(s, encoding = "utf-8", errors = "replace"): ++def unquote_plus(s, encoding='utf-8', errors='strict'): + """Like unquote(), but also replace plus signs by spaces, as required for + unquoting HTML form values. + @@ -205,11 +219,11 @@ + return res -def quote(s, safe = '/'): -+def quote(s, safe = '/', encoding = "utf-8", errors = "replace"): ++def quote(s, safe='/', encoding='utf-8', errors='strict'): """quote('abc def') -> 'abc%20def' Each part of a URL, e.g. the path info, the query, etc., has a -@@ -323,8 +371,18 @@ +@@ -323,8 +369,20 @@ is reserved, but in typical usage the quote function is being called on a path where the existing slash characters are used as reserved characters. @@ -217,8 +231,10 @@ + The optional encoding and errors parameters specify how to deal with + non-ASCII characters, as accepted by the str.encode method. + By default, characters are encoded with UTF-8, and unsupported characters -+ are replaced by a placeholder character. ++ raise a UnicodeEncodeError. """ ++ if encoding is None: encoding = 'utf-8' ++ if errors is None: errors = 'strict' + if isinstance(safe, str): + # Normalize 'safe' by converting to bytes and removing non-ASCII chars + safe = safe.encode('ascii', 'ignore') @@ -228,46 +244,58 @@ try: quoter = _safe_quoters[cachekey] except KeyError: -@@ -333,13 +391,19 @@ +@@ -333,13 +391,32 @@ res = map(quoter, s) return ''.join(res) -def quote_plus(s, safe = ''): - """Quote the query fragment of a URL; replacing ' ' with '+'""" -+def quote_plus(s, safe = '', encoding = "utf-8", errors = "replace"): +- if ' ' in s: +- s = quote(s, safe + ' ') ++def quote_plus(s, safe='', encoding='utf-8', errors='strict'): + """Like quote(), but also replace ' ' with '+', as required for quoting + HTML form values. Plus signs in the original string are escaped unless + they are included in safe. It also does not have safe default to '/'. + """ - if ' ' in s: - s = quote(s, safe + ' ') ++ # Check if ' ' in s, where s may either be a str or bytes ++ if ' ' in s if isinstance(s, str) else b' ' in s: ++ s = quote(s, safe + ' ' if isinstance(safe, str) else safe + b' ') return s.replace(' ', '+') - return quote(s, safe) + return quote(s, safe, encoding, errors) -+# quote accepts either bytes or strings, so quote_from_bytes is just an alias -+quote_from_bytes = quote ++def quote_from_bytes(s, safe='/'): ++ if isinstance(safe, str): ++ # Normalize 'safe' by converting to bytes and removing non-ASCII chars ++ safe = safe.encode('ascii', 'ignore') ++ cachekey = (safe, always_safe) ++ if not isinstance(s, bytes) or isinstance(s, bytearray): ++ raise TypeError("quote_from_bytes() expected a bytes") ++ try: ++ quoter = _safe_quoters[cachekey] ++ except KeyError: ++ quoter = Quoter(safe) ++ _safe_quoters[cachekey] = quoter ++ res = map(quoter, s) ++ return ''.join(res) + def urlencode(query,doseq=0): """Encode a sequence of two-element tuples or dictionary into a URL query string. Index: Lib/email/utils.py =================================================================== ---- Lib/email/utils.py (revision 65324) +--- Lib/email/utils.py (revision 65574) +++ Lib/email/utils.py (working copy) -@@ -219,7 +219,10 @@ +@@ -219,7 +219,7 @@ charset is given but not language, the string is encoded using the empty string for language. """ - s = urllib.parse.quote(s, safe='') -+ try: -+ s = urllib.parse.quote(s, safe='', encoding=charset) -+ except: -+ s = urllib.parse.quote(s, safe='') ++ s = urllib.parse.quote(s, safe='', encoding=charset) if charset is None and language is None: return s if language is None: -@@ -271,7 +274,10 @@ +@@ -271,7 +271,10 @@ # language specifiers at the beginning of the string. for num, s, encoded in continuations: if encoded: @@ -281,7 +309,7 @@ value = quote(EMPTYSTRING.join(value)) Index: Lib/test/test_http_cookiejar.py =================================================================== ---- Lib/test/test_http_cookiejar.py (revision 65324) +--- Lib/test/test_http_cookiejar.py (revision 65574) +++ Lib/test/test_http_cookiejar.py (working copy) @@ -539,6 +539,8 @@ # unquoted unsafe @@ -304,7 +332,7 @@ cookie = interact_2965( Index: Lib/test/test_cgi.py =================================================================== ---- Lib/test/test_cgi.py (revision 65324) +--- Lib/test/test_cgi.py (revision 65574) +++ Lib/test/test_cgi.py (working copy) @@ -68,6 +68,8 @@ ("&a=b", [('a', 'b')]), @@ -317,7 +345,7 @@ parse_strict_test_cases = [ Index: Lib/test/test_wsgiref.py =================================================================== ---- Lib/test/test_wsgiref.py (revision 65324) +--- Lib/test/test_wsgiref.py (revision 65574) +++ Lib/test/test_wsgiref.py (working copy) @@ -291,6 +291,7 @@ def testAppURIs(self): @@ -337,16 +365,36 @@ self.checkReqURI("http://127.0.0.1/spammity/spam?say=ni", Index: Lib/test/test_urllib.py =================================================================== ---- Lib/test/test_urllib.py (revision 65324) +--- Lib/test/test_urllib.py (revision 65574) +++ Lib/test/test_urllib.py (working copy) -@@ -355,6 +355,23 @@ +@@ -336,10 +336,10 @@ + "_.-"]) + result = urllib.parse.quote(do_not_quote) + self.assertEqual(do_not_quote, result, +- "using quote(): %s != %s" % (do_not_quote, result)) ++ "using quote(): %r != %r" % (do_not_quote, result)) + result = urllib.parse.quote_plus(do_not_quote) + self.assertEqual(do_not_quote, result, +- "using quote_plus(): %s != %s" % (do_not_quote, result)) ++ "using quote_plus(): %r != %r" % (do_not_quote, result)) + + def test_default_safe(self): + # Test '/' is default value for 'safe' parameter +@@ -350,11 +350,28 @@ + quote_by_default = "<>" + result = urllib.parse.quote(quote_by_default, safe=quote_by_default) self.assertEqual(quote_by_default, result, - "using quote_plus(): %s != %s" % +- "using quote(): %s != %s" % (quote_by_default, result)) ++ "using quote(): %r != %r" % (quote_by_default, result)) + result = urllib.parse.quote_plus(quote_by_default, safe=quote_by_default) + self.assertEqual(quote_by_default, result, +- "using quote_plus(): %s != %s" % ++ "using quote_plus(): %r != %r" % (quote_by_default, result)) + # Safe expressed as bytes rather than str + result = urllib.parse.quote(quote_by_default, safe=b"<>") + self.assertEqual(quote_by_default, result, -+ "using quote(): %s != %s" % (quote_by_default, result)) ++ "using quote(): %r != %r" % (quote_by_default, result)) + # "Safe" non-ASCII characters should have no effect + # (Since URIs are not allowed to have non-ASCII characters) + result = urllib.parse.quote("a\xfcb", encoding="latin-1", safe="\xfc") @@ -363,9 +411,50 @@ def test_default_quoting(self): # Make sure all characters that should be quoted are by default sans -@@ -407,6 +424,56 @@ +@@ -378,35 +395,100 @@ + expected = "ab%5B%5Dcd" + result = urllib.parse.quote(partial_quote) + self.assertEqual(expected, result, +- "using quote(): %s != %s" % (expected, result)) ++ "using quote(): %r != %r" % (expected, result)) + self.assertEqual(expected, result, +- "using quote_plus(): %s != %s" % (expected, result)) ++ "using quote_plus(): %r != %r" % (expected, result)) + + def test_quoting_space(self): + # Make sure quote() and quote_plus() handle spaces as specified in + # their unique way + result = urllib.parse.quote(' ') + self.assertEqual(result, hexescape(' '), +- "using quote(): %s != %s" % (result, hexescape(' '))) ++ "using quote(): %r != %r" % (result, hexescape(' '))) + result = urllib.parse.quote_plus(' ') + self.assertEqual(result, '+', +- "using quote_plus(): %s != +" % result) ++ "using quote_plus(): %r != +" % result) + given = "a b cd e f" + expect = given.replace(' ', hexescape(' ')) + result = urllib.parse.quote(given) + self.assertEqual(expect, result, +- "using quote(): %s != %s" % (expect, result)) ++ "using quote(): %r != %r" % (expect, result)) + expect = given.replace(' ', '+') + result = urllib.parse.quote_plus(given) + self.assertEqual(expect, result, +- "using quote_plus(): %s != %s" % (expect, result)) ++ "using quote_plus(): %r != %r" % (expect, result)) + + def test_quoting_plus(self): + self.assertEqual(urllib.parse.quote_plus('alpha+beta gamma'), + 'alpha%2Bbeta+gamma') self.assertEqual(urllib.parse.quote_plus('alpha+beta gamma', '+'), 'alpha+beta+gamma') ++ # Test with bytes ++ self.assertEqual(urllib.parse.quote_plus(b'alpha+beta gamma'), ++ 'alpha%2Bbeta+gamma') ++ # Test with safe bytes ++ self.assertEqual(urllib.parse.quote_plus('alpha+beta gamma', b'+'), ++ 'alpha+beta+gamma') + def test_quote_bytes(self): + # Bytes should quote directly to percent-encoded values @@ -391,6 +480,10 @@ + result = urllib.parse.quote(given) + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) ++ # Characters in Latin-1 range, encoded by with None (default) ++ result = urllib.parse.quote(given, encoding=None, errors=None) ++ self.assertEqual(expect, result, ++ "using quote(): %r != %r" % (expect, result)) + # Characters in Latin-1 range, encoded with Latin-1 + given = "\xa2\xd8ab\xff" + expect = "%A2%D8ab%FF" @@ -405,8 +498,13 @@ + "using quote(): %r != %r" % (expect, result)) + # Characters in BMP, encoded with Latin-1 + given = "\u6f22\u5b57" ++ self.assertRaises(UnicodeEncodeError, urllib.parse.quote, given, ++ encoding="latin-1") ++ # Characters in BMP, encoded with Latin-1, with replace error handling ++ given = "\u6f22\u5b57" + expect = "%3F%3F" # "??" -+ result = urllib.parse.quote(given, encoding="latin-1") ++ result = urllib.parse.quote(given, encoding="latin-1", ++ errors="replace") + self.assertEqual(expect, result, + "using quote(): %r != %r" % (expect, result)) + # Characters in BMP, Latin-1, with xmlcharref error handling @@ -420,9 +518,65 @@ class UnquotingTests(unittest.TestCase): """Tests for unquote() and unquote_plus() -@@ -463,10 +530,71 @@ +@@ -422,23 +504,28 @@ + expect = chr(num) + result = urllib.parse.unquote(given) + self.assertEqual(expect, result, +- "using unquote(): %s != %s" % (expect, result)) ++ "using unquote(): %r != %r" % (expect, result)) + result = urllib.parse.unquote_plus(given) + self.assertEqual(expect, result, +- "using unquote_plus(): %s != %s" % ++ "using unquote_plus(): %r != %r" % + (expect, result)) + escape_list.append(given) + escape_string = ''.join(escape_list) + del escape_list + result = urllib.parse.unquote(escape_string) + self.assertEqual(result.count('%'), 1, +- "using quote(): not all characters escaped; %s" % +- result) +- result = urllib.parse.unquote(escape_string) +- self.assertEqual(result.count('%'), 1, + "using unquote(): not all characters escaped: " + "%s" % result) + ++ def test_unquoting_mixed_case(self): ++ # Test unquoting on mixed-case hex digits in the percent-escapes ++ given = '%Ab%eA' ++ expect = b'\xab\xea' ++ result = urllib.parse.unquote_to_bytes(given) ++ self.assertEqual(expect, result, ++ "using unquote_to_bytes(): %r != %r" ++ % (expect, result)) ++ + def test_unquoting_parts(self): + # Make sure unquoting works when have non-quoted characters + # interspersed +@@ -446,10 +533,10 @@ + expect = "abcd" + result = urllib.parse.unquote(given) + self.assertEqual(expect, result, +- "using quote(): %s != %s" % (expect, result)) ++ "using quote(): %r != %r" % (expect, result)) + result = urllib.parse.unquote_plus(given) + self.assertEqual(expect, result, +- "using unquote_plus(): %s != %s" % (expect, result)) ++ "using unquote_plus(): %r != %r" % (expect, result)) + + def test_unquoting_plus(self): + # Test difference between unquote() and unquote_plus() +@@ -457,16 +544,85 @@ + expect = given + result = urllib.parse.unquote(given) self.assertEqual(expect, result, - "using unquote_plus(): %s != %s" % (expect, result)) +- "using unquote(): %s != %s" % (expect, result)) ++ "using unquote(): %r != %r" % (expect, result)) + expect = given.replace('+', ' ') + result = urllib.parse.unquote_plus(given) + self.assertEqual(expect, result, +- "using unquote_plus(): %s != %s" % (expect, result)) ++ "using unquote_plus(): %r != %r" % (expect, result)) + def test_unquote_to_bytes(self): + given = 'br%C3%BCckner_sapporo_20050930.doc' @@ -449,6 +603,10 @@ + result = urllib.parse.unquote(given) + self.assertEqual(expect, result, + "using unquote(): %r != %r" % (expect, result)) ++ # Characters in the Latin-1 range, encoded with None (default) ++ result = urllib.parse.unquote(given, encoding=None, errors=None) ++ self.assertEqual(expect, result, ++ "using unquote(): %r != %r" % (expect, result)) + # Characters in the Latin-1 range, encoded with Latin-1 + result = urllib.parse.unquote('br%FCckner_sapporo_20050930.doc', @@ -466,8 +624,12 @@ + + # Decode with UTF-8, invalid sequence + given = "%F3%B1" ++ self.assertRaises(UnicodeDecodeError, urllib.parse.unquote, given) ++ ++ # Decode with UTF-8, invalid sequence, replace errors ++ given = "%F3%B1" + expect = "\ufffd" # Replacement character -+ result = urllib.parse.unquote(given) ++ result = urllib.parse.unquote(given, errors="replace") + self.assertEqual(expect, result, + "using unquote(): %r != %r" % (expect, result)) +