Index: Lib/urllib/parse.py =================================================================== --- Lib/urllib/parse.py (revision 64818) +++ Lib/urllib/parse.py (working copy) @@ -261,26 +261,29 @@ return url, '' -_hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) -_hextochr.update(('%02X' % i, chr(i)) for i in range(256)) +_hextochr = dict((('%02x' % i).encode('ascii'), bytes([i])) + for i in range(256)) +_hextochr.update((('%02X' % i).encode('ascii'), bytes([i])) + for i in range(256)) -def unquote(s): +def unquote(s, encoding = "utf-8", errors = "replace"): """unquote('abc%20def') -> 'abc def'.""" - res = s.split('%') + # Manipulate as bytes rather than str, so we can + # decode from UTF-8 at the end. + res = s.encode("utf-8").split(b'%') for i in range(1, len(res)): item = res[i] try: res[i] = _hextochr[item[:2]] + item[2:] except KeyError: - res[i] = '%' + item - except UnicodeDecodeError: - res[i] = chr(int(item[:2], 16)) + item[2:] - return "".join(res) + res[i] = b'%' + item + # Replace malformed UTF-8 sequences with '\ufffd' + return b"".join(res).decode(encoding, errors) -def unquote_plus(s): +def unquote_plus(s, encoding = "utf-8", errors = "replace"): """unquote('%7e/abc+def') -> '~/abc def'""" s = s.replace('+', ' ') - return unquote(s) + return unquote(s, encoding, errors) always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' @@ -288,22 +291,25 @@ _safe_quoters= {} class Quoter: - def __init__(self, safe): + def __init__(self, safe, encoding = "utf-8", errors = "replace"): self.cache = {} self.safe = safe + always_safe + self.encoding = encoding + self.errors = errors def __call__(self, c): try: return self.cache[c] except KeyError: - if ord(c) < 256: - res = (c in self.safe) and c or ('%%%02X' % ord(c)) - self.cache[c] = res - return res + if c in self.safe: + res = c else: - return "".join(['%%%02X' % i for i in c.encode("utf-8")]) + res = "".join(['%%%02X' % i for i in + c.encode(self.encoding, self.errors)]) + self.cache[c] = res + return res -def quote(s, safe = '/'): +def quote(s, safe = '/', encoding = "utf-8", errors = "replace"): """quote('abc def') -> 'abc%20def' Each part of a URL, e.g. the path info, the query, etc., has a @@ -324,21 +330,21 @@ called on a path where the existing slash characters are used as reserved characters. """ - cachekey = (safe, always_safe) + cachekey = (safe, always_safe, encoding, errors) try: quoter = _safe_quoters[cachekey] except KeyError: - quoter = Quoter(safe) + quoter = Quoter(safe, encoding, errors) _safe_quoters[cachekey] = quoter res = map(quoter, s) return ''.join(res) -def quote_plus(s, safe = ''): +def quote_plus(s, safe = '', encoding = "utf-8", errors = "replace"): """Quote the query fragment of a URL; replacing ' ' with '+'""" if ' ' in s: s = quote(s, safe + ' ') return s.replace(' ', '+') - return quote(s, safe) + return quote(s, safe, encoding, errors) def urlencode(query,doseq=0): """Encode a sequence of two-element tuples or dictionary into a URL query string. Index: Lib/email/utils.py =================================================================== --- Lib/email/utils.py (revision 64818) +++ Lib/email/utils.py (working copy) @@ -219,7 +219,7 @@ charset is given but not language, the string is encoded using the empty string for language. """ - s = urllib.parse.quote(s, safe='') + s = urllib.parse.quote(s, safe='', encoding=charset) if charset is None and language is None: return s if language is None: @@ -271,7 +271,10 @@ # language specifiers at the beginning of the string. for num, s, encoded in continuations: if encoded: - s = urllib.parse.unquote(s) + # Decode as "latin-1", so the characters in s directly + # represent the percent-encoded octet values. + # collapse_rfc2231_value treats this as an octet sequence. + s = urllib.parse.unquote(s, encoding="latin-1") extended = True value.append(s) value = quote(EMPTYSTRING.join(value)) Index: Lib/test/test_http_cookiejar.py =================================================================== --- Lib/test/test_http_cookiejar.py (revision 64818) +++ Lib/test/test_http_cookiejar.py (working copy) @@ -1444,7 +1444,8 @@ # Try some URL encodings of the PATHs. # (the behaviour here has changed from libwww-perl) c = CookieJar(DefaultCookiePolicy(rfc2965=True)) - interact_2965(c, "http://www.acme.com/foo%2f%25/%3c%3c%0Anew%E5/%E5", + interact_2965(c, "http://www.acme.com/foo%2f%25/" + "%3c%3c%0Anew%C3%A5/%C3%A5", "foo = bar; version = 1") cookie = interact_2965( Index: Lib/test/test_urllib.py =================================================================== --- Lib/test/test_urllib.py (revision 64818) +++ Lib/test/test_urllib.py (working copy) @@ -465,7 +465,7 @@ def test_unquote_with_unicode(self): r = urllib.parse.unquote('br%C3%BCckner_sapporo_20050930.doc') - self.assertEqual(r, 'br\xc3\xbcckner_sapporo_20050930.doc') + self.assertEqual(r, 'br\u00fcckner_sapporo_20050930.doc') class urlencode_Tests(unittest.TestCase): """Tests for urlencode()"""