Index: Doc/library/urllib.parse.rst =================================================================== --- Doc/library/urllib.parse.rst (revision 74563) +++ Doc/library/urllib.parse.rst (working copy) @@ -89,7 +89,7 @@ object. -.. function:: parse_qs(qs[, keep_blank_values[, strict_parsing]]) +.. function:: parse_qs(qs[, keep_blank_values[, strict_parsing[, encoding[, errors]]]]) Parse a query string given as a string argument (data of type :mimetype:`application/x-www-form-urlencoded`). Data are returned as a @@ -106,11 +106,15 @@ parsing errors. If false (the default), errors are silently ignored. If true, errors raise a :exc:`ValueError` exception. + The optional arguments *encoding* and *errors* determine how percent-encoded + sequences in the query string should be decoded into characters. They default + to 'utf-8' and 'replace', respectively. + Use the :func:`urllib.parse.urlencode` function to convert such dictionaries into query strings. -.. function:: parse_qsl(qs[, keep_blank_values[, strict_parsing]]) +.. function:: parse_qsl(qs[, keep_blank_values[, strict_parsing[, encoding[, errors]]]]) Parse a query string given as a string argument (data of type :mimetype:`application/x-www-form-urlencoded`). Data are returned as a list of @@ -126,6 +130,10 @@ parsing errors. If false (the default), errors are silently ignored. If true, errors raise a :exc:`ValueError` exception. + The optional arguments *encoding* and *errors* determine how percent-encoded + sequences in the query string should be decoded into characters. They default + to 'utf-8' and 'replace', respectively. + Use the :func:`urllib.parse.urlencode` function to convert such lists of pairs into query strings. @@ -302,7 +310,7 @@ ``b'a&\xef'``. -.. function:: urlencode(query[, doseq]) +.. function:: urlencode(query[, doseq[, encoding[, errors]]]) Convert a mapping object or a sequence of two-element tuples to a "url-encoded" string, suitable to pass to :func:`urlopen` above as the optional *data* @@ -318,6 +326,9 @@ :func:`parse_qs` and :func:`parse_qsl` which are used to parse query strings into Python data structures. + The optional *encoding* and *errors* parameters specify how strings should + be encoded into bytes before being percent-escaped. If not specified, they + default to ``'utf-8'`` and ``'strict'`` respectively. .. seealso:: Index: Lib/urllib/parse.py =================================================================== --- Lib/urllib/parse.py (revision 74563) +++ Lib/urllib/parse.py (working copy) @@ -329,33 +329,41 @@ res[-1] = b''.join(pct_sequence).decode(encoding, errors) return ''.join(res) -def parse_qs(qs, keep_blank_values=0, strict_parsing=0): +def parse_qs(qs, keep_blank_values=0, strict_parsing=0, encoding='utf-8', + errors='replace'): """Parse a query given as a string argument. - Arguments: + Arguments: - qs: URL-encoded query string to be parsed + qs: URL-encoded query string to be parsed - keep_blank_values: flag indicating whether blank values in - URL encoded queries should be treated as blank strings. - A true value indicates that blanks should be retained as - blank strings. The default false value indicates that - blank values are to be ignored and treated as if they were - not included. + keep_blank_values: flag indicating whether blank values in + URL encoded queries should be treated as blank strings. + A true value indicates that blanks should be retained as + blank strings. The default false value indicates that + blank values are to be ignored and treated as if they were + not included. - strict_parsing: flag indicating what to do with parsing errors. - If false (the default), errors are silently ignored. - If true, errors raise a ValueError exception. + strict_parsing: flag indicating what to do with parsing errors. + If false (the default), errors are silently ignored. + If true, errors raise a ValueError exception. + + encoding: character encoding used to decode percent-encoded sequences. + Defaults to UTF-8. + + errors: error handling scheme used when decoding; defaults to 'replace'. """ dict = {} - for name, value in parse_qsl(qs, keep_blank_values, strict_parsing): + for name, value in parse_qsl(qs, keep_blank_values, strict_parsing, + encoding, errors): if name in dict: dict[name].append(value) else: dict[name] = [value] return dict -def parse_qsl(qs, keep_blank_values=0, strict_parsing=0): +def parse_qsl(qs, keep_blank_values=0, strict_parsing=0, encoding='utf-8', + errors='replace'): """Parse a query given as a string argument. Arguments: @@ -372,7 +380,12 @@ false (the default), errors are silently ignored. If true, errors raise a ValueError exception. - Returns a list, as G-d intended. + encoding: character encoding used to decode percent-encoded sequences. + Defaults to UTF-8. + + errors: error handling scheme used when decoding; defaults to 'replace'. + + Returns a list of (key, value) tuples. """ pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] r = [] @@ -389,8 +402,8 @@ else: continue if len(nv[1]) or keep_blank_values: - name = unquote(nv[0].replace('+', ' ')) - value = unquote(nv[1].replace('+', ' ')) + name = unquote(nv[0].replace('+', ' '), encoding, errors) + value = unquote(nv[1].replace('+', ' '), encoding, errors) r.append((name, value)) return r @@ -509,7 +522,7 @@ _safe_quoters[cachekey] = quoter return ''.join([quoter[char] for char in bs]) -def urlencode(query, doseq=0): +def urlencode(query, doseq=0, safe='', encoding=None, errors=None): """Encode a sequence of two-element tuples or dictionary into a URL query string. If any values in the query arg are sequences and doseq is true, each @@ -539,30 +552,21 @@ raise TypeError("not a valid non-string sequence " "or mapping object").with_traceback(tb) + def universal_quote(obj): + """Turn an object into a urlquoted string, handling bytes sanely""" + if isinstance(obj, bytes): + return quote_plus(obj, safe) + else: + return quote_plus(str(obj), safe, encoding, errors) + l = [] - if not doseq: - for k, v in query: - k = quote_plus(str(k)) - v = quote_plus(str(v)) - l.append(k + '=' + v) - else: - for k, v in query: - k = quote_plus(str(k)) - if isinstance(v, str): - v = quote_plus(v) - l.append(k + '=' + v) - else: - try: - # Is this a sufficient test for sequence-ness? - x = len(v) - except TypeError: - # not a sequence - v = quote_plus(str(v)) - l.append(k + '=' + v) - else: - # loop over the sequence - for elt in v: - l.append(k + '=' + quote_plus(str(elt))) + for k, v in query: + k = universal_quote(k) + if doseq and not isinstance(v, (str, bytes)) and isinstance(v, collections.Sized): + for elt in v: + l.append(k + '=' + universal_quote(elt)) + else: + l.append(k + '=' + universal_quote(v)) return '&'.join(l) # Utilities to parse URLs (most of these return None for missing parts): Index: Lib/test/test_urllib.py =================================================================== --- Lib/test/test_urllib.py (revision 74563) +++ Lib/test/test_urllib.py (working copy) @@ -786,6 +786,10 @@ def test_nonstring_values(self): self.assertEqual("a=1", urllib.parse.urlencode({"a": 1})) self.assertEqual("a=None", urllib.parse.urlencode({"a": None})) + self.assertEqual("a=1%FF", urllib.parse.urlencode({b"a": b"1\xff"})) + # Make sure we don't treat bytes as a sequence + self.assertEqual("a=b%FF", + urllib.parse.urlencode({b"a": b"b\xff"}, True)) def test_nonstring_seq_values(self): self.assertEqual("a=1&a=2", urllib.parse.urlencode({"a": [1, 2]}, True)) @@ -793,7 +797,32 @@ urllib.parse.urlencode({"a": [None, "a"]}, True)) self.assertEqual("a=a&a=b", urllib.parse.urlencode({"a": {"a": 1, "b": 1}}, True)) + self.assertEqual("a=b&a=%FF", + urllib.parse.urlencode({b"a": ["b", b"\xff"]}, True)) + def test_encodings(self): + given = [("\u00a0", "\u00c1")] + expect="%A0=%C1" + result=urllib.parse.urlencode(given, encoding='latin-1') + self.assertEqual(expect, result) + given = [("\u00a0", "\u0100")] + expect="%A0=%3F" + result=urllib.parse.urlencode(given, encoding='latin-1', errors='replace') + self.assertEqual(expect, result) + expect="%A0=" + result=urllib.parse.urlencode(given, encoding='latin-1', errors='ignore') + self.assertEqual(expect, result) + self.assertRaises(UnicodeEncodeError, urllib.parse.urlencode, + given, encoding='latin-1') + expect="%3F=%3F" + result=urllib.parse.urlencode(given, encoding='ascii', errors='replace') + self.assertEqual(expect, result) + expect="%C2%A0=%C4%80" + result=urllib.parse.urlencode(given) + self.assertEqual(expect, result) + result=urllib.parse.urlencode(given, encoding='utf-8') + self.assertEqual(expect, result) + class Pathname_Tests(unittest.TestCase): """Test pathname2url() and url2pathname()""" Index: Lib/test/test_urlparse.py =================================================================== --- Lib/test/test_urlparse.py (revision 74563) +++ Lib/test/test_urlparse.py (working copy) @@ -23,6 +23,10 @@ ("&a=b", [('a', 'b')]), ("a=a+b&b=b+c", [('a', 'a b'), ('b', 'b c')]), ("a=1&a=2", [('a', '1'), ('a', '2')]), + ("\u1000=\u1001&\u1002=\u1003", + [('\u1000', '\u1001'), ('\u1002', '\u1003')]), + ("a=%C4%80&%e1%80%80=%f0%90%80%80", + [('a', '\u0100'), ('\u1000', '\U00010000')]), ] class UrlParseTestCase(unittest.TestCase): @@ -83,6 +87,24 @@ result = urllib.parse.parse_qsl(orig, keep_blank_values=True) self.assertEqual(result, expect, "Error parsing %s" % repr(orig)) + # Test encoding parameters + given = "%A0=%FF" + expect = [("\xa0", "\xff")] + result = urllib.parse.parse_qsl(given, encoding='latin-1') + self.assertEqual(result, expect) + expect = [("\ufffd", "\ufffd")] + result = urllib.parse.parse_qsl(given, encoding='ascii') + self.assertEqual(result, expect) + result = urllib.parse.parse_qsl(given, encoding='ascii', errors='replace') + self.assertEqual(result, expect) + result = urllib.parse.parse_qsl(given) # use UTF-8 by default + self.assertEqual(result, expect) + self.assertRaises(UnicodeDecodeError, urllib.parse.parse_qsl, + given, encoding='utf-8', errors='strict') + # questionable expectation, since keep_blank_values isn't specified: + expect = [("", "")] + result = urllib.parse.parse_qsl(given, encoding='ascii', errors='ignore') + self.assertEqual(result, expect) def test_roundtrips(self): testcases = [