From e62f4e83abc1f467110ace7337e65dab320c4e76 Mon Sep 17 00:00:00 2001
From: Tim Burke <tim.burke@gmail.com>
Date: Fri, 20 Sep 2019 15:55:03 -0700
Subject: [PATCH] bpo-38216: Only forbid CR, LF, and SP in http URLs

---
 Lib/http/client.py      |  7 +++---
 Lib/test/test_urllib.py | 47 ++++++++++++++++++++++++++++++++---------
 2 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/Lib/http/client.py b/Lib/http/client.py
index dd23edcd59..aff8fe7f81 100644
--- a/Lib/http/client.py
+++ b/Lib/http/client.py
@@ -145,10 +145,9 @@ _is_illegal_header_value = re.compile(rb'\n(?![ \t])|\r(?![ \t\n])').search
 #  https://tools.ietf.org/html/rfc3986#appendix-A pchar definition.
 # Prevents CVE-2019-9740.  Includes control characters such as \r\n.
 # We don't restrict chars above \x7f as putrequest() limits us to ASCII.
-_contains_disallowed_url_pchar_re = re.compile('[\x00-\x20\x7f]')
-# Arguably only these _should_ allowed:
-#  _is_allowed_url_pchars_re = re.compile(r"^[/!$&'()*+,;=:@%a-zA-Z0-9._~-]+$")
-# We are more lenient for assumed real world compatibility purposes.
+_contains_disallowed_url_pchar_re = re.compile('[\r\n ]')
+# Note that this is much more lenient than what is allowed on master, to
+# ensure smoother upgrades within a minor release.
 
 # We always set the Content-Length header for these methods because some
 # servers will otherwise respond with a 411
diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py
index 7ec365b928..33c26c476c 100644
--- a/Lib/test/test_urllib.py
+++ b/Lib/test/test_urllib.py
@@ -332,9 +332,11 @@ class urlopen_HttpTests(unittest.TestCase, FakeHTTPMixin, FakeFTPMixin):
 
     @unittest.skipUnless(ssl, "ssl module required")
     def test_url_with_control_char_rejected(self):
-        for char_no in list(range(0, 0x21)) + [0x7f]:
+        for char_no in range(129):
             char = chr(char_no)
+            quoted_char = urllib.parse.quote(char)
             schemeless_url = f"//localhost:7777/test{char}/"
+            escaped_url = f"//localhost:7777/test{quoted_char}/"
             self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.")
             try:
                 # We explicitly test urllib.request.urlopen() instead of the top
@@ -345,15 +347,40 @@ class urlopen_HttpTests(unittest.TestCase, FakeHTTPMixin, FakeFTPMixin):
                 # above attempts at injection within the url _path_ safe.
                 escaped_char_repr = repr(char).replace('\\', r'\\')
                 InvalidURL = http.client.InvalidURL
-                with self.assertRaisesRegex(
-                    InvalidURL, f"contain control.*{escaped_char_repr}"):
-                    urllib.request.urlopen(f"http:{schemeless_url}")
-                with self.assertRaisesRegex(
-                    InvalidURL, f"contain control.*{escaped_char_repr}"):
-                    urllib.request.urlopen(f"https:{schemeless_url}")
-                # This code path quotes the URL so there is no injection.
-                resp = urlopen(f"http:{schemeless_url}")
-                self.assertNotIn(char, resp.geturl())
+                if char in '\r\n ':
+                    with self.assertRaisesRegex(
+                        InvalidURL, f"contain control.*{escaped_char_repr}"):
+                        urllib.request.urlopen(f"http:{schemeless_url}")
+                    with self.assertRaisesRegex(
+                        InvalidURL, f"contain control.*{escaped_char_repr}"):
+                        urllib.request.urlopen(f"https:{schemeless_url}")
+                    # This code path quotes the URL so there is no injection.
+                    resp = urlopen(f"http:{schemeless_url}")
+                    self.assertNotEqual(f"http:{schemeless_url}", resp.geturl())
+                    self.assertEqual(f"http:{escaped_url}", resp.geturl())
+                    self.assertEqual(f"GET /test{quoted_char}/ HTTP/1.1\r".encode('ascii'),
+                                     http.client.HTTPConnection.buf.split(b'\n', 1)[0])
+                elif char == '\x80':
+                    with self.assertRaises(UnicodeEncodeError):
+                        urllib.request.urlopen(f"http:{schemeless_url}")
+                    with self.assertRaises(UnicodeEncodeError):
+                        urllib.request.urlopen(f"https:{schemeless_url}")
+                    with self.assertRaisesRegex(
+                        UnicodeError, r"URL \'.*\' contains non-ASCII characters"):
+                        urlopen(f"http:{schemeless_url}")
+                elif char == '#':
+                    resp = urllib.request.urlopen(f"http:{schemeless_url}")
+                    self.assertEqual(f"http:{schemeless_url}", resp.geturl())
+                    # Note that the number sign is used to delimit
+                    # fragments, which should only be used by the client.
+                    # As such, it does not show up on the wire.
+                    self.assertEqual(f"GET /test HTTP/1.1\r".encode('ascii'),
+                                     http.client.HTTPConnection.buf.split(b'\n', 1)[0])
+                else:
+                    resp = urllib.request.urlopen(f"http:{schemeless_url}")
+                    self.assertEqual(f"http:{schemeless_url}", resp.geturl())
+                    self.assertEqual(f"GET /test{char}/ HTTP/1.1\r".encode('ascii'),
+                                     http.client.HTTPConnection.buf.split(b'\n', 1)[0])
             finally:
                 self.unfakehttp()
 
-- 
2.21.0