# HG changeset patch # Parent 74a9f0d242e9069e62fd6156ea9fc3159f278ea4 Issue #25184: Handle filename encoding issues in pydoc This supports undecodable bytes in file paths by: * Percent-encoding the bytes in file: URLs and leaving other non-ASCII characters unencoded (technically making them IRIs rather than URLs) * Encoding HTTP URLs with UTF-8 and surrogatepass * Substituting question marks in the displayed link text Some missing html.escape() calls were added as well. Tests based on patch from Serhiy Storchaka. diff -r 74a9f0d242e9 Lib/pydoc.py --- a/Lib/pydoc.py Mon Sep 28 16:53:44 2015 -0700 +++ b/Lib/pydoc.py Thu Oct 01 00:22:10 2015 +0000 @@ -221,6 +221,32 @@ keyfunc = lambda attr: (field_order.get(attr[0], 0), attr[0]) attrs.sort(key=keyfunc) +class _IriQuoter(dict): + """Mapping to percent encode an IRI (URL that uses Unicode)""" + # Inspired by the URL quoting implementation in "urllib.parse" + + _safe = urllib.parse._ALWAYS_SAFE | {ord("/")} + _escapes = range(0xDC00, 0xDD00) + + def __missing__(self, ordinal): + if ordinal in self._safe: + result = ordinal + elif ordinal < 128: # Reserved ASCII character + result = "%{:02X}".format(ordinal) + elif ordinal in self._escapes: # Surrogate-escaped byte + result = "%{:02X}".format(self._escapes.index(ordinal)) + else: + result = ordinal + self[ordinal] = result + return result + +def _displayable_path(path): + """Avoid surrogate-escaped bytes""" + displayable = path.encode('utf-8', 'replace').decode('utf-8') + if displayable != path: + displayable += " (invalid filename encoding)" + return displayable + # ----------------------------------------------------- module manipulation def ispackage(path): @@ -476,14 +502,16 @@ escape = _repr_instance.escape def page(self, title, contents): - """Format an HTML page.""" + """Format an HTML page. + + "title" is plain text, without any markup.""" return '''\ Python: %s %s -''' % (title, contents) +''' % (html.escape(title), contents) def heading(self, title, fgcol, bgcol, extras=''): """Format a page heading.""" @@ -519,8 +547,11 @@ return result + '\n%s' % contents def bigsection(self, title, *args): - """Format a section with a big heading.""" - title = '%s' % title + """Format a section with a big heading. + + "title" is plain text, without any markup.""" + + title = '%s' % html.escape(title) return self.section(title, *args) def preformat(self, text): @@ -577,9 +608,9 @@ text = name return '%s' % (url, text) - def filelink(self, url, path): - """Make a link to source file.""" - return '%s' % (url, path) + def filelink(self, path): + """Make a URL or IRI to source file.""" + return "file:" + path.translate(_IriQuoter()) def markup(self, text, escape=None, funcs={}, classes={}, methods={}): """Mark up some plain text, given a context of symbols to look for. @@ -660,8 +691,9 @@ head = '%s' % linkedname try: path = inspect.getabsfile(object) - url = urllib.parse.quote(path) - filelink = self.filelink(url, path) + url = self.filelink(path) + path = _displayable_path(path) + filelink = '%s' % (url, html.escape(path)) except TypeError: filelink = '(built-in)' info = [] @@ -1016,6 +1048,7 @@ modpkgs.sort() contents = self.multicolumn(modpkgs, self.modpkglink) + dir = _displayable_path(dir) return self.bigsection(dir, '#ffffff', '#ee77aa', contents) # -------------------------------------------- text documentation generator @@ -2298,10 +2331,12 @@ Pydoc: %s %s%s
%s
-''' % (title, css_link, html_navbar(), contents) +''' % (html.escape(title), css_link, html_navbar(), contents) - def filelink(self, url, path): - return '%s' % (url, path) + def filelink(self, path): + url = urllib.parse.quote( + path, encoding='utf-8', errors='surrogatepass') + return "getfile?key=" + url html = _HTMLDoc() @@ -2388,13 +2423,14 @@ def html_getfile(path): """Get and display a source file listing safely.""" - path = urllib.parse.unquote(path) + path = urllib.parse.unquote(path, 'utf-8', 'surrogatepass') with tokenize.open(path) as fp: lines = html.escape(fp.read()) body = '
%s
' % lines heading = html.heading( 'File Listing', '#ffffff', '#7799ee') + path = _displayable_path(path) contents = heading + html.bigsection( 'File: %s' % path, '#ffffff', '#ee77aa', body) return 'getfile %s' % path, contents diff -r 74a9f0d242e9 Lib/test/test_pydoc.py --- a/Lib/test/test_pydoc.py Mon Sep 28 16:53:44 2015 -0700 +++ b/Lib/test/test_pydoc.py Thu Oct 01 00:22:10 2015 +0000 @@ -10,6 +10,7 @@ import _pickle import pkgutil import re +import shutil import stat import string import test.support @@ -23,10 +24,11 @@ from collections import namedtuple from test.support.script_helper import assert_python_ok from test.support import ( - TESTFN, rmtree, + TESTFN, TESTFN_UNDECODABLE, TESTFN_UNENCODABLE, rmtree, reap_children, reap_threads, captured_output, captured_stdout, - captured_stderr, unlink, requires_docstrings + captured_stderr, unlink, requires_docstrings, temp_dir ) +from unittest.mock import patch from test import pydoc_mod try: @@ -347,7 +349,8 @@ "Returns pydoc generated output as html" doc = pydoc.HTMLDoc() output = doc.docmodule(module) - loc = doc.getdocloc(pydoc_mod) or "" + output.encode("utf-8", "strict") # Should not have any surrogate escapes + loc = doc.getdocloc(module) or "" if loc: loc = "
Module Docs" return output.strip(), loc @@ -355,7 +358,7 @@ def get_pydoc_text(module): "Returns pydoc generated output as text" doc = pydoc.TextDoc() - loc = doc.getdocloc(pydoc_mod) or "" + loc = doc.getdocloc(module) or "" if loc: loc = "\nMODULE DOCS\n " + loc + "\n" @@ -413,6 +416,30 @@ expected_html_data_docstrings) self.assertEqual(result, expected_html) + def test_html_path_encoding(self): + for path in (TESTFN_UNDECODABLE, TESTFN_UNENCODABLE): + if not path: + continue + with self.subTest(repr(path)): + path = os.fsdecode(path) + path = os.path.join(path, 'pydoc_awkward_path.py') + mod = types.ModuleType('test.pydoc_awkward_path') + mod.__file__ = os.path.abspath(path) + result, _ = get_pydoc_html(mod) + + # Extract the link URL and displayed filename + result, _ = result.split( + ' (invalid filename encoding)', 2) + link, _, text = result.rpartition('">') + _, _, link = link.rpartition('= 2, "Docstrings are omitted with -O2 and above") @unittest.skipIf(hasattr(sys, 'gettrace') and sys.gettrace(), @@ -870,6 +897,14 @@ class PydocUrlHandlerTest(PydocBaseTest): """Tests for pydoc._url_handler""" + def call_url_handler(self, url, expected_title): + text = pydoc._url_handler(url, "text/html") + result = get_html_title(text) + # Checking the title can ensure an error page was not returned + self.assertEqual(result, expected_title, text) + text.encode("utf-8", "strict") # Should have no surrogate escapes + return text + def test_content_type_err(self): f = pydoc._url_handler self.assertRaises(TypeError, f, 'A', '') @@ -896,16 +931,49 @@ with self.restrict_walk_packages(): for url, title in requests: - text = pydoc._url_handler(url, "text/html") - result = get_html_title(text) - self.assertEqual(result, title, text) + self.call_url_handler(url, title) path = string.__file__ title = "Pydoc: getfile " + path url = "getfile?key=" + path - text = pydoc._url_handler(url, "text/html") - result = get_html_title(text) - self.assertEqual(result, title) + self.call_url_handler(url, title) + + def test_path_encoding(self): + # Test handling of tricky paths + for dir in (TESTFN_UNDECODABLE, TESTFN_UNENCODABLE): + if not dir: + continue + with self.subTest(repr(dir)), temp_dir(dir): + dir = os.fsdecode(dir) + path = os.path.join(dir, "test_mod_awkward_path.py") + shutil.copyfile(pydoc_mod.__file__, path) + saved_paths = tuple(sys.path) + sys.path.append(dir) + try: + text = self.call_url_handler( + "index", "Pydoc: Index of Modules") + self.assertIn("invalid filename encoding", text) + + text = self.call_url_handler( + "test_mod_awkward_path", + "Pydoc: module test_mod_awkward_path") + import test_mod_awkward_path + self.assertEqual( + test_mod_awkward_path.__file__, path, + "Should not be a cached path") + # Extract the link URL and displayed filename + text, _ = text.split( + ' (invalid filename encoding)', 2) + link, _, text = text.rpartition('">') + _, _, link = link.rpartition('