# HG changeset patch # Parent 74a9f0d242e9069e62fd6156ea9fc3159f278ea4 Issue #25184: Handle filename encoding issues in pydoc This supports undecodable bytes in file paths by: * Percent-encoding the bytes in file: URLs and leaving other non-ASCII characters unencoded (technically making them IRIs rather than URLs) * Encoding HTTP URLs with UTF-8 and surrogatepass * Substituting question marks in the displayed link text Some missing html.escape() calls were added as well. Tests based on patch from Serhiy Storchaka. diff -r 74a9f0d242e9 Lib/pydoc.py --- a/Lib/pydoc.py Mon Sep 28 16:53:44 2015 -0700 +++ b/Lib/pydoc.py Thu Oct 01 00:22:10 2015 +0000 @@ -221,6 +221,32 @@ keyfunc = lambda attr: (field_order.get(attr[0], 0), attr[0]) attrs.sort(key=keyfunc) +class _IriQuoter(dict): + """Mapping to percent encode an IRI (URL that uses Unicode)""" + # Inspired by the URL quoting implementation in "urllib.parse" + + _safe = urllib.parse._ALWAYS_SAFE | {ord("/")} + _escapes = range(0xDC00, 0xDD00) + + def __missing__(self, ordinal): + if ordinal in self._safe: + result = ordinal + elif ordinal < 128: # Reserved ASCII character + result = "%{:02X}".format(ordinal) + elif ordinal in self._escapes: # Surrogate-escaped byte + result = "%{:02X}".format(self._escapes.index(ordinal)) + else: + result = ordinal + self[ordinal] = result + return result + +def _displayable_path(path): + """Avoid surrogate-escaped bytes""" + displayable = path.encode('utf-8', 'replace').decode('utf-8') + if displayable != path: + displayable += " (invalid filename encoding)" + return displayable + # ----------------------------------------------------- module manipulation def ispackage(path): @@ -476,14 +502,16 @@ escape = _repr_instance.escape def page(self, title, contents): - """Format an HTML page.""" + """Format an HTML page. + + "title" is plain text, without any markup.""" return '''\