diff --git a/Doc/library/difflib.rst b/Doc/library/difflib.rst --- a/Doc/library/difflib.rst +++ b/Doc/library/difflib.rst @@ -99,17 +99,18 @@ diffs. For comparing directories and fil broken and wrapped, defaults to ``None`` where lines are not wrapped. *linejunk* and *charjunk* are optional keyword arguments passed into ``ndiff()`` (used by :class:`HtmlDiff` to generate the side by side HTML differences). See ``ndiff()`` documentation for argument default values and descriptions. The following methods are public: - .. method:: make_file(fromlines, tolines, fromdesc='', todesc='', context=False, numlines=5) + .. method:: make_file(fromlines, tolines, fromdesc='', todesc='', context=False, \ + numlines=5, *, charset='utf-8') Compares *fromlines* and *tolines* (lists of strings) and returns a string which is a complete HTML file containing a table showing line by line differences with inter-line and intra-line changes highlighted. *fromdesc* and *todesc* are optional keyword arguments to specify from/to file column header strings (both default to an empty string). @@ -118,16 +119,20 @@ diffs. For comparing directories and fil ``False`` to show the full files. *numlines* defaults to ``5``. When *context* is ``True`` *numlines* controls the number of context lines which surround the difference highlights. When *context* is ``False`` *numlines* controls the number of lines which are shown before a difference highlight when using the "next" hyperlinks (setting to zero would cause the "next" hyperlinks to place the next difference highlight at the top of the browser without any leading context). + .. versionchanged:: 3.5 + *charset* keyword-only argument was added. The default charset of HTML document + changed from ``'ISO-8859-1'`` to ``'utf-8'``. + .. method:: make_table(fromlines, tolines, fromdesc='', todesc='', context=False, numlines=5) Compares *fromlines* and *tolines* (lists of strings) and returns a string which is a complete HTML table showing line by line differences with inter-line and intra-line changes highlighted. The arguments for this method are the same as those for the :meth:`make_file` method. diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst --- a/Doc/whatsnew/3.5.rst +++ b/Doc/whatsnew/3.5.rst @@ -220,16 +220,24 @@ contextlib ---------- * The new :func:`contextlib.redirect_stderr` context manager(similar to :func:`contextlib.redirect_stdout`) makes it easier for utility scripts to handle inflexible APIs that write their output to :data:`sys.stderr` and don't provide any options to redirect it. (Contributed by Berker Peksag in :issue:`22389`.) +difflib +------- + +* The charset of the HTML document generated by :meth:`difflib.HtmlDiff.make_file` + can now be customized by using *charset* keyword-only parameter. The default + charset of HTML document changed from ``'ISO-8859-1'`` to ``'utf-8'``. + (Contributed by Berker Peksag in :issue:`2052`.) + distutils --------- * The ``build`` and ``build_ext`` commands now accept a ``-j`` option to enable parallel building of extension modules. (Contributed by Antoine Pitrou in :issue:`5309`.) doctest diff --git a/Lib/difflib.py b/Lib/difflib.py --- a/Lib/difflib.py +++ b/Lib/difflib.py @@ -1593,17 +1593,17 @@ def _mdiff(fromlines, tolines, context=N _file_template = """ + content="text/html; charset=%(charset)s" /> %(table)s%(legend)s @@ -1680,39 +1680,47 @@ class HtmlDiff(object): HtmlDiff() to generate the side by side HTML differences). See ndiff() documentation for argument default values and descriptions. """ self._tabsize = tabsize self._wrapcolumn = wrapcolumn self._linejunk = linejunk self._charjunk = charjunk - def make_file(self,fromlines,tolines,fromdesc='',todesc='',context=False, - numlines=5): + def make_file(self, fromlines, tolines, fromdesc='', todesc='', + context=False, numlines=5, *, charset='utf-8'): """Returns HTML file of side by side comparison with change highlights Arguments: fromlines -- list of "from" lines tolines -- list of "to" lines fromdesc -- "from" file column header string todesc -- "to" file column header string context -- set to True for contextual differences (defaults to False which shows full differences). numlines -- number of context lines. When context is set True, controls number of lines displayed before and after the change. When context is False, controls the number of lines to place the "next" link anchors before the next change (so click of "next" link jumps to just before the change). + charset -- charset of the HTML document """ + table = self.make_table(fromlines, tolines, fromdesc, todesc, + context=context, numlines=numlines) + styles = self._styles.encode(charset, + 'xmlcharrefreplace').decode(charset) + legend = self._legend.encode(charset, + 'xmlcharrefreplace').decode(charset) return self._file_template % dict( - styles = self._styles, - legend = self._legend, - table = self.make_table(fromlines,tolines,fromdesc,todesc, - context=context,numlines=numlines)) + styles=styles, + legend=legend, + table=table.encode(charset, 'xmlcharrefreplace').decode(charset), + charset=charset + ) def _tab_newline_replace(self,fromlines,tolines): """Returns from/to line lists with tabs expanded and newlines removed. Instead of tab characters being replaced by the number of spaces needed to fill in to the next tab stop, this function will fill the space with tab characters. This is done so that the difference algorithms can identify changes in a file when tabs are replaced by diff --git a/Lib/test/test_difflib.py b/Lib/test/test_difflib.py --- a/Lib/test/test_difflib.py +++ b/Lib/test/test_difflib.py @@ -102,16 +102,30 @@ patch914575_from1 = """ patch914575_to1 = """ 1. Beautiful is better than ugly. 3. Simple is better than complex. 4. Complicated is better than complex. 5. Flat is better than nested. """ +patch914575_nonascii_from1 = """ + 1. Beautiful is beTTer than ugly. + 2. Explicit is better than ımplıcıt. + 3. Simple is better than complex. + 4. Complex is better than complicated. +""" + +patch914575_nonascii_to1 = """ + 1. Beautiful is better than ügly. + 3. Sımple is better than complex. + 4. Complicated is better than cömplex. + 5. Flat is better than nested. +""" + patch914575_from2 = """ \t\tLine 1: preceeded by from:[tt] to:[ssss] \t\tLine 2: preceeded by from:[sstt] to:[sssst] \t \tLine 3: preceeded by from:[sstst] to:[ssssss] Line 4: \thas from:[sst] to:[sss] after : Line 5: has from:[t] to:[ss] at end\t """ @@ -218,16 +232,37 @@ class TestSFpatches(unittest.TestCase): def test_recursion_limit(self): # Check if the problem described in patch #1413711 exists. limit = sys.getrecursionlimit() old = [(i%2 and "K:%d" or "V:A:%d") % i for i in range(limit*2)] new = [(i%2 and "K:%d" or "V:B:%d") % i for i in range(limit*2)] difflib.SequenceMatcher(None, old, new).get_opcodes() + def test_make_file_default_charset(self): + html_diff = difflib.HtmlDiff() + output = html_diff.make_file(patch914575_from1.splitlines(), + patch914575_to1.splitlines()) + self.assertIn('content="text/html; charset=utf-8"', output) + + def test_make_file_iso88591_charset(self): + html_diff = difflib.HtmlDiff() + output = html_diff.make_file(patch914575_from1.splitlines(), + patch914575_to1.splitlines(), + charset='iso-8859-1') + self.assertIn('content="text/html; charset=iso-8859-1"', output) + + def test_make_file_usascii_charset_with_nonascii_input(self): + html_diff = difflib.HtmlDiff() + output = html_diff.make_file(patch914575_nonascii_from1.splitlines(), + patch914575_nonascii_to1.splitlines(), + charset='us-ascii') + self.assertIn('content="text/html; charset=us-ascii"', output) + self.assertIn('ımplıcıt', output) + class TestOutputFormat(unittest.TestCase): def test_tab_delimiter(self): args = ['one', 'two', 'Original', 'Current', '2005-01-26 23:30:50', '2010-04-02 10:20:52'] ud = difflib.unified_diff(*args, lineterm='') self.assertEqual(list(ud)[0:2], [ "--- Original\t2005-01-26 23:30:50", diff --git a/Lib/test/test_difflib_expect.html b/Lib/test/test_difflib_expect.html --- a/Lib/test/test_difflib_expect.html +++ b/Lib/test/test_difflib_expect.html @@ -1,17 +1,17 @@ + content="text/html; charset=utf-8" />