diff -r e532937914fc -r fa4c6160c518 Doc/library/difflib.rst --- a/Doc/library/difflib.rst Thu Apr 16 18:54:56 2015 -0400 +++ b/Doc/library/difflib.rst Thu Apr 16 21:19:10 2015 -0400 @@ -315,6 +315,20 @@ See :ref:`difflib-interface` for a more detailed example. +.. function:: diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'', fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\\n') + + Compare *a* and *b* (lists of bytes objects) using *dfunc*; yield a + sequence of delta lines (also bytes) in the format returned by *dfunc*. + *dfunc* must be a callable, typically either :func:`unified_diff` or + :func:`context_diff`. + + Allows you to compare data with unknown or inconsistent encoding. All + inputs except *n* must be bytes objects, not str. Works by losslessly + converting all inputs (except *n*) to str, and calling *dfunc(a, b, + fromfile, tofile, fromfiledate, tofiledate, n, lineterm)*. The output of + *dfunc* is then converted back to bytes, so the delta lines that you + receive have the same unknown/inconsistent encodings as *a* and *b*. + .. function:: IS_LINE_JUNK(line) diff -r e532937914fc -r fa4c6160c518 Doc/whatsnew/3.5.rst --- a/Doc/whatsnew/3.5.rst Thu Apr 16 18:54:56 2015 -0400 +++ b/Doc/whatsnew/3.5.rst Thu Apr 16 21:19:10 2015 -0400 @@ -302,6 +302,9 @@ charset of HTML document changed from ``'ISO-8859-1'`` to ``'utf-8'``. (Contributed by Berker Peksag in :issue:`2052`.) +* It's now possible to compare lists of byte strings with + :func:`difflib.diff_bytes` (fixes a regression from Python 2). + distutils --------- diff -r e532937914fc -r fa4c6160c518 Lib/difflib.py --- a/Lib/difflib.py Thu Apr 16 18:54:56 2015 -0400 +++ b/Lib/difflib.py Thu Apr 16 21:19:10 2015 -0400 @@ -28,7 +28,7 @@ __all__ = ['get_close_matches', 'ndiff', 'restore', 'SequenceMatcher', 'Differ','IS_CHARACTER_JUNK', 'IS_LINE_JUNK', 'context_diff', - 'unified_diff', 'HtmlDiff', 'Match'] + 'unified_diff', 'diff_bytes', 'HtmlDiff', 'Match'] from heapq import nlargest as _nlargest from collections import namedtuple as _namedtuple @@ -1174,6 +1174,7 @@ four """ + _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm) started = False for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): if not started: @@ -1261,6 +1262,7 @@ four """ + _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm) prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ') started = False for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): @@ -1292,6 +1294,53 @@ for line in b[j1:j2]: yield prefix[tag] + line +def _check_types(a, b, *args): + # Checking types is weird, but the alternative is garbled output when + # someone passes mixed bytes and str to {unified,context}_diff(). E.g. + # without this check, passing filenames as bytes results in output like + # --- b'oldfile.txt' + # +++ b'newfile.txt' + # because of how str.format() incorporates bytes objects. + if a and not isinstance(a[0], str): + raise TypeError('lines to compare must be str, not %s (%r)' % + (type(a[0]).__name__, a[0])) + if b and not isinstance(b[0], str): + raise TypeError('lines to compare must be str, not %s (%r)' % + (type(b[0]).__name__, b[0])) + for arg in args: + if not isinstance(arg, str): + raise TypeError('all arguments must be str, not: %r' % (arg,)) + +def diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'', + fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\n'): + r""" + Compare `a` and `b`, two sequences of lines represented as bytes rather + than str. This is a wrapper for `dfunc`, which is typically either + unified_diff() or context_diff(). Inputs are losslessly converted to + strings so that `dfunc` only has to worry about strings, and encoded + back to bytes on return. This is necessary to compare files with + unknown or inconsistent encoding. All other inputs (except `n`) must be + bytes rather than str. + """ + def decode(s): + try: + return s.decode('ascii', 'surrogateescape') + except AttributeError as err: + msg = ('all arguments must be bytes, not %s (%r)' % + (type(s).__name__, s)) + raise TypeError(msg) from err + a = list(map(decode, a)) + b = list(map(decode, b)) + fromfile = decode(fromfile) + tofile = decode(tofile) + fromfiledate = decode(fromfiledate) + tofiledate = decode(tofiledate) + lineterm = decode(lineterm) + + lines = dfunc(a, b, fromfile, tofile, fromfiledate, tofiledate, n, lineterm) + for line in lines: + yield line.encode('ascii', 'surrogateescape') + def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): r""" Compare `a` and `b` (lists of strings); return a `Differ`-style delta. diff -r e532937914fc -r fa4c6160c518 Lib/test/test_difflib.py --- a/Lib/test/test_difflib.py Thu Apr 16 18:54:56 2015 -0400 +++ b/Lib/test/test_difflib.py Thu Apr 16 21:19:10 2015 -0400 @@ -322,12 +322,157 @@ self.assertEqual(fmt(0,0), '0') +class TestBytes(unittest.TestCase): + # don't really care about the content of the output, just the fact + # that it's bytes and we don't crash + def check(self, diff): + diff = list(diff) # trigger exceptions first + for line in diff: + self.assertIsInstance( + line, bytes, + "all lines of diff should be bytes, but got: %r" % line) + + def test_byte_content(self): + # if we receive byte strings, we return byte strings + a = [b'hello', b'andr\xe9'] # iso-8859-1 bytes + b = [b'hello', b'andr\xc3\xa9'] # utf-8 bytes + + unified = difflib.unified_diff + context = difflib.context_diff + + check = self.check + check(difflib.diff_bytes(unified, a, a)) + check(difflib.diff_bytes(unified, a, b)) + + # now with filenames (content and filenames are all bytes!) + check(difflib.diff_bytes(unified, a, a, b'a', b'a')) + check(difflib.diff_bytes(unified, a, b, b'a', b'b')) + + # and with filenames and dates + check(difflib.diff_bytes(unified, a, a, b'a', b'a', b'2005', b'2013')) + check(difflib.diff_bytes(unified, a, b, b'a', b'b', b'2005', b'2013')) + + # same all over again, with context diff + check(difflib.diff_bytes(context, a, a)) + check(difflib.diff_bytes(context, a, b)) + check(difflib.diff_bytes(context, a, a, b'a', b'a')) + check(difflib.diff_bytes(context, a, b, b'a', b'b')) + check(difflib.diff_bytes(context, a, a, b'a', b'a', b'2005', b'2013')) + check(difflib.diff_bytes(context, a, b, b'a', b'b', b'2005', b'2013')) + + def test_byte_filenames(self): + # somebody renamed a file from ISO-8859-2 to UTF-8 + fna = b'\xb3odz.txt' # "łodz.txt" + fnb = b'\xc5\x82odz.txt' + + # they transcoded the content at the same time + a = [b'\xa3odz is a city in Poland.'] + b = [b'\xc5\x81odz is a city in Poland.'] + + check = self.check + unified = difflib.unified_diff + context = difflib.context_diff + check(difflib.diff_bytes(unified, a, b, fna, fnb)) + check(difflib.diff_bytes(context, a, b, fna, fnb)) + + def assertDiff(expect, actual): + # do not compare expect and equal as lists, because unittest + # uses difflib to report difference between lists + actual = list(actual) + self.assertEqual(len(expect), len(actual)) + for e, a in zip(expect, actual): + self.assertEqual(e, a) + + expect = [ + b'--- \xb3odz.txt', + b'+++ \xc5\x82odz.txt', + b'@@ -1 +1 @@', + b'-\xa3odz is a city in Poland.', + b'+\xc5\x81odz is a city in Poland.', + ] + actual = difflib.diff_bytes(unified, a, b, fna, fnb, lineterm=b'') + assertDiff(expect, actual) + + # with dates (plain ASCII) + datea = b'2005-03-18' + dateb = b'2005-03-19' + check(difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb)) + check(difflib.diff_bytes(context, a, b, fna, fnb, datea, dateb)) + + expect = [ + # note the mixed encodings here: this is deeply wrong by every + # tenet of Unicode, but it doesn't crash, it's parseable by + # patch, and it's how UNIX(tm) diff behaves + b'--- \xb3odz.txt\t2005-03-18', + b'+++ \xc5\x82odz.txt\t2005-03-19', + b'@@ -1 +1 @@', + b'-\xa3odz is a city in Poland.', + b'+\xc5\x81odz is a city in Poland.', + ] + actual = difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb, + lineterm=b'') + assertDiff(expect, actual) + + def test_mixed_types_content(self): + # type of input content must be consistent: all str or all bytes + a = [b'hello'] + b = ['hello'] + + unified = difflib.unified_diff + context = difflib.context_diff + + expect = "lines to compare must be str, not bytes (b'hello')" + self._assert_type_error(expect, unified, a, b) + self._assert_type_error(expect, unified, b, a) + self._assert_type_error(expect, context, a, b) + self._assert_type_error(expect, context, b, a) + + expect = "all arguments must be bytes, not str ('hello')" + self._assert_type_error(expect, difflib.diff_bytes, unified, a, b) + self._assert_type_error(expect, difflib.diff_bytes, unified, b, a) + self._assert_type_error(expect, difflib.diff_bytes, context, a, b) + self._assert_type_error(expect, difflib.diff_bytes, context, b, a) + + def test_mixed_types_filenames(self): + # cannot pass filenames as bytes if content is str (this may not be + # the right behaviour, but at least the test demonstrates how + # things work) + a = ['hello\n'] + b = ['ohell\n'] + fna = b'ol\xe9.txt' # filename transcoded from ISO-8859-1 + fnb = b'ol\xc3a9.txt' # to UTF-8 + self._assert_type_error( + "all arguments must be str, not: b'ol\\xe9.txt'", + difflib.unified_diff, a, b, fna, fnb) + + def test_mixed_types_dates(self): + # type of dates must be consistent with type of contents + a = [b'foo\n'] + b = [b'bar\n'] + datea = '1 fév' + dateb = '3 fév' + self._assert_type_error( + "all arguments must be bytes, not str ('1 fév')", + difflib.diff_bytes, difflib.unified_diff, + a, b, b'a', b'b', datea, dateb) + + # if input is str, non-ASCII dates are fine + a = ['foo\n'] + b = ['bar\n'] + list(difflib.unified_diff(a, b, 'a', 'b', datea, dateb)) + + def _assert_type_error(self, msg, generator, *args): + with self.assertRaises(TypeError) as ctx: + list(generator(*args)) + self.assertEqual(msg, str(ctx.exception)) + + def test_main(): difflib.HtmlDiff._default_prefix = 0 Doctests = doctest.DocTestSuite(difflib) run_unittest( TestWithAscii, TestAutojunk, TestSFpatches, TestSFbugs, - TestOutputFormat, Doctests) + TestOutputFormat, TestBytes, Doctests) if __name__ == '__main__': test_main() diff -r e532937914fc -r fa4c6160c518 Misc/NEWS --- a/Misc/NEWS Thu Apr 16 18:54:56 2015 -0400 +++ b/Misc/NEWS Thu Apr 16 21:19:10 2015 -0400 @@ -173,6 +173,10 @@ - Issue #23310: Fix MagicMock's initializer to work with __methods__, just like configure_mock(). Patch by Kasia Jachim. +- Issue #17445: add difflib.diff_bytes() to support comparison of + byte strings (fixes a regression from Python 2). + + Build -----