diff -r e55cc0834e9c -r 13161c1d9c5f Lib/difflib.py --- a/Lib/difflib.py Wed Apr 15 17:08:45 2015 -0400 +++ b/Lib/difflib.py Wed Apr 15 09:26:22 2015 -0400 @@ -1173,7 +1173,7 @@ +tree four """ - + _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm) started = False for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): if not started: @@ -1260,7 +1260,7 @@ ! tree four """ - + _check_types(a, b, fromfile, tofile, fromfiledate, tofiledate, lineterm) prefix = dict(insert='+ ', delete='- ', replace='! ', equal=' ') started = False for group in SequenceMatcher(None,a,b).get_grouped_opcodes(n): @@ -1292,6 +1292,49 @@ for line in b[j1:j2]: yield prefix[tag] + line +def _check_types(a, b, *args): + # Checking types is weird, but it's better than producing garbled + # output. E.g. if someone passes filenames as bytes to unified_diff(), + # without this check we would produce output lines like: + # --- b'oldfile.txt' + # +++ b'newfile.txt' + # (because that's how str.format() incorporates bytes objects). + if a and not isinstance(a[0], str): + raise TypeError('lines to compare must be str, not %r' % a[0]) + if b and not isinstance(b[0], str): + raise TypeError('lines to compare must be str, not %r' % b[0]) + for arg in args: + if not isinstance(arg, str): + raise TypeError('all arguments must be str, not: %r' % (arg,)) + +def diff_bytes(dfunc, a, b, fromfile=b'', tofile=b'', + fromfiledate=b'', tofiledate=b'', n=3, lineterm=b'\n'): + r""" + Compare `a` and `b`, two sequences of lines represented as bytes rather + than strings. This is a wrapper for `dfunc`, which is typically either + unified_diff() or context_diff() (the function object, not the name). + All inputs are losslessly converted to strings so that `dfunc` only has + to worry about strings, and decoded back to bytes on return. This is + necessary to compare files with unknown or inconsistent encoding. All + other inputs (except `n`) must be bytes rather than str. + """ + def decode(s): + try: + return s.decode('ascii', 'surrogateescape') + except AttributeError: + raise TypeError('all arguments must be bytes, not %r' % s) + a = list(map(decode, a)) + b = list(map(decode, b)) + fromfile = decode(fromfile) + tofile = decode(tofile) + fromfiledate = decode(fromfiledate) + tofiledate = decode(tofiledate) + lineterm = decode(lineterm) + + lines = dfunc(a, b, fromfile, tofile, fromfiledate, tofiledate, n, lineterm) + for line in lines: + yield line.encode('ascii', 'surrogateescape') + def ndiff(a, b, linejunk=None, charjunk=IS_CHARACTER_JUNK): r""" Compare `a` and `b` (lists of strings); return a `Differ`-style delta. diff -r e55cc0834e9c -r 13161c1d9c5f Lib/test/test_difflib.py --- a/Lib/test/test_difflib.py Wed Apr 15 17:08:45 2015 -0400 +++ b/Lib/test/test_difflib.py Wed Apr 15 09:26:22 2015 -0400 @@ -322,12 +322,158 @@ self.assertEqual(fmt(0,0), '0') +class TestBytes(unittest.TestCase): + # don't really care about the content of the output, just the fact + # that it's bytes and we don't crash + def check(self, diff): + diff = list(diff) # trigger exceptions first + for line in diff: + self.assertTrue( + isinstance(line, bytes), + "all lines of diff should be bytes, but got: %r" % line) + + def test_byte_content(self): + "if we receive byte strings, we return byte strings" + a = [b'hello', b'andrew'] + b = [b'hello', b'andr\xe9'] # latin-1 bytes + + unified = difflib.unified_diff + context = difflib.context_diff + + check = self.check + check(difflib.diff_bytes(unified, a, a)) + check(difflib.diff_bytes(unified, a, b)) + + # now with filenames (content and filenames are all bytes!) + check(difflib.diff_bytes(unified, a, a, b'a', b'a')) + check(difflib.diff_bytes(unified, a, b, b'a', b'b')) + + # and with filenames and dates + check(difflib.diff_bytes(unified, a, a, b'a', b'a', b'2005', b'2013')) + check(difflib.diff_bytes(unified, a, b, b'a', b'b', b'2005', b'2013')) + + # same all over again, with context diff + check(difflib.diff_bytes(context, a, a)) + check(difflib.diff_bytes(context, a, b)) + check(difflib.diff_bytes(context, a, a, b'a', b'a')) + check(difflib.diff_bytes(context, a, b, b'a', b'b')) + check(difflib.diff_bytes(context, a, a, b'a', b'a', b'2005', b'2013')) + check(difflib.diff_bytes(context, a, b, b'a', b'b', b'2005', b'2013')) + + def test_byte_filenames(self): + # somebody renamed a file from ISO-8859-2 to UTF-8 + fna = b'\xb3odz.txt' # "łodz.txt" + fnb = b'\xc5\x82odz.txt' + + # they transcoded the content at the same time + a = [b'\xa3odz is a city in Poland.'] + b = [b'\xc5\x81odz is a city in Poland.'] + + check = self.check + unified = difflib.unified_diff + context = difflib.context_diff + check(difflib.diff_bytes(unified, a, b, fna, fnb)) + check(difflib.diff_bytes(context, a, b, fna, fnb)) + + def assertDiff(expect, actual): + # do not compare expect and equal as lists, because unittest + # uses difflib to report difference between lists + actual = list(actual) + self.assertEqual(len(expect), len(actual)) + for e, a in zip(expect, actual): + self.assertEqual(e, a) + + expect = [ + b'--- \xb3odz.txt', + b'+++ \xc5\x82odz.txt', + b'@@ -1 +1 @@', + b'-\xa3odz is a city in Poland.', + b'+\xc5\x81odz is a city in Poland.', + ] + actual = difflib.diff_bytes(unified, a, b, fna, fnb, lineterm=b'') + assertDiff(expect, actual) + + # with dates (plain ASCII) + datea = b'2005-03-18' + dateb = b'2005-03-19' + check(difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb)) + check(difflib.diff_bytes(context, a, b, fna, fnb, datea, dateb)) + + expect = [ + # note the mixed encodings here: this is deeply wrong by every + # tenet of Unicode, but it doesn't crash, it's parseable by + # patch, and it's how UNIX(tm) diff behaves + b'--- \xb3odz.txt\t2005-03-18', + b'+++ \xc5\x82odz.txt\t2005-03-19', + b'@@ -1 +1 @@', + b'-\xa3odz is a city in Poland.', + b'+\xc5\x81odz is a city in Poland.', + ] + actual = difflib.diff_bytes(unified, a, b, fna, fnb, datea, dateb, + lineterm=b'') + assertDiff(expect, actual) + + def test_mixed_types_content(self): + 'type of input content must be consistent: all str or all bytes' + a = [b'hello'] + b = ['hello'] + + def assertTypeError(generator, *args): + try: + list(callable(args, *args)) + self.fail('expected TypeError') + except TypeError: + pass + + assertTypeError(difflib.unified_diff, a, b) + assertTypeError(difflib.unified_diff, b, a) + assertTypeError(difflib.context_diff, a, b) + assertTypeError(difflib.context_diff, b, a) + + assertTypeError(difflib.diff_bytes, difflib.unified_diff, a, b) + assertTypeError(difflib.diff_bytes, difflib.unified_diff, b, a) + assertTypeError(difflib.diff_bytes, difflib.context_diff, a, b) + assertTypeError(difflib.diff_bytes, difflib.context_diff, b, a) + + def test_mixed_types_filenames(self): + 'cannot pass filenames as bytes if content is str' + # this may not be the right behaviour, but at least the test + # demonstrates how things work + a = ['hello\n'] + b = ['ohell\n'] + fna = b'ol\xe9.txt' # filename transcoded from ISO-8859-1 + fnb = b'ol\xc3a9.txt' # to UTF-8 + try: + list(difflib.unified_diff(a, b, fna, fnb)) + self.fail('expected TypeError') + except TypeError: + pass + + def test_mixed_types_dates(self): + 'type of dates must be consistent with type of contents' + a = [b'foo\n'] + b = [b'bar\n'] + datea = '1 fév' + dateb = '3 fév' + try: + list(difflib.diff_bytes(a, b, 'a', 'b', datea, dateb, + dfunc=difflib.unified_diff)) + self.fail('expected TypeError') + except TypeError: + pass + + # if input is str, non-ASCII dates are fine + a = ['foo\n'] + b = ['bar\n'] + list(difflib.unified_diff(a, b, 'a', 'b', datea, dateb)) + + def test_main(): difflib.HtmlDiff._default_prefix = 0 Doctests = doctest.DocTestSuite(difflib) run_unittest( TestWithAscii, TestAutojunk, TestSFpatches, TestSFbugs, - TestOutputFormat, Doctests) + TestOutputFormat, TestBytes, Doctests) if __name__ == '__main__': test_main()