diff -r cf70f030a744 Lib/tarfile.py --- a/Lib/tarfile.py Wed Jun 18 23:07:46 2014 -0400 +++ b/Lib/tarfile.py Mon Jun 23 21:40:37 2014 +0300 @@ -156,9 +156,22 @@ #--------------------------------------------------------- # initialization #--------------------------------------------------------- -ENCODING = sys.getfilesystemencoding() -if ENCODING is None: - ENCODING = sys.getdefaultencoding() +try: + _unicode = unicode + _decode = unicode + _have_unicode = True + ENCODING = sys.getfilesystemencoding() + if ENCODING is None: + ENCODING = sys.getdefaultencoding() +except NameError: + # If Python is built without Unicode support, the unicode type + # will not exist. Fake one. + class _unicode(str): + pass + def _decode(s, encoding, errors=None): + return s + _have_unicode = False + ENCODING = None #--------------------------------------------------------- # Some useful functions @@ -242,6 +255,10 @@ else: return s.encode(encoding, errors) +if not _have_unicode: + def uts(s, encoding, errors): + return s + def calc_chksums(buf): """Calculate the checksum for a member's header by summing up all characters except for the chksum field which is treated as if @@ -453,7 +470,7 @@ 0) timestamp = struct.pack(" length: pax_headers[hname] = val @@ -1065,7 +1083,7 @@ val = info[name] if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float): - pax_headers[name] = unicode(val) + pax_headers[name] = _unicode(val) info[name] = 0 # Create a pax extended header if necessary. @@ -1160,8 +1178,9 @@ """ records = [] for keyword, value in pax_headers.iteritems(): - keyword = keyword.encode("utf8") - value = value.encode("utf8") + if _have_unicode: + keyword = keyword.encode("utf8") + value = value.encode("utf8") l = len(keyword) + len(value) + 3 # ' ' + '=' + '\n' n = p = 0 while True: @@ -1389,8 +1408,8 @@ length = int(length) value = buf[match.end(2) + 1:match.start(1) + length - 1] - keyword = keyword.decode("utf8") - value = value.decode("utf8") + keyword = _decode(keyword, "utf8") + value = _decode(value, "utf8") pax_headers[keyword] = value pos += length diff -r cf70f030a744 Lib/test/test_tarfile.py --- a/Lib/test/test_tarfile.py Wed Jun 18 23:07:46 2014 -0400 +++ b/Lib/test/test_tarfile.py Mon Jun 23 21:40:37 2014 +0300 @@ -615,7 +615,7 @@ self._test_member(tarinfo, size=86016, chksum=md5_sparse) def test_find_umlauts(self): - tarinfo = self.tar.getmember("ustar/umlauts-ÄÖÜäöüß") + tarinfo = self.tar.getmember('ustar/umlauts-\xc4\xd6\xdc\xe4\xf6\xfc\xdf') self._test_member(tarinfo, size=7011, chksum=md5_regtype) def test_find_ustar_longname(self): @@ -628,7 +628,9 @@ def test_find_pax_umlauts(self): self.tar = tarfile.open(self.tarname, mode=self.mode, encoding="iso8859-1") - tarinfo = self.tar.getmember("pax/umlauts-ÄÖÜäöüß") + name = ("pax/umlauts-\xc4\xd6\xdc\xe4\xf6\xfc\xdf" if test_support.have_unicode + else 'pax/umlauts-\xc3\x84\xc3\x96\xc3\x9c\xc3\xa4\xc3\xb6\xc3\xbc\xc3\x9f') + tarinfo = self.tar.getmember(name) self._test_member(tarinfo, size=7011, chksum=md5_regtype) @@ -696,17 +698,23 @@ tarinfo = tar.getmember("pax/regtype1") self.assertEqual(tarinfo.uname, "foo") self.assertEqual(tarinfo.gname, "bar") - self.assertEqual(tarinfo.pax_headers.get("VENDOR.umlauts"), u"ÄÖÜäöüß") + self.assertEqual(tarinfo.pax_headers.get("VENDOR.umlauts"), + u"\xc4\xd6\xdc\xe4\xf6\xfc\xdf" if test_support.have_unicode + else '\xc3\x84\xc3\x96\xc3\x9c\xc3\xa4\xc3\xb6\xc3\xbc\xc3\x9f') tarinfo = tar.getmember("pax/regtype2") self.assertEqual(tarinfo.uname, "") self.assertEqual(tarinfo.gname, "bar") - self.assertEqual(tarinfo.pax_headers.get("VENDOR.umlauts"), u"ÄÖÜäöüß") + self.assertEqual(tarinfo.pax_headers.get("VENDOR.umlauts"), + u"\xc4\xd6\xdc\xe4\xf6\xfc\xdf" if test_support.have_unicode + else '\xc3\x84\xc3\x96\xc3\x9c\xc3\xa4\xc3\xb6\xc3\xbc\xc3\x9f') tarinfo = tar.getmember("pax/regtype3") self.assertEqual(tarinfo.uname, "tarfile") self.assertEqual(tarinfo.gname, "tarfile") - self.assertEqual(tarinfo.pax_headers.get("VENDOR.umlauts"), u"ÄÖÜäöüß") + self.assertEqual(tarinfo.pax_headers.get("VENDOR.umlauts"), + u"\xc4\xd6\xdc\xe4\xf6\xfc\xdf" if test_support.have_unicode + else '\xc3\x84\xc3\x96\xc3\x9c\xc3\xa4\xc3\xb6\xc3\xbc\xc3\x9f') def test_pax_number_fields(self): # All following number fields are read from the pax header. @@ -1113,6 +1121,7 @@ finally: os.umask(original_umask) + @test_support.requires_unicode def test_issue13639(self): try: with tarfile.open(unicode(tmpname, sys.getfilesystemencoding()), self.mode): @@ -1267,8 +1276,8 @@ u"foo": u"bar", u"uid": u"0", u"mtime": u"1.23", - u"test": u"äöü", - u"äöü": u"test"} + u"test": u"\xe4\xf6\xfc", + u"\xe4\xf6\xfc": u"test"} tar = tarfile.open(tmpname, "w", format=tarfile.PAX_FORMAT, pax_headers=pax_headers) @@ -1282,8 +1291,9 @@ # Test if all the fields are unicode. for key, val in tar.pax_headers.iteritems(): - self.assertTrue(type(key) is unicode) - self.assertTrue(type(val) is unicode) + if test_support.have_unicode: + self.assertIs(type(key), unicode) + self.assertIs(type(val), unicode) if key in tarfile.PAX_NUMBER_FIELDS: try: tarfile.PAX_NUMBER_FIELDS[key](val) @@ -1297,7 +1307,7 @@ tar = tarfile.open(tmpname, "w", format=tarfile.PAX_FORMAT, encoding="iso8859-1") t = tarfile.TarInfo() - t.name = u"äöü" # non-ASCII + t.name = u"\xe4\xf6\xfc" # non-ASCII t.uid = 8**8 # too large t.pax_headers = pax_headers tar.addfile(t) @@ -1315,18 +1325,21 @@ format = tarfile.USTAR_FORMAT + @test_support.requires_unicode def test_iso8859_1_filename(self): self._test_unicode_filename("iso8859-1") + @test_support.requires_unicode def test_utf7_filename(self): self._test_unicode_filename("utf7") + @test_support.requires_unicode def test_utf8_filename(self): self._test_unicode_filename("utf8") def _test_unicode_filename(self, encoding): tar = tarfile.open(tmpname, "w", format=self.format, encoding=encoding, errors="strict") - name = u"äöü" + name = u"\xe4\xf6\xfc" tar.addfile(tarfile.TarInfo(name)) tar.close() @@ -1335,21 +1348,22 @@ self.assertEqual(tar.getmembers()[0].name, name.encode(encoding)) tar.close() + @test_support.requires_unicode def test_unicode_filename_error(self): tar = tarfile.open(tmpname, "w", format=self.format, encoding="ascii", errors="strict") tarinfo = tarfile.TarInfo() - tarinfo.name = "äöü" + tarinfo.name = "\xe4\xf6\xfc" if self.format == tarfile.PAX_FORMAT: self.assertRaises(UnicodeError, tar.addfile, tarinfo) else: tar.addfile(tarinfo) - tarinfo.name = u"äöü" + tarinfo.name = u"\xe4\xf6\xfc" self.assertRaises(UnicodeError, tar.addfile, tarinfo) tarinfo.name = "foo" - tarinfo.uname = u"äöü" + tarinfo.uname = u"\xe4\xf6\xfc" self.assertRaises(UnicodeError, tar.addfile, tarinfo) def test_unicode_argument(self): @@ -1362,7 +1376,7 @@ tar.close() def test_uname_unicode(self): - for name in (u"äöü", "äöü"): + for name in (u"\xe4\xf6\xfc", "\xe4\xf6\xfc"): t = tarfile.TarInfo("foo") t.uname = name t.gname = name @@ -1375,8 +1389,8 @@ tar = tarfile.open("foo.tar", fileobj=fobj, encoding="iso8859-1") t = tar.getmember("foo") - self.assertEqual(t.uname, "äöü") - self.assertEqual(t.gname, "äöü") + self.assertEqual(t.uname, "\xe4\xf6\xfc") + self.assertEqual(t.gname, "\xe4\xf6\xfc") class GNUUnicodeTest(UstarUnicodeTest): @@ -1395,12 +1409,13 @@ tar.addfile(t) tar.close() + @test_support.requires_unicode def test_error_handlers(self): # Test if the unicode error handlers work correctly for characters # that cannot be expressed in a given encoding. - self._create_unicode_name(u"äöü") + self._create_unicode_name(u"\xe4\xf6\xfc") - for handler, name in (("utf-8", u"äöü".encode("utf8")), + for handler, name in (("utf-8", u"\xe4\xf6\xfc".encode("utf8")), ("replace", "???"), ("ignore", "")): tar = tarfile.open(tmpname, format=self.format, encoding="ascii", errors=handler) @@ -1409,14 +1424,15 @@ self.assertRaises(UnicodeError, tarfile.open, tmpname, encoding="ascii", errors="strict") + @test_support.requires_unicode def test_error_handler_utf8(self): # Create a pathname that has one component representable using # iso8859-1 and the other only in iso8859-15. - self._create_unicode_name(u"äöü/¤") + self._create_unicode_name(u"\xe4\xf6\xfc/" + unichr(0x20ac)) tar = tarfile.open(tmpname, format=self.format, encoding="iso8859-1", errors="utf-8") - self.assertEqual(tar.getnames()[0], "äöü/" + u"¤".encode("utf8")) + self.assertEqual(tar.getnames()[0], "\xe4\xf6\xfc/" + unichr(0x20ac).encode("utf8")) class AppendTest(unittest.TestCase):