Fix GNU and USTAR formats to properly handle paths with special characters that are encoded with more than one byte each. GNU and USTAR formats use a special case if the file path is longer than 100 bytes. The detection for this, though, incorrectly checked for 100 characters rather than 100 bytes. So, if the length was close to but not exceeding 100 characters and included special characters such that the encoded length is greater than 100 bytes, the encoded string was truncated to 100 bytes and thus the resulting file name was truncated within the tar file. Index: python3.4-3.4.3/Lib/tarfile.py =================================================================== --- python3.4-3.4.3.orig/Lib/tarfile.py 2015-06-17 15:52:49.155621849 -0700 +++ python3.4-3.4.3/Lib/tarfile.py 2015-06-17 15:52:49.155621849 -0700 @@ -814,11 +814,11 @@ """ info["magic"] = POSIX_MAGIC - if len(info["linkname"]) > LENGTH_LINK: + if len(info["linkname"].encode(encoding)) > LENGTH_LINK: raise ValueError("linkname is too long") - if len(info["name"]) > LENGTH_NAME: - info["prefix"], info["name"] = self._posix_split_name(info["name"]) + if len(info["name"].encode(encoding)) > LENGTH_NAME: + info["prefix"], info["name"] = self._posix_split_name(info["name"], encoding) return self._create_header(info, USTAR_FORMAT, encoding, errors) @@ -828,10 +828,10 @@ info["magic"] = GNU_MAGIC buf = b"" - if len(info["linkname"]) > LENGTH_LINK: + if len(info["linkname"].encode(encoding)) > LENGTH_LINK: buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors) - if len(info["name"]) > LENGTH_NAME: + if len(info["name"].encode(encoding)) > LENGTH_NAME: buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors) return buf + self._create_header(info, GNU_FORMAT, encoding, errors) @@ -891,18 +891,20 @@ """ return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8") - def _posix_split_name(self, name): - """Split a name longer than 100 chars into a prefix + def _posix_split_name(self, name, encoding): + """Split a name longer than 100 encoded bytes into a prefix and a name part. """ prefix = name[:LENGTH_PREFIX + 1] + while prefix and len(prefix[:-1].encode(encoding)) > LENGTH_PREFIX: + prefix = prefix[:-1] while prefix and prefix[-1] != "/": prefix = prefix[:-1] name = name[len(prefix):] prefix = prefix[:-1] - if not prefix or len(name) > LENGTH_NAME: + if not prefix or len(name.encode(encoding)) > LENGTH_NAME: raise ValueError("name is too long") return prefix, name