diff --git a/Doc/library/base64.rst b/Doc/library/base64.rst --- a/Doc/library/base64.rst +++ b/Doc/library/base64.rst @@ -132,6 +132,76 @@ The modern interface provides: string. +.. function:: a85encode(s, *, foldspaces=False, wrapcol=0, pad=False, adobe=False) + + Encode a byte string using Ascii85. + + *s* is the string to encode. The encoded byte string is returned. + + *foldspaces* is an optional flag that uses the special short sequence 'y' + instead of 4 consecutive spaces (ASCII 0x20) as supported by 'btoa'. This + feature is not supported by the "standard" Ascii85 encoding. + + *wrapcol* controls whether the output should have newline ('\n') + characters added to it. If this is non-zero, each output line will be + at most this many characters long. + + *pad* controls whether the input string is padded to a multiple of 4 + before encoding. Note that the ``btoa`` implementation always pads. + + *adobe* controls whether the encoded byte sequence is framed with ``<~`` + and ``~>``, which is used by the Adobe implementation. + + .. versionadded:: 3.4 + + +.. function:: a85decode(s, *, foldspaces=False, adobe=False, ignorechars=b' \t\n\r\v') + + Decode an Ascii85 encoded byte string. + + *s* is the byte string to decode. + + *foldspaces* is a flag that specifies whether the 'y' short sequence + should be accepted as shorthand for 4 consecutive spaces (ASCII 0x20). + This feature is not supported by the "standard" Ascii85 encoding. + + *adobe* controls whether the input sequence is in Adobe Ascii85 format + (i.e. is framed with <~ and ~>). + + *ignorechars* should be a byte string containing characters to ignore + from the input. This should only contain whitespace characters, and by + default contains all whitespace characters in ASCII. + + .. versionadded:: 3.4 + + +.. function:: b85encode(s, pad=False) + + Encode a byte string using base85, as used in e.g. git-style binary + diffs. + + If *pad* is true, the input is padded with "\\0" so its length is a + multiple of 4 characters before encoding. + + .. versionadded:: 3.4 + + +.. function:: b85decode(b) + + Decode base85-encoded byte string. Padding is implicitly removed, if + necessary. + + .. versionadded:: 3.4 + + +.. note:: + Both Base85 and Ascii85 have an expansion factor of 5 to 4 (5 Base85 or + Ascii85 characters can encode 4 binary bytes), while the better-known + Base64 has an expansion factor of 6 to 4. They are therefore more + efficient when space expensive. They differ by details such as the + character map used for encoding. + + The legacy interface: .. function:: decode(input, output) diff --git a/Lib/base64.py b/Lib/base64.py --- a/Lib/base64.py +++ b/Lib/base64.py @@ -1,6 +1,6 @@ #! /usr/bin/env python3 -"""RFC 3548: Base16, Base32, Base64 Data Encodings""" +"""Base16, Base32, Base64 (RFC 3548), Base85 and Ascii85 data encodings""" # Modified 04-Oct-1995 by Jack Jansen to use binascii module # Modified 30-Dec-2003 by Barry Warsaw to add full RFC 3548 support @@ -9,6 +9,7 @@ import re import struct import binascii +import itertools __all__ = [ @@ -17,6 +18,8 @@ import binascii # Generalized interface for other encodings 'b64encode', 'b64decode', 'b32encode', 'b32decode', 'b16encode', 'b16decode', + # Base85 and Ascii85 encodings + 'b85encode', 'b85decode', 'a85encode', 'a85decode', # Standard Base64 encoding 'standard_b64encode', 'standard_b64decode', # Some common Base64 alternatives. As referenced by RFC 3458, see thread @@ -268,7 +271,193 @@ def b16decode(s, casefold=False): raise binascii.Error('Non-base16 digit found') return binascii.unhexlify(s) +# +# Ascii85 encoding/decoding +# +def _85encode(b, chars, chars2, pad=False, foldnuls=False, foldspaces=False): + # Helper function for a85encode and b85encode + if not isinstance(b, bytes_types): + b = memoryview(b).tobytes() + + padding = (-len(b)) % 4 + if padding: + b = b + b'\0' * padding + words = struct.Struct('!%dI' % (len(b) // 4)).unpack(b) + + a85chars2 = _a85chars2 + a85chars = _a85chars + chunks = [b'z' if foldnuls and not word else + b'y' if foldspaces and word == 0x20202020 else + (chars2[word // 614125] + + chars2[word // 85 % 7225] + + chars[word % 85]) + for word in words] + + if padding and not pad: + if chunks[-1] == b'z': + chunks[-1] = chars[0] * 5 + chunks[-1] = chunks[-1][:-padding] + + return b''.join(chunks) + +_A85START = b"<~" +_A85END = b"~>" +_a85chars = [bytes([i]) for i in range(33, 118)] +_a85chars2 = [(a + b) for a in _a85chars for b in _a85chars] + +def a85encode(b, *, foldspaces=False, wrapcol=0, pad=False, adobe=False): + """Encode a byte string using Ascii85. + + b is the byte string to encode. The encoded byte string is returned. + + foldspaces is an optional flag that uses the special short sequence 'y' + instead of 4 consecutive spaces (ASCII 0x20) as supported by 'btoa'. This + feature is not supported by the "standard" Adobe encoding. + + wrapcol controls whether the output should have newline ('\n') characters + added to it. If this is non-zero, each output line will be at most this + many characters long. + + pad controls whether the input string is padded to a multiple of 4 before + encoding. Note that the btoa implementation always pads. + + adobe controls whether the encoded byte sequence is framed with <~ and ~>, + which is used by the Adobe implementation. + """ + result = _85encode(b, _a85chars, _a85chars2, pad, True, foldspaces) + + if adobe: + result = _A85START + result + if wrapcol: + wrapcol = max(2 if adobe else 1, wrapcol) + chunks = [result[i: i + wrapcol] + for i in range(0, len(result), wrapcol)] + if adobe: + if len(chunks[-1]) + 2 > wrapcol: + chunks.append(b'') + result = b'\n'.join(chunks) + if adobe: + result += _A85END + + return result + +def a85decode(b, *, foldspaces=False, adobe=False, ignorechars=b' \t\n\r\v'): + """Decode an Ascii85 encoded byte string. + + s is the byte string to decode. + + foldspaces is a flag that specifies whether the 'y' short sequence should be + accepted as shorthand for 4 consecutive spaces (ASCII 0x20). This feature is + not supported by the "standard" Adobe encoding. + + adobe controls whether the input sequence is in Adobe Ascii85 format (i.e. + is framed with <~ and ~>). + + ignorechars should be a byte string containing characters to ignore from the + input. This should only contain whitespace characters, and by default + contains all whitespace characters in ASCII. + """ + b = _bytes_from_decode_data(b) + if adobe: + if not (b.startswith(_A85START) and b.endswith(_A85END)): + raise ValueError("Ascii85 encoded byte sequences must be bracketed " + "by {} and {}".format(_A85START, _A85END)) + b = b[2:-2] # Strip off start/end markers + # + # We have to go through this stepwise, so as to ignore spaces and handle + # special short sequences + # + packI = struct.Struct('!I').pack + decoded = [] + decoded_append = decoded.append + curr = [] + curr_append = curr.append + curr_clear = curr.clear + for x in b + b'u' * 4: + if b'!'[0] <= x <= b'u'[0]: + curr_append(x) + if len(curr) == 5: + acc = 0 + for x in curr: + acc = 85 * acc + (x - 33) + try: + decoded_append(packI(acc)) + except struct.error: + raise ValueError('Ascii85 overflow') + curr_clear() + elif x == b'z'[0]: + if curr: + raise ValueError('z inside Ascii85 5-tuple') + decoded_append(b'\0\0\0\0') + elif foldspaces and x == b'y'[0]: + if curr: + raise ValueError('y inside Ascii85 5-tuple') + decoded_append(b'\x20\x20\x20\x20') + elif x in ignorechars: + # Skip whitespace + continue + else: + raise ValueError('Non-Ascii85 digit found: %c' % x) + + result = b''.join(decoded) + padding = 4 - len(curr) + if padding: + # Throw away the extra padding + result = result[:-padding] + return result + +# The following code is originally taken (with permission) from Mercurial + +_b85chars = b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" \ + b"abcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~" +_b85chars = [bytes([i]) for i in _b85chars] +_b85chars2 = [(a + b) for a in _b85chars for b in _b85chars] +_b85dec = None + +def b85encode(b, pad=False): + """Encode an ASCII-encoded byte array in base85 format. + + If pad is true, the input is padded with "\0" so its length is a multiple of + 4 characters before encoding. + """ + return _85encode(b, _b85chars, _b85chars2, pad) + +def b85decode(b): + """Decode base85-encoded byte array""" + b = _bytes_from_decode_data(b) + global _b85dec + if _b85dec is None: + _b85dec = [None] * 256 + for i, c in enumerate(_b85chars): + _b85dec[c[0]] = i + + padding = (-len(b)) % 5 + b = b + b'~' * padding + out = [] + packI = struct.Struct('!I').pack + for i in range(0, len(b), 5): + chunk = b[i:i + 5] + acc = 0 + try: + for c in chunk: + acc = acc * 85 + _b85dec[c] + except TypeError: + for j, c in enumerate(chunk): + if _b85dec[c] is None: + raise ValueError('bad base85 character at position %d' + % (i + j)) from None + raise + try: + out.append(packI(acc)) + except struct.error: + raise ValueError('base85 overflow in hunk starting at byte %d' + % i) from None + + result = b''.join(out) + if padding: + result = result[:-padding] + return result # Legacy interface. This code could be cleaned up since I don't believe # binascii has any line length limitations. It just doesn't seem worth it diff --git a/Lib/test/test_base64.py b/Lib/test/test_base64.py --- a/Lib/test/test_base64.py +++ b/Lib/test/test_base64.py @@ -100,9 +100,13 @@ class BaseXYTestCase(unittest.TestCase): def check_other_types(self, f, bytes_data, expected): eq = self.assertEqual - eq(f(bytearray(bytes_data)), expected) + b = bytearray(bytes_data) + eq(f(b), expected) + # The bytearray wasn't mutated + eq(b, bytes_data) eq(f(memoryview(bytes_data)), expected) eq(f(array('B', bytes_data)), expected) + # XXX why is b64encode hardcoded here? self.check_nonbyte_element_format(base64.b64encode, bytes_data) self.check_multidimensional(base64.b64encode, bytes_data) @@ -359,12 +363,258 @@ class BaseXYTestCase(unittest.TestCase): eq(base64.b16decode(array('B', b"0102abcdef"), True), b'\x01\x02\xab\xcd\xef') + def test_a85encode(self): + eq = self.assertEqual + + tests = { + b'': b'', + b"www.python.org": b'GB\\6`E-ZP=Df.1GEb>', + bytes(range(255)): b"""!!*-'"9eu7#RLhG$k3[W&.oNg'GVB"(`=52*$$""" + b"""(B+<_pR,UFcb-n-Vr/1iJ-0JP==1c70M3&s#]4?Ykm5X@_(6q'R884cE""" + b"""H9MJ8X:f1+h<)lt#=BSg3>[:ZC?t!MSA7]@cBPD3sCi+'.E,fo>FEMbN""" + b"""G^4U^I!pHnJ:W<)KS>/9Ll%"IN/`jYOHG]iPa.Q$R$jD4S=Q7DTV8*TU""" + b"""nsrdW2ZetXKAY/Yd(L?['d?O\\@K2_]Y2%o^qmn*`5Ta:aN;TJbg"GZd""" + b"""*^:jeCE.%f\\,!5gtgiEi8N\\UjQ5OekiqBum-X60nF?)@o_%qPq"ad`""" + b"""r;HT""", + b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + b"0123456789!@#0^&*();:<>,. []{}": + b'@:E_WAS,RgBkhF"D/O92EH6,BF`qtRH$VbC6UX@47n?3D92&&T' + b":Jand;cHat='/U/0JP==1c70M3&r-I,;,E,oN2F(oQ1z', + b"zero compression\0\0\0": b'H=_,8+Cf>,E,oN2F(oQ1!!!!', + b"Boundary:\0\0\0\0": b'6>q!aA79M(3WK-[!!', + b"Space compr: ": b';fH/TAKYK$D/aMV+', data) + + self.check_other_types(base64.a85encode, b"www.python.org", + b'GB\\6`E-ZP=Df.1GEb>') + + self.assertRaises(TypeError, base64.a85encode, "") + + eq(base64.a85encode(b"www.python.org", wrapcol=7, adobe=False), + b'GB\\6`E-\nZP=Df.1\nGEb>') + eq(base64.a85encode(b"\0\0\0\0www.python.org", wrapcol=7, adobe=False), + b'zGB\\6`E\n-ZP=Df.\n1GEb>') + eq(base64.a85encode(b"www.python.org", wrapcol=7, adobe=True), + b'<~GB\\6`\nE-ZP=Df\n.1GEb>\n~>') + + eq(base64.a85encode(b' '*8, foldspaces=True, adobe=False), b'yy') + eq(base64.a85encode(b' '*7, foldspaces=True, adobe=False), b'y+}2SXo+ITwPvYU}0ioWMyV&XlZI|Y;A6DaB*^Tbai%j""" + b"""czJqze0_d@fPsR8goTEOh>41ejE#,. []{}""": + b"""VPa!sWoBn+X=-b1ZEkOHadLBXb#`}nd3r%YLqtVJM@UIZOH55pPf$@(""" + b"""Q&d$}S6EqEFflSSG&MFiI5{CeBQRbjDkv#CIy^osE+AW7dwl""", + b'no padding..': b'Zf_uPVPs@!Zf7no', + b'zero compression\x00\x00\x00\x00': b'dS!BNAY*TBaB^jHb7^mG00000', + b'zero compression\x00\x00\x00': b'dS!BNAY*TBaB^jHb7^mG0000', + b"""Boundary:\x00\x00\x00\x00""": b"""LT`0$WMOi7IsgCw00""", + b'Space compr: ': b'Q*dEpWgug3ZE$irARr(h', + b'\xff': b'{{', + b'\xff'*2: b'|Nj', + b'\xff'*3: b'|Ns9', + b'\xff'*4: b'|NsC0', + } + + for data, res in tests.items(): + eq(base64.b85encode(data), res) + + self.check_other_types(base64.b85encode, b"www.python.org", + b'cXxL#aCvlSZ*DGca%T') + + def test_a85decode(self): + eq = self.assertEqual + + tests = { + b'': b'', + b'GB\\6`E-ZP=Df.1GEb>': b'www.python.org', + b"""! ! * -'"\n\t\t9eu\r\n7# RL\vhG$k3[W&.oNg'GVB"(`=52*$$""" + b"""(B+<_pR,UFcb-n-Vr/1iJ-0JP==1c70M3&s#]4?Ykm5X@_(6q'R884cE""" + b"""H9MJ8X:f1+h<)lt#=BSg3>[:ZC?t!MSA7]@cBPD3sCi+'.E,fo>FEMbN""" + b"""G^4U^I!pHnJ:W<)KS>/9Ll%"IN/`jYOHG]iPa.Q$R$jD4S=Q7DTV8*TU""" + b"""nsrdW2ZetXKAY/Yd(L?['d?O\\@K2_]Y2%o^qmn*`5Ta:aN;TJbg"GZd""" + b"""*^:jeCE.%f\\,!5gtgiEi8N\\UjQ5OekiqBum-X60nF?)@o_%qPq"ad`""" + b"""r;HT""": bytes(range(255)), + b"""@:E_WAS,RgBkhF"D/O92EH6,BF`qtRH$VbC6UX@47n?3D92&&T:Jand;c""" + b"""Hat='/U/0JP==1c70M3&r-I,;,. []{}', + b'DJpY:@:Wn_DJ(RS': b'no padding..', + b'H=_,8+Cf>,E,oN2F(oQ1z': b'zero compression\x00\x00\x00\x00', + b'H=_,8+Cf>,E,oN2F(oQ1!!!!': b'zero compression\x00\x00\x00', + b'6>q!aA79M(3WK-[!!': b"Boundary:\x00\x00\x00\x00", + b';fH/TAKYK$D/aMV+', adobe=True), res, data) + eq(base64.a85decode('<~%s~>' % data.decode("ascii"), adobe=True), + res, data) + + eq(base64.a85decode(b'yy', foldspaces=True, adobe=False), b' '*8) + eq(base64.a85decode(b'y+', + b"www.python.org") + + def test_b85decode(self): + eq = self.assertEqual + + tests = { + b'': b'', + b'cXxL#aCvlSZ*DGca%T': b'www.python.org', + b"""009C61O)~M2nh-c3=Iws5D^j+6crX17#SKH9337X""" + b"""AR!_nBqb&%C@Cr{EG;fCFflSSG&MFiI5|2yJUu=?KtV!7L`6nNNJ&ad""" + b"""OifNtP*GA-R8>}2SXo+ITwPvYU}0ioWMyV&XlZI|Y;A6DaB*^Tbai%j""" + b"""czJqze0_d@fPsR8goTEOh>41ejE#,. []{}""", + b'Zf_uPVPs@!Zf7no': b'no padding..', + b'dS!BNAY*TBaB^jHb7^mG00000': b'zero compression\x00\x00\x00\x00', + b'dS!BNAY*TBaB^jHb7^mG0000': b'zero compression\x00\x00\x00', + b"""LT`0$WMOi7IsgCw00""": b"""Boundary:\x00\x00\x00\x00""", + b'Q*dEpWgug3ZE$irARr(h': b'Space compr: ', + b'{{': b'\xff', + b'|Nj': b'\xff'*2, + b'|Ns9': b'\xff'*3, + b'|NsC0': b'\xff'*4, + } + + for data, res in tests.items(): + eq(base64.b85decode(data), res) + eq(base64.b85decode(data.decode("ascii")), res) + + self.check_other_types(base64.b85decode, b'cXxL#aCvlSZ*DGca%T', + b"www.python.org") + + def test_a85_padding(self): + eq = self.assertEqual + + eq(base64.a85encode(b"x", pad=True), b'GQ7^D') + eq(base64.a85encode(b"xx", pad=True), b"G^'2g") + eq(base64.a85encode(b"xxx", pad=True), b'G^+H5') + eq(base64.a85encode(b"xxxx", pad=True), b'G^+IX') + eq(base64.a85encode(b"xxxxx", pad=True), b'G^+IXGQ7^D') + + eq(base64.a85decode(b'GQ7^D'), b"x\x00\x00\x00") + eq(base64.a85decode(b"G^'2g"), b"xx\x00\x00") + eq(base64.a85decode(b'G^+H5'), b"xxx\x00") + eq(base64.a85decode(b'G^+IX'), b"xxxx") + eq(base64.a85decode(b'G^+IXGQ7^D'), b"xxxxx\x00\x00\x00") + + def test_b85_padding(self): + eq = self.assertEqual + + eq(base64.b85encode(b"x", pad=True), b'cmMzZ') + eq(base64.b85encode(b"xx", pad=True), b'cz6H+') + eq(base64.b85encode(b"xxx", pad=True), b'czAdK') + eq(base64.b85encode(b"xxxx", pad=True), b'czAet') + eq(base64.b85encode(b"xxxxx", pad=True), b'czAetcmMzZ') + + eq(base64.b85decode(b'cmMzZ'), b"x\x00\x00\x00") + eq(base64.b85decode(b'cz6H+'), b"xx\x00\x00") + eq(base64.b85decode(b'czAdK'), b"xxx\x00") + eq(base64.b85decode(b'czAet'), b"xxxx") + eq(base64.b85decode(b'czAetcmMzZ'), b"xxxxx\x00\x00\x00") + + def test_a85decode_errors(self): + illegal = (set(range(32)) | set(range(118, 256))) - set(b' \t\n\r\v') + for c in illegal: + with self.assertRaises(ValueError, msg=bytes([c])): + base64.a85decode(b'!!!!' + bytes([c])) + with self.assertRaises(ValueError, msg=bytes([c])): + base64.a85decode(b'!!!!' + bytes([c]), adobe=False) + with self.assertRaises(ValueError, msg=bytes([c])): + base64.a85decode(b'<~!!!!' + bytes([c]) + b'~>', adobe=True) + + self.assertRaises(ValueError, base64.a85decode, + b"malformed", adobe=True) + self.assertRaises(ValueError, base64.a85decode, + b"<~still malformed", adobe=True) + self.assertRaises(ValueError, base64.a85decode, + b"also malformed~>", adobe=True) + + # With adobe=False (the default), Adobe framing markers are disallowed + self.assertRaises(ValueError, base64.a85decode, + b"<~~>") + self.assertRaises(ValueError, base64.a85decode, + b"<~~>", adobe=False) + base64.a85decode(b"<~~>", adobe=True) # sanity check + + self.assertRaises(ValueError, base64.a85decode, + b"abcx", adobe=False) + self.assertRaises(ValueError, base64.a85decode, + b"abcdey", adobe=False) + self.assertRaises(ValueError, base64.a85decode, + b"a b\nc", adobe=False, ignorechars=b"") + + self.assertRaises(ValueError, base64.a85decode, b's', adobe=False) + self.assertRaises(ValueError, base64.a85decode, b's8', adobe=False) + self.assertRaises(ValueError, base64.a85decode, b's8W', adobe=False) + self.assertRaises(ValueError, base64.a85decode, b's8W-', adobe=False) + self.assertRaises(ValueError, base64.a85decode, b's8W-"', adobe=False) + + def test_b85decode_errors(self): + illegal = list(range(33)) + \ + list(b'"\',./:[\\]') + \ + list(range(128, 256)) + for c in illegal: + with self.assertRaises(ValueError, msg=bytes([c])): + base64.b85decode(b'0000' + bytes([c])) + + self.assertRaises(ValueError, base64.b85decode, b'|') + self.assertRaises(ValueError, base64.b85decode, b'|N') + self.assertRaises(ValueError, base64.b85decode, b'|Ns') + self.assertRaises(ValueError, base64.b85decode, b'|NsC') + self.assertRaises(ValueError, base64.b85decode, b'|NsC1') + def test_decode_nonascii_str(self): decode_funcs = (base64.b64decode, base64.standard_b64decode, base64.urlsafe_b64decode, base64.b32decode, - base64.b16decode) + base64.b16decode, + base64.b85decode, + base64.a85decode) for f in decode_funcs: self.assertRaises(ValueError, f, 'with non-ascii \xcb')