diff -r 937ba7bb2285 Lib/base64.py --- a/Lib/base64.py Fri Apr 19 07:10:45 2013 +0300 +++ b/Lib/base64.py Fri Apr 19 23:35:26 2013 +0300 @@ -1,14 +1,16 @@ #! /usr/bin/env python3 -"""RFC 3548: Base16, Base32, Base64 Data Encodings""" +"""RFC 3548: Base16, Base32, Base64; and Base85, Ascii85 Data Encodings""" # Modified 04-Oct-1995 by Jack Jansen to use binascii module # Modified 30-Dec-2003 by Barry Warsaw to add full RFC 3548 support # Modified 22-May-2007 by Guido van Rossum to use bytes everywhere +# Modified 14-Apr-2013 by Martin Morrison to add Base85/Ascii85 support import re import struct import binascii +import itertools __all__ = [ @@ -17,6 +19,8 @@ # Generalized interface for other encodings 'b64encode', 'b64decode', 'b32encode', 'b32decode', 'b16encode', 'b16decode', + # Base85 and Ascii85 encodings + 'b85encode', 'b85decode', 'a85encode', 'a85decode', 'btoa85', 'atob85', # Standard Base64 encoding 'standard_b64encode', 'standard_b64decode', # Some common Base64 alternatives. As referenced by RFC 3458, see thread @@ -302,7 +306,241 @@ raise binascii.Error('Non-base16 digit found') return binascii.unhexlify(s) +# +# Ascii85 encoding/decoding +# +_B85START = b"<~" +_B85END = b"~>" +_B85FOLDNUL = b"z" +_B85FOLDSPACE = b"y" +_B85CHARPAD = b"u" + +def a85encode(b, *, foldspaces=False, wrapcol=0, pad=False, adobe=True): + """Encode a byte string using Ascii85. + + b is the byte string to encode. The encoded byte string is returned. + + foldspaces is an optional flag that uses the special short sequence 'y' + instead of 4 consecutive spaces (ASCII 0x20) as supported by 'btoa'. This + feature is not supported by the "standard" Adobe encoding. + + wrapcol controls whether the output should have newline ('\n') characters + added to it. If this is non-zero, each output line will be at most this + many characters long, with a minimum of 5. + + pad controls whether the input string is padded to a multiple of 4 before + encoding. Note that the btoa implementation always pads. + + adobe controls whether the encoded byte sequence is framed with <~ and ~>, + which is used by the Adobe implementation. + """ + if not isinstance(b, bytes_types): + raise TypeError("expected bytes, not {}".format(b.__class__.__name__)) + + quanta, leftover = divmod(len(b), 4) + words = struct.Struct("!{}I{}B".format(quanta, leftover)).unpack(b) + encoded = [] + if adobe: + encoded.append(_B85START) + # Normalise wrapcol. We're lazy and don't wrap inside a block of 5 chars + wrapcol = (quanta+1)* 5 if wrapcol == 0 else max(5, wrapcol - wrapcol % 5) + currlen = 0 + + def encode(word): + next = [] + for j in range(5): + word, num = divmod(word, 85) + next.append(num + 33) + return bytes(reversed(next)) + + for word in itertools.islice(words, quanta): + if word == 0: + encoded.append(_B85FOLDNUL) + currlen += 1 + elif foldspaces and word == 0x20202020: + encoded.append(_B85FOLDSPACE) + currlen += 1 + else: + encoded.append(encode(word)) + currlen += 5 + if currlen % wrapcol == 0: + encoded.append(b'\n') + + if leftover: + last, = struct.unpack("!I", bytes(words[quanta:] + (0,)*(4-leftover))) + if pad: + encoded.append(encode(last)) + else: + encoded.append(encode(last)[:leftover + 1]) + + if adobe: + encoded.append(_B85END) + + return b"".join(encoded) + +def a85decode(b, *, foldspaces=False, adobe=True, ignorechars=b' \t\n\r\v'): + """Decode an Ascii85 encoded byte string. + + s is the byte string to decode. + + foldspaces is a flag that specifies whether the 'y' short sequence should be + accepted as shorthand for 4 consecutive spaces (ASCII 0x20). This feature is + not supported by the "standard" Adobe encoding. + + adobe controls whether the input sequence is in Adobe Ascii85 format (i.e. + is framed with <~ and ~>). + + ignorechars should be a byte string containing characters to ignore from the + input. This should only contain whitespace characters, and by default + contains all whitespace characters in ASCII. + """ + b = _bytes_from_decode_data(b) + if adobe and (not b.startswith(_B85START) or not b.endswith(_B85END)): + raise binascii.Error("Ascii85 encoded byte sequences must be bracketed " + "by {} and {}".format(_B85START, _B85END)) + # + # We have to go through this stepwise, so as to ignore spaces and handle + # special short sequences + # + if adobe: + b = b[2:-2] # Strip off start/end markers + position = 0 + decoded = [] + last = [] + + def decode(fivechars): + val = 0 + for j in range(5): + if not 0 <= (fivechars[j] - 33) < 85: + raise TypeError("Non-base85 digit found: {}" + .format(chr(fivechars[j]))) + val *= 85 + val += fivechars[j] - 33 + return val + + try: + while True: + curr = [] + while len(curr) < 5: + # Skip whitespace + while b[position] in ignorechars: + position += 1 + # handle special short sequences + if b[position] == _B85FOLDNUL[0]: + decoded.append(0) + position += 1 + break + if b[position] == _B85FOLDSPACE[0] and foldspaces: + decoded.append(0x20202020) + position += 1 + break + curr.append(b[position]) + position += 1 + else: + decoded.append(decode(curr)) + except IndexError: + # We ran out of characters for a full word; handle any stragglers + if curr: + last = curr + [_B85CHARPAD[0]] * (5 - len(curr)) + # Throw away the extra padding + last = list(struct.pack("!I", decode(last))[:-(5-len(curr))]) + + return struct.Struct("!{}I{}B".format(len(decoded), len(last))).pack( + *itertools.chain(decoded, last)) + +def btoa85(b, *, wrapcol=0): + """Encode an ASCII-encoded byte array in base85 using 'btoa' format. + + This is a convenience wrapper around a85encode passing arguments to + emulate the behaviour of the legacy btoa program. + """ + return a85encode(b, foldspaces=True, wrapcol=wrapcol, pad=True, adobe=False) + +def atob85(b): + """Decode base85 string encoded using 'btoa' format. + + This is a convenience wrapper around a85decode passing arguments to + emulate the behaviour of the legacy atob program. + """ + return a85decode(b, foldspaces=True, adobe=False, ignorechars=b'\n') + +# The following code is originally taken (with permission) from Mercurial + +_b85chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" \ + "abcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~" +_b85chars2 = [(a + b) for a in _b85chars for b in _b85chars] +_b85dec = {} + +def _mkb85dec(): + for i, c in enumerate(_b85chars): + _b85dec[ord(c)] = i + +def b85encode(b, pad=False): + """Encode an ASCII-encoded byte array in base85 format. + + If pad is true, the input is padded with "\0" so its length is a multiple of + 4 characters before encoding. + """ + if not isinstance(b, bytes_types): + raise TypeError("expected bytes, not %s" % b.__class__.__name__) + + l = len(b) + r = l % 4 + if r: + b += b'\0' * (4 - r) + longs = len(b) >> 2 + words = struct.Struct('>%dL' % (longs)).unpack(b) + + out = "".join(_b85chars[(word // 52200625) % 85] + + _b85chars2[(word // 7225) % 7225] + + _b85chars2[word % 7225] + for word in words).encode("ascii") + + if pad: + return out + + # Trim padding + olen = l % 4 + if olen: + olen += 1 + olen += l // 4 * 5 + return out[:olen] + +def b85decode(b): + """Decode base85-encoded byte array""" + text = _bytes_from_decode_data(b) + if not _b85dec: + _mkb85dec() + + l = len(text) + out = [] + for i in range(0, len(text), 5): + chunk = text[i:i + 5] + acc = 0 + for j, c in enumerate(chunk): + try: + acc = acc * 85 + _b85dec[c] + except KeyError: + raise ValueError('bad base85 character at position %d' + % (i + j)) + if acc > 4294967295: + raise ValueError('Base85 overflow in hunk starting at byte %d' % i) + out.append(acc) + + # Pad final chunk if necessary + cl = l % 5 + if cl: + acc *= 85 ** (5 - cl) + if cl > 1: + acc += 0xffffff >> (cl - 2) * 8 + out[-1] = acc + + out = struct.Struct('>%dL' % (len(out))).pack(*out) + if cl: + out = out[:-(5 - cl)] + + return out # Legacy interface. This code could be cleaned up since I don't believe # binascii has any line length limitations. It just doesn't seem worth it diff -r 937ba7bb2285 Lib/test/test_base64.py --- a/Lib/test/test_base64.py Fri Apr 19 07:10:45 2013 +0300 +++ b/Lib/test/test_base64.py Fri Apr 19 23:35:26 2013 +0300 @@ -260,12 +260,182 @@ eq(base64.b16decode(b'0102abcdef', True), b'\x01\x02\xab\xcd\xef') eq(base64.b16decode('0102abcdef', True), b'\x01\x02\xab\xcd\xef') + def test_a85encode(self): + eq = self.assertEqual + + tests = { + b"www.python.org": b'<~GB\\6`E-ZP=Df.1GEb>~>', + b"no padding..": b'<~DJpY:@:Wn_DJ(RS~>', + b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + b"0123456789!@#0^&*();:<>,. []{}": + b'<~@:E_WAS,RgBkhF"D/O92EH6,BF`qtRH$VbC6UX@47n?3D92&&T' + b":Jand;cHat='/U/0JP==1c70M3&r-I,;", + b"zero compression\0\0\0\0": b'<~H=_,8+Cf>,E,oN2F(oQ1z~>', + b"Boundary:\0\0\0\0": b'<~6>q!aA79M(3WK-[!!~>', + b"Space compr: ": b'<~;fH/TAKYK$D/aMV+', + bytes(range(255)): b"""<~!!*-'"9eu7#RLhG$k3[W&.oNg'GVB"(`=52*$$""" + b"""(B+<_pR,UFcb-n-Vr/1iJ-0JP==1c70M3&s#]4?Ykm5X@_(6q'R884cE""" + b"""H9MJ8X:f1+h<)lt#=BSg3>[:ZC?t!MSA7]@cBPD3sCi+'.E,fo>FEMbN""" + b"""G^4U^I!pHnJ:W<)KS>/9Ll%"IN/`jYOHG]iPa.Q$R$jD4S=Q7DTV8*TU""" + b"""nsrdW2ZetXKAY/Yd(L?['d?O\\@K2_]Y2%o^qmn*`5Ta:aN;TJbg"GZd""" + b"""*^:jeCE.%f\\,!5gtgiEi8N\\UjQ5OekiqBum-X60nF?)@o_%qPq"ad`""" + b"""r;HT~>""", + } + + for data, res in tests.items(): + eq(base64.a85encode(data), res) + + self.assertRaises(TypeError, base64.a85encode, "") + + def test_a85encode_btoa(self): + eq = self.assertEqual + + tests = { + b'zero compression\x00\x00\x00\x00': b'H=_,8+Cf>,E,oN2F(oQ1z', + b'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' + b'0123456789!@#0^&*();:<>,. []{}': + b'@:E_WAS,RgBkhF"D/O92EH6,BF`qtRH$VbC6UX@47n?3D92&&T' + b":Jand;cHat='/U/0JP==1c70M3&r-I,;@P', + b'Boundary:\x00\x00\x00\x00': b'6>q!aA79M(3WK-[!!!!!', + bytes(range(255)): b"""!!*-'"9eu7#RLhG$k3[W&.oNg'GVB"(`=52*$$""" + b"""(B+<_pR,UFcb-n-Vr/1iJ-0JP==1c70M3&s#]4?Ykm5X@_(6q'R884cE""" + b"""H9MJ8X:f1+h<)lt#=BSg3>[:ZC?t!MSA7]@cBPD3sCi+'.E,fo>FEMbN""" + b"""G^4U^I!pHnJ:W<)KS>/9Ll%"IN/`jYOHG]iPa.Q$R$jD4S=Q7DTV8*TU""" + b"""nsrdW2ZetXKAY/Yd(L?['d?O\\@K2_]Y2%o^qmn*`5Ta:aN;TJbg"GZd""" + b"""*^:jeCE.%f\\,!5gtgiEi8N\\UjQ5OekiqBum-X60nF?)@o_%qPq"ad`""" + b"""r;HTp""", + } + + for data, res in tests.items(): + eq(base64.btoa85(data), res) + + eq(base64.btoa85(b"www.python.org", wrapcol=7), + b'GB\\6`\nE-ZP=\nDf.1G\nEb>@P') + eq(base64.btoa85(b"no padding..", wrapcol=10), b'DJpY:@:Wn_\nDJ(RS') + + def test_b85encode(self): + eq = self.assertEqual + + tests = { + bytes(range(255)): b"""009C61O)~M2nh-c3=Iws5D^j+6crX17#SKH9337X""" + b"""AR!_nBqb&%C@Cr{EG;fCFflSSG&MFiI5|2yJUu=?KtV!7L`6nNNJ&ad""" + b"""OifNtP*GA-R8>}2SXo+ITwPvYU}0ioWMyV&XlZI|Y;A6DaB*^Tbai%j""" + b"""czJqze0_d@fPsR8goTEOh>41ejE#,. []{}""": + b"""VPa!sWoBn+X=-b1ZEkOHadLBXb#`}nd3r%YLqtVJM@UIZOH55pPf$@(""" + b"""Q&d$}S6EqEFflSSG&MFiI5{CeBQRbjDkv#CIy^osE+AW7dwl""", + b"""Boundary:\x00\x00\x00\x00""": b"""LT`0$WMOi7IsgCw00""", + b'zero compression\x00\x00\x00\x00': b'dS!BNAY*TBaB^jHb7^mG00000', + b'www.python.org': b'cXxL#aCvlSZ*DGca%T', + b'no padding..': b'Zf_uPVPs@!Zf7no', + b'Space compr: ': b'Q*dEpWgug3ZE$irARr(h', + } + + for data, res in tests.items(): + eq(base64.b85encode(data), res) + + def test_a85decode(self): + eq = self.assertEqual + + tests = { + b'<~GB\\6`E-ZP=Df.1GEb>~>': b'www.python.org', + b'<~H=_,8+Cf>,E,oN2F(oQ1z~>': b'zero compression\x00\x00\x00\x00', + b"""<~! ! * -'"\n\t\t9eu\r\n7# RL\vhG$k3[W&.oNg'GVB"(`=52*$$""" + b"""(B+<_pR,UFcb-n-Vr/1iJ-0JP==1c70M3&s#]4?Ykm5X@_(6q'R884cE""" + b"""H9MJ8X:f1+h<)lt#=BSg3>[:ZC?t!MSA7]@cBPD3sCi+'.E,fo>FEMbN""" + b"""G^4U^I!pHnJ:W<)KS>/9Ll%"IN/`jYOHG]iPa.Q$R$jD4S=Q7DTV8*TU""" + b"""nsrdW2ZetXKAY/Yd(L?['d?O\\@K2_]Y2%o^qmn*`5Ta:aN;TJbg"GZd""" + b"""*^:jeCE.%f\\,!5gtgiEi8N\\UjQ5OekiqBum-X60nF?)@o_%qPq"ad`""" + b"""r;HT~>""": bytes(range(255)), + b'<~DJpY:@:Wn_DJ(RS~>': b'no padding..', + b"""<~@:E_WAS,RgBkhF"D/O92EH6,BF`qtRH$VbC6UX@47n?3D92&&T:Jand;c""" + b"""Hat='/U/0JP==1c70M3&r-I,;""": + b'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234' + b'56789!@#0^&*();:<>,. []{}', + b'<~6>q!aA79M(3WK-[!!~>': b"Boundary:\x00\x00\x00\x00", + b'<~;fH/TAKYK$D/aMV+': b'Space compr: ', + } + + for data, res in tests.items(): + eq(base64.a85decode(data), res) + eq(base64.a85decode(data.decode("ascii")), res) + + def test_a85decode_atob(self): + eq = self.assertEqual + + tests = { + b'GB\\6`E-ZP=Df.1GEb>@P': b'www.python.org\x00\x00', + b'H=_,8+Cf>,E,oN2F(oQ1z': b'zero compression\x00\x00\x00\x00', + b"""!!*-'"9eu7#RLhG$k3[W&.oNg'GVB"(`=52*$$""" + b"""(B+<_pR,UFcb-n-Vr/1iJ-0JP==1c70M3&s#]4?Ykm5X@_(6q'R884cE""" + b"""H9MJ8X:f1+h<)lt#=BSg3>[:ZC?t!MSA7]@cBPD3sCi+'.E,fo>FEMbN""" + b"""G^4U^I!pHnJ:W<)KS>/9Ll%"IN/`jYOHG]iPa.Q$R$jD4S=Q7DTV8*TU""" + b"""nsrdW2ZetXKAY/Yd(L?['d?O\\@K2_]Y2%o^qmn*`5Ta:aN;TJbg"GZd""" + b"""*^:jeCE.%f\\,!5gtgiEi8N\\UjQ5OekiqBum-X60nF?)@o_%qPq"ad`""" + b"""r;HTp""": bytes(range(255)) + b"\x00", + b'DJpY:@:Wn_DJ(RS': b'no padding..', + b"""@:E_WAS,RgBkhF"D/O92EH6,BF`qtRH$VbC6UX@47n?3D92&&T:Jand;c""" + b"""Hat='/U/0JP==1c70M3&r-I,;,. []{}\x00\x00', + b'6>q!aA79M(3WK-[!!': b'Boundary:\x00\x00\x00\x00', + b';fH/TAKYK$D/aMVy': b'Space compr: ' + } + + for data, res in tests.items(): + eq(base64.atob85(data), res) + eq(base64.atob85(data.decode("ascii")), res) + + def test_b85decode(self): + eq = self.assertEqual + + tests = { + b"""009C61O)~M2nh-c3=Iws5D^j+6crX17#SKH9337X""" + b"""AR!_nBqb&%C@Cr{EG;fCFflSSG&MFiI5|2yJUu=?KtV!7L`6nNNJ&ad""" + b"""OifNtP*GA-R8>}2SXo+ITwPvYU}0ioWMyV&XlZI|Y;A6DaB*^Tbai%j""" + b"""czJqze0_d@fPsR8goTEOh>41ejE#,. []{}""", + b"""LT`0$WMOi7IsgCw00""": b"""Boundary:\x00\x00\x00\x00""", + b'dS!BNAY*TBaB^jHb7^mG00000': b'zero compression\x00\x00\x00\x00', + b'cXxL#aCvlSZ*DGca%T': b'www.python.org', + b'Zf_uPVPs@!Zf7no': b'no padding..', + b'Q*dEpWgug3ZE$irARr(h': b'Space compr: ', + } + + for data, res in tests.items(): + eq(base64.b85decode(data), res) + eq(base64.b85decode(data.decode("ascii")), res) + + def test_a85decode_errors(self): + self.assertRaises(binascii.Error, base64.a85decode, "malformed") + self.assertRaises(binascii.Error, base64.a85decode, "<~still malformed") + self.assertRaises(binascii.Error, base64.a85decode, "also malformed~>") + self.assertRaises(TypeError, base64.a85decode, "<~abcx~>") + self.assertRaises(TypeError, base64.a85decode, "<~abcdey~>") + self.assertRaises(TypeError, base64.a85decode, "<~a b\nc~>", + ignorechars=b"") + def test_decode_nonascii_str(self): decode_funcs = (base64.b64decode, base64.standard_b64decode, base64.urlsafe_b64decode, base64.b32decode, - base64.b16decode) + base64.b16decode, + base64.b85decode, + base64.a85decode) for f in decode_funcs: self.assertRaises(ValueError, f, 'with non-ascii \xcb')