diff -r 723b2abd94f9 Lib/base64.py --- a/Lib/base64.py Sat Apr 06 18:28:19 2013 +0100 +++ b/Lib/base64.py Sun Apr 14 16:43:14 2013 +0100 @@ -1,14 +1,16 @@ #! /usr/bin/env python3 -"""RFC 3548: Base16, Base32, Base64 Data Encodings""" +"""RFC 3548: Base16, Base32, Base64; and Base85, Ascii85 Data Encodings""" # Modified 04-Oct-1995 by Jack Jansen to use binascii module # Modified 30-Dec-2003 by Barry Warsaw to add full RFC 3548 support # Modified 22-May-2007 by Guido van Rossum to use bytes everywhere +# Modified 14-Apr-2013 by Martin Morrison to add Base85/Ascii85 support import re import struct import binascii +import itertools __all__ = [ @@ -17,6 +19,8 @@ # Generalized interface for other encodings 'b64encode', 'b64decode', 'b32encode', 'b32decode', 'b16encode', 'b16decode', + # Base85 and Ascii85 encodings + 'b85encode', 'b85decode', 'a85encode', 'a85decode', # Standard Base64 encoding 'standard_b64encode', 'standard_b64decode', # Some common Base64 alternatives. As referenced by RFC 3458, see thread @@ -302,7 +306,192 @@ raise binascii.Error('Non-base16 digit found') return binascii.unhexlify(s) +# +# Ascii85 encoding/decoding +# +_B85START = b"<~" +_B85END = b"~>" +_B85FOLDNUL = b"z" +_B85FOLDSPACE = b"y" + +def a85encode(b, *, foldspaces=False): + """Encode a byte string using Ascii85. + + b is the byte string to encode. The encoded byte string is returned. + + foldspaces is an optional flag that uses the special short sequence 'y' + instead of 4 consecutive spaces (ASCII 0x20) as supported by 'btoa'. This + feature is not supported by the "standard" Adobe encoding. + """ + if not isinstance(b, bytes_types): + raise TypeError("expected bytes, not {}".format(b.__class__.__name__)) + + quanta, leftover = divmod(len(b), 4) + words = struct.unpack("!{}I{}B".format(quanta, leftover), b) + encoded = [_B85START] + + def encode(word): + next = [] + for j in range(5): + word, num = divmod(word, 85) + next.append(num + 33) + return bytes(reversed(next)) + + for word in itertools.islice(words, quanta): + if word == 0: + encoded.append(_B85FOLDNUL) + elif foldspaces and word == 0x20202020: + encoded.append(_B85FOLDSPACE) + else: + encoded.append(encode(word)) + + if leftover: + last, = struct.unpack("!I", bytes(words[quanta:] + (0,)*(4-leftover))) + encoded.append(encode(last)[:leftover + 1]) + + encoded.append(_B85END) + + return b"".join(encoded) + +def a85decode(b, *, foldspaces=False): + """Decode an Ascii85 encoded byte string. + + s is the byte string to decode. + + foldspaces is a flag that specifies whether the 'y' short sequence should be + accepted as shorthand for 4 consecutive spaces (ASCII 0x20). This feature is + not supported by the "standard" Adobe encoding. + """ + b = _bytes_from_decode_data(b) + if not b.startswith(_B85START) or not b.endswith(_B85END): + raise binascii.Error("Ascii85 encoded byte sequences must be bracketed " + "by {} and {}".format(_B85START, _B85END)) + # + # We have to go through this stepwise, so as to ignore spaces and handle + # special short sequences + # + b = b[2:-2] # Strip off start/end markers + position = 0 + decoded = [] + last = [] + + def decode(fivechars): + val = 0 + for j in range(5): + if fivechars[j] - 33 >= 85: + raise TypeError("Non-base85 digit found: {}" + .format(chr(fivechars[j]))) + val *= 85 + val += fivechars[j] - 33 + return val + + try: + while True: + curr = [] + while len(curr) < 5: + # Skip whitespace + while b[position] in b' \t\n\r\v': + position += 1 + # handle special short sequences + if b[position] == b'z'[0]: + decoded.append(0) + position += 1 + break + if b[position] == b'y'[0] and foldspaces: + decoded.append(0x20202020) + position += 1 + break + curr.append(b[position]) + position += 1 + else: + decoded.append(decode(curr)) + except IndexError: + # We ran out of characters for a full word; handle any stragglers + if curr: + last = curr + [b'u'[0]] * (5 - len(curr)) + # Throw away the extra padding + last = list(struct.pack("!I", decode(last))[:-(5-len(curr))]) + + return struct.pack("!{}I{}B".format(len(decoded), len(last)), + *itertools.chain(decoded, last)) + +# The following code is originally taken (with permission) from Mercurial + +_b85chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" \ + "abcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~" +_b85chars2 = [(a + b) for a in _b85chars for b in _b85chars] +_b85dec = {} + +def _mkb85dec(): + for i, c in enumerate(_b85chars): + _b85dec[ord(c)] = i + +def b85encode(b, pad=False): + """Encode an ASCII-encoded byte array in base85 format. + + If pad is true, the input is padded with "\0" so its length is a multiple of + 4 characters before encoding. + """ + if not isinstance(b, bytes_types): + raise TypeError("expected bytes, not %s" % b.__class__.__name__) + + l = len(b) + r = l % 4 + if r: + b += b'\0' * (4 - r) + longs = len(b) >> 2 + words = struct.unpack('>%dL' % (longs), b) + + out = "".join(_b85chars[(word // 52200625) % 85] + + _b85chars2[(word // 7225) % 7225] + + _b85chars2[word % 7225] + for word in words).encode("ascii") + + if pad: + return out + + # Trim padding + olen = l % 4 + if olen: + olen += 1 + olen += l // 4 * 5 + return out[:olen] + +def b85decode(b): + """Decode base85-encoded byte array""" + text = _bytes_from_decode_data(b) + if not _b85dec: + _mkb85dec() + + l = len(text) + out = [] + for i in range(0, len(text), 5): + chunk = text[i:i + 5] + acc = 0 + for j, c in enumerate(chunk): + try: + acc = acc * 85 + _b85dec[c] + except KeyError: + raise ValueError('bad base85 character at position %d' + % (i + j)) + if acc > 4294967295: + raise ValueError('Base85 overflow in hunk starting at byte %d' % i) + out.append(acc) + + # Pad final chunk if necessary + cl = l % 5 + if cl: + acc *= 85 ** (5 - cl) + if cl > 1: + acc += 0xffffff >> (cl - 2) * 8 + out[-1] = acc + + out = struct.pack('>%dL' % (len(out)), *out) + if cl: + out = out[:-(5 - cl)] + + return out # Legacy interface. This code could be cleaned up since I don't believe # binascii has any line length limitations. It just doesn't seem worth it diff -r 723b2abd94f9 Lib/test/test_base64.py --- a/Lib/test/test_base64.py Sat Apr 06 18:28:19 2013 +0100 +++ b/Lib/test/test_base64.py Sun Apr 14 16:43:14 2013 +0100 @@ -260,12 +260,125 @@ eq(base64.b16decode(b'0102abcdef', True), b'\x01\x02\xab\xcd\xef') eq(base64.b16decode('0102abcdef', True), b'\x01\x02\xab\xcd\xef') + def test_a85encode(self): + eq = self.assertEqual + + tests = { + b"www.python.org": b'<~GB\\6`E-ZP=Df.1GEb>~>', + b"no padding..": b'<~DJpY:@:Wn_DJ(RS~>', + b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + b"0123456789!@#0^&*();:<>,. []{}": + b'<~@:E_WAS,RgBkhF"D/O92EH6,BF`qtRH$VbC6UX@47n?3D92&&T' + b":Jand;cHat='/U/0JP==1c70M3&r-I,;", + b"zero compression\0\0\0\0": b'<~H=_,8+Cf>,E,oN2F(oQ1z~>', + b"Boundary:\0\0\0\0": b'<~6>q!aA79M(3WK-[!!~>', + b"Space compr: ": b'<~;fH/TAKYK$D/aMV+', + bytes(range(255)): b"""<~!!*-'"9eu7#RLhG$k3[W&.oNg'GVB"(`=52*$$""" + b"""(B+<_pR,UFcb-n-Vr/1iJ-0JP==1c70M3&s#]4?Ykm5X@_(6q'R884cE""" + b"""H9MJ8X:f1+h<)lt#=BSg3>[:ZC?t!MSA7]@cBPD3sCi+'.E,fo>FEMbN""" + b"""G^4U^I!pHnJ:W<)KS>/9Ll%"IN/`jYOHG]iPa.Q$R$jD4S=Q7DTV8*TU""" + b"""nsrdW2ZetXKAY/Yd(L?['d?O\\@K2_]Y2%o^qmn*`5Ta:aN;TJbg"GZd""" + b"""*^:jeCE.%f\\,!5gtgiEi8N\\UjQ5OekiqBum-X60nF?)@o_%qPq"ad`""" + b"""r;HT~>""", + } + + for data, res in tests.items(): + eq(base64.a85encode(data), res) + + self.assertRaises(TypeError, base64.a85encode, "") + + def test_a85encode_foldspaces(self): + eq = self.assertEqual + + tests = { + b"www.python.org": b'<~GB\\6`E-ZP=Df.1GEb>~>', + b"no padding..": b'<~DJpY:@:Wn_DJ(RS~>', + b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + b"0123456789!@#0^&*();:<>,. []{}": + b'<~@:E_WAS,RgBkhF"D/O92EH6,BF`qtRH$VbC6UX@47n?3D92&&T' + b":Jand;cHat='/U/0JP==1c70M3&r-I,;", + b"zero compression\0\0\0\0": b'<~H=_,8+Cf>,E,oN2F(oQ1z~>', + b"Boundary:\x00\x00\x00\x00": b'<~6>q!aA79M(3WK-[!!~>', + b"Space compr: ": b'<~;fH/TAKYK$D/aMVy~>', + bytes(range(255)): b"""<~!!*-'"9eu7#RLhG$k3[W&.oNg'GVB"(`=52*$$""" + b"""(B+<_pR,UFcb-n-Vr/1iJ-0JP==1c70M3&s#]4?Ykm5X@_(6q'R884cE""" + b"""H9MJ8X:f1+h<)lt#=BSg3>[:ZC?t!MSA7]@cBPD3sCi+'.E,fo>FEMbN""" + b"""G^4U^I!pHnJ:W<)KS>/9Ll%"IN/`jYOHG]iPa.Q$R$jD4S=Q7DTV8*TU""" + b"""nsrdW2ZetXKAY/Yd(L?['d?O\\@K2_]Y2%o^qmn*`5Ta:aN;TJbg"GZd""" + b"""*^:jeCE.%f\\,!5gtgiEi8N\\UjQ5OekiqBum-X60nF?)@o_%qPq"ad`""" + b"""r;HT~>""", + } + + for data, res in tests.items(): + eq(base64.a85encode(data, foldspaces=True), res) + + def test_a85decode(self): + eq = self.assertEqual + + tests = { + b'<~GB\\6`E-ZP=Df.1GEb>~>': b'www.python.org', + b'<~H=_,8+Cf>,E,oN2F(oQ1z~>': b'zero compression\x00\x00\x00\x00', + b"""<~!!*-'"9eu7#RLhG$k3[W&.oNg'GVB"(`=52*$$""" + b"""(B+<_pR,UFcb-n-Vr/1iJ-0JP==1c70M3&s#]4?Ykm5X@_(6q'R884cE""" + b"""H9MJ8X:f1+h<)lt#=BSg3>[:ZC?t!MSA7]@cBPD3sCi+'.E,fo>FEMbN""" + b"""G^4U^I!pHnJ:W<)KS>/9Ll%"IN/`jYOHG]iPa.Q$R$jD4S=Q7DTV8*TU""" + b"""nsrdW2ZetXKAY/Yd(L?['d?O\\@K2_]Y2%o^qmn*`5Ta:aN;TJbg"GZd""" + b"""*^:jeCE.%f\\,!5gtgiEi8N\\UjQ5OekiqBum-X60nF?)@o_%qPq"ad`""" + b"""r;HT~>""": bytes(range(255)), + b'<~DJpY:@:Wn_DJ(RS~>': b'no padding..', + b"""<~@:E_WAS,RgBkhF"D/O92EH6,BF`qtRH$VbC6UX@47n?3D92&&T:Jand;c""" + b"""Hat='/U/0JP==1c70M3&r-I,;""": + b'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234' + b'56789!@#0^&*();:<>,. []{}', + b'<~6>q!aA79M(3WK-[!!~>': b"Boundary:\x00\x00\x00\x00", + b'<~;fH/TAKYK$D/aMV+': b'Space compr: ' + } + + for data, res in tests.items(): + eq(base64.a85decode(data), res) + eq(base64.a85decode(data.decode("ascii")), res) + + def test_a85decode_foldspaces(self): + eq = self.assertEqual + + tests = { + b'<~GB\\6`E-ZP=Df.1GEb>~>': b'www.python.org', + b'<~H=_,8+Cf>,E,oN2F(oQ1z~>': b'zero compression\x00\x00\x00\x00', + b"""<~!!*-'"9eu7#RLhG$k3[W&.oNg'GVB"(`=52*$$""" + b"""(B+<_pR,UFcb-n-Vr/1iJ-0JP==1c70M3&s#]4?Ykm5X@_(6q'R884cE""" + b"""H9MJ8X:f1+h<)lt#=BSg3>[:ZC?t!MSA7]@cBPD3sCi+'.E,fo>FEMbN""" + b"""G^4U^I!pHnJ:W<)KS>/9Ll%"IN/`jYOHG]iPa.Q$R$jD4S=Q7DTV8*TU""" + b"""nsrdW2ZetXKAY/Yd(L?['d?O\\@K2_]Y2%o^qmn*`5Ta:aN;TJbg"GZd""" + b"""*^:jeCE.%f\\,!5gtgiEi8N\\UjQ5OekiqBum-X60nF?)@o_%qPq"ad`""" + b"""r;HT~>""": bytes(range(255)), + b'<~DJpY:@:Wn_DJ(RS~>': b'no padding..', + b"""<~@:E_WAS,RgBkhF"D/O92EH6,BF`qtRH$VbC6UX@47n?3D92&&T:Jand;c""" + b"""Hat='/U/0JP==1c70M3&r-I,;""": + b'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234' + b'56789!@#0^&*();:<>,. []{}', + b'<~6>q!aA79M(3WK-[!!~>': b'Boundary:\x00\x00\x00\x00', + b'<~;fH/TAKYK$D/aMVy~>': b'Space compr: ' + } + + for data, res in tests.items(): + eq(base64.a85decode(data, foldspaces=True), res) + eq(base64.a85decode(data.decode("ascii"), foldspaces=True), res) + + def test_a85decode_errors(self): + self.assertRaises(binascii.Error, base64.a85decode, "malformed") + self.assertRaises(binascii.Error, base64.a85decode, "<~still malformed") + self.assertRaises(binascii.Error, base64.a85decode, "also malformed~>") + self.assertRaises(TypeError, base64.a85decode, "<~abcx~>") + self.assertRaises(TypeError, base64.a85decode, "<~abcdey~>") + def test_decode_nonascii_str(self): decode_funcs = (base64.b64decode, base64.standard_b64decode, base64.urlsafe_b64decode, base64.b32decode, - base64.b16decode) + base64.b16decode, + base64.b85decode, + base64.a85decode) for f in decode_funcs: self.assertRaises(ValueError, f, 'with non-ascii \xcb')