# HG changeset patch # Parent 2d71d0f954fb3b293bf097d99e958a000eb17696 Document, fix and test quoted-printable newline handling * \n by default (e.g. for soft line breaks) * CRLF if found in input (even in non-text) * Native Python implementation in quopri did not handle CRLF * Typo errors in documentation * quopri uses istext=True * header flag does not affect newline encoding; only istext affects it * Avoid unnecessarily breaking 76-char quoted-printable line ending in escape * Fixes breaking escape codes over two lines by native Python encoder diff -r 2d71d0f954fb Doc/library/binascii.rst --- a/Doc/library/binascii.rst Sat Jan 17 17:33:49 2015 -0800 +++ b/Doc/library/binascii.rst Tue Jan 20 04:38:41 2015 +0000 @@ -62,7 +62,8 @@ .. function:: a2b_qp(string, header=False) Convert a block of quoted-printable data back to binary and return the binary - data. More than one line may be passed at a time. If the optional argument + data. More than one line may be passed at a time, using either + ``b"\r\n"`` or ``b"\n"`` for newlines. If the optional argument *header* is present and true, underscores will be decoded as spaces. .. versionchanged:: 3.2 @@ -71,15 +72,16 @@ .. function:: b2a_qp(data, quotetabs=False, istext=True, header=False) - Convert binary data to a line(s) of ASCII characters in quoted-printable + Convert binary data to line(s) of ASCII characters in quoted-printable encoding. The return value is the converted line(s). If the optional argument *quotetabs* is present and true, all tabs and spaces will be encoded. If the - optional argument *istext* is present and true, newlines are not encoded but + argument *istext* is true (the default), newlines are not encoded, but trailing whitespace will be encoded. If the optional argument *header* is - present and true, spaces will be encoded as underscores per RFC1522. If the - optional argument *header* is present and false, newline characters will be - encoded as well; otherwise linefeed conversion might corrupt the binary data - stream. + present and true, spaces will be encoded as underscores per RFC1522. + + The return value uses ``b"\n"`` for hard and soft newlines by default, + but will use ``b"\r\n"`` instead if that sequence is found in + the original data (even if *istext* is false). .. function:: a2b_hqx(string) diff -r 2d71d0f954fb Doc/library/quopri.rst --- a/Doc/library/quopri.rst Sat Jan 17 17:33:49 2015 -0800 +++ b/Doc/library/quopri.rst Tue Jan 20 04:38:41 2015 +0000 @@ -24,24 +24,34 @@ .. function:: decode(input, output, header=False) Decode the contents of the *input* file and write the resulting decoded binary - data to the *output* file. *input* and *output* must be :term:`binary file objects - `. If the optional argument *header* is present and true, underscore - will be decoded as space. This is used to decode "Q"-encoded headers as + data to the *output* file. The *input* and *output* arguments + must be :term:`binary file objects `. + If the optional argument *header* is present and true, underscores + will be decoded as spaces. This is used to decode "Q"-encoded headers as described in :rfc:`1522`: "MIME (Multipurpose Internet Mail Extensions) Part Two: Message Header Extensions for Non-ASCII Text". + This function is equivalent to applying :func:`binascii.a2b_qp` to + the file data. + .. function:: encode(input, output, quotetabs, header=False) - Encode the contents of the *input* file and write the resulting quoted- - printable data to the *output* file. *input* and *output* must be - :term:`binary file objects `. *quotetabs*, a flag which controls - whether to encode embedded spaces and tabs must be provideda and when true it - encodes such embedded whitespace, and when false it leaves them unencoded. + Encode the contents of the *input* file and write the resulting + quoted-printable data to the *output* file. The *input* and + *output* arguments must be :term:`binary file objects `. + The *quotetabs* flag, which controls whether to encode + embedded spaces and tabs, must be provided. When true, it + encodes such embedded whitespace, and when false, it leaves it unencoded. Note that spaces and tabs appearing at the end of lines are always encoded, - as per :rfc:`1521`. *header* is a flag which controls if spaces are encoded + as per :rfc:`1521`. The *header* flag controls if spaces are encoded as underscores as per :rfc:`1522`. + This function is equivalent to applying :func:`binascii.b2a_qp` with + ``istext=True`` to the file data. Therefore, :func:`encode` should only + be used to encode text data that uses ``b"\r\n"`` or ``b"\n"`` as + newlines. + .. function:: decodestring(s, header=False) diff -r 2d71d0f954fb Lib/quopri.py --- a/Lib/quopri.py Sat Jan 17 17:33:49 2015 -0800 +++ b/Lib/quopri.py Tue Jan 20 04:38:41 2015 +0000 @@ -56,7 +56,7 @@ output.write(odata) return - def write(s, output=output, lineEnd=b'\n'): + def write(s, *, output=output, lineEnd): # RFC 1521 requires that the line ending in a space or tab must have # that trailing character encoded. if s and s[-1:] in b' \t': @@ -71,10 +71,16 @@ line = input.readline() if not line: break + # First, write out the previous line + if prevline is not None: + write(prevline, lineEnd=stripped or b'\n') outline = [] # Strip off any readline induced trailing newline stripped = b'' - if line[-1:] == b'\n': + if line[-2:] == b'\r\n': + line = line[:-2] + stripped = b'\r\n' + elif line[-1:] == b'\n': line = line[:-1] stripped = b'\n' # Calculate the un-length-limited encoded line @@ -86,17 +92,18 @@ outline.append(b'_') else: outline.append(c) - # First, write out the previous line - if prevline is not None: - write(prevline) # Now see if we need any soft line breaks because of RFC-imposed # length limitations. Then do the thisline->prevline dance. thisline = EMPTYSTRING.join(outline) + soft_break = b'=' + (stripped or b'\n') while len(thisline) > MAXLINESIZE: + end = thisline.rfind(b"=", MAXLINESIZE-3, MAXLINESIZE) + if end < 0: + end = MAXLINESIZE-1 # Don't forget to include the soft line break `=' sign in the # length calculation! - write(thisline[:MAXLINESIZE-1], lineEnd=b'=\n') - thisline = thisline[MAXLINESIZE-1:] + write(thisline[:end], lineEnd=soft_break) + thisline = thisline[end:] # Write out the current line prevline = thisline # Write out the last line, without a trailing newline @@ -131,12 +138,17 @@ if not line: break i, n = 0, len(line) if n > 0 and line[n-1:n] == b'\n': - partial = 0; n = n-1 + partial = False + if line.endswith(b'\r\n'): + eol = b'\r\n' + else: + eol = b'\n' + n = n-len(eol) # Strip trailing whitespace while n > 0 and line[n-1:n] in b" \t\r": n = n-1 else: - partial = 1 + partial = True while i < n: c = line[i:i+1] if c == b'_' and header: @@ -144,7 +156,7 @@ elif c != ESCAPE: new = new + c; i = i+1 elif i+1 == n and not partial: - partial = 1; break + partial = True; break elif i+1 < n and line[i+1] == ESCAPE: new = new + ESCAPE; i = i+2 elif i+2 < n and ishex(line[i+1:i+2]) and ishex(line[i+2:i+3]): @@ -152,7 +164,7 @@ else: # Bad escape sequence -- leave it in new = new + c; i = i+1 if not partial: - output.write(new + b'\n') + output.write(new + eol) new = b'' if new: output.write(new) diff -r 2d71d0f954fb Lib/test/test_binascii.py --- a/Lib/test/test_binascii.py Sat Jan 17 17:33:49 2015 -0800 +++ b/Lib/test/test_binascii.py Tue Jan 20 04:38:41 2015 +0000 @@ -193,6 +193,15 @@ self.assertEqual(binascii.b2a_qp(b'.'), b'=2E') self.assertEqual(binascii.b2a_qp(b'.\n'), b'=2E\n') self.assertEqual(binascii.b2a_qp(b'a.\n'), b'a.\n') + self.assertEqual(binascii.b2a_qp(b'x' * 77, istext=False), + b'x' * 75 + b'=\n' # Non-text mode uses \n by default + b'xx') + self.assertEqual(binascii.b2a_qp(b'x' * 77 + b'\r\n', istext=False), + b'x' * 75 + b'=\r\n' # Switches to CRLF if seen in data + b'xx=0D=0A') + # header=True used to be documented as encoding newlines + self.assertEqual(binascii.b2a_qp(b'newline\n', header=True), + b'newline\n') def test_empty_string(self): # A test for SF bug #1022953. Make sure SystemError is not raised. diff -r 2d71d0f954fb Lib/test/test_quopri.py --- a/Lib/test/test_quopri.py Sat Jan 17 17:33:49 2015 -0800 +++ b/Lib/test/test_quopri.py Tue Jan 20 04:38:41 2015 +0000 @@ -113,6 +113,14 @@ zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'''), # Now some really complex stuff ;) (DECSAMPLE, ENCSAMPLE), + + # 73 unescaped characters plus one =XX escape makes a full line + (b'#' * 73 + b'=\n', b'#' * 73 + b'=3D\n'), + + # Escape codes should not be broken by soft line breaks + (b'#' * 74 + b'=\n', b'#' * 74 + b'=\n' b'=3D\n'), + (b'#' * 75 + b'=\n', b'#' * 75 + b'=\n' b'=3D\n'), + (b'#' * 76 + b'=\n', b'#' * 75 + b'=\n' b'#=3D\n'), ) # These are used in the "quotetabs=1" tests. @@ -159,6 +167,43 @@ self.assertEqual(outfp.getvalue(), p) @withpythonimplementation + def test_newline(self): + '''Exercise support of \n and CRLF newlines''' + plain_lines = ( + b'x' * 77 + b'Line 1 \t ', + b'Line 2\t \t', + b'Line 3', + b'No newline', + ) + enc_lines = ( + b'x' * 75 + b'=', + b'xx' b'Line 1 \t=20', + b'Line 2\t =09', + b'Line 3', + b'No newline', + ) + for newline in (b'\r\n', b'\n'): + with self.subTest(repr(newline)): + plaintext = newline.join(plain_lines) + encoded = newline.join(enc_lines) + self.assertEqual(encoded, quopri.encodestring(plaintext)) + self.assertEqual(plaintext, quopri.decodestring(encoded)) + + # Default soft newline is \n if there are no hard newlines + encoded = (b'x' * 75 + b'=\n' + b'xx') + self.assertEqual(encoded, quopri.encodestring(b'x' * 77)) + + @withpythonimplementation + def test_decode_nontext(self): + '''Should decode non-textual =0D and =0A bytes''' + encoded = (b'CR=0D CRLF=0D=0A NL=0A Soft NL=\n' + b'*Soft CRLF=\r\n' + b'*EOF') + data = b'CR\r CRLF\r\n NL\n Soft NL*Soft CRLF*EOF' + self.assertEqual(data, quopri.decodestring(encoded)) + + @withpythonimplementation def test_embedded_ws(self): for p, e in self.ESTRINGS: self.assertEqual(quopri.encodestring(p, quotetabs=True), e) diff -r 2d71d0f954fb Modules/binascii.c --- a/Modules/binascii.c Sat Jan 17 17:33:49 2015 -0800 +++ b/Modules/binascii.c Tue Jan 20 04:38:41 2015 +0000 @@ -1334,8 +1334,7 @@ } /* XXX: This is ridiculously complicated to be backward compatible - * (mostly) with the quopri module. It doesn't re-create the quopri - * module bug where text ending in CRLF has the CR encoded */ + * (mostly) with the quopri module. */ /*[clinic input] binascii.b2a_qp @@ -1389,8 +1388,12 @@ ((databuf[in] < 33) && (databuf[in] != '\r') && (databuf[in] != '\n') && (quotetabs || ((databuf[in] != '\t') && (databuf[in] != ' '))))) - { - if ((linelen + 3) >= MAXLINESIZE) { + { /* Input byte needs to be escaped to =XX */ + if ((linelen + 3 > MAXLINESIZE) || + ((linelen + 3 == MAXLINESIZE) && (in+1 < datalen) && + (!istext || ((databuf[in+1] != '\n') && + (databuf[in+1] != '\r' || databuf[in+2] != '\n'))))) + { /* Soft line break needed before escape code */ linelen = 0; if (crlf) odatalen += 3; @@ -1406,7 +1409,7 @@ ((databuf[in] == '\n') || ((in+1 < datalen) && (databuf[in] == '\r') && (databuf[in+1] == '\n')))) - { + { /* Literal hard line break */ linelen = 0; /* Protect against whitespace on end of line */ if (in && ((databuf[in-1] == ' ') || (databuf[in-1] == '\t'))) @@ -1420,10 +1423,11 @@ else in++; } - else { + else { /* Unescaped literal character */ if ((in + 1 != datalen) && (databuf[in+1] != '\n') && (linelen + 1) >= MAXLINESIZE) { + /* Soft line break needed before character */ linelen = 0; if (crlf) odatalen += 3; @@ -1437,8 +1441,7 @@ } } - /* We allocate the output same size as input, this is overkill. - * The previous implementation used calloc() so we'll zero out the + /* The previous implementation used calloc() so we'll zero out the * memory here too, since PyMem_Malloc() does not guarantee that. */ odata = (unsigned char *) PyMem_Malloc(odatalen); @@ -1461,8 +1464,12 @@ (databuf[in] != '\r') && (databuf[in] != '\n') && (quotetabs || (!quotetabs && ((databuf[in] != '\t') && (databuf[in] != ' ')))))) - { - if ((linelen + 3 )>= MAXLINESIZE) { + { /* Input byte needs to be escaped to =XX */ + if ((linelen + 3 > MAXLINESIZE) || + ((linelen + 3 == MAXLINESIZE) && (in+1 < datalen) && + (!istext || ((databuf[in+1] != '\n') && + (databuf[in+1] != '\r' || databuf[in+2] != '\n'))))) + { /* Soft line break needed before escape code */ odata[out++] = '='; if (crlf) odata[out++] = '\r'; odata[out++] = '\n'; @@ -1479,7 +1486,7 @@ ((databuf[in] == '\n') || ((in+1 < datalen) && (databuf[in] == '\r') && (databuf[in+1] == '\n')))) - { + { /* Literal hard line break */ linelen = 0; /* Protect against whitespace on end of line */ if (out && ((odata[out-1] == ' ') || (odata[out-1] == '\t'))) { @@ -1496,10 +1503,11 @@ else in++; } - else { + else { /* Unescaped literal character */ if ((in + 1 != datalen) && (databuf[in+1] != '\n') && (linelen + 1) >= MAXLINESIZE) { + /* Soft line break needed before character */ odata[out++] = '='; if (crlf) odata[out++] = '\r'; odata[out++] = '\n';