# HG changeset patch
# Parent 2d71d0f954fb3b293bf097d99e958a000eb17696
Document, fix and test quoted-printable newline handling

* \n by default (e.g. for soft line breaks)
* CRLF if found in input (even in non-text)
* Native Python implementation in quopri did not handle CRLF
* Typo errors in documentation
* quopri uses istext=True
* header flag does not affect newline encoding; only istext affects it
* Avoid unnecessarily breaking 76-char quoted-printable line ending in escape
* Fixes breaking escape codes over two lines by native Python encoder

diff -r 2d71d0f954fb Doc/library/binascii.rst
--- a/Doc/library/binascii.rst	Sat Jan 17 17:33:49 2015 -0800
+++ b/Doc/library/binascii.rst	Tue Jan 20 04:38:41 2015 +0000
@@ -62,7 +62,8 @@
 .. function:: a2b_qp(string, header=False)
 
    Convert a block of quoted-printable data back to binary and return the binary
-   data. More than one line may be passed at a time. If the optional argument
+   data. More than one line may be passed at a time, using either
+   ``b"\r\n"`` or ``b"\n"`` for newlines. If the optional argument
    *header* is present and true, underscores will be decoded as spaces.
 
    .. versionchanged:: 3.2
@@ -71,15 +72,16 @@
 
 .. function:: b2a_qp(data, quotetabs=False, istext=True, header=False)
 
-   Convert binary data to a line(s) of ASCII characters in quoted-printable
+   Convert binary data to line(s) of ASCII characters in quoted-printable
    encoding.  The return value is the converted line(s). If the optional argument
    *quotetabs* is present and true, all tabs and spaces will be encoded.   If the
-   optional argument *istext* is present and true, newlines are not encoded but
+   argument *istext* is true (the default), newlines are not encoded, but
    trailing whitespace will be encoded. If the optional argument *header* is
-   present and true, spaces will be encoded as underscores per RFC1522. If the
-   optional argument *header* is present and false, newline characters will be
-   encoded as well; otherwise linefeed conversion might corrupt the binary data
-   stream.
+   present and true, spaces will be encoded as underscores per RFC1522.
+
+   The return value uses ``b"\n"`` for hard and soft newlines by default,
+   but will use ``b"\r\n"`` instead if that sequence is found in
+   the original data (even if *istext* is false).
 
 
 .. function:: a2b_hqx(string)
diff -r 2d71d0f954fb Doc/library/quopri.rst
--- a/Doc/library/quopri.rst	Sat Jan 17 17:33:49 2015 -0800
+++ b/Doc/library/quopri.rst	Tue Jan 20 04:38:41 2015 +0000
@@ -24,24 +24,34 @@
 .. function:: decode(input, output, header=False)
 
    Decode the contents of the *input* file and write the resulting decoded binary
-   data to the *output* file. *input* and *output* must be :term:`binary file objects
-   <file object>`.  If the optional argument *header* is present and true, underscore
-   will be decoded as space. This is used to decode "Q"-encoded headers as
+   data to the *output* file. The *input* and *output* arguments
+   must be :term:`binary file objects <file object>`.
+   If the optional argument *header* is present and true, underscores
+   will be decoded as spaces. This is used to decode "Q"-encoded headers as
    described in :rfc:`1522`: "MIME (Multipurpose Internet Mail Extensions)
    Part Two: Message Header Extensions for Non-ASCII Text".
 
+   This function is equivalent to applying :func:`binascii.a2b_qp` to
+   the file data.
+
 
 .. function:: encode(input, output, quotetabs, header=False)
 
-   Encode the contents of the *input* file and write the resulting quoted-
-   printable data to the *output* file. *input* and *output* must be
-   :term:`binary file objects <file object>`. *quotetabs*, a flag which controls
-   whether to encode embedded spaces and tabs must be provideda and when true it
-   encodes such embedded whitespace, and when false it leaves them unencoded.
+   Encode the contents of the *input* file and write the resulting
+   quoted-printable data to the *output* file. The *input* and
+   *output* arguments must be :term:`binary file objects <file object>`.
+   The *quotetabs* flag, which controls whether to encode
+   embedded spaces and tabs, must be provided. When true, it
+   encodes such embedded whitespace, and when false, it leaves it unencoded.
    Note that spaces and tabs appearing at the end of lines are always encoded,
-   as per :rfc:`1521`.  *header* is a flag which controls if spaces are encoded
+   as per :rfc:`1521`.  The *header* flag controls if spaces are encoded
    as underscores as per :rfc:`1522`.
 
+   This function is equivalent to applying :func:`binascii.b2a_qp` with
+   ``istext=True`` to the file data. Therefore, :func:`encode` should only
+   be used to encode text data that uses ``b"\r\n"`` or ``b"\n"`` as
+   newlines.
+
 
 .. function:: decodestring(s, header=False)
 
diff -r 2d71d0f954fb Lib/quopri.py
--- a/Lib/quopri.py	Sat Jan 17 17:33:49 2015 -0800
+++ b/Lib/quopri.py	Tue Jan 20 04:38:41 2015 +0000
@@ -56,7 +56,7 @@
         output.write(odata)
         return
 
-    def write(s, output=output, lineEnd=b'\n'):
+    def write(s, *, output=output, lineEnd):
         # RFC 1521 requires that the line ending in a space or tab must have
         # that trailing character encoded.
         if s and s[-1:] in b' \t':
@@ -71,10 +71,16 @@
         line = input.readline()
         if not line:
             break
+        # First, write out the previous line
+        if prevline is not None:
+            write(prevline, lineEnd=stripped or b'\n')
         outline = []
         # Strip off any readline induced trailing newline
         stripped = b''
-        if line[-1:] == b'\n':
+        if line[-2:] == b'\r\n':
+            line = line[:-2]
+            stripped = b'\r\n'
+        elif line[-1:] == b'\n':
             line = line[:-1]
             stripped = b'\n'
         # Calculate the un-length-limited encoded line
@@ -86,17 +92,18 @@
                 outline.append(b'_')
             else:
                 outline.append(c)
-        # First, write out the previous line
-        if prevline is not None:
-            write(prevline)
         # Now see if we need any soft line breaks because of RFC-imposed
         # length limitations.  Then do the thisline->prevline dance.
         thisline = EMPTYSTRING.join(outline)
+        soft_break = b'=' + (stripped or b'\n')
         while len(thisline) > MAXLINESIZE:
+            end = thisline.rfind(b"=", MAXLINESIZE-3, MAXLINESIZE)
+            if end < 0:
+                end = MAXLINESIZE-1
             # Don't forget to include the soft line break `=' sign in the
             # length calculation!
-            write(thisline[:MAXLINESIZE-1], lineEnd=b'=\n')
-            thisline = thisline[MAXLINESIZE-1:]
+            write(thisline[:end], lineEnd=soft_break)
+            thisline = thisline[end:]
         # Write out the current line
         prevline = thisline
     # Write out the last line, without a trailing newline
@@ -131,12 +138,17 @@
         if not line: break
         i, n = 0, len(line)
         if n > 0 and line[n-1:n] == b'\n':
-            partial = 0; n = n-1
+            partial = False
+            if line.endswith(b'\r\n'):
+                eol = b'\r\n'
+            else:
+                eol = b'\n'
+            n = n-len(eol)
             # Strip trailing whitespace
             while n > 0 and line[n-1:n] in b" \t\r":
                 n = n-1
         else:
-            partial = 1
+            partial = True
         while i < n:
             c = line[i:i+1]
             if c == b'_' and header:
@@ -144,7 +156,7 @@
             elif c != ESCAPE:
                 new = new + c; i = i+1
             elif i+1 == n and not partial:
-                partial = 1; break
+                partial = True; break
             elif i+1 < n and line[i+1] == ESCAPE:
                 new = new + ESCAPE; i = i+2
             elif i+2 < n and ishex(line[i+1:i+2]) and ishex(line[i+2:i+3]):
@@ -152,7 +164,7 @@
             else: # Bad escape sequence -- leave it in
                 new = new + c; i = i+1
         if not partial:
-            output.write(new + b'\n')
+            output.write(new + eol)
             new = b''
     if new:
         output.write(new)
diff -r 2d71d0f954fb Lib/test/test_binascii.py
--- a/Lib/test/test_binascii.py	Sat Jan 17 17:33:49 2015 -0800
+++ b/Lib/test/test_binascii.py	Tue Jan 20 04:38:41 2015 +0000
@@ -193,6 +193,15 @@
         self.assertEqual(binascii.b2a_qp(b'.'), b'=2E')
         self.assertEqual(binascii.b2a_qp(b'.\n'), b'=2E\n')
         self.assertEqual(binascii.b2a_qp(b'a.\n'), b'a.\n')
+        self.assertEqual(binascii.b2a_qp(b'x' * 77, istext=False),
+            b'x' * 75 + b'=\n'  # Non-text mode uses \n by default
+            b'xx')
+        self.assertEqual(binascii.b2a_qp(b'x' * 77 + b'\r\n', istext=False),
+            b'x' * 75 + b'=\r\n'  # Switches to CRLF if seen in data
+            b'xx=0D=0A')
+        # header=True used to be documented as encoding newlines
+        self.assertEqual(binascii.b2a_qp(b'newline\n', header=True),
+            b'newline\n')
 
     def test_empty_string(self):
         # A test for SF bug #1022953.  Make sure SystemError is not raised.
diff -r 2d71d0f954fb Lib/test/test_quopri.py
--- a/Lib/test/test_quopri.py	Sat Jan 17 17:33:49 2015 -0800
+++ b/Lib/test/test_quopri.py	Tue Jan 20 04:38:41 2015 +0000
@@ -113,6 +113,14 @@
 zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'''),
         # Now some really complex stuff ;)
         (DECSAMPLE, ENCSAMPLE),
+
+        # 73 unescaped characters plus one =XX escape makes a full line
+        (b'#' * 73 + b'=\n', b'#' * 73 + b'=3D\n'),
+
+        # Escape codes should not be broken by soft line breaks
+        (b'#' * 74 + b'=\n', b'#' * 74 + b'=\n' b'=3D\n'),
+        (b'#' * 75 + b'=\n', b'#' * 75 + b'=\n' b'=3D\n'),
+        (b'#' * 76 + b'=\n', b'#' * 75 + b'=\n' b'#=3D\n'),
         )
 
     # These are used in the "quotetabs=1" tests.
@@ -159,6 +167,43 @@
             self.assertEqual(outfp.getvalue(), p)
 
     @withpythonimplementation
+    def test_newline(self):
+        '''Exercise support of \n and CRLF newlines'''
+        plain_lines = (
+            b'x' * 77 + b'Line 1 \t ',
+            b'Line 2\t \t',
+            b'Line 3',
+            b'No newline',
+        )
+        enc_lines = (
+            b'x' * 75 + b'=',
+            b'xx' b'Line 1 \t=20',
+            b'Line 2\t =09',
+            b'Line 3',
+            b'No newline',
+        )
+        for newline in (b'\r\n', b'\n'):
+            with self.subTest(repr(newline)):
+                plaintext = newline.join(plain_lines)
+                encoded = newline.join(enc_lines)
+                self.assertEqual(encoded, quopri.encodestring(plaintext))
+                self.assertEqual(plaintext, quopri.decodestring(encoded))
+
+        # Default soft newline is \n if there are no hard newlines
+        encoded = (b'x' * 75 + b'=\n'
+            b'xx')
+        self.assertEqual(encoded, quopri.encodestring(b'x' * 77))
+
+    @withpythonimplementation
+    def test_decode_nontext(self):
+        '''Should decode non-textual =0D and =0A bytes'''
+        encoded = (b'CR=0D CRLF=0D=0A NL=0A Soft NL=\n'
+            b'*Soft CRLF=\r\n'
+            b'*EOF')
+        data = b'CR\r CRLF\r\n NL\n Soft NL*Soft CRLF*EOF'
+        self.assertEqual(data, quopri.decodestring(encoded))
+
+    @withpythonimplementation
     def test_embedded_ws(self):
         for p, e in self.ESTRINGS:
             self.assertEqual(quopri.encodestring(p, quotetabs=True), e)
diff -r 2d71d0f954fb Modules/binascii.c
--- a/Modules/binascii.c	Sat Jan 17 17:33:49 2015 -0800
+++ b/Modules/binascii.c	Tue Jan 20 04:38:41 2015 +0000
@@ -1334,8 +1334,7 @@
 }
 
 /* XXX: This is ridiculously complicated to be backward compatible
- * (mostly) with the quopri module.  It doesn't re-create the quopri
- * module bug where text ending in CRLF has the CR encoded */
+ * (mostly) with the quopri module. */
 
 /*[clinic input]
 binascii.b2a_qp
@@ -1389,8 +1388,12 @@
             ((databuf[in] < 33) &&
              (databuf[in] != '\r') && (databuf[in] != '\n') &&
              (quotetabs || ((databuf[in] != '\t') && (databuf[in] != ' ')))))
-        {
-            if ((linelen + 3) >= MAXLINESIZE) {
+        { /* Input byte needs to be escaped to =XX */
+            if ((linelen + 3 > MAXLINESIZE) ||
+                ((linelen + 3 == MAXLINESIZE) && (in+1 < datalen) &&
+                    (!istext || ((databuf[in+1] != '\n') &&
+                        (databuf[in+1] != '\r' || databuf[in+2] != '\n')))))
+            { /* Soft line break needed before escape code */
                 linelen = 0;
                 if (crlf)
                     odatalen += 3;
@@ -1406,7 +1409,7 @@
                 ((databuf[in] == '\n') ||
                  ((in+1 < datalen) && (databuf[in] == '\r') &&
                  (databuf[in+1] == '\n'))))
-            {
+            { /* Literal hard line break */
                 linelen = 0;
                 /* Protect against whitespace on end of line */
                 if (in && ((databuf[in-1] == ' ') || (databuf[in-1] == '\t')))
@@ -1420,10 +1423,11 @@
                 else
                     in++;
             }
-            else {
+            else { /* Unescaped literal character */
                 if ((in + 1 != datalen) &&
                     (databuf[in+1] != '\n') &&
                     (linelen + 1) >= MAXLINESIZE) {
+                    /* Soft line break needed before character */
                     linelen = 0;
                     if (crlf)
                         odatalen += 3;
@@ -1437,8 +1441,7 @@
         }
     }
 
-    /* We allocate the output same size as input, this is overkill.
-     * The previous implementation used calloc() so we'll zero out the
+    /* The previous implementation used calloc() so we'll zero out the
      * memory here too, since PyMem_Malloc() does not guarantee that.
      */
     odata = (unsigned char *) PyMem_Malloc(odatalen);
@@ -1461,8 +1464,12 @@
              (databuf[in] != '\r') && (databuf[in] != '\n') &&
              (quotetabs ||
             (!quotetabs && ((databuf[in] != '\t') && (databuf[in] != ' '))))))
-        {
-            if ((linelen + 3 )>= MAXLINESIZE) {
+        { /* Input byte needs to be escaped to =XX */
+            if ((linelen + 3 > MAXLINESIZE) ||
+                ((linelen + 3 == MAXLINESIZE) && (in+1 < datalen) &&
+                    (!istext || ((databuf[in+1] != '\n') &&
+                        (databuf[in+1] != '\r' || databuf[in+2] != '\n')))))
+            { /* Soft line break needed before escape code */
                 odata[out++] = '=';
                 if (crlf) odata[out++] = '\r';
                 odata[out++] = '\n';
@@ -1479,7 +1486,7 @@
                 ((databuf[in] == '\n') ||
                  ((in+1 < datalen) && (databuf[in] == '\r') &&
                  (databuf[in+1] == '\n'))))
-            {
+            { /* Literal hard line break */
                 linelen = 0;
                 /* Protect against whitespace on end of line */
                 if (out && ((odata[out-1] == ' ') || (odata[out-1] == '\t'))) {
@@ -1496,10 +1503,11 @@
                 else
                     in++;
             }
-            else {
+            else { /* Unescaped literal character */
                 if ((in + 1 != datalen) &&
                     (databuf[in+1] != '\n') &&
                     (linelen + 1) >= MAXLINESIZE) {
+                    /* Soft line break needed before character */
                     odata[out++] = '=';
                     if (crlf) odata[out++] = '\r';
                     odata[out++] = '\n';