Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(239)

Delta Between Two Patch Sets: Lib/email/quoprimime.py

Issue 5803: email/quoprimime: encode and decode are very slow on large messages
Left Patch Set: Created 8 years, 10 months ago
Right Patch Set: Created 6 years, 8 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « no previous file | no next file » | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 # Copyright (C) 2001-2006 Python Software Foundation 1 # Copyright (C) 2001-2006 Python Software Foundation
2 # Author: Ben Gertzfield 2 # Author: Ben Gertzfield
3 # Contact: email-sig@python.org 3 # Contact: email-sig@python.org
4 4
5 """Quoted-printable content transfer encoding per RFCs 2045-2047. 5 """Quoted-printable content transfer encoding per RFCs 2045-2047.
6 6
7 This module handles the content transfer encoding method defined in RFC 2045 7 This module handles the content transfer encoding method defined in RFC 2045
8 to encode US ASCII-like 8-bit data called `quoted-printable'. It is used to 8 to encode US ASCII-like 8-bit data called `quoted-printable'. It is used to
9 safely encode text that is in a character set similar to the 7-bit US ASCII 9 safely encode text that is in a character set similar to the 7-bit US ASCII
10 character set, but that includes some 8-bit characters that are normally not 10 character set, but that includes some 8-bit characters that are normally not
(...skipping 11 matching lines...) Expand all
22 22
23 This module does not do the line wrapping or end-of-line character 23 This module does not do the line wrapping or end-of-line character
24 conversion necessary for proper internationalized headers; it only 24 conversion necessary for proper internationalized headers; it only
25 does dumb encoding and decoding. To deal with the various line 25 does dumb encoding and decoding. To deal with the various line
26 wrapping issues, use the email.header module. 26 wrapping issues, use the email.header module.
27 """ 27 """
28 28
29 __all__ = [ 29 __all__ = [
30 'body_decode', 30 'body_decode',
31 'body_encode', 31 'body_encode',
32 'body_quopri_check', 32 'body_length',
33 'body_quopri_len',
34 'decode', 33 'decode',
35 'decodestring', 34 'decodestring',
36 'encode',
37 'encodestring',
38 'header_decode', 35 'header_decode',
39 'header_encode', 36 'header_encode',
40 'header_quopri_check', 37 'header_length',
41 'header_quopri_len',
42 'quote', 38 'quote',
43 'unquote', 39 'unquote',
44 ] 40 ]
45 41
46 import re 42 import re
47 43 import io
48 from string import hexdigits 44
49 from email.utils import fix_eols 45 from string import ascii_letters, digits, hexdigits
50 46
51 CRLF = '\r\n' 47 CRLF = '\r\n'
52 NL = '\n' 48 NL = '\n'
53 49 EMPTYSTRING = ''
54 # See also Charset.py 50
55 MISC_LEN = 7 51 # Build a mapping of octets to the expansion of that octet. Since we're only
56 52 # going to have 256 of these things, this isn't terribly inefficient
57 hqre = re.compile(r'[^-a-zA-Z0-9!*+/ ]') 53 # space-wise. Remember that headers and bodies have different sets of safe
58 bqre = re.compile(r'[^ !-<>-~\t]') 54 # characters. Initialize both maps with the full expansion, and then override
59 bqre2 = re.compile(r'[^ !-<>-~\t\r\n]') 55 # the safe bytes with the more compact form.
60 56 _QUOPRI_MAP = ['=%02X' % c for c in range(256)]
61 57 _QUOPRI_HEADER_MAP = _QUOPRI_MAP[:]
62 58 _QUOPRI_BODY_MAP = _QUOPRI_MAP[:]
59
60 # Safe header bytes which need no encoding.
61 for c in b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii'):
62 _QUOPRI_HEADER_MAP[c] = chr(c)
63 # Headers have one other special encoding; spaces become underscores.
64 _QUOPRI_HEADER_MAP[ord(' ')] = '_'
65
66 # Safe body bytes which need no encoding.
67 for c in (b' !"#$%&\'()*+,-./0123456789:;<>'
68 b'?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`'
69 b'abcdefghijklmnopqrstuvwxyz{|}~\t'):
70 _QUOPRI_BODY_MAP[c] = chr(c)
71
72
63 73
64 # Helpers 74 # Helpers
65 def header_quopri_check(c): 75 def header_check(octet):
66 """Return True if the character should be escaped with header quopri.""" 76 """Return True if the octet should be escaped with header quopri."""
67 return bool(hqre.match(c)) 77 return chr(octet) != _QUOPRI_HEADER_MAP[octet]
68 78
69 79
70 def body_quopri_check(c): 80 def body_check(octet):
71 """Return True if the character should be escaped with body quopri.""" 81 """Return True if the octet should be escaped with body quopri."""
72 return bool(bqre.match(c)) 82 return chr(octet) != _QUOPRI_BODY_MAP[octet]
73 83
74 84
75 def header_quopri_len(s): 85 def header_length(bytearray):
76 """Return the length of str when it is encoded with header quopri.""" 86 """Return a header quoted-printable encoding length.
77 count = 0 87
78 for c in s: 88 Note that this does not include any RFC 2047 chrome added by
79 if hqre.match(c): 89 `header_encode()`.
80 count += 3 90
81 else: 91 :param bytearray: An array of bytes (a.k.a. octets).
82 count += 1 92 :return: The length in bytes of the byte array when it is encoded with
83 return count 93 quoted-printable for headers.
84 94 """
85 95 return sum(len(_QUOPRI_HEADER_MAP[octet]) for octet in bytearray)
86 def body_quopri_len(str): 96
87 """Return the length of str when it is encoded with body quopri.""" 97
88 count = 0 98 def body_length(bytearray):
89 for c in str: 99 """Return a body quoted-printable encoding length.
90 if bqre.match(c): 100
91 count += 3 101 :param bytearray: An array of bytes (a.k.a. octets).
92 else: 102 :return: The length in bytes of the byte array when it is encoded with
93 count += 1 103 quoted-printable for bodies.
94 return count 104 """
105 return sum(len(_QUOPRI_BODY_MAP[octet]) for octet in bytearray)
95 106
96 107
97 def _max_append(L, s, maxlen, extra=''): 108 def _max_append(L, s, maxlen, extra=''):
109 if not isinstance(s, str):
110 s = chr(s)
98 if not L: 111 if not L:
99 L.append(s.lstrip()) 112 L.append(s.lstrip())
100 elif len(L[-1]) + len(s) <= maxlen: 113 elif len(L[-1]) + len(s) <= maxlen:
101 L[-1] += extra + s 114 L[-1] += extra + s
102 else: 115 else:
103 L.append(s.lstrip()) 116 L.append(s.lstrip())
104 117
105 118
106 def unquote(s): 119 def unquote(s):
107 """Turn a string in the form =AB to the ASCII character with value 0xab""" 120 """Turn a string in the form =AB to the ASCII character with value 0xab"""
108 return chr(int(s[1:3], 16)) 121 return chr(int(s[1:3], 16))
109 122
110 123
111 def quote(c): 124 def quote(c):
112 return "=%02X" % ord(c) 125 return _QUOPRI_MAP[ord(c)]
113 126
114 def quote_match(m): 127
115 c = m.group(0)[0] 128 def header_encode(header_bytes, charset='iso-8859-1'):
116 return "=%02X" % ord(c)
117
118
119
120 def header_encode(header, charset="iso-8859-1", keep_eols=False,
121 maxlinelen=76, eol=NL):
122 """Encode a single header line with quoted-printable (like) encoding. 129 """Encode a single header line with quoted-printable (like) encoding.
123 130
124 Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but 131 Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but
125 used specifically for email header fields to allow charsets with mostly 7 132 used specifically for email header fields to allow charsets with mostly 7
126 bit characters (and some 8 bit) to remain more or less readable in non-RFC 133 bit characters (and some 8 bit) to remain more or less readable in non-RFC
127 2045 aware mail clients. 134 2045 aware mail clients.
128 135
129 charset names the character set to use to encode the header. It defaults 136 charset names the character set to use in the RFC 2046 header. It
130 to iso-8859-1. 137 defaults to iso-8859-1.
131 138 """
132 The resulting string will be in the form: 139 # Return empty headers as an empty string.
133 140 if not header_bytes:
134 "=?charset?q?I_f=E2rt_in_your_g=E8n=E8ral_dire=E7tion?\\n 141 return ''
135 =?charset?q?Silly_=C8nglish_Kn=EEghts?=" 142 # Iterate over every byte, encoding if necessary.
136 143 encoded = header_bytes.decode('latin1').translate(_QUOPRI_HEADER_MAP)
137 with each line wrapped safely at, at most, maxlinelen characters (defaults
138 to 76 characters). If maxlinelen is None, the entire string is encoded in
139 one chunk with no splitting.
140
141 End-of-line characters (\\r, \\n, \\r\\n) will be automatically converted
142 to the canonical email line separator \\r\\n unless the keep_eols
143 parameter is True (the default is False).
144
145 Each line of the header will be terminated in the value of eol, which
146 defaults to "\\n". Set this to "\\r\\n" if you are using the result of
147 this function directly in email.
148 """
149 # Return empty headers unchanged
150 if not header:
151 return header
152
153 if not keep_eols:
154 header = fix_eols(header)
155
156 # Quopri encode each line, in encoded chunks no greater than maxlinelen in
157 # length, after the RFC chrome is added in.
158 quoted = []
159 if maxlinelen is None:
160 # An obnoxiously large number that's good enough
161 max_encoded = 100000
162 else:
163 max_encoded = maxlinelen - len(charset) - MISC_LEN - 1
164
165 for c in header:
166 # Space may be represented as _ instead of =20 for readability
167 if c == ' ':
168 _max_append(quoted, '_', max_encoded)
169 # These characters can be included verbatim
170 elif not hqre.match(c):
171 _max_append(quoted, c, max_encoded)
172 # Otherwise, replace with hex value like =E2
173 else:
174 _max_append(quoted, "=%02X" % ord(c), max_encoded)
175
176 # Now add the RFC chrome to each encoded chunk and glue the chunks 144 # Now add the RFC chrome to each encoded chunk and glue the chunks
177 # together. BAW: should we be able to specify the leading whitespace in 145 # together.
178 # the joiner? 146 return '=?%s?q?%s?=' % (charset, encoded)
179 joiner = eol + ' ' 147
180 return joiner.join(['=?%s?q?%s?=' % (charset, line) for line in quoted]) 148
181 149 _QUOPRI_BODY_ENCODE_MAP = _QUOPRI_BODY_MAP[:]
182 150 for c in b'\r\n':
183 151 _QUOPRI_BODY_ENCODE_MAP[c] = chr(c)
184 152
185 def encode(body, binary=False, maxlinelen=76, eol=NL): 153 def body_encode(body, maxlinelen=76, eol=NL):
186 """Encode with quoted-printable, wrapping at maxlinelen characters. 154 """Encode with quoted-printable, wrapping at maxlinelen characters.
187
188 If binary is False (the default), end-of-line characters will be converted
189 to the canonical email end-of-line sequence \\r\\n. Otherwise they will
190 be left verbatim.
191 155
192 Each line of encoded text will end with eol, which defaults to "\\n". Set 156 Each line of encoded text will end with eol, which defaults to "\\n". Set
193 this to "\\r\\n" if you will be using the result of this function directly 157 this to "\\r\\n" if you will be using the result of this function directly
194 in an email. 158 in an email.
195 159
196 Each line will be wrapped at, at most, maxlinelen characters (defaults to 160 Each line will be wrapped at, at most, maxlinelen characters before the
197 76 characters). Long lines will have the `soft linefeed' quoted-printable 161 eol string (maxlinelen defaults to 76 characters, the maximum value
198 character "=" appended to them, so the decoded text will be identical to 162 permitted by RFC 2045). Long lines will have the 'soft line break'
199 the original text. 163 quoted-printable character "=" appended to them, so the decoded text will
200 164 be identical to the original text.
201 This algorithm is considerably more efficient than the one included with 165
202 Python. The only difference I am aware of is that the Python algorithm will 166 The minimum maxlinelen is 4 to have room for a quoted character ("=XX")
203 output a line with length maxlinelen + 1 when the input line is exactly 167 followed by a soft line break. Smaller values will generate a
204 maxlinelen long and ends in ' ' or '\t'. I consider this a bug in the 168 ValueError.
205 Python implementation. 169
206 """ 170 """
171
172 if maxlinelen < 4:
173 raise ValueError("maxlinelen must be at least 4")
207 if not body: 174 if not body:
208 return body 175 return body
209 176
210 if not binary:
211 body = fix_eols(body)
212
213 # quote speacial characters 177 # quote speacial characters
214 body = bqre2.sub(quote_match, body) 178 body = body.translate(_QUOPRI_BODY_ENCODE_MAP)
215 179
216 eq_eol = '=' + eol 180 soft_break = '=' + eol
217 # leave space for the '=' at the end of a line 181 # leave space for the '=' at the end of a line
218 maxlinelen1 = maxlinelen - 1 182 maxlinelen1 = maxlinelen - 1
219 183
220 encoded_body = [] 184 encoded_body = []
185 append = encoded_body.append
186
221 for line in body.splitlines(): 187 for line in body.splitlines():
222 # break up the line into pieces no longer than maxlinelen - 1 188 # break up the line into pieces no longer than maxlinelen - 1
223 i, j = 0, maxlinelen1 189 start = 0
224 linelen = len(line) 190 laststart = len(line) - 1 - maxlinelen
225 while j < linelen: 191 while start <= laststart:
192 stop = start + maxlinelen1
226 # make sure we don't break up an escape sequence 193 # make sure we don't break up an escape sequence
227 if line[j-2] == '=': 194 if line[stop - 2] == '=':
228 j = j - 2 195 append(line[start:stop - 1])
229 elif line[j - 1] == '=': 196 start = stop - 2
230 j = j - 1 197 elif line[stop - 1] == '=':
231 encoded_body.append(line[i:j] + '=') 198 append(line[start:stop])
232 i, j = j, j + maxlinelen1 199 start = stop - 1
200 else:
201 append(line[start:stop] + '=')
202 start = stop
233 203
234 # handle rest of line, special case if line ends in whitespace 204 # handle rest of line, special case if line ends in whitespace
235 if line and line[-1] in ' \t': 205 if line and line[-1] in ' \t':
236 # match Python implementation "bug" 206 room = start - laststart
237 #if i > 0 and linelen == i + 1: 207 if room >= 3:
238 # encoded_body[-1] = encoded_body[-1][:-1] + line[-1] + eq_eol 208 # It's a whitespace character at end-of-line, and we have room
239 #else: 209 # for the three-character quoted encoding.
240 # encoded_body.append(line[i:] + eq_eol) 210 q = quote(line[-1])
241 encoded_body.append(line[i:] + eq_eol) 211 elif room == 2:
242 else: 212 # There's room for the whitespace character and a soft break.
243 encoded_body.append(line[i:]) 213 q = line[-1] + soft_break
214 else:
215 # There's room only for a soft break. The quoted whitespace
216 # will be the only content on the subsequent line.
217 q = soft_break + quote(line[-1])
218 append(line[start:-1] + q)
219 else:
220 append(line[start:])
244 221
245 # add back final newline if present 222 # add back final newline if present
246 if body[-1] in CRLF: 223 if body[-1] in CRLF:
247 encoded_body.append('') 224 append('')
248 225
249 return eol.join(encoded_body) 226 return eol.join(encoded_body)
250 227
251 228
252 # For convenience and backwards compatibility w/ standard base64 module
253 body_encode = encode
254 encodestring = encode
255
256
257
258 229
259 # BAW: I'm not sure if the intent was for the signature of this function to be 230 # BAW: I'm not sure if the intent was for the signature of this function to be
260 # the same as base64MIME.decode() or not... 231 # the same as base64MIME.decode() or not...
261 def decode(encoded, eol=NL): 232 def decode(encoded, eol=NL):
262 """Decode a quoted-printable string. 233 """Decode a quoted-printable string.
263 234
264 Lines are separated with eol, which defaults to \\n. 235 Lines are separated with eol, which defaults to \\n.
265 """ 236 """
266 if not encoded: 237 if not encoded:
267 return encoded 238 return encoded
(...skipping 25 matching lines...) Expand all
293 decoded += unquote(line[i:i+3]) 264 decoded += unquote(line[i:i+3])
294 i += 3 265 i += 3
295 # Otherwise, not in form =AB, pass literally 266 # Otherwise, not in form =AB, pass literally
296 else: 267 else:
297 decoded += c 268 decoded += c
298 i += 1 269 i += 1
299 270
300 if i == n: 271 if i == n:
301 decoded += eol 272 decoded += eol
302 # Special case if original string did not end with eol 273 # Special case if original string did not end with eol
303 if not encoded.endswith(eol) and decoded.endswith(eol): 274 if encoded[-1] not in '\r\n' and decoded.endswith(eol):
304 decoded = decoded[:-1] 275 decoded = decoded[:-1]
305 return decoded 276 return decoded
306 277
307 278
308 # For convenience and backwards compatibility w/ standard base64 module 279 # For convenience and backwards compatibility w/ standard base64 module
309 body_decode = decode 280 body_decode = decode
310 decodestring = decode 281 decodestring = decode
311 282
312 283
313
314 284
315 def _unquote_match(match): 285 def _unquote_match(match):
316 """Turn a match in the form =AB to the ASCII character with value 0xab""" 286 """Turn a match in the form =AB to the ASCII character with value 0xab"""
317 s = match.group(0) 287 s = match.group(0)
318 return unquote(s) 288 return unquote(s)
319 289
320 290
321 # Header decoding is done a bit differently 291 # Header decoding is done a bit differently
322 def header_decode(s): 292 def header_decode(s):
323 """Decode a string encoded with RFC 2045 MIME header `Q' encoding. 293 """Decode a string encoded with RFC 2045 MIME header `Q' encoding.
324 294
325 This function does not parse a full MIME header value encoded with 295 This function does not parse a full MIME header value encoded with
326 quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use 296 quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
327 the high level email.header class for that functionality. 297 the high level email.header class for that functionality.
328 """ 298 """
329 s = s.replace('_', ' ') 299 s = s.replace('_', ' ')
330 return re.sub(r'=[a-fA-F0-9]{2}', _unquote_match, s) 300 return re.sub(r'=[a-fA-F0-9]{2}', _unquote_match, s, re.ASCII)
LEFTRIGHT
« no previous file | no next file » | Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Toggle Comments ('s')

RSS Feeds Recent Issues | This issue
This is Rietveld 894c83f36cb7+