Index: email/feedparser.py
===================================================================
--- email/feedparser.py	(revision 46492)
+++ email/feedparser.py	(working copy)
@@ -28,8 +28,7 @@
 
 NLCRE = re.compile('\r\n|\r|\n')
 NLCRE_bol = re.compile('(\r\n|\r|\n)')
-NLCRE_eol = re.compile('(\r\n|\r|\n)$')
-NLCRE_crack = re.compile('(\r\n|\r|\n)')
+ENDRE = re.compile(r'(--)?[ \t]*(\r\n|\r|\n)?$')
 # RFC 2822 $3.6.8 Optional fields.  ftext is %d33-57 / %d59-126, Any character
 # except controls, SP, and ":".
 headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
@@ -38,7 +37,10 @@
 
 NeedMoreData = object()
 
+def FalsePred(line):
+    return False
 
+
 
 class BufferedSubFile(object):
     """A file-ish object that can have new data loaded into it.
@@ -47,6 +49,9 @@
     current predicate matches the current line, a false EOF response
     (i.e. empty string) is returned instead.  This lets the parser adhere to a
     simple abstraction -- it parses until EOF closes the current message.
+
+    This object supports both readline() and iteration: it is optimized for
+    the latter.
     """
     def __init__(self):
         # The last partial line pushed into this object.
@@ -55,14 +60,24 @@
         self._lines = []
         # The stack of false-EOF checking predicates.
         self._eofstack = []
+        self._at_eof = FalsePred
         # A flag indicating whether the file has been closed or not.
         self._closed = False
 
     def push_eof_matcher(self, pred):
         self._eofstack.append(pred)
+        self._eofstack.append(self._at_eof)
+        if self._at_eof is FalsePred:
+            self._at_eof = pred
+        else:
+            oldpred = self._at_eof
+            def at_eof(line):
+                return pred(line) or oldpred(line)
+            self._at_eof = at_eof
 
     def pop_eof_matcher(self):
-        return self._eofstack.pop()
+        self._at_eof = self._eofstack.pop()
+        return self._eofstack.pop()     # pred
 
     def close(self):
         # Don't forget any trailing partial line.
@@ -70,10 +85,10 @@
         self._partial = ''
         self._closed = True
 
-    def readline(self):
+    def next(self):
         if not self._lines:
             if self._closed:
-                return ''
+                raise StopIteration
             return NeedMoreData
         # Pop the line off the stack and see if it matches the current
         # false-EOF predicate.
@@ -81,36 +96,53 @@
         # RFC 2046, section 5.1.2 requires us to recognize outer level
         # boundaries at any level of inner nesting.  Do this, but be sure it's
         # in the order of most to least nested.
-        for ateof in self._eofstack[::-1]:
-            if ateof(line):
-                # We're at the false EOF.  But push the last line back first.
-                self._lines.append(line)
-                return ''
+        if self._at_eof(line):
+            # We're at the false EOF.  But push the last line back first.
+            self._lines.append(line)
+            raise StopIteration
         return line
 
+    def readline(self):
+        try:
+            return self.next()
+        except StopIteration:
+            return ''
+
+    def consume_lines(self, lines):
+        """Read all lines until EOF or NeedMoreData."""
+        # This can be implemented a lot more efficiently than next().
+        count = 0
+        ateof = self._at_eof
+        for line in self._lines[::-1]:
+            if ateof(line):
+                break
+            count += 1
+        if count:
+            read = self._lines[-1:-count-1:-1]
+            del self._lines[-count:]
+            lines.extend(read)
+        if self._lines or self._closed:
+            return '' # ateof or real EOF
+        else:
+            return NeedMoreData
+
     def unreadline(self, line):
         # Let the consumer push a line back into the buffer.
         assert line is not NeedMoreData
+        # XXX We should also assert that line is not empty.
         self._lines.append(line)
 
     def push(self, data):
         """Push some new data into this object."""
         # Handle any previous leftovers
         data, self._partial = self._partial + data, ''
-        # Crack into lines, but preserve the newlines on the end of each
-        parts = NLCRE_crack.split(data)
-        # The *ahem* interesting behaviour of re.split when supplied grouping
-        # parentheses is that the last element of the resulting list is the
-        # data after the final RE.  In the case of a NL/CR terminated string,
-        # this is the empty string.
+        # Crack into lines, but preserve the newlines on the end of each.
+        parts = _splitlines(data)
+        # The last element of the resulting list is the data after the final
+        # NL/CR.  In the case of a NL/CR terminated string, this is the empty
+        # string.
         self._partial = parts.pop()
-        # parts is a list of strings, alternating between the line contents
-        # and the eol character(s).  Gather up a list of lines after
-        # re-attaching the newlines.
-        lines = []
-        for i in range(len(parts) // 2):
-            lines.append(parts[i*2] + parts[i*2+1])
-        self.pushlines(lines)
+        self.pushlines(parts)
 
     def pushlines(self, lines):
         # Reverse and insert at the front of the lines.
@@ -122,13 +154,35 @@
     def __iter__(self):
         return self
 
-    def next(self):
-        line = self.readline()
-        if line == '':
-            raise StopIteration
-        return line
 
+
+def _splitlines(data):
+    # Split data at EOLs, exactly as for <file>.readlines() except that if the
+    # data ends with an EOL, the last element is an empty string.
+    if '\r' not in data:                # only \n
+        return _splitlines1(data, '\n')
+    if '\n' not in data:                # only \r
+        return _splitlines1(data, '\r')
+    # Hope for only \r\n.
+    lines = []
+    for part in data.split('\r\n'):
+        if '\r' in part or '\n' in part: # mixed EOLs, split the part
+            lines.extend(_splitlines(part))
+            lines[-1] += '\r\n'
+        else:
+            lines.append(part + '\r\n')
+    return lines
 
+
+def _splitlines1(data, eol):
+    # Split quickly, then paste the EOLs back on.  This is actually faster
+    # than splitting using find()!
+    parts = data.split(eol)
+    for i in range(len(parts) - 1):
+        parts[i] += eol
+    return parts
+
+
 
 class FeedParser:
     """A feed-style parser of email."""
@@ -294,9 +348,12 @@
             # this onto the input stream until we've scanned past the
             # preamble.
             separator = '--' + boundary
-            boundaryre = re.compile(
-                '(?P<sep>' + re.escape(separator) +
-                r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
+            # Few lines are boundaries, so use a fast rejection test.  Note
+            # that boundarymatch may return False rather than None if it
+            # fails, unlike <re>.match().  Match group 1 is the end marker,
+            # group 2 is the EOL.
+            def boundary_match(line):
+                return line[0] == '-' and _boundary_match(line, separator)
             capturing_preamble = True
             preamble = []
             linesep = False
@@ -307,14 +364,14 @@
                     continue
                 if line == '':
                     break
-                mo = boundaryre.match(line)
+                mo = boundary_match(line)
                 if mo:
                     # If we're looking at the end boundary, we're done with
                     # this multipart.  If there was a newline at the end of
                     # the closing boundary, then we need to initialize the
                     # epilogue with the empty string (see below).
-                    if mo.group('end'):
-                        linesep = mo.group('linesep')
+                    if mo.group(1):
+                        linesep = mo.group(2)
                         break
                     # We saw an inter-part boundary.  Were we in the preamble?
                     if capturing_preamble:
@@ -322,9 +379,9 @@
                             # According to RFC 2046, the last newline belongs
                             # to the boundary.
                             lastline = preamble[-1]
-                            eolmo = NLCRE_eol.search(lastline)
-                            if eolmo:
-                                preamble[-1] = lastline[:-len(eolmo.group(0))]
+                            eol = _eol_end(lastline)
+                            if eol:
+                                preamble[-1] = lastline[:-len(eol)]
                             self._cur.preamble = EMPTYSTRING.join(preamble)
                         capturing_preamble = False
                         self._input.unreadline(line)
@@ -338,13 +395,13 @@
                         if line is NeedMoreData:
                             yield NeedMoreData
                             continue
-                        mo = boundaryre.match(line)
+                        mo = boundary_match(line)
                         if not mo:
                             self._input.unreadline(line)
                             break
                     # Recurse to parse this subpart; the input stream points
                     # at the subpart's first line.
-                    self._input.push_eof_matcher(boundaryre.match)
+                    self._input.push_eof_matcher(boundary_match)
                     for retval in self._parsegen():
                         if retval is NeedMoreData:
                             yield NeedMoreData
@@ -359,16 +416,16 @@
                         if epilogue == '':
                             self._last.epilogue = None
                         elif epilogue is not None:
-                            mo = NLCRE_eol.search(epilogue)
-                            if mo:
-                                end = len(mo.group(0))
+                            eol = _eol_end(epilogue)
+                            if eol:
+                                end = len(eol)
                                 self._last.epilogue = epilogue[:-end]
                     else:
                         payload = self._last.get_payload()
                         if isinstance(payload, basestring):
-                            mo = NLCRE_eol.search(payload)
-                            if mo:
-                                payload = payload[:-len(mo.group(0))]
+                            eol = _eol_end(payload)
+                            if eol:
+                                payload = payload[:-len(eol)]
                                 self._last.set_payload(payload)
                     self._input.pop_eof_matcher()
                     self._pop_message()
@@ -387,10 +444,8 @@
                 self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
                 self._cur.set_payload(EMPTYSTRING.join(preamble))
                 epilogue = []
-                for line in self._input:
-                    if line is NeedMoreData:
-                        yield NeedMoreData
-                        continue
+                while self._input.consume_lines(epilogue) is NeedMoreData:
+                    yield NeedMoreData
                 self._cur.epilogue = EMPTYSTRING.join(epilogue)
                 return
             # If the end boundary ended in a newline, we'll need to make sure
@@ -399,11 +454,8 @@
                 epilogue = ['']
             else:
                 epilogue = []
-            for line in self._input:
-                if line is NeedMoreData:
-                    yield NeedMoreData
-                    continue
-                epilogue.append(line)
+            while self._input.consume_lines(epilogue) is NeedMoreData:
+                yield NeedMoreData
             # Any CRLF at the front of the epilogue is not technically part of
             # the epilogue.  Also, watch out for an empty string epilogue,
             # which means a single newline.
@@ -417,11 +469,8 @@
         # Otherwise, it's some non-multipart type, so the entire rest of the
         # file contents becomes the payload.
         lines = []
-        for line in self._input:
-            if line is NeedMoreData:
-                yield NeedMoreData
-                continue
-            lines.append(line)
+        while self._input.consume_lines(lines) is NeedMoreData:
+            yield NeedMoreData
         self._cur.set_payload(EMPTYSTRING.join(lines))
 
     def _parse_headers(self, lines):
@@ -449,9 +498,9 @@
             if line.startswith('From '):
                 if lineno == 0:
                     # Strip off the trailing newline
-                    mo = NLCRE_eol.search(line)
-                    if mo:
-                        line = line[:-len(mo.group(0))]
+                    eol = _eol_end(line)
+                    if eol:
+                        line = line[:-len(eol)]
                     self._cur.set_unixfrom(line)
                     continue
                 elif lineno == len(lines) - 1:
@@ -478,3 +527,25 @@
         if lastheader:
             # XXX reconsider the joining of folded lines
             self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')
+
+
+
+def _boundary_match(data, sep):
+    if not data.startswith(sep):
+        return None
+    return ENDRE.match(data[len(sep):])
+
+
+def _eol_end(data):
+    """If data ends with an EOL, return the EOL, otherwise None."""
+    if not data:
+        return None
+    if data[-1] == '\r':
+        return '\r'
+    elif data[-1] == '\n':
+        if data.endswith('\r\n'):
+            return '\r\n'
+        else:
+            return '\n'
+    else:
+        return None