Index: email/feedparser.py =================================================================== --- email/feedparser.py (revision 46492) +++ email/feedparser.py (working copy) @@ -28,8 +28,7 @@ NLCRE = re.compile('\r\n|\r|\n') NLCRE_bol = re.compile('(\r\n|\r|\n)') -NLCRE_eol = re.compile('(\r\n|\r|\n)$') -NLCRE_crack = re.compile('(\r\n|\r|\n)') +ENDRE = re.compile(r'(--)?[ \t]*(\r\n|\r|\n)?$') # RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character # except controls, SP, and ":". headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])') @@ -38,7 +37,10 @@ NeedMoreData = object() +def FalsePred(line): + return False + class BufferedSubFile(object): """A file-ish object that can have new data loaded into it. @@ -47,6 +49,9 @@ current predicate matches the current line, a false EOF response (i.e. empty string) is returned instead. This lets the parser adhere to a simple abstraction -- it parses until EOF closes the current message. + + This object supports both readline() and iteration: it is optimized for + the latter. """ def __init__(self): # The last partial line pushed into this object. @@ -55,14 +60,24 @@ self._lines = [] # The stack of false-EOF checking predicates. self._eofstack = [] + self._at_eof = FalsePred # A flag indicating whether the file has been closed or not. self._closed = False def push_eof_matcher(self, pred): self._eofstack.append(pred) + self._eofstack.append(self._at_eof) + if self._at_eof is FalsePred: + self._at_eof = pred + else: + oldpred = self._at_eof + def at_eof(line): + return pred(line) or oldpred(line) + self._at_eof = at_eof def pop_eof_matcher(self): - return self._eofstack.pop() + self._at_eof = self._eofstack.pop() + return self._eofstack.pop() # pred def close(self): # Don't forget any trailing partial line. @@ -70,10 +85,10 @@ self._partial = '' self._closed = True - def readline(self): + def next(self): if not self._lines: if self._closed: - return '' + raise StopIteration return NeedMoreData # Pop the line off the stack and see if it matches the current # false-EOF predicate. @@ -81,36 +96,53 @@ # RFC 2046, section 5.1.2 requires us to recognize outer level # boundaries at any level of inner nesting. Do this, but be sure it's # in the order of most to least nested. - for ateof in self._eofstack[::-1]: - if ateof(line): - # We're at the false EOF. But push the last line back first. - self._lines.append(line) - return '' + if self._at_eof(line): + # We're at the false EOF. But push the last line back first. + self._lines.append(line) + raise StopIteration return line + def readline(self): + try: + return self.next() + except StopIteration: + return '' + + def consume_lines(self, lines): + """Read all lines until EOF or NeedMoreData.""" + # This can be implemented a lot more efficiently than next(). + count = 0 + ateof = self._at_eof + for line in self._lines[::-1]: + if ateof(line): + break + count += 1 + if count: + read = self._lines[-1:-count-1:-1] + del self._lines[-count:] + lines.extend(read) + if self._lines or self._closed: + return '' # ateof or real EOF + else: + return NeedMoreData + def unreadline(self, line): # Let the consumer push a line back into the buffer. assert line is not NeedMoreData + # XXX We should also assert that line is not empty. self._lines.append(line) def push(self, data): """Push some new data into this object.""" # Handle any previous leftovers data, self._partial = self._partial + data, '' - # Crack into lines, but preserve the newlines on the end of each - parts = NLCRE_crack.split(data) - # The *ahem* interesting behaviour of re.split when supplied grouping - # parentheses is that the last element of the resulting list is the - # data after the final RE. In the case of a NL/CR terminated string, - # this is the empty string. + # Crack into lines, but preserve the newlines on the end of each. + parts = _splitlines(data) + # The last element of the resulting list is the data after the final + # NL/CR. In the case of a NL/CR terminated string, this is the empty + # string. self._partial = parts.pop() - # parts is a list of strings, alternating between the line contents - # and the eol character(s). Gather up a list of lines after - # re-attaching the newlines. - lines = [] - for i in range(len(parts) // 2): - lines.append(parts[i*2] + parts[i*2+1]) - self.pushlines(lines) + self.pushlines(parts) def pushlines(self, lines): # Reverse and insert at the front of the lines. @@ -122,13 +154,35 @@ def __iter__(self): return self - def next(self): - line = self.readline() - if line == '': - raise StopIteration - return line + +def _splitlines(data): + # Split data at EOLs, exactly as for .readlines() except that if the + # data ends with an EOL, the last element is an empty string. + if '\r' not in data: # only \n + return _splitlines1(data, '\n') + if '\n' not in data: # only \r + return _splitlines1(data, '\r') + # Hope for only \r\n. + lines = [] + for part in data.split('\r\n'): + if '\r' in part or '\n' in part: # mixed EOLs, split the part + lines.extend(_splitlines(part)) + lines[-1] += '\r\n' + else: + lines.append(part + '\r\n') + return lines + +def _splitlines1(data, eol): + # Split quickly, then paste the EOLs back on. This is actually faster + # than splitting using find()! + parts = data.split(eol) + for i in range(len(parts) - 1): + parts[i] += eol + return parts + + class FeedParser: """A feed-style parser of email.""" @@ -294,9 +348,12 @@ # this onto the input stream until we've scanned past the # preamble. separator = '--' + boundary - boundaryre = re.compile( - '(?P' + re.escape(separator) + - r')(?P--)?(?P[ \t]*)(?P\r\n|\r|\n)?$') + # Few lines are boundaries, so use a fast rejection test. Note + # that boundarymatch may return False rather than None if it + # fails, unlike .match(). Match group 1 is the end marker, + # group 2 is the EOL. + def boundary_match(line): + return line[0] == '-' and _boundary_match(line, separator) capturing_preamble = True preamble = [] linesep = False @@ -307,14 +364,14 @@ continue if line == '': break - mo = boundaryre.match(line) + mo = boundary_match(line) if mo: # If we're looking at the end boundary, we're done with # this multipart. If there was a newline at the end of # the closing boundary, then we need to initialize the # epilogue with the empty string (see below). - if mo.group('end'): - linesep = mo.group('linesep') + if mo.group(1): + linesep = mo.group(2) break # We saw an inter-part boundary. Were we in the preamble? if capturing_preamble: @@ -322,9 +379,9 @@ # According to RFC 2046, the last newline belongs # to the boundary. lastline = preamble[-1] - eolmo = NLCRE_eol.search(lastline) - if eolmo: - preamble[-1] = lastline[:-len(eolmo.group(0))] + eol = _eol_end(lastline) + if eol: + preamble[-1] = lastline[:-len(eol)] self._cur.preamble = EMPTYSTRING.join(preamble) capturing_preamble = False self._input.unreadline(line) @@ -338,13 +395,13 @@ if line is NeedMoreData: yield NeedMoreData continue - mo = boundaryre.match(line) + mo = boundary_match(line) if not mo: self._input.unreadline(line) break # Recurse to parse this subpart; the input stream points # at the subpart's first line. - self._input.push_eof_matcher(boundaryre.match) + self._input.push_eof_matcher(boundary_match) for retval in self._parsegen(): if retval is NeedMoreData: yield NeedMoreData @@ -359,16 +416,16 @@ if epilogue == '': self._last.epilogue = None elif epilogue is not None: - mo = NLCRE_eol.search(epilogue) - if mo: - end = len(mo.group(0)) + eol = _eol_end(epilogue) + if eol: + end = len(eol) self._last.epilogue = epilogue[:-end] else: payload = self._last.get_payload() if isinstance(payload, basestring): - mo = NLCRE_eol.search(payload) - if mo: - payload = payload[:-len(mo.group(0))] + eol = _eol_end(payload) + if eol: + payload = payload[:-len(eol)] self._last.set_payload(payload) self._input.pop_eof_matcher() self._pop_message() @@ -387,10 +444,8 @@ self._cur.defects.append(errors.StartBoundaryNotFoundDefect()) self._cur.set_payload(EMPTYSTRING.join(preamble)) epilogue = [] - for line in self._input: - if line is NeedMoreData: - yield NeedMoreData - continue + while self._input.consume_lines(epilogue) is NeedMoreData: + yield NeedMoreData self._cur.epilogue = EMPTYSTRING.join(epilogue) return # If the end boundary ended in a newline, we'll need to make sure @@ -399,11 +454,8 @@ epilogue = [''] else: epilogue = [] - for line in self._input: - if line is NeedMoreData: - yield NeedMoreData - continue - epilogue.append(line) + while self._input.consume_lines(epilogue) is NeedMoreData: + yield NeedMoreData # Any CRLF at the front of the epilogue is not technically part of # the epilogue. Also, watch out for an empty string epilogue, # which means a single newline. @@ -417,11 +469,8 @@ # Otherwise, it's some non-multipart type, so the entire rest of the # file contents becomes the payload. lines = [] - for line in self._input: - if line is NeedMoreData: - yield NeedMoreData - continue - lines.append(line) + while self._input.consume_lines(lines) is NeedMoreData: + yield NeedMoreData self._cur.set_payload(EMPTYSTRING.join(lines)) def _parse_headers(self, lines): @@ -449,9 +498,9 @@ if line.startswith('From '): if lineno == 0: # Strip off the trailing newline - mo = NLCRE_eol.search(line) - if mo: - line = line[:-len(mo.group(0))] + eol = _eol_end(line) + if eol: + line = line[:-len(eol)] self._cur.set_unixfrom(line) continue elif lineno == len(lines) - 1: @@ -478,3 +527,25 @@ if lastheader: # XXX reconsider the joining of folded lines self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n') + + + +def _boundary_match(data, sep): + if not data.startswith(sep): + return None + return ENDRE.match(data[len(sep):]) + + +def _eol_end(data): + """If data ends with an EOL, return the EOL, otherwise None.""" + if not data: + return None + if data[-1] == '\r': + return '\r' + elif data[-1] == '\n': + if data.endswith('\r\n'): + return '\r\n' + else: + return '\n' + else: + return None