"""A readline()-style interface to the parts of a multipart message. The MultiFile class makes each part of a multipart message "feel" like an ordinary file, as long as you use fp.readline(). Allows recursive use, for nested multipart messages. Probably best used together with module mimetools. Suggested use: real_fp = open(...) fp = MultiFile(real_fp) 'read some lines from fp' fp.push(separator) while 1: 'read lines from fp until it returns an empty string' (A) if not fp.next(): break fp.pop() 'read remaining lines from fp until it returns an empty string' The latter sequence may be used recursively at (A). It is also allowed to use multiple push()...pop() sequences. If seekable is given as 0, the class code will not do the bookkeeping it normally attempts in order to make seeks relative to the beginning of the current file part. This may be useful when using MultiFile with a non- seekable stream object. """ # 2001-04-03: Geoffrey T. Dairiki # # This is a re-implementation of the stock python multifile.py # # The main changes: # # 1. Efficiency: # # This version supports calling the read() method with an argument. # (In many cases, I've found that reading a MultiFile line by line # is just too slow --- remember multipart messages often contain # large binary attachments.) # # This version performs reads on the underlying input stream in # larger chunks as well, and uses a regular expression search to # search for separator lines. # # 2. Buglets fixed # # The original version has a buglet regarding its handling of the # newline which preceeds a separator line. According to RFC 2046, # section 5.1.1 the newline preceeding a separator is part of the # separator, not part of the preceeding content. The old version # of multifile.py treats the newline as part of the content. Thus, # it introduces a spurious empty line at the end of each content. # # Matching of the separators: RFC 2046, section 5.1.1 also states, # that if the beginning of a line matches the separator, it is a # separator. The old code ignores only trailing white space when # looking for a separator line. This code ignores trailing anything, # on the separator line. import string import re __all__ = ["MultiFile", "Error", "IllegalSeek", "UnexpectedEOF", "EndmarkMissing"] class Error(RuntimeError): pass class IllegalSeek(Error): def __init__(self, detail = "Illegal seek on multifile"): Error.__init__(self, detail) class UnexpectedEOF(Error): def __init__(self, detail = "Sudden EOF on multifile"): Error.__init__(self, detail) class EndmarkMissing(Error): pass _END_OF_STRING = re.compile(r'\Z') class MultiFile: def __init__(self, fp, seekable=1): self.fp = fp self.readahead = '' self.pos = 0 # Position in underlying file self.seekable = seekable if seekable: try: self.pos = fp.tell() except: self.seekable = 0 self.startpos = self.pos # Start of current 'file' self.endpos = None # End of current 'file', if known self.limit = None # min(enclosing endpos's) self.sep = None # current boundary string # Matches current separator, current end marker, # or any enclosing separators or end markers: self.mark_re = re.compile('(?=a)b') # Never matches # Same as above, but includes trailing cruft through newline. self.markline_re = self.mark_re # Maximum length of a match for self.mark_re. self.max_mark_len = 0 self.done = 0 # End marker passed? self.stack = [] def tell(self): if not self.seekable: raise IllegalSeek return self.pos - self.startpos def seek(self, pos, whence=0): if not self.seekable: raise IllegalSeek # Figure out where end of current file is. if not self.endpos: curpos = self.pos try: while self.read(4096): pass finally: self.pos = curpos self.readahead = '' if whence == 1: newpos = self.pos + pos elif whence == 2: newpos = self.endpos + pos else: newpos = self.startpos + pos if newpos < self.startpos: raise ValueError, "Seek past beginning of file" self.pos = newpos self.fp.seek(self.pos) self.readahead = '' def __read_more(self, size = 8192): hunk = self.fp.read(size) if not hunk: raise UnexpectedEOF self.readahead = self.readahead + hunk def __do_read(self, size): assert size >= 0 while 1: need = size - len(self.readahead) if need <= 0: break self.__read_more(need) def __try_to_read(self, size): try: self.__do_read(size) except UnexpectedEOF: return _END_OF_STRING.search(self.readahead) else: return None def __fillbuf(self, size): if self.endpos is not None: # If we know where the end is, no need to search for marks. # Just read in the data. nleft = self.endpos - self.pos if size >= 0: nleft = min(size, nleft) if nleft <= 0: return 0 self.__do_read(nleft) return nleft if size < 0: # Read until mark found if self.sep is None: self.readahead = self.readahead + self.fp.read() eof = _END_OF_STRING.search(self.readahead) else: eof = self.mark_re.search(self.readahead) while not eof: self.__read_more() eof = self.mark_re.search(self.readahead) else: if self.limit is not None: size = min(size, self.limit - self.pos) # Read enough to include mark if there is one eof = self.__try_to_read(size + self.max_mark_len) if self.sep is not None: eof = self.mark_re.search(self.readahead, 0, size + self.max_mark_len) if eof: if (self.sep is not None) and (eof.group('badmark') is not None): raise EndmarkMissing, "Missing endmarker (sep = '%s')" % self.sep self.endpos = self.pos + eof.start() size = min(size, eof.start()) else: size = min(size, len(self.readahead)) return size def read(self, size = -1): size = self.__fillbuf(size) hunk = self.readahead[:size] self.readahead = self.readahead[size:] self.pos = self.pos + size return hunk def readline(self): if self.endpos is not None and self.pos >= self.endpos: return '' try: while 1: line_len = string.find(self.readahead, '\n') + 1 if line_len: return self.read(line_len) self.__read_more() except UnexpectedEOF: if self.sep is None: return self.read() raise def readlines(self): lines = string.split(self.read(), '\n') last = lines.pop() lines = map(lambda x: x+'\n', lines) if last: lines.append(last) return lines def __at_mark(self): self.__try_to_read(self.max_mark_len) mark = self.mark_re.match(self.readahead) if mark: # Find end of marker line try: while 1: mark = self.markline_re.match(self.readahead) if mark: break self.__read_more(256) except UnexpectedEOF: mark = re.match(self.mark_re.pattern + r".*\Z", self.readahead) assert mark return mark def next(self): if self.done: return 0 while self.read(4096): pass if self.sep is None: # no marks on stack return 0 if self.pos > self.endpos: assert self.seekable self.pos = self.endpos self.fp.seek(self.pos) self.readahead = '' mark = self.__at_mark() assert mark if mark.group('badmark') is not None: return 0 mark_len = mark.end() self.readahead = self.readahead[mark_len:] self.pos = self.pos + mark_len self.startpos = self.pos if mark.group('endmark') is not None: self.endpos = self.pos self.done = 1 return 0 else: self.endpos = None return 1 def push(self, sep): if self.endpos is not None and self.pos > self.endpos: raise Error, 'bad MultiFile.push() call' if self.done or self.__at_mark(): raise Error, 'bad MultiFile.push() call' if sep is None: raise ValueError, 'bad separator' self.stack.append( (self.sep, self.startpos, self.endpos, self.limit, self.mark_re, self.markline_re, self.max_mark_len) ) if self.endpos is not None: def not_None(x): return x is not None self.limit = min(filter(not_None, [self.endpos, self.limit])) self.sep = sep self.startpos, self.endpos = self.pos, None self.__compute_regexps() def __compute_regexps(self): def common_prefix(list): prefix = list[0] for item in list[1:]: while prefix != item[:len(prefix)]: prefix = prefix[:-1] return prefix assert self.sep is not None mark = self.section_divider(self.sep) endmark = self.end_marker(self.sep) badmarks = [] for sep in map(lambda x:x[0], self.stack[1:]): badmarks.append(self.end_marker(sep)) badmarks.append(self.section_divider(sep)) marks = [mark, endmark] + badmarks prefix = common_prefix(marks) def remove_prefix(x, p=prefix): return x[len(p):] mark = re.escape(remove_prefix(mark)) endmark = re.escape(remove_prefix(endmark)) badmarks = string.join(map(re.escape, map(remove_prefix, badmarks)), '|') prefix = re.escape(prefix) if not badmarks: badmarks = '(?=a)b' # never matches regexp = ( r'(?:\r?\n)?^%s(?:(?P%s)|%s|(?P%s))' % (prefix, endmark, mark, badmarks) ) self.max_mark_len = max(map(len, marks)) + 2 self.mark_re = re.compile(regexp, re.M) self.markline_re = re.compile(regexp + r'.*\n', re.M) def pop(self): try: ( self.sep, self.startpos, self.endpos, self.limit, self.mark_re, self.markline_re, self.max_mark_len ) = self.stack.pop() except IndexError: raise Error, 'bad MultiFile.pop() call' self.done = 0 def is_data(self, line): return line[:2] <> '--' def section_divider(self, str): return "--" + str def end_marker(self, str): return "--" + str + "--" #End of multifile.py