""" Byte stream reader to process mboxo style mailbox files. It replaces any occurrence of b'\n>From ' with b'\nFrom ' The class handles matching across reads, provided that the read size is at least 7 bytes. """ import mailbox FROM_MANGLED =b'\n>From ' FROM_UNMANGLED=b'\nFrom ' # We want to match the 7 bytes b'\n>From ' in the input stream # However this can be split over multiple reads. # The split can occur anywhere after the leading b'\n' # and the trailing b' '. If we match any of these # we keep the trailing part of the buffer for next time # The following are all the possible prefixes for a split: FROMS=(FROM_MANGLED[:-1], FROM_MANGLED[:-2], FROM_MANGLED[:-3], FROM_MANGLED[:-4], FROM_MANGLED[:-5], FROM_MANGLED[:-6], ) class MboxoReader(mailbox._PartialFile): def __init__(self, f, start=None, stop=None): self.remain=0 # number of bytes to keep for next read super().__init__(f._file, start=f._start, stop=f._stop) # Override the read method to provide mboxo filtering def _read(self, size, read_method): # get the next chunk, resetting if necessary if self.remain != 0: super().seek(whence=1, offset=-self.remain) bytes = super()._read(size, read_method) bufflen=len(bytes) # did we get anything new? if bufflen > self.remain: # is there a potential cross-boundary match? if bytes.endswith(FROMS): # yes, work out what to keep # N.B. rindex will fail if it cannot find the LF; # this should be impossible self.remain=bufflen - bytes.rindex(b'\n') else: # don't need to keep anything back self.remain=0 else: # EOF self.remain=0 # we cannot use -0 to mean end of array... end = bufflen if self.remain == 0 else -self.remain # exclude the potential split match from the return return bytes[:end].replace(FROM_MANGLED, FROM_UNMANGLED)