Index: Lib/encodings/utf_16.py =================================================================== --- Lib/encodings/utf_16.py (Revision 54756) +++ Lib/encodings/utf_16.py (Arbeitskopie) @@ -34,6 +34,19 @@ codecs.IncrementalEncoder.reset(self) self.encoder = None + def getstate(self): + if self.encoder is not None: + return sys.byteorder + return None + + def setstate(self, state): + if state is None: + self.encoder = None + elif state == 'little': + self.encoder = codecs.utf_16_le_encode + else: + self.encoder = codecs.utf_16_be_encode + class IncrementalDecoder(codecs.BufferedIncrementalDecoder): def __init__(self, errors='strict'): codecs.BufferedIncrementalDecoder.__init__(self, errors) @@ -56,6 +69,24 @@ codecs.BufferedIncrementalDecoder.reset(self) self.decoder = None + def getstate(self): + state = codecs.BufferedIncrementalDecoder.getstate(self) + if self.decoder is codecs.utf_16_le_decode: + return (state, "little") + elif self.decoder is codecs.utf_16_be_decode: + return (state, "big") + return (state, None) + + def setstate(self, state): + codecs.BufferedIncrementalDecoder.setstate(self, state[0]) + state = state[1] + if state is None: + self.encoder = None + elif state == 'little': + self.decoder = codecs.utf_16_le_decode + else: + self.decoder = codecs.utf_16_be_decode + class StreamWriter(codecs.StreamWriter): def __init__(self, stream, errors='strict'): self.bom_written = False Index: Lib/encodings/utf_8_sig.py =================================================================== --- Lib/encodings/utf_8_sig.py (Revision 54756) +++ Lib/encodings/utf_8_sig.py (Arbeitskopie) @@ -29,7 +29,7 @@ def encode(self, input, final=False): if self.first: - self.first = False + self.first = None return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0] else: return codecs.utf_8_encode(input, self.errors)[0] @@ -38,26 +38,51 @@ codecs.IncrementalEncoder.reset(self) self.first = True + def getstate(self): + return self.first + + def setstate(self, state): + self.first = state + class IncrementalDecoder(codecs.BufferedIncrementalDecoder): def __init__(self, errors='strict'): codecs.BufferedIncrementalDecoder.__init__(self, errors) self.first = True def _buffer_decode(self, input, errors, final): - if self.first and codecs.BOM_UTF8.startswith(input): # might be a BOM + if self.first: if len(input) < 3: - # not enough data to decide if this really is a BOM - # => try again on the next call - return (u"", 0) - (output, consumed) = codecs.utf_8_decode(input[3:], errors, final) - self.first = False - return (output, consumed+3) + if codecs.BOM_UTF8.startswith(input): + # not enough data to decide if this really is a BOM + # => try again on the next call + return (u"", 0) + else: + self.first = None + else: + self.first = None + if input[:3] == codecs.BOM_UTF8: + (output, consumed) = codecs.utf_8_decode(input[3:], errors, final) + return (output, consumed+3) return codecs.utf_8_decode(input, errors, final) def reset(self): codecs.BufferedIncrementalDecoder.reset(self) self.first = True + def getstate(self): + state = codecs.BufferedIncrementalDecoder.getstate(self) + if state is None and self.first is None: + return None + return (state, self.first) + + def setstate(self, state): + if state is None: + first = None + else: + (state, first) = state + codecs.BufferedIncrementalDecoder.setstate(self, state) + self.first = first + class StreamWriter(codecs.StreamWriter): def reset(self): codecs.StreamWriter.reset(self) Index: Lib/test/test_codecs.py =================================================================== --- Lib/test/test_codecs.py (Revision 54756) +++ Lib/test/test_codecs.py (Arbeitskopie) @@ -23,6 +23,26 @@ self._buffer = self._buffer[size:] return s +def all_partial_decodes(encoding, s): + for i in xrange(len(s)+1): + d = codecs.getincrementaldecoder(encoding)() + part1 = d.decode(s[:i]) + state = d.getstate() + d = codecs.getincrementaldecoder(encoding)() + d.setstate(state) + part2 = d.decode(s[i:], True) + yield part1+part2 + +def all_partial_encodes(encoding, u): + for i in xrange(len(u)+1): + d = codecs.getincrementalencoder(encoding)() + part1 = d.encode(u[:i]) + state = d.getstate() + d = codecs.getincrementalencoder(encoding)() + d.setstate(state) + part2 = d.encode(u[i:], True) + yield part1+part2 + class ReadTest(unittest.TestCase): def check_partial(self, input, partialresults): # get a StreamReader for the encoding and feed the bytestring version @@ -294,6 +314,12 @@ def test_errors(self): self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True) + def test_decoder_state(self): + for s in all_partial_decodes(self.encoding, self.spamle): + self.assertEqual(s, "spamspam") + for s in all_partial_decodes(self.encoding, self.spambe): + self.assertEqual(s, "spamspam") + class UTF16LETest(ReadTest): encoding = "utf-16-le" @@ -357,6 +383,11 @@ ] ) + def test_decoder_state(self): + u = u"\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff" + for s in all_partial_decodes(self.encoding, u.encode(self.encoding)): + self.assertEqual(s, u) + class UTF7Test(ReadTest): encoding = "utf-7" @@ -429,6 +460,16 @@ # SF bug #1601501: check that the codec works with a buffer unicode("\xef\xbb\xbf", "utf-8-sig") + def test_bom(self): + d = codecs.getincrementaldecoder("utf-8-sig")() + s = u"spam" + self.assertEqual(d.decode(s.encode("utf-8-sig")), s) + + def test_decoder_state(self): + u = u"\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff" + for s in all_partial_decodes(self.encoding, u.encode(self.encoding)): + self.assertEqual(s, u) + class EscapeDecodeTest(unittest.TestCase): def test_empty(self): self.assertEquals(codecs.escape_decode(""), ("", 0)) @@ -1066,7 +1107,11 @@ "punycode", "unicode_internal" ] -broken_incremental_coders = broken_unicode_with_streams[:] +broken_incremental_coders = broken_unicode_with_streams + [ + "idna", + "zlib_codec", + "bz2_codec", +] # The following encodings only support "strict" mode only_strict_mode = [ @@ -1215,6 +1260,16 @@ table_type = type(cp1140.encoding_table) self.assertEqual(table_type, table_type) + def test_decoder_state(self): + # Check that getstate() and setstate() handle the state properly + u = u"abc123" + for encoding in all_unicode_encodings: + if encoding not in broken_incremental_coders: + for s in all_partial_decodes(encoding, u.encode(encoding)): + self.assertEqual(s, u) + for s in all_partial_encodes(encoding, u): + self.assertEqual(s.decode(encoding), u) + class BasicStrTest(unittest.TestCase): def test_basics(self): s = "abc123" Index: Lib/codecs.py =================================================================== --- Lib/codecs.py (Revision 54756) +++ Lib/codecs.py (Arbeitskopie) @@ -181,6 +181,18 @@ Resets the encoder to the initial state. """ + def getstate(self): + """ + Return the current state of the encoder. + """ + return None + + def setstate(self, state): + """ + Set the current state of the encoder. state must have been returned by + getstate(). + """ + class BufferedIncrementalEncoder(IncrementalEncoder): """ This subclass of IncrementalEncoder can be used as the baseclass for an @@ -208,6 +220,12 @@ IncrementalEncoder.reset(self) self.buffer = "" + def getstate(self): + return self.buffer or None + + def setstate(self, state): + self.buffer = state or "" + class IncrementalDecoder(object): """ An IncrementalDecoder decodes an input in multiple steps. The input can be @@ -235,6 +253,18 @@ Resets the decoder to the initial state. """ + def getstate(self): + """ + Return the current state of the decoder. + """ + return None + + def setstate(self, state): + """ + Set the current state of the decoder. state must have been returned by + getstate(). + """ + class BufferedIncrementalDecoder(IncrementalDecoder): """ This subclass of IncrementalDecoder can be used as the baseclass for an @@ -262,6 +292,12 @@ IncrementalDecoder.reset(self) self.buffer = "" + def getstate(self): + return self.buffer or None + + def setstate(self, state): + self.buffer = state or "" + # # The StreamWriter and StreamReader class provide generic working # interfaces which can be used to implement new encoding submodules