Index: Lib/encodings/utf_8_sig.py =================================================================== RCS file: Lib/encodings/utf_8_sig.py diff -N Lib/encodings/utf_8_sig.py --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ Lib/encodings/utf_8_sig.py 5 Apr 2005 20:25:28 -0000 @@ -0,0 +1,57 @@ +""" Python 'utf-8-sig' Codec +This work similar to UTF-8 with the following changes: + +* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the + first three bytes. + +* On decoding/reading if the first three bytes are an UTF-8 encoded BOM, these + bytes will be skipped. +""" +import codecs + +### Codec APIs + +def encode(input, errors='strict'): + return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input)) + +def decode(input, errors='strict'): + prefix = 0 + if input.startswith(codecs.BOM_UTF8): + input = input[3:] + prefix = 3 + (output, consumed) = codecs.utf_8_decode(input, errors, True) + return (output, consumed+prefix) + +class StreamWriter(codecs.StreamWriter): + def reset(self): + codecs.StreamWriter.reset(self) + try: + del self.encode + except AttributeError: + pass + + def encode(self, input, errors='strict'): + self.encode = codecs.utf_8_encode + return encode(input, errors) + +class StreamReader(codecs.StreamReader): + def reset(self): + codecs.StreamReader.reset(self) + try: + del self.decode + except AttributeError: + pass + + def decode(self, input, errors='strict'): + if len(input) < 3 and codecs.BOM_UTF8.startswith(input): + # not enough data to decide if this is a BOM + # => try again on the next call + return (u"", 0) + self.decode = codecs.utf_8_decode + return decode(input, errors) + +### encodings module API + +def getregentry(): + + return (encode,decode,StreamReader,StreamWriter) Index: Lib/test/test_codecs.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/test/test_codecs.py,v retrieving revision 1.22 diff -u -r1.22 test_codecs.py --- Lib/test/test_codecs.py 4 Apr 2005 21:38:47 -0000 1.22 +++ Lib/test/test_codecs.py 5 Apr 2005 20:25:30 -0000 @@ -315,6 +315,32 @@ ] ) +class UTF8SigTest(ReadTest): + encoding = "utf-8-sig" + + def test_partial(self): + self.check_partial( + u"u\feff\x00\xff\u07ff\u0800\uffff", + [ + u"", + u"", + u"", + u"", + u"", + u"\ufeff", + u"\ufeff\x00", + u"\ufeff\x00\xff", + u"\ufeff\x00\xff", + u"\ufeff\x00\xff\u07ff", + u"\ufeff\x00\xff\u07ff", + u"\ufeff\x00\xff\u07ff", + u"\ufeff\x00\xff\u07ff\u0800", + u"\ufeff\x00\xff\u07ff\u0800", + u"\ufeff\x00\xff\u07ff\u0800", + u"\ufeff\x00\xff\u07ff\u0800\uffff", + ] + ) + class EscapeDecodeTest(unittest.TestCase): def test_empty_escape_decode(self): self.assertEquals(codecs.escape_decode(""), ("", 0)) @@ -762,6 +788,7 @@ "utf_16_le", "utf_7", "utf_8", + "utf_8_sig", ] if hasattr(codecs, "mbcs_encode"):