Index: Lib/encodings/utf_8_sig.py =================================================================== RCS file: Lib/encodings/utf_8_sig.py diff -N Lib/encodings/utf_8_sig.py --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ Lib/encodings/utf_8_sig.py 9 Aug 2005 13:37:53 -0000 @@ -0,0 +1,57 @@ +""" Python 'utf-8-sig' Codec +This work similar to UTF-8 with the following changes: + +* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the + first three bytes. + +* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these + bytes will be skipped. +""" +import codecs + +### Codec APIs + +def encode(input, errors='strict'): + return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input)) + +def decode(input, errors='strict'): + prefix = 0 + if input.startswith(codecs.BOM_UTF8): + input = input[3:] + prefix = 3 + (output, consumed) = codecs.utf_8_decode(input, errors, True) + return (output, consumed+prefix) + +class StreamWriter(codecs.StreamWriter): + def reset(self): + codecs.StreamWriter.reset(self) + try: + del self.encode + except AttributeError: + pass + + def encode(self, input, errors='strict'): + self.encode = codecs.utf_8_encode + return encode(input, errors) + +class StreamReader(codecs.StreamReader): + def reset(self): + codecs.StreamReader.reset(self) + try: + del self.decode + except AttributeError: + pass + + def decode(self, input, errors='strict'): + if len(input) < 3 and codecs.BOM_UTF8.startswith(input): + # not enough data to decide if this is a BOM + # => try again on the next call + return (u"", 0) + self.decode = codecs.utf_8_decode + return decode(input, errors) + +### encodings module API + +def getregentry(): + + return (encode,decode,StreamReader,StreamWriter) Index: Lib/test/test_codecs.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/test/test_codecs.py,v retrieving revision 1.24 diff -u -r1.24 test_codecs.py --- Lib/test/test_codecs.py 20 Jul 2005 22:15:39 -0000 1.24 +++ Lib/test/test_codecs.py 9 Aug 2005 13:37:54 -0000 @@ -317,6 +317,33 @@ ] ) +class UTF8SigTest(ReadTest): + encoding = "utf-8-sig" + + def test_partial(self): + self.check_partial( + u"\ufeff\x00\xff\u07ff\u0800\uffff", + [ + u"", + u"", + u"", # First BOM has been read and skipped + u"", + u"", + u"\ufeff", # Second BOM has been read and emitted + u"\ufeff\x00", # "\x00" read and emitted + u"\ufeff\x00", # First byte of encoded u"\xff" read + u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read + u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read + u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read + u"\ufeff\x00\xff\u07ff", + u"\ufeff\x00\xff\u07ff", + u"\ufeff\x00\xff\u07ff\u0800", + u"\ufeff\x00\xff\u07ff\u0800", + u"\ufeff\x00\xff\u07ff\u0800", + u"\ufeff\x00\xff\u07ff\u0800\uffff", + ] + ) + class EscapeDecodeTest(unittest.TestCase): def test_empty_escape_decode(self): self.assertEquals(codecs.escape_decode(""), ("", 0)) @@ -876,6 +903,7 @@ UTF16LETest, UTF16BETest, UTF8Test, + UTF8SigTest, EscapeDecodeTest, RecodingTest, PunycodeTest, Index: Misc/NEWS =================================================================== RCS file: /cvsroot/python/python/dist/src/Misc/NEWS,v retrieving revision 1.1320 diff -u -r1.1320 NEWS --- Misc/NEWS 18 Jul 2005 08:53:17 -0000 1.1320 +++ Misc/NEWS 9 Aug 2005 13:38:00 -0000 @@ -370,6 +370,7 @@ line ending. Remove the special handling of a "\r\n" that has been split between two lines. +- Patch #1177307: Added a new codec utf_8_sig for UTF-8 with a BOM signature. Build ----- Index: Doc/lib/libcodecs.tex =================================================================== RCS file: /cvsroot/python/python/dist/src/Doc/lib/libcodecs.tex,v retrieving revision 1.35 diff -u -r1.35 libcodecs.tex --- Doc/lib/libcodecs.tex 1 Jan 2005 00:28:34 -0000 1.35 +++ Doc/lib/libcodecs.tex 9 Aug 2005 13:38:01 -0000 @@ -886,6 +886,10 @@ {U8, UTF, utf8} {all languages} +\lineiii{utf_8_sig} + {} + {all languages} + \end{longtableiii} A number of codecs are specific to Python, so their codec names have @@ -1054,3 +1058,20 @@ \begin{funcdesc}{ToUnicode}{label} Convert a label to Unicode, as specified in \rfc{3490}. \end{funcdesc} + +\subsection{\module{encodings.idna} --- + Internationalized Domain Names in Applications} + +\declaremodule{standard}{encodings.utf_8_sig} +\modulesynopsis{UTF-8 codec with BOM signature} +% XXX The next line triggers a formatting bug, so it's commented out +% until that can be fixed. +%\moduleauthor{Walter D\"orwald} + +\versionadded{2.5} + +This module implements a variant of the UTF-8 codec: On encoding a UTF-8 +encoded BOM will be prepended to the UTF-8 encoded bytes. For the stateful +encoder this is only done once (on the first write to the byte stream). +For decoding an optional UTF-8 encoded BOM at the start of the data will be +skipped.