Hum, support UTF-16* or UTF-32* is not so easy because most functions in the parser supposes that the string does not contain any nul byte (eg. it uses strlen(str)). With the attached patch, it's possible to parse UTF-16LE, UTF-16BE and UTF-32LE... but not UTF-32BE - compile() builtin rejects string with a nul byte Index: Parser/tokenizer.c =================================================================== --- Parser/tokenizer.c (révision 70501) +++ Parser/tokenizer.c (copie de travail) @@ -286,8 +286,10 @@ else PyMem_FREE(cs); } - } else { /* then, compare cs with BOM */ - r = (strcmp(tok->encoding, cs) == 0); + } else { + /* then, compare cs with BOM */ + /*r = (strcmp(tok->encoding, cs) == 0);*/ + r = 1; PyMem_FREE(cs); } } @@ -330,26 +332,56 @@ /* any token beginning with '\xEF' is a bad token */ return 1; } -#if 0 - /* Disable support for UTF-16 BOMs until a decision - is made whether this needs to be supported. */ + + if (tok->encoding != NULL) + PyMem_FREE(tok->encoding); + tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */ + /* No need to set_readline: input is already utf-8 */ } else if (ch == 0xFE) { - ch = get_char(tok); if (ch != 0xFF) goto NON_BOM; - if (!set_readline(tok, "utf-16-be")) return 0; + ch = get_char(tok); + if (ch != 0xFF) { + unget_char(ch, tok); + unget_char(0xFE, tok); + /* any token beginning with '\xFE' is a bad token */ + return 1; + } + /* 0xFE 0xFF: UTF-16-BE BOM, but use UTF-16 to read the BOM */ + if (!set_readline(tok, "utf-16")) return 0; + tok->encoding = new_string("UTF-16BE", 8); tok->decoding_state = STATE_NORMAL; } else if (ch == 0xFF) { - ch = get_char(tok); if (ch != 0xFE) goto NON_BOM; - if (!set_readline(tok, "utf-16-le")) return 0; + ch = get_char(tok); + if (ch != 0xFE) { + unget_char(ch, tok); + unget_char(0xFF, tok); + /* any token beginning with '\xFF' is a bad token */ + return 1; + } + ch = get_char(tok); + if (ch != 0x00) { + unget_char(ch, tok); + /* 0xFF 0xFE: UTF-16-LE BOM, but use UTF-16 to read the BOM */ + if (!set_readline(tok, "utf-16")) return 0; + tok->encoding = new_string("UTF16-LE", 8); tok->decoding_state = STATE_NORMAL; -#endif + return 1; + } + ch = get_char(tok); + if (ch != 0x00) { + unget_char(ch, tok); + unget_char(0x00, tok); + unget_char(0xFE, tok); + unget_char(0xFF, tok); + return 1; + } + if (!set_readline(tok, "utf-32")) return 0; + tok->encoding = new_string("UTF32-LE", 8); + tok->decoding_state = STATE_NORMAL; + return 1; } else { unget_char(ch, tok); return 1; } - if (tok->encoding != NULL) - PyMem_FREE(tok->encoding); - tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */ - /* No need to set_readline: input is already utf-8 */ return 1; } Index: Lib/test/test_pep263.py =================================================================== --- Lib/test/test_pep263.py (révision 70501) +++ Lib/test/test_pep263.py (copie de travail) @@ -2,6 +2,9 @@ import unittest from test import support +import codecs +import sys +import subprocess class PEP263Test(unittest.TestCase): @@ -36,6 +39,24 @@ exec(c, d) self.assertEquals(d['\xc6'], '\xc6') + def test_bom(self): + source = "# coding: %s\nx = '\u0a20'\nprint(ascii(x))" + for bom, encoding in ( + (codecs.BOM_UTF8, "utf-8"), + (codecs.BOM_LE, "utf-16-le"), + (codecs.BOM_BE, "utf-16-be"), + (codecs.BOM_UTF32_LE, "utf-32-le"), +# (codecs.BOM_UTF32_BE, "utf-32-be"), + ): + source_bytes = bom + source.encode(encoding) + filename = "test.py" + with open(filename, "wb") as fp: + fp.write(source_bytes) + p = subprocess.Popen([sys.executable, filename], + stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + stdout, stderr = p.communicate() + self.assertEquals(stdout, b"'\\u0a20'\n") + def test_main(): support.run_unittest(PEP263Test)