Index: Doc/library/codecs.rst =================================================================== --- Doc/library/codecs.rst (Revision 58878) +++ Doc/library/codecs.rst (Arbeitskopie) @@ -1135,6 +1135,8 @@ | uu_codec | uu | byte string | Convert the operand using | | | | | uuencode | +--------------------+---------------------------+----------------+---------------------------+ +| xml | | Unicode string | An XML meta codec | ++--------------------+---------------------------+----------------+---------------------------+ | zlib_codec | zip, zlib | byte string | Compress the operand | | | | | using gzip | +--------------------+---------------------------+----------------+---------------------------+ @@ -1218,3 +1220,39 @@ is only done once (on the first write to the byte stream). For decoding an optional UTF-8 encoded BOM at the start of the data will be skipped. + +:mod:`encodings.xml` --- XML meta codec +--------------------------------------- + +.. module:: encodings.xml + :synopsis: XML meta codec +.. moduleauthor:: Walter Dörwald + +.. versionadded:: 2.6 + +This module implements a codec that can be used for encoding and decoding XML. +Once the encoding has been determined the decoding/encoding process falls back +to using the codec for that particular encoding to do the rest of the work, so +this XML codec supports all encodings supported by Python itself. + +On decoding the XML codec determines the encoding by either inspecting the +first few bytes of the byte stream/string or by extracting the encoding from +the XML declaration. If the encoding can't be determined from the first few +bytes and there is no XML declaration the codec falls back to using UTF-8. +When the encoding is specified by an external source (e.g. a Content-Type +header in an HTTP response), this encoding can be passed as an argument to the +codec, which will then bypass encoding detection. If there's an XML declaration +in the input, the XML declaration passed to the application then will contain +the externally specified encoding instead of the original one. + +On encoding the XML codec extracts the encoding from the XML declaration and +will encode the output in that encoding. If there's no XML declaration UTF-8 +will be used. It's possible to pass an external encoding to the encoder too. +The encoder will then encode the output in that encoding and put the correct +encoding into the XML declaration (if there is one). + + +.. seealso:: + + http://www.w3.org/TR/2004/REC-xml-20040204/#sec-guessing + Autodetection of Character Encodings in XML Index: Lib/encodings/xml.py =================================================================== --- Lib/encodings/xml.py (Revision 0) +++ Lib/encodings/xml.py (Revision 0) @@ -0,0 +1,198 @@ +# -*- coding: iso-8859-1 -*- +""" +Python 'xml' Codec +""" + + +import codecs + + +def decode(input, errors="strict", encoding=None): + if encoding is None: + encoding = codecs.detect_xml_encoding(input, True) + if encoding == "xml": + raise ValueError("xml not allowed as encoding name") + (input, consumed) = codecs.getdecoder(encoding)(input, errors) + return (codecs.fix_xml_encoding(input, unicode(encoding), True), consumed) + + +def encode(input, errors="strict", encoding=None): + consumed = len(input) + if encoding is None: + encoding = codecs.detect_xml_encoding(input, True) + else: + input = codecs.fix_xml_encoding(input, unicode(encoding), True) + if encoding == "xml": + raise ValueError("xml not allowed as encoding name") + info = codecs.lookup(encoding) + return (info.encode(input, errors)[0], consumed) + + +class IncrementalDecoder(codecs.IncrementalDecoder): + def __init__(self, errors="strict", encoding=None): + self.decoder = None + self.encoding = encoding + codecs.IncrementalDecoder.__init__(self, errors) + self._errors = errors # Store ``errors`` somewhere else, because we have to hide it in a property + self.buffer = "" + self.headerfixed = False + + def iterdecode(self, input): + for part in input: + result = self.decode(part, False) + if result: + yield result + result = self.decode("", True) + if result: + yield result + + def decode(self, input, final=False): + # We're doing basically the same as a ``BufferedIncrementalDecoder``, + # but since the buffer is only relevant until the encoding has been detected + # (in which case the buffer of the underlying codec might kick in), + # we're implementing buffering ourselves to avoid some overhead. + if self.decoder is None: + input = self.buffer + input + self.encoding = codecs.detect_xml_encoding(input, final) + if self.encoding is None: + self.buffer = input # retry the complete input on the next call + return u"" # no encoding determined yet, so no output + if self.encoding == "xml": + raise ValueError("xml not allowed as encoding name") + self.buffer = "" # isn't needed any more, as the decoder might keep its own buffer + self.decoder = codecs.getincrementaldecoder(self.encoding)(self._errors) + if self.headerfixed: + return self.decoder.decode(input, final) + # If we haven't fixed the header yet, the content of ``self.buffer`` is a ``unicode`` object + output = self.buffer + self.decoder.decode(input, final) + newoutput = codecs.fix_xml_encoding(output, unicode(self.encoding), final) + if newoutput is None: + self.buffer = output # retry fixing the declaration (but keep the decoded stuff) + return u"" + self.headerfixed = True + return newoutput + + def reset(self): + codecs.IncrementalDecoder.reset(self) + self.decoder = None + self.buffer = "" + self.headerfixed = False + + def _geterrors(self): + return self._errors + + def _seterrors(self, errors): + # Setting ``errors`` must be done on the real decoder too + if self.decoder is not None: + self.decoder.errors = errors + self._errors = errors + errors = property(_geterrors, _seterrors) + + +class IncrementalEncoder(codecs.IncrementalEncoder): + def __init__(self, errors="strict", encoding=None): + self.encoder = None + self.encoding = encoding + codecs.IncrementalEncoder.__init__(self, errors) + self._errors = errors # Store ``errors`` somewhere else, because we have to hide it in a property + self.buffer = u"" + + def iterencode(self, input): + for part in input: + result = self.encode(part, False) + if result: + yield result + result = self.encode(u"", True) + if result: + yield result + + def encode(self, input, final=False): + if self.encoder is None: + input = self.buffer + input + if self.encoding is not None: + # Replace encoding in the declaration with the specified one + newinput = codecs.fix_xml_encoding(input, unicode(self.encoding), final) + if newinput is None: # declaration not complete => Retry next time + self.buffer = input + return "" + input = newinput + else: + # Use encoding from the XML declaration + self.encoding = codecs.detect_xml_encoding(input, final) + if self.encoding is not None: + if self.encoding == "xml": + raise ValueError("xml not allowed as encoding name") + info = codecs.lookup(self.encoding) + self.encoder = info.incrementalencoder(self._errors) + self.buffer = u"" + else: + self.buffer = input + return "" + return self.encoder.encode(input, final) + + def reset(self): + codecs.IncrementalEncoder.reset(self) + self.encoder = None + self.buffer = u"" + + def _geterrors(self): + return self._errors + + def _seterrors(self, errors): + # Setting ``errors ``must be done on the real encoder too + if self.encoder is not None: + self.encoder.errors = errors + self._errors = errors + errors = property(_geterrors, _seterrors) + + +class StreamWriter(codecs.StreamWriter): + def __init__(self, stream, errors="strict", encoding="utf-8", header=False): + codecs.StreamWriter.__init__(self, stream, errors) + self.encoder = IncrementalEncoder(errors) + self._errors = errors + + def encode(self, input, errors='strict'): + return (self.encoder.encode(input, False), len(input)) + + def _geterrors(self): + return self._errors + + def _seterrors(self, errors): + # Setting ``errors`` must be done on the encoder too + if self.encoder is not None: + self.encoder.errors = errors + self._errors = errors + errors = property(_geterrors, _seterrors) + + +class StreamReader(codecs.StreamReader): + def __init__(self, stream, errors="strict"): + codecs.StreamReader.__init__(self, stream, errors) + self.decoder = IncrementalDecoder(errors) + self._errors = errors + + def decode(self, input, errors='strict'): + return (self.decoder.decode(input, False), len(input)) + + def _geterrors(self): + return self._errors + + def _seterrors(self, errors): + # Setting ``errors`` must be done on the decoder too + if self.decoder is not None: + self.decoder.errors = errors + self._errors = errors + errors = property(_geterrors, _seterrors) + + +def getregentry(): + return codecs.CodecInfo( + name="xml", + encode=encode, + decode=decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamwriter=StreamWriter, + streamreader=StreamReader, + ) Index: Lib/test/test_codecs.py =================================================================== --- Lib/test/test_codecs.py (Revision 58878) +++ Lib/test/test_codecs.py (Arbeitskopie) @@ -1406,7 +1406,216 @@ info.streamwriter, 'strict') as srw: self.assertEquals(srw.read(), u"\xfc") +class XMLCodecTest(unittest.TestCase): + def test_detectencoding_str(self): + self.assert_(codecs.detect_xml_encoding("") is None) + self.assert_(codecs.detect_xml_encoding("\xef") is None) + self.assertEqual(codecs.detect_xml_encoding("\xef\x33"), "utf-8") + self.assert_(codecs.detect_xml_encoding("\xef\xbb") is None) + self.assertEqual(codecs.detect_xml_encoding("\xef\xbb\x33"), "utf-8") + self.assertEqual(codecs.detect_xml_encoding("\xef\xbb\xbf"), "utf-8-sig") + self.assert_(codecs.detect_xml_encoding("\xff") is None) + self.assertEqual(codecs.detect_xml_encoding("\xff\x33"), "utf-8") + self.assert_(codecs.detect_xml_encoding("\xff\xfe") is None) + self.assertEqual(codecs.detect_xml_encoding("\xff\xfe\x33"), "utf-16") + self.assert_(codecs.detect_xml_encoding("\xff\xfe\x00") is None) + self.assertEqual(codecs.detect_xml_encoding("\xff\xfe\x00\x33"), "utf-16") + self.assertEqual(codecs.detect_xml_encoding("\xff\xfe\x00\x00"), "utf-32") + self.assert_(codecs.detect_xml_encoding("\x00") is None) + self.assertEqual(codecs.detect_xml_encoding("\x00\x33"), "utf-8") + self.assert_(codecs.detect_xml_encoding("\x00\x00") is None) + self.assertEqual(codecs.detect_xml_encoding("\x00\x00\x33"), "utf-8") + self.assert_(codecs.detect_xml_encoding("\x00\x00\xfe") is None) + self.assertEqual(codecs.detect_xml_encoding("\x00\x00\x00\x33"), "utf-8") + self.assertEqual(codecs.detect_xml_encoding("\x00\x00\x00<"), "utf-32-be") + self.assertEqual(codecs.detect_xml_encoding("\x00\x00\xfe\xff"), "utf-32") + self.assert_(codecs.detect_xml_encoding("<") is None) + self.assertEqual(codecs.detect_xml_encoding("<\x33"), "utf-8") + self.assert_(codecs.detect_xml_encoding("<\x00") is None) + self.assertEqual(codecs.detect_xml_encoding("<\x00\x33"), "utf-8") + self.assert_(codecs.detect_xml_encoding("<\x00\x00") is None) + self.assertEqual(codecs.detect_xml_encoding("<\x00\x00\x33"), "utf-8") + self.assertEqual(codecs.detect_xml_encoding("<\x00\x00\x00"), "utf-32-le") + self.assert_(codecs.detect_xml_encoding(""), "utf-8") + self.assert_(codecs.detect_xml_encoding("g\xfcrk\u20ac"): + # Check stateless decoder + d = codecs.getdecoder("xml") + self.assertEqual( + d(input.encode(encoding))[0], + input.replace("'x'", repr(encoding)) + ) + + # Check stateless decoder with specified encoding + self.assertEqual( + d(input.encode(encoding), encoding=encoding)[0], + input.replace("'x'", repr(encoding)) + ) + + # Check incremental decoder + id = codecs.getincrementaldecoder("xml")() + self.assertEqual( + "".join(id.iterdecode(input.encode(encoding))), + input.replace("'x'", repr(encoding)) + ) + + # Check incremental decoder with specified encoding + id = codecs.getincrementaldecoder("xml")(encoding) + self.assertEqual( + "".join(id.iterdecode(input.encode(encoding))), + input.replace("'x'", repr(encoding)) + ) + + # Autodetectable encodings + checkauto("utf-8-sig") + checkauto("utf-16") + checkauto("utf-16-le") + checkauto("utf-16-be") + checkauto("utf-32") + checkauto("utf-32-le") + checkauto("utf-32-be") + + def checkdecl(encoding, input=u"\u20ac"): + # Check stateless decoder with encoding autodetection + d = codecs.getdecoder("xml") + input = input % encoding + self.assertEqual(d(input.encode(encoding))[0], input) + + # Check stateless decoder with specified encoding + self.assertEqual(d(input.encode(encoding), encoding=encoding)[0], input) + + # Check incremental decoder with encoding autodetection + id = codecs.getincrementaldecoder("xml")() + self.assertEqual("".join(id.iterdecode(input.encode(encoding))), input) + + # Check incremental decoder with specified encoding + id = codecs.getincrementaldecoder("xml")(encoding) + self.assertEqual("".join(id.iterdecode(input.encode(encoding))), input) + + # Use correct declaration + checkdecl("utf-8") + checkdecl("iso-8859-1", u"") + checkdecl("iso-8859-15") + checkdecl("cp1252") + + # No recursion + self.assertRaises(ValueError, "".decode, "xml") + + def test_encoder(self): + def check(encoding, input=u"g\xfcrk\u20ac"): + # Check stateless encoder with encoding autodetection + e = codecs.getencoder("xml") + inputdecl = input.replace("'x'", repr(encoding)) + self.assertEqual(e(inputdecl)[0].decode(encoding), inputdecl) + + # Check stateless encoder with specified encoding + self.assertEqual(e(input, encoding=encoding)[0].decode(encoding), inputdecl) + + # Check incremental encoder with encoding autodetection + ie = codecs.getincrementalencoder("xml")() + self.assertEqual("".join(ie.iterencode(inputdecl)).decode(encoding), inputdecl) + + # Check incremental encoder with specified encoding + ie = codecs.getincrementalencoder("xml")(encoding=encoding) + self.assertEqual("".join(ie.iterencode(input)).decode(encoding), inputdecl) + + # Autodetectable encodings + check("utf-8-sig") + check("utf-16") + check("utf-16-le") + check("utf-16-be") + check("utf-32") + check("utf-32-le") + check("utf-32-be") + check("utf-8") + check("iso-8859-1", u"") + check("iso-8859-15") + check("cp1252") + + # No recursion + self.assertRaises(ValueError, u"".encode, "xml") + + def test_main(): test_support.run_unittest( UTF32Test, @@ -1435,6 +1644,7 @@ BasicStrTest, CharmapTest, WithStmtTest, + XMLCodecTest ) Index: Modules/_codecs_functions.c =================================================================== --- Modules/_codecs_functions.c (Revision 0) +++ Modules/_codecs_functions.c (Revision 0) @@ -0,0 +1,115 @@ +/* ------------------------------------------------------------------------ + + _codecs_functions -- bit shared between 8bit and unicode implementations + of functions in the codecs module. + + ------------------------------------------------------------------------ */ + +/* Parses a pseudoattr. Returns 2 if a name has been found, 1 if we're at the + end of the declaration, 0 if we didn't have enough data and -1 on error. + The pseudoattr name is put into namestart and nameend, + The pseudoattr value is put into valuestart and valueend. */ +static int STRINGLIB_PARSEPSEUDOATTR( + const STRINGLIB_CHAR *s, const STRINGLIB_CHAR *end, + const STRINGLIB_CHAR **namestart, const STRINGLIB_CHAR **nameend, + const STRINGLIB_CHAR **valuestart, const STRINGLIB_CHAR **valueend) +{ + STRINGLIB_CHAR quote; + + /* goto beginning of next word */ + while (s there's no pseudoattr there */ + if (s[0] == '?' && s[1] == '>') + return 1; + } + + *namestart = s; + while (s continue */ + str = *encodingend+1; + } + } +} Index: Modules/_codecsmodule.c =================================================================== --- Modules/_codecsmodule.c (Revision 58878) +++ Modules/_codecsmodule.c (Arbeitskopie) @@ -1073,6 +1073,444 @@ return PyCodec_LookupError(name); } +/* --- Functions for XML codecs ------------------------------------------- */ + +static int cmpu2s(const Py_UNICODE *u, const char *s, Py_ssize_t len) +{ + while (len) + { + if (*u != *s) + return *u - *s; + ++u; + ++s; + --len; + } + return 0; +} + +/* define unicode version of parsepseudoattr/parseencoding */ +#define STRINGLIB_PARSEPSEUDOATTR parse_xml_pseudoattr_unicode +#define STRINGLIB_PARSEENCODING parse_xml_encoding_unicode +#define STRINGLIB_CHAR Py_UNICODE +#define STRINGLIB_ISALPHA(c) Py_UNICODE_ISALPHA(c) +#define STRINGLIB_CMP2CHAR(u, s, l) cmpu2s(u, s, l) + +#include "_codecs_functions.c" + +#undef STRINGLIB_PARSEPSEUDOATTR +#undef STRINGLIB_PARSEENCODING +#undef STRINGLIB_CHAR +#undef STRINGLIB_ISALPHA +#undef STRINGLIB_CMP2CHAR + +/* define str version of parsepseudoattr/parseencoding */ +#define STRINGLIB_PARSEPSEUDOATTR parse_xml_pseudoattr_str +#define STRINGLIB_PARSEENCODING parse_xml_encoding_str +#define STRINGLIB_CHAR char +#define STRINGLIB_ISALPHA(c) (((c)>='a' && (c)<='z') || ((c)>='A' && (c)<='Z')) +#define STRINGLIB_CMP2CHAR(u, s, l) strncmp(u, s, l) + +#include "_codecs_functions.c" + +#undef STRINGLIB_PARSEPSEUDOATTR +#undef STRINGLIB_PARSEENCODING +#undef STRINGLIB_CHAR +#undef STRINGLIB_ISALPHA +#undef STRINGLIB_CMP2CHAR + +/* Parses a unicode XML declaration and returns the position of the encoding in + encodingstart/encodingend. Return values are the same as for parseencoding(). */ +int parse_xml_declaration_unicode(const Py_UNICODE *str, const Py_UNICODE *strend, const Py_UNICODE **encodingstart, const Py_UNICODE **encodingend) +{ + Py_ssize_t strlen = strend - str; + + if (strlen>0) + { + if (*str++ != '<') + return 1; + if (strlen>1) + { + if (*str++ != '?') + return 1; + if (strlen>2) + { + if (*str++ != 'x') + return 1; + if (strlen>3) + { + if (*str++ != 'm') + return 1; + if (strlen>4) + { + if (*str++ != 'l') + return 1; + if (strlen>5) + { + if (*str != ' ' && *str != '\t' && *str != '\r' && *str != '\n') + return 1; + return parse_xml_encoding_unicode(++str, strend, encodingstart, encodingend); + } + } + } + } + } + } + return 0; +} + +/* We're using bits to store all possible candidate encodings (or variants, i.e. + * we have two bits for the variants of UTF-16 and two for the + * variants of UTF-32). + * + * Prefixes for various XML encodings + * (see http://www.w3.org/TR/2004/REC-xml-20040204/#sec-guessing) + * UTF-8-SIG xEF xBB xBF + * UTF-16 (LE) xFF xFE ~x00|~x00 + * UTF-16 (BE) xFE xFF + * UTF-16-LE < x00 ? x00 + * UTF-16-BE x00 < + * UTF-32 (LE) xFF xFE x00 x00 + * UTF-32 (BE) x00 x00 xFE xFF + * UTF-32-LE < x00 x00 x00 + * UTF-32-BE x00 x00 x00 < + * XML-DECL < ? x m l +*/ + +#define CANDIDATE_UTF_8_SIG (1<<0) +#define CANDIDATE_UTF_16_AS_LE (1<<1) +#define CANDIDATE_UTF_16_AS_BE (1<<2) +#define CANDIDATE_UTF_16_LE (1<<3) +#define CANDIDATE_UTF_16_BE (1<<4) +#define CANDIDATE_UTF_32_AS_LE (1<<5) +#define CANDIDATE_UTF_32_AS_BE (1<<6) +#define CANDIDATE_UTF_32_LE (1<<7) +#define CANDIDATE_UTF_32_BE (1<<8) +#define CANDIDATE_DECL (1<<9) +#define CANDIDATES ((CANDIDATE_DECL<<1)-1) /* All bits */ + +#if 0 +/* for debugging output */ +void DUMPCANDIDATES(int candidates) +{ + if (candidates&CANDIDATE_UTF_8_SIG) + printf("u8s "); + else + printf("--- "); + if (candidates&CANDIDATE_UTF_16_AS_LE) + printf("u16(le) "); + else + printf("------- "); + if (candidates&CANDIDATE_UTF_16_AS_BE) + printf("u16(be) "); + else + printf("------- "); + if (candidates&CANDIDATE_UTF_16_LE) + printf("u16le "); + else + printf("----- "); + if (candidates&CANDIDATE_UTF_16_BE) + printf("u16be "); + else + printf("----- "); + if (candidates&CANDIDATE_UTF_32_AS_LE) + printf("u32(le) "); + else + printf("------- "); + if (candidates&CANDIDATE_UTF_32_AS_BE) + printf("u32(be) "); + else + printf("------- "); + if (candidates&CANDIDATE_UTF_32_LE) + printf("u32le "); + else + printf("----- "); + if (candidates&CANDIDATE_UTF_32_BE) + printf("u32be "); + else + printf("----- "); + if (candidates&CANDIDATE_DECL) + printf("decl\n"); + else + printf("----\n"); +} + +/* for debugging output */ +void DUMPBYTE(char c) +{ + printf("-> %02x\n", (int)(unsigned char)c); +} +#else +#define DUMPCANDIDATES(x) +#define DUMPBYTE(x) +#endif + +static PyObject *detect_xml_encoding_str(const char *str, Py_ssize_t len, int final) +{ + const char *origstr; + Py_ssize_t origlen; + int candidates = CANDIDATES; /* all 10 encodings are still possible */ + const char *strend; + char firstbytes[4]; + + origlen = len; + origstr = str; + strend = str + len; + + /* For each byte in the input delete the appropriate bit if the + * encoding has the wrong value in this spot. If no bits remain + * we default to UTF-8. If only one bit remains (and we had enough input) + * this is the resulting encoding. + */ + DUMPCANDIDATES(candidates); + if (len) + { + /* Check first byte */ + firstbytes[0] = *str; + DUMPBYTE(*str); + if (firstbytes[0] != '\xef') + candidates &= ~CANDIDATE_UTF_8_SIG; + if (firstbytes[0] != '\xff') + candidates &= ~CANDIDATE_UTF_32_AS_LE& + ~CANDIDATE_UTF_16_AS_LE; + if (firstbytes[0] != '\xfe') + candidates &= ~CANDIDATE_UTF_16_AS_BE; + if (firstbytes[0] != '<') + candidates &= ~CANDIDATE_UTF_32_LE& + ~CANDIDATE_UTF_16_LE& + ~CANDIDATE_DECL; + if (firstbytes[0] != '\x00') + candidates &= ~CANDIDATE_UTF_32_AS_BE& + ~CANDIDATE_UTF_32_BE& + ~CANDIDATE_UTF_16_BE; + DUMPCANDIDATES(candidates); + if (++str, --len) + { + /* Check second byte */ + firstbytes[1] = *str; + DUMPBYTE(*str); + if (firstbytes[1] != '\xbb') + candidates &= ~CANDIDATE_UTF_8_SIG; + if (firstbytes[1] != '\xfe') + candidates &= ~CANDIDATE_UTF_16_AS_LE& + ~CANDIDATE_UTF_32_AS_LE; + if (firstbytes[1] != '\xff') + candidates &= ~CANDIDATE_UTF_16_AS_BE; + if (firstbytes[1] != '\x00') + candidates &= ~CANDIDATE_UTF_16_LE& + ~CANDIDATE_UTF_32_AS_BE& + ~CANDIDATE_UTF_32_LE& + ~CANDIDATE_UTF_32_BE; + if (firstbytes[1] != '<') + candidates &= ~CANDIDATE_UTF_16_BE; + if (firstbytes[1] != '?') + candidates &= ~CANDIDATE_DECL; + DUMPCANDIDATES(candidates); + if (++str, --len) + { + /* Check third byte */ + firstbytes[2] = *str; + DUMPBYTE(*str); + if (firstbytes[2] != '\xbf') + candidates &= ~CANDIDATE_UTF_8_SIG; + if (firstbytes[2] != '?') + candidates &= ~CANDIDATE_UTF_16_LE; + if (firstbytes[2] != '\x00') + candidates &= ~CANDIDATE_UTF_32_AS_LE& + ~CANDIDATE_UTF_32_LE& + ~CANDIDATE_UTF_32_BE; + if (firstbytes[2] != '\xfe') + candidates &= ~CANDIDATE_UTF_32_AS_BE; + if (firstbytes[2] != 'x') + candidates &= ~CANDIDATE_DECL; + DUMPCANDIDATES(candidates); + if (++str, --len) + { + /* Check fourth byte */ + firstbytes[3] = *str; + DUMPBYTE(*str); + if (firstbytes[3] == '\x00' && firstbytes[2] == '\x00') + candidates &= ~CANDIDATE_UTF_16_AS_LE; + if (firstbytes[3] != '\x00') + candidates &= ~CANDIDATE_UTF_16_LE& + ~CANDIDATE_UTF_32_AS_LE& + ~CANDIDATE_UTF_32_LE; + if (firstbytes[3] != '\xff') + candidates &= ~CANDIDATE_UTF_32_AS_BE; + if (firstbytes[3] != '<') + candidates &= ~CANDIDATE_UTF_32_BE; + if (firstbytes[3] != 'm') + candidates &= ~CANDIDATE_DECL; + DUMPCANDIDATES(candidates); + if (++str, --len) + { + /* Check fifth byte */ + DUMPBYTE(*str); + if (*str != 'l') + candidates &= ~CANDIDATE_DECL; + DUMPCANDIDATES(candidates); + if (++str, --len) + { + /* Check sixth byte */ + DUMPBYTE(*str); + if (*str != ' ' && *str != '\t' && *str != '\r' && *str != '\n') + candidates &= ~CANDIDATE_DECL; + DUMPCANDIDATES(candidates); + } + } + } + } + } + } + if (candidates == 0) + return PyString_FromString("utf-8"); + else if (!(candidates & (candidates-1))) /* only one encoding remaining */ + { + if ((candidates == CANDIDATE_UTF_8_SIG) && (origlen >= 3)) + return PyString_FromString("utf-8-sig"); + else if ((candidates == CANDIDATE_UTF_16_AS_LE) && (origlen >= 2)) + return PyString_FromString("utf-16"); + else if ((candidates == CANDIDATE_UTF_16_AS_BE) && (origlen >= 2)) + return PyString_FromString("utf-16"); + else if ((candidates == CANDIDATE_UTF_16_LE) && (origlen >= 4)) + return PyString_FromString("utf-16-le"); + else if ((candidates == CANDIDATE_UTF_16_BE) && (origlen >= 2)) + return PyString_FromString("utf-16-be"); + else if ((candidates == CANDIDATE_UTF_32_AS_LE) && (origlen >= 4)) + return PyString_FromString("utf-32"); + else if ((candidates == CANDIDATE_UTF_32_AS_BE) && (origlen >= 4)) + return PyString_FromString("utf-32"); + else if ((candidates == CANDIDATE_UTF_32_LE) && (origlen >= 4)) + return PyString_FromString("utf-32-le"); + else if ((candidates == CANDIDATE_UTF_32_BE) && (origlen >= 4)) + return PyString_FromString("utf-32-be"); + else if ((candidates == CANDIDATE_DECL) && (origlen >= 6)) + { + const char *encodingstart; + const char *encodingend; + + switch (parse_xml_encoding_str(str, strend, &encodingstart, &encodingend)) + { + case -1: + return NULL; + case 0: /* don't know yet */ + Py_RETURN_NONE; + case 1: /* not found => default to utf-8 */ + return PyString_FromString("utf-8"); + case 2: /* found it */ + return PyString_FromStringAndSize(encodingstart, encodingend-encodingstart); + } + } + } + if (final) /* if this is the last call, and we haven't determined an encoding yet, we default to UTF-8 */ + return PyString_FromString("utf-8"); + /* We don't know yet */ + Py_RETURN_NONE; +} + +static PyObject *detect_xml_encoding_unicode(const Py_UNICODE *str, Py_ssize_t len, int final) +{ + const Py_UNICODE *encodingstart; + const Py_UNICODE *encodingend; + + switch (parse_xml_declaration_unicode(str, str+len, &encodingstart, &encodingend)) + { + case -1: + return NULL; + case 0: /* don't know yet */ + if (final) /* we won't get better data, so default to utf-8 */ + goto utf8; + Py_RETURN_NONE; + case 1: /* not found => default to UTF-8 */ + goto utf8; + case 2: /* found it => put the encoding name into this spot and return the new string */ + return PyUnicode_FromUnicode(encodingstart, encodingend-encodingstart); + } + utf8: + return PyUnicode_DecodeASCII("utf-8", 5, NULL); +} + +static PyObject *detect_xml_encoding(PyObject *self, PyObject *args) +{ + PyObject *obj; + int final = 0; + + if (!PyArg_ParseTuple(args, "O|i:detect_xml_encoding", &obj, &final)) + return NULL; + + if (PyString_Check(obj)) + return detect_xml_encoding_str(PyString_AS_STRING(obj), PyString_GET_SIZE(obj), final); + else if (PyUnicode_Check(obj)) + return detect_xml_encoding_unicode(PyUnicode_AS_UNICODE(obj), PyUnicode_GET_SIZE(obj), final); + else + { + PyErr_SetString(PyExc_TypeError, "expected str or unicode"); + return NULL; + } +} + +static char detect_xml_encoding__doc__[] = +"detect_xml_encoding(str[, final=False]) -> str or None\n\ +\n\ +Tries to detect the XML encoding from the first few bytes of the string\n\ +or the encoding declaration in the XML header. Return the name of the\n\ +encoding or None, if the encoding is ambiguous."; + +PyObject *fix_xml_encoding(PyObject *self, PyObject *args) +{ + PyObject *strobj; + const Py_UNICODE *strstart; + const Py_UNICODE *strend; + int final = 0; + const Py_UNICODE *enc; + Py_ssize_t enclen; + const Py_UNICODE *encodingstart; + const Py_UNICODE *encodingend; + + if (!PyArg_ParseTuple(args, "O!u#|i:fixencoding", &PyUnicode_Type, &strobj, &enc, &enclen, &final)) + return NULL; + + strstart = PyUnicode_AS_UNICODE(strobj); + strend = strstart + PyUnicode_GET_SIZE(strobj); + switch (parse_xml_declaration_unicode(strstart, strend, &encodingstart, &encodingend)) + { + case -1: + return NULL; + case 0: /* don't know yet */ + if (final) /* we won't get better data, so use what we have */ + goto original; + Py_RETURN_NONE; + case 1: /* not found => return original string */ + goto original; + case 2: /* found it */ + { + /* yes => put the encoding name into this spot and return the new string */ + PyObject *newobj = PyUnicode_FromUnicode(NULL, (encodingstart-strstart) + enclen + (strend - encodingend)); + Py_UNICODE *new; + if (!newobj) + return NULL; + new = PyUnicode_AS_UNICODE(newobj); + #define Py_UNICODE_STPCOPY(target, source, length) (Py_UNICODE_COPY(target, source, length), (target)+(length)) + new = Py_UNICODE_STPCOPY(new, strstart, encodingstart-strstart); + new = Py_UNICODE_STPCOPY(new, enc, enclen); + (void) Py_UNICODE_STPCOPY(new, encodingend, strend - encodingend); + return newobj; + } + } + Py_RETURN_NONE; + original: + Py_INCREF(strobj); + return strobj; +} + +static char fix_xml_encoding__doc__[] = +"fix_xml_encoding(unicode, encoding) -> unicode or None\n\ +\n\ +Replaces the encoding specification in the XML declaration at the start of the\n\ +first argument with the encoding specified. If there's no XML declaration the\n\ +original string is returned. If the string isn't long enough to find an\n\ +encoding None is returned."; + + /* --- Module API --------------------------------------------------------- */ static PyMethodDef _codecs_functions[] = { @@ -1129,6 +1567,10 @@ register_error__doc__}, {"lookup_error", lookup_error, METH_VARARGS, lookup_error__doc__}, + {"detect_xml_encoding", detect_xml_encoding, METH_VARARGS, + detect_xml_encoding__doc__}, + {"fix_xml_encoding", fix_xml_encoding, METH_VARARGS, + fix_xml_encoding__doc__}, {NULL, NULL} /* sentinel */ };