Index: Doc/library/codecs.rst
===================================================================
--- Doc/library/codecs.rst (Revision 58878)
+++ Doc/library/codecs.rst (Arbeitskopie)
@@ -1135,6 +1135,8 @@
| uu_codec | uu | byte string | Convert the operand using |
| | | | uuencode |
+--------------------+---------------------------+----------------+---------------------------+
+| xml | | Unicode string | An XML meta codec |
++--------------------+---------------------------+----------------+---------------------------+
| zlib_codec | zip, zlib | byte string | Compress the operand |
| | | | using gzip |
+--------------------+---------------------------+----------------+---------------------------+
@@ -1218,3 +1220,39 @@
is only done once (on the first write to the byte stream). For decoding an
optional UTF-8 encoded BOM at the start of the data will be skipped.
+
+:mod:`encodings.xml` --- XML meta codec
+---------------------------------------
+
+.. module:: encodings.xml
+ :synopsis: XML meta codec
+.. moduleauthor:: Walter Dörwald
+
+.. versionadded:: 2.6
+
+This module implements a codec that can be used for encoding and decoding XML.
+Once the encoding has been determined the decoding/encoding process falls back
+to using the codec for that particular encoding to do the rest of the work, so
+this XML codec supports all encodings supported by Python itself.
+
+On decoding the XML codec determines the encoding by either inspecting the
+first few bytes of the byte stream/string or by extracting the encoding from
+the XML declaration. If the encoding can't be determined from the first few
+bytes and there is no XML declaration the codec falls back to using UTF-8.
+When the encoding is specified by an external source (e.g. a Content-Type
+header in an HTTP response), this encoding can be passed as an argument to the
+codec, which will then bypass encoding detection. If there's an XML declaration
+in the input, the XML declaration passed to the application then will contain
+the externally specified encoding instead of the original one.
+
+On encoding the XML codec extracts the encoding from the XML declaration and
+will encode the output in that encoding. If there's no XML declaration UTF-8
+will be used. It's possible to pass an external encoding to the encoder too.
+The encoder will then encode the output in that encoding and put the correct
+encoding into the XML declaration (if there is one).
+
+
+.. seealso::
+
+ http://www.w3.org/TR/2004/REC-xml-20040204/#sec-guessing
+ Autodetection of Character Encodings in XML
Index: Lib/encodings/xml.py
===================================================================
--- Lib/encodings/xml.py (Revision 0)
+++ Lib/encodings/xml.py (Revision 0)
@@ -0,0 +1,198 @@
+# -*- coding: iso-8859-1 -*-
+"""
+Python 'xml' Codec
+"""
+
+
+import codecs
+
+
+def decode(input, errors="strict", encoding=None):
+ if encoding is None:
+ encoding = codecs.detect_xml_encoding(input, True)
+ if encoding == "xml":
+ raise ValueError("xml not allowed as encoding name")
+ (input, consumed) = codecs.getdecoder(encoding)(input, errors)
+ return (codecs.fix_xml_encoding(input, unicode(encoding), True), consumed)
+
+
+def encode(input, errors="strict", encoding=None):
+ consumed = len(input)
+ if encoding is None:
+ encoding = codecs.detect_xml_encoding(input, True)
+ else:
+ input = codecs.fix_xml_encoding(input, unicode(encoding), True)
+ if encoding == "xml":
+ raise ValueError("xml not allowed as encoding name")
+ info = codecs.lookup(encoding)
+ return (info.encode(input, errors)[0], consumed)
+
+
+class IncrementalDecoder(codecs.IncrementalDecoder):
+ def __init__(self, errors="strict", encoding=None):
+ self.decoder = None
+ self.encoding = encoding
+ codecs.IncrementalDecoder.__init__(self, errors)
+ self._errors = errors # Store ``errors`` somewhere else, because we have to hide it in a property
+ self.buffer = ""
+ self.headerfixed = False
+
+ def iterdecode(self, input):
+ for part in input:
+ result = self.decode(part, False)
+ if result:
+ yield result
+ result = self.decode("", True)
+ if result:
+ yield result
+
+ def decode(self, input, final=False):
+ # We're doing basically the same as a ``BufferedIncrementalDecoder``,
+ # but since the buffer is only relevant until the encoding has been detected
+ # (in which case the buffer of the underlying codec might kick in),
+ # we're implementing buffering ourselves to avoid some overhead.
+ if self.decoder is None:
+ input = self.buffer + input
+ self.encoding = codecs.detect_xml_encoding(input, final)
+ if self.encoding is None:
+ self.buffer = input # retry the complete input on the next call
+ return u"" # no encoding determined yet, so no output
+ if self.encoding == "xml":
+ raise ValueError("xml not allowed as encoding name")
+ self.buffer = "" # isn't needed any more, as the decoder might keep its own buffer
+ self.decoder = codecs.getincrementaldecoder(self.encoding)(self._errors)
+ if self.headerfixed:
+ return self.decoder.decode(input, final)
+ # If we haven't fixed the header yet, the content of ``self.buffer`` is a ``unicode`` object
+ output = self.buffer + self.decoder.decode(input, final)
+ newoutput = codecs.fix_xml_encoding(output, unicode(self.encoding), final)
+ if newoutput is None:
+ self.buffer = output # retry fixing the declaration (but keep the decoded stuff)
+ return u""
+ self.headerfixed = True
+ return newoutput
+
+ def reset(self):
+ codecs.IncrementalDecoder.reset(self)
+ self.decoder = None
+ self.buffer = ""
+ self.headerfixed = False
+
+ def _geterrors(self):
+ return self._errors
+
+ def _seterrors(self, errors):
+ # Setting ``errors`` must be done on the real decoder too
+ if self.decoder is not None:
+ self.decoder.errors = errors
+ self._errors = errors
+ errors = property(_geterrors, _seterrors)
+
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+ def __init__(self, errors="strict", encoding=None):
+ self.encoder = None
+ self.encoding = encoding
+ codecs.IncrementalEncoder.__init__(self, errors)
+ self._errors = errors # Store ``errors`` somewhere else, because we have to hide it in a property
+ self.buffer = u""
+
+ def iterencode(self, input):
+ for part in input:
+ result = self.encode(part, False)
+ if result:
+ yield result
+ result = self.encode(u"", True)
+ if result:
+ yield result
+
+ def encode(self, input, final=False):
+ if self.encoder is None:
+ input = self.buffer + input
+ if self.encoding is not None:
+ # Replace encoding in the declaration with the specified one
+ newinput = codecs.fix_xml_encoding(input, unicode(self.encoding), final)
+ if newinput is None: # declaration not complete => Retry next time
+ self.buffer = input
+ return ""
+ input = newinput
+ else:
+ # Use encoding from the XML declaration
+ self.encoding = codecs.detect_xml_encoding(input, final)
+ if self.encoding is not None:
+ if self.encoding == "xml":
+ raise ValueError("xml not allowed as encoding name")
+ info = codecs.lookup(self.encoding)
+ self.encoder = info.incrementalencoder(self._errors)
+ self.buffer = u""
+ else:
+ self.buffer = input
+ return ""
+ return self.encoder.encode(input, final)
+
+ def reset(self):
+ codecs.IncrementalEncoder.reset(self)
+ self.encoder = None
+ self.buffer = u""
+
+ def _geterrors(self):
+ return self._errors
+
+ def _seterrors(self, errors):
+ # Setting ``errors ``must be done on the real encoder too
+ if self.encoder is not None:
+ self.encoder.errors = errors
+ self._errors = errors
+ errors = property(_geterrors, _seterrors)
+
+
+class StreamWriter(codecs.StreamWriter):
+ def __init__(self, stream, errors="strict", encoding="utf-8", header=False):
+ codecs.StreamWriter.__init__(self, stream, errors)
+ self.encoder = IncrementalEncoder(errors)
+ self._errors = errors
+
+ def encode(self, input, errors='strict'):
+ return (self.encoder.encode(input, False), len(input))
+
+ def _geterrors(self):
+ return self._errors
+
+ def _seterrors(self, errors):
+ # Setting ``errors`` must be done on the encoder too
+ if self.encoder is not None:
+ self.encoder.errors = errors
+ self._errors = errors
+ errors = property(_geterrors, _seterrors)
+
+
+class StreamReader(codecs.StreamReader):
+ def __init__(self, stream, errors="strict"):
+ codecs.StreamReader.__init__(self, stream, errors)
+ self.decoder = IncrementalDecoder(errors)
+ self._errors = errors
+
+ def decode(self, input, errors='strict'):
+ return (self.decoder.decode(input, False), len(input))
+
+ def _geterrors(self):
+ return self._errors
+
+ def _seterrors(self, errors):
+ # Setting ``errors`` must be done on the decoder too
+ if self.decoder is not None:
+ self.decoder.errors = errors
+ self._errors = errors
+ errors = property(_geterrors, _seterrors)
+
+
+def getregentry():
+ return codecs.CodecInfo(
+ name="xml",
+ encode=encode,
+ decode=decode,
+ incrementalencoder=IncrementalEncoder,
+ incrementaldecoder=IncrementalDecoder,
+ streamwriter=StreamWriter,
+ streamreader=StreamReader,
+ )
Index: Lib/test/test_codecs.py
===================================================================
--- Lib/test/test_codecs.py (Revision 58878)
+++ Lib/test/test_codecs.py (Arbeitskopie)
@@ -1406,7 +1406,216 @@
info.streamwriter, 'strict') as srw:
self.assertEquals(srw.read(), u"\xfc")
+class XMLCodecTest(unittest.TestCase):
+ def test_detectencoding_str(self):
+ self.assert_(codecs.detect_xml_encoding("") is None)
+ self.assert_(codecs.detect_xml_encoding("\xef") is None)
+ self.assertEqual(codecs.detect_xml_encoding("\xef\x33"), "utf-8")
+ self.assert_(codecs.detect_xml_encoding("\xef\xbb") is None)
+ self.assertEqual(codecs.detect_xml_encoding("\xef\xbb\x33"), "utf-8")
+ self.assertEqual(codecs.detect_xml_encoding("\xef\xbb\xbf"), "utf-8-sig")
+ self.assert_(codecs.detect_xml_encoding("\xff") is None)
+ self.assertEqual(codecs.detect_xml_encoding("\xff\x33"), "utf-8")
+ self.assert_(codecs.detect_xml_encoding("\xff\xfe") is None)
+ self.assertEqual(codecs.detect_xml_encoding("\xff\xfe\x33"), "utf-16")
+ self.assert_(codecs.detect_xml_encoding("\xff\xfe\x00") is None)
+ self.assertEqual(codecs.detect_xml_encoding("\xff\xfe\x00\x33"), "utf-16")
+ self.assertEqual(codecs.detect_xml_encoding("\xff\xfe\x00\x00"), "utf-32")
+ self.assert_(codecs.detect_xml_encoding("\x00") is None)
+ self.assertEqual(codecs.detect_xml_encoding("\x00\x33"), "utf-8")
+ self.assert_(codecs.detect_xml_encoding("\x00\x00") is None)
+ self.assertEqual(codecs.detect_xml_encoding("\x00\x00\x33"), "utf-8")
+ self.assert_(codecs.detect_xml_encoding("\x00\x00\xfe") is None)
+ self.assertEqual(codecs.detect_xml_encoding("\x00\x00\x00\x33"), "utf-8")
+ self.assertEqual(codecs.detect_xml_encoding("\x00\x00\x00<"), "utf-32-be")
+ self.assertEqual(codecs.detect_xml_encoding("\x00\x00\xfe\xff"), "utf-32")
+ self.assert_(codecs.detect_xml_encoding("<") is None)
+ self.assertEqual(codecs.detect_xml_encoding("<\x33"), "utf-8")
+ self.assert_(codecs.detect_xml_encoding("<\x00") is None)
+ self.assertEqual(codecs.detect_xml_encoding("<\x00\x33"), "utf-8")
+ self.assert_(codecs.detect_xml_encoding("<\x00\x00") is None)
+ self.assertEqual(codecs.detect_xml_encoding("<\x00\x00\x33"), "utf-8")
+ self.assertEqual(codecs.detect_xml_encoding("<\x00\x00\x00"), "utf-32-le")
+ self.assert_(codecs.detect_xml_encoding("") is None)
+ self.assert_(codecs.detect_xml_encoding(""), "utf-8")
+ self.assert_(codecs.detect_xml_encoding("g\xfcrk\u20ac"):
+ # Check stateless decoder
+ d = codecs.getdecoder("xml")
+ self.assertEqual(
+ d(input.encode(encoding))[0],
+ input.replace("'x'", repr(encoding))
+ )
+
+ # Check stateless decoder with specified encoding
+ self.assertEqual(
+ d(input.encode(encoding), encoding=encoding)[0],
+ input.replace("'x'", repr(encoding))
+ )
+
+ # Check incremental decoder
+ id = codecs.getincrementaldecoder("xml")()
+ self.assertEqual(
+ "".join(id.iterdecode(input.encode(encoding))),
+ input.replace("'x'", repr(encoding))
+ )
+
+ # Check incremental decoder with specified encoding
+ id = codecs.getincrementaldecoder("xml")(encoding)
+ self.assertEqual(
+ "".join(id.iterdecode(input.encode(encoding))),
+ input.replace("'x'", repr(encoding))
+ )
+
+ # Autodetectable encodings
+ checkauto("utf-8-sig")
+ checkauto("utf-16")
+ checkauto("utf-16-le")
+ checkauto("utf-16-be")
+ checkauto("utf-32")
+ checkauto("utf-32-le")
+ checkauto("utf-32-be")
+
+ def checkdecl(encoding, input=u"\u20ac"):
+ # Check stateless decoder with encoding autodetection
+ d = codecs.getdecoder("xml")
+ input = input % encoding
+ self.assertEqual(d(input.encode(encoding))[0], input)
+
+ # Check stateless decoder with specified encoding
+ self.assertEqual(d(input.encode(encoding), encoding=encoding)[0], input)
+
+ # Check incremental decoder with encoding autodetection
+ id = codecs.getincrementaldecoder("xml")()
+ self.assertEqual("".join(id.iterdecode(input.encode(encoding))), input)
+
+ # Check incremental decoder with specified encoding
+ id = codecs.getincrementaldecoder("xml")(encoding)
+ self.assertEqual("".join(id.iterdecode(input.encode(encoding))), input)
+
+ # Use correct declaration
+ checkdecl("utf-8")
+ checkdecl("iso-8859-1", u"")
+ checkdecl("iso-8859-15")
+ checkdecl("cp1252")
+
+ # No recursion
+ self.assertRaises(ValueError, "".decode, "xml")
+
+ def test_encoder(self):
+ def check(encoding, input=u"g\xfcrk\u20ac"):
+ # Check stateless encoder with encoding autodetection
+ e = codecs.getencoder("xml")
+ inputdecl = input.replace("'x'", repr(encoding))
+ self.assertEqual(e(inputdecl)[0].decode(encoding), inputdecl)
+
+ # Check stateless encoder with specified encoding
+ self.assertEqual(e(input, encoding=encoding)[0].decode(encoding), inputdecl)
+
+ # Check incremental encoder with encoding autodetection
+ ie = codecs.getincrementalencoder("xml")()
+ self.assertEqual("".join(ie.iterencode(inputdecl)).decode(encoding), inputdecl)
+
+ # Check incremental encoder with specified encoding
+ ie = codecs.getincrementalencoder("xml")(encoding=encoding)
+ self.assertEqual("".join(ie.iterencode(input)).decode(encoding), inputdecl)
+
+ # Autodetectable encodings
+ check("utf-8-sig")
+ check("utf-16")
+ check("utf-16-le")
+ check("utf-16-be")
+ check("utf-32")
+ check("utf-32-le")
+ check("utf-32-be")
+ check("utf-8")
+ check("iso-8859-1", u"")
+ check("iso-8859-15")
+ check("cp1252")
+
+ # No recursion
+ self.assertRaises(ValueError, u"".encode, "xml")
+
+
def test_main():
test_support.run_unittest(
UTF32Test,
@@ -1435,6 +1644,7 @@
BasicStrTest,
CharmapTest,
WithStmtTest,
+ XMLCodecTest
)
Index: Modules/_codecs_functions.c
===================================================================
--- Modules/_codecs_functions.c (Revision 0)
+++ Modules/_codecs_functions.c (Revision 0)
@@ -0,0 +1,115 @@
+/* ------------------------------------------------------------------------
+
+ _codecs_functions -- bit shared between 8bit and unicode implementations
+ of functions in the codecs module.
+
+ ------------------------------------------------------------------------ */
+
+/* Parses a pseudoattr. Returns 2 if a name has been found, 1 if we're at the
+ end of the declaration, 0 if we didn't have enough data and -1 on error.
+ The pseudoattr name is put into namestart and nameend,
+ The pseudoattr value is put into valuestart and valueend. */
+static int STRINGLIB_PARSEPSEUDOATTR(
+ const STRINGLIB_CHAR *s, const STRINGLIB_CHAR *end,
+ const STRINGLIB_CHAR **namestart, const STRINGLIB_CHAR **nameend,
+ const STRINGLIB_CHAR **valuestart, const STRINGLIB_CHAR **valueend)
+{
+ STRINGLIB_CHAR quote;
+
+ /* goto beginning of next word */
+ while (s there's no pseudoattr there */
+ if (s[0] == '?' && s[1] == '>')
+ return 1;
+ }
+
+ *namestart = s;
+ while (s continue */
+ str = *encodingend+1;
+ }
+ }
+}
Index: Modules/_codecsmodule.c
===================================================================
--- Modules/_codecsmodule.c (Revision 58878)
+++ Modules/_codecsmodule.c (Arbeitskopie)
@@ -1073,6 +1073,444 @@
return PyCodec_LookupError(name);
}
+/* --- Functions for XML codecs ------------------------------------------- */
+
+static int cmpu2s(const Py_UNICODE *u, const char *s, Py_ssize_t len)
+{
+ while (len)
+ {
+ if (*u != *s)
+ return *u - *s;
+ ++u;
+ ++s;
+ --len;
+ }
+ return 0;
+}
+
+/* define unicode version of parsepseudoattr/parseencoding */
+#define STRINGLIB_PARSEPSEUDOATTR parse_xml_pseudoattr_unicode
+#define STRINGLIB_PARSEENCODING parse_xml_encoding_unicode
+#define STRINGLIB_CHAR Py_UNICODE
+#define STRINGLIB_ISALPHA(c) Py_UNICODE_ISALPHA(c)
+#define STRINGLIB_CMP2CHAR(u, s, l) cmpu2s(u, s, l)
+
+#include "_codecs_functions.c"
+
+#undef STRINGLIB_PARSEPSEUDOATTR
+#undef STRINGLIB_PARSEENCODING
+#undef STRINGLIB_CHAR
+#undef STRINGLIB_ISALPHA
+#undef STRINGLIB_CMP2CHAR
+
+/* define str version of parsepseudoattr/parseencoding */
+#define STRINGLIB_PARSEPSEUDOATTR parse_xml_pseudoattr_str
+#define STRINGLIB_PARSEENCODING parse_xml_encoding_str
+#define STRINGLIB_CHAR char
+#define STRINGLIB_ISALPHA(c) (((c)>='a' && (c)<='z') || ((c)>='A' && (c)<='Z'))
+#define STRINGLIB_CMP2CHAR(u, s, l) strncmp(u, s, l)
+
+#include "_codecs_functions.c"
+
+#undef STRINGLIB_PARSEPSEUDOATTR
+#undef STRINGLIB_PARSEENCODING
+#undef STRINGLIB_CHAR
+#undef STRINGLIB_ISALPHA
+#undef STRINGLIB_CMP2CHAR
+
+/* Parses a unicode XML declaration and returns the position of the encoding in
+ encodingstart/encodingend. Return values are the same as for parseencoding(). */
+int parse_xml_declaration_unicode(const Py_UNICODE *str, const Py_UNICODE *strend, const Py_UNICODE **encodingstart, const Py_UNICODE **encodingend)
+{
+ Py_ssize_t strlen = strend - str;
+
+ if (strlen>0)
+ {
+ if (*str++ != '<')
+ return 1;
+ if (strlen>1)
+ {
+ if (*str++ != '?')
+ return 1;
+ if (strlen>2)
+ {
+ if (*str++ != 'x')
+ return 1;
+ if (strlen>3)
+ {
+ if (*str++ != 'm')
+ return 1;
+ if (strlen>4)
+ {
+ if (*str++ != 'l')
+ return 1;
+ if (strlen>5)
+ {
+ if (*str != ' ' && *str != '\t' && *str != '\r' && *str != '\n')
+ return 1;
+ return parse_xml_encoding_unicode(++str, strend, encodingstart, encodingend);
+ }
+ }
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+/* We're using bits to store all possible candidate encodings (or variants, i.e.
+ * we have two bits for the variants of UTF-16 and two for the
+ * variants of UTF-32).
+ *
+ * Prefixes for various XML encodings
+ * (see http://www.w3.org/TR/2004/REC-xml-20040204/#sec-guessing)
+ * UTF-8-SIG xEF xBB xBF
+ * UTF-16 (LE) xFF xFE ~x00|~x00
+ * UTF-16 (BE) xFE xFF
+ * UTF-16-LE < x00 ? x00
+ * UTF-16-BE x00 <
+ * UTF-32 (LE) xFF xFE x00 x00
+ * UTF-32 (BE) x00 x00 xFE xFF
+ * UTF-32-LE < x00 x00 x00
+ * UTF-32-BE x00 x00 x00 <
+ * XML-DECL < ? x m l
+*/
+
+#define CANDIDATE_UTF_8_SIG (1<<0)
+#define CANDIDATE_UTF_16_AS_LE (1<<1)
+#define CANDIDATE_UTF_16_AS_BE (1<<2)
+#define CANDIDATE_UTF_16_LE (1<<3)
+#define CANDIDATE_UTF_16_BE (1<<4)
+#define CANDIDATE_UTF_32_AS_LE (1<<5)
+#define CANDIDATE_UTF_32_AS_BE (1<<6)
+#define CANDIDATE_UTF_32_LE (1<<7)
+#define CANDIDATE_UTF_32_BE (1<<8)
+#define CANDIDATE_DECL (1<<9)
+#define CANDIDATES ((CANDIDATE_DECL<<1)-1) /* All bits */
+
+#if 0
+/* for debugging output */
+void DUMPCANDIDATES(int candidates)
+{
+ if (candidates&CANDIDATE_UTF_8_SIG)
+ printf("u8s ");
+ else
+ printf("--- ");
+ if (candidates&CANDIDATE_UTF_16_AS_LE)
+ printf("u16(le) ");
+ else
+ printf("------- ");
+ if (candidates&CANDIDATE_UTF_16_AS_BE)
+ printf("u16(be) ");
+ else
+ printf("------- ");
+ if (candidates&CANDIDATE_UTF_16_LE)
+ printf("u16le ");
+ else
+ printf("----- ");
+ if (candidates&CANDIDATE_UTF_16_BE)
+ printf("u16be ");
+ else
+ printf("----- ");
+ if (candidates&CANDIDATE_UTF_32_AS_LE)
+ printf("u32(le) ");
+ else
+ printf("------- ");
+ if (candidates&CANDIDATE_UTF_32_AS_BE)
+ printf("u32(be) ");
+ else
+ printf("------- ");
+ if (candidates&CANDIDATE_UTF_32_LE)
+ printf("u32le ");
+ else
+ printf("----- ");
+ if (candidates&CANDIDATE_UTF_32_BE)
+ printf("u32be ");
+ else
+ printf("----- ");
+ if (candidates&CANDIDATE_DECL)
+ printf("decl\n");
+ else
+ printf("----\n");
+}
+
+/* for debugging output */
+void DUMPBYTE(char c)
+{
+ printf("-> %02x\n", (int)(unsigned char)c);
+}
+#else
+#define DUMPCANDIDATES(x)
+#define DUMPBYTE(x)
+#endif
+
+static PyObject *detect_xml_encoding_str(const char *str, Py_ssize_t len, int final)
+{
+ const char *origstr;
+ Py_ssize_t origlen;
+ int candidates = CANDIDATES; /* all 10 encodings are still possible */
+ const char *strend;
+ char firstbytes[4];
+
+ origlen = len;
+ origstr = str;
+ strend = str + len;
+
+ /* For each byte in the input delete the appropriate bit if the
+ * encoding has the wrong value in this spot. If no bits remain
+ * we default to UTF-8. If only one bit remains (and we had enough input)
+ * this is the resulting encoding.
+ */
+ DUMPCANDIDATES(candidates);
+ if (len)
+ {
+ /* Check first byte */
+ firstbytes[0] = *str;
+ DUMPBYTE(*str);
+ if (firstbytes[0] != '\xef')
+ candidates &= ~CANDIDATE_UTF_8_SIG;
+ if (firstbytes[0] != '\xff')
+ candidates &= ~CANDIDATE_UTF_32_AS_LE&
+ ~CANDIDATE_UTF_16_AS_LE;
+ if (firstbytes[0] != '\xfe')
+ candidates &= ~CANDIDATE_UTF_16_AS_BE;
+ if (firstbytes[0] != '<')
+ candidates &= ~CANDIDATE_UTF_32_LE&
+ ~CANDIDATE_UTF_16_LE&
+ ~CANDIDATE_DECL;
+ if (firstbytes[0] != '\x00')
+ candidates &= ~CANDIDATE_UTF_32_AS_BE&
+ ~CANDIDATE_UTF_32_BE&
+ ~CANDIDATE_UTF_16_BE;
+ DUMPCANDIDATES(candidates);
+ if (++str, --len)
+ {
+ /* Check second byte */
+ firstbytes[1] = *str;
+ DUMPBYTE(*str);
+ if (firstbytes[1] != '\xbb')
+ candidates &= ~CANDIDATE_UTF_8_SIG;
+ if (firstbytes[1] != '\xfe')
+ candidates &= ~CANDIDATE_UTF_16_AS_LE&
+ ~CANDIDATE_UTF_32_AS_LE;
+ if (firstbytes[1] != '\xff')
+ candidates &= ~CANDIDATE_UTF_16_AS_BE;
+ if (firstbytes[1] != '\x00')
+ candidates &= ~CANDIDATE_UTF_16_LE&
+ ~CANDIDATE_UTF_32_AS_BE&
+ ~CANDIDATE_UTF_32_LE&
+ ~CANDIDATE_UTF_32_BE;
+ if (firstbytes[1] != '<')
+ candidates &= ~CANDIDATE_UTF_16_BE;
+ if (firstbytes[1] != '?')
+ candidates &= ~CANDIDATE_DECL;
+ DUMPCANDIDATES(candidates);
+ if (++str, --len)
+ {
+ /* Check third byte */
+ firstbytes[2] = *str;
+ DUMPBYTE(*str);
+ if (firstbytes[2] != '\xbf')
+ candidates &= ~CANDIDATE_UTF_8_SIG;
+ if (firstbytes[2] != '?')
+ candidates &= ~CANDIDATE_UTF_16_LE;
+ if (firstbytes[2] != '\x00')
+ candidates &= ~CANDIDATE_UTF_32_AS_LE&
+ ~CANDIDATE_UTF_32_LE&
+ ~CANDIDATE_UTF_32_BE;
+ if (firstbytes[2] != '\xfe')
+ candidates &= ~CANDIDATE_UTF_32_AS_BE;
+ if (firstbytes[2] != 'x')
+ candidates &= ~CANDIDATE_DECL;
+ DUMPCANDIDATES(candidates);
+ if (++str, --len)
+ {
+ /* Check fourth byte */
+ firstbytes[3] = *str;
+ DUMPBYTE(*str);
+ if (firstbytes[3] == '\x00' && firstbytes[2] == '\x00')
+ candidates &= ~CANDIDATE_UTF_16_AS_LE;
+ if (firstbytes[3] != '\x00')
+ candidates &= ~CANDIDATE_UTF_16_LE&
+ ~CANDIDATE_UTF_32_AS_LE&
+ ~CANDIDATE_UTF_32_LE;
+ if (firstbytes[3] != '\xff')
+ candidates &= ~CANDIDATE_UTF_32_AS_BE;
+ if (firstbytes[3] != '<')
+ candidates &= ~CANDIDATE_UTF_32_BE;
+ if (firstbytes[3] != 'm')
+ candidates &= ~CANDIDATE_DECL;
+ DUMPCANDIDATES(candidates);
+ if (++str, --len)
+ {
+ /* Check fifth byte */
+ DUMPBYTE(*str);
+ if (*str != 'l')
+ candidates &= ~CANDIDATE_DECL;
+ DUMPCANDIDATES(candidates);
+ if (++str, --len)
+ {
+ /* Check sixth byte */
+ DUMPBYTE(*str);
+ if (*str != ' ' && *str != '\t' && *str != '\r' && *str != '\n')
+ candidates &= ~CANDIDATE_DECL;
+ DUMPCANDIDATES(candidates);
+ }
+ }
+ }
+ }
+ }
+ }
+ if (candidates == 0)
+ return PyString_FromString("utf-8");
+ else if (!(candidates & (candidates-1))) /* only one encoding remaining */
+ {
+ if ((candidates == CANDIDATE_UTF_8_SIG) && (origlen >= 3))
+ return PyString_FromString("utf-8-sig");
+ else if ((candidates == CANDIDATE_UTF_16_AS_LE) && (origlen >= 2))
+ return PyString_FromString("utf-16");
+ else if ((candidates == CANDIDATE_UTF_16_AS_BE) && (origlen >= 2))
+ return PyString_FromString("utf-16");
+ else if ((candidates == CANDIDATE_UTF_16_LE) && (origlen >= 4))
+ return PyString_FromString("utf-16-le");
+ else if ((candidates == CANDIDATE_UTF_16_BE) && (origlen >= 2))
+ return PyString_FromString("utf-16-be");
+ else if ((candidates == CANDIDATE_UTF_32_AS_LE) && (origlen >= 4))
+ return PyString_FromString("utf-32");
+ else if ((candidates == CANDIDATE_UTF_32_AS_BE) && (origlen >= 4))
+ return PyString_FromString("utf-32");
+ else if ((candidates == CANDIDATE_UTF_32_LE) && (origlen >= 4))
+ return PyString_FromString("utf-32-le");
+ else if ((candidates == CANDIDATE_UTF_32_BE) && (origlen >= 4))
+ return PyString_FromString("utf-32-be");
+ else if ((candidates == CANDIDATE_DECL) && (origlen >= 6))
+ {
+ const char *encodingstart;
+ const char *encodingend;
+
+ switch (parse_xml_encoding_str(str, strend, &encodingstart, &encodingend))
+ {
+ case -1:
+ return NULL;
+ case 0: /* don't know yet */
+ Py_RETURN_NONE;
+ case 1: /* not found => default to utf-8 */
+ return PyString_FromString("utf-8");
+ case 2: /* found it */
+ return PyString_FromStringAndSize(encodingstart, encodingend-encodingstart);
+ }
+ }
+ }
+ if (final) /* if this is the last call, and we haven't determined an encoding yet, we default to UTF-8 */
+ return PyString_FromString("utf-8");
+ /* We don't know yet */
+ Py_RETURN_NONE;
+}
+
+static PyObject *detect_xml_encoding_unicode(const Py_UNICODE *str, Py_ssize_t len, int final)
+{
+ const Py_UNICODE *encodingstart;
+ const Py_UNICODE *encodingend;
+
+ switch (parse_xml_declaration_unicode(str, str+len, &encodingstart, &encodingend))
+ {
+ case -1:
+ return NULL;
+ case 0: /* don't know yet */
+ if (final) /* we won't get better data, so default to utf-8 */
+ goto utf8;
+ Py_RETURN_NONE;
+ case 1: /* not found => default to UTF-8 */
+ goto utf8;
+ case 2: /* found it => put the encoding name into this spot and return the new string */
+ return PyUnicode_FromUnicode(encodingstart, encodingend-encodingstart);
+ }
+ utf8:
+ return PyUnicode_DecodeASCII("utf-8", 5, NULL);
+}
+
+static PyObject *detect_xml_encoding(PyObject *self, PyObject *args)
+{
+ PyObject *obj;
+ int final = 0;
+
+ if (!PyArg_ParseTuple(args, "O|i:detect_xml_encoding", &obj, &final))
+ return NULL;
+
+ if (PyString_Check(obj))
+ return detect_xml_encoding_str(PyString_AS_STRING(obj), PyString_GET_SIZE(obj), final);
+ else if (PyUnicode_Check(obj))
+ return detect_xml_encoding_unicode(PyUnicode_AS_UNICODE(obj), PyUnicode_GET_SIZE(obj), final);
+ else
+ {
+ PyErr_SetString(PyExc_TypeError, "expected str or unicode");
+ return NULL;
+ }
+}
+
+static char detect_xml_encoding__doc__[] =
+"detect_xml_encoding(str[, final=False]) -> str or None\n\
+\n\
+Tries to detect the XML encoding from the first few bytes of the string\n\
+or the encoding declaration in the XML header. Return the name of the\n\
+encoding or None, if the encoding is ambiguous.";
+
+PyObject *fix_xml_encoding(PyObject *self, PyObject *args)
+{
+ PyObject *strobj;
+ const Py_UNICODE *strstart;
+ const Py_UNICODE *strend;
+ int final = 0;
+ const Py_UNICODE *enc;
+ Py_ssize_t enclen;
+ const Py_UNICODE *encodingstart;
+ const Py_UNICODE *encodingend;
+
+ if (!PyArg_ParseTuple(args, "O!u#|i:fixencoding", &PyUnicode_Type, &strobj, &enc, &enclen, &final))
+ return NULL;
+
+ strstart = PyUnicode_AS_UNICODE(strobj);
+ strend = strstart + PyUnicode_GET_SIZE(strobj);
+ switch (parse_xml_declaration_unicode(strstart, strend, &encodingstart, &encodingend))
+ {
+ case -1:
+ return NULL;
+ case 0: /* don't know yet */
+ if (final) /* we won't get better data, so use what we have */
+ goto original;
+ Py_RETURN_NONE;
+ case 1: /* not found => return original string */
+ goto original;
+ case 2: /* found it */
+ {
+ /* yes => put the encoding name into this spot and return the new string */
+ PyObject *newobj = PyUnicode_FromUnicode(NULL, (encodingstart-strstart) + enclen + (strend - encodingend));
+ Py_UNICODE *new;
+ if (!newobj)
+ return NULL;
+ new = PyUnicode_AS_UNICODE(newobj);
+ #define Py_UNICODE_STPCOPY(target, source, length) (Py_UNICODE_COPY(target, source, length), (target)+(length))
+ new = Py_UNICODE_STPCOPY(new, strstart, encodingstart-strstart);
+ new = Py_UNICODE_STPCOPY(new, enc, enclen);
+ (void) Py_UNICODE_STPCOPY(new, encodingend, strend - encodingend);
+ return newobj;
+ }
+ }
+ Py_RETURN_NONE;
+ original:
+ Py_INCREF(strobj);
+ return strobj;
+}
+
+static char fix_xml_encoding__doc__[] =
+"fix_xml_encoding(unicode, encoding) -> unicode or None\n\
+\n\
+Replaces the encoding specification in the XML declaration at the start of the\n\
+first argument with the encoding specified. If there's no XML declaration the\n\
+original string is returned. If the string isn't long enough to find an\n\
+encoding None is returned.";
+
+
/* --- Module API --------------------------------------------------------- */
static PyMethodDef _codecs_functions[] = {
@@ -1129,6 +1567,10 @@
register_error__doc__},
{"lookup_error", lookup_error, METH_VARARGS,
lookup_error__doc__},
+ {"detect_xml_encoding", detect_xml_encoding, METH_VARARGS,
+ detect_xml_encoding__doc__},
+ {"fix_xml_encoding", fix_xml_encoding, METH_VARARGS,
+ fix_xml_encoding__doc__},
{NULL, NULL} /* sentinel */
};