Index: Doc/library/codecs.rst
===================================================================
--- Doc/library/codecs.rst	(Revision 58878)
+++ Doc/library/codecs.rst	(Arbeitskopie)
@@ -1135,6 +1135,8 @@
 | uu_codec           | uu                        | byte string    | Convert the operand using |
 |                    |                           |                | uuencode                  |
 +--------------------+---------------------------+----------------+---------------------------+
+| xml                |                           | Unicode string | An XML meta codec         |
++--------------------+---------------------------+----------------+---------------------------+
 | zlib_codec         | zip, zlib                 | byte string    | Compress the operand      |
 |                    |                           |                | using gzip                |
 +--------------------+---------------------------+----------------+---------------------------+
@@ -1218,3 +1220,39 @@
 is only done once (on the first write to the byte stream).  For decoding an
 optional UTF-8 encoded BOM at the start of the data will be skipped.
 
+
+:mod:`encodings.xml` --- XML meta codec
+---------------------------------------
+
+.. module:: encodings.xml
+   :synopsis: XML meta codec
+.. moduleauthor:: Walter Dörwald
+
+.. versionadded:: 2.6
+
+This module implements a codec that can be used for encoding and decoding XML.
+Once the encoding has been determined the decoding/encoding process falls back
+to using the codec for that particular encoding to do the rest of the work, so
+this XML codec supports all encodings supported by Python itself.
+
+On decoding the XML codec determines the encoding by either inspecting the
+first few bytes of the byte stream/string or by extracting the encoding from
+the XML declaration. If the encoding can't be determined from the first few
+bytes and there is no XML declaration the codec falls back to using UTF-8.
+When the encoding is specified by an external source (e.g. a Content-Type
+header in an HTTP response), this encoding can be passed as an argument to the
+codec, which will then bypass encoding detection. If there's an XML declaration
+in the input, the XML declaration passed to the application then will contain
+the externally specified encoding instead of the original one.
+
+On encoding the XML codec extracts the encoding from the XML declaration and
+will encode the output in that encoding. If there's no XML declaration UTF-8
+will be used. It's possible to pass an external encoding to the encoder too.
+The encoder will then encode the output in that encoding and put the correct
+encoding into the XML declaration (if there is one).
+
+
+.. seealso::
+
+   http://www.w3.org/TR/2004/REC-xml-20040204/#sec-guessing
+      Autodetection of Character Encodings in XML
Index: Lib/encodings/xml.py
===================================================================
--- Lib/encodings/xml.py	(Revision 0)
+++ Lib/encodings/xml.py	(Revision 0)
@@ -0,0 +1,198 @@
+# -*- coding: iso-8859-1 -*-
+"""
+Python 'xml' Codec
+"""
+
+
+import codecs
+
+
+def decode(input, errors="strict", encoding=None):
+    if encoding is None:
+        encoding = codecs.detect_xml_encoding(input, True)
+    if encoding == "xml":
+        raise ValueError("xml not allowed as encoding name")
+    (input, consumed) = codecs.getdecoder(encoding)(input, errors)
+    return (codecs.fix_xml_encoding(input, unicode(encoding), True), consumed)
+
+
+def encode(input, errors="strict", encoding=None):
+    consumed = len(input)
+    if encoding is None:
+        encoding = codecs.detect_xml_encoding(input, True)
+    else:
+        input = codecs.fix_xml_encoding(input, unicode(encoding), True)
+    if encoding == "xml":
+        raise ValueError("xml not allowed as encoding name")
+    info = codecs.lookup(encoding)
+    return (info.encode(input, errors)[0], consumed)
+
+
+class IncrementalDecoder(codecs.IncrementalDecoder):
+    def __init__(self, errors="strict", encoding=None):
+        self.decoder = None
+        self.encoding = encoding
+        codecs.IncrementalDecoder.__init__(self, errors)
+        self._errors = errors # Store ``errors`` somewhere else, because we have to hide it in a property
+        self.buffer = ""
+        self.headerfixed = False
+
+    def iterdecode(self, input):
+        for part in input:
+            result = self.decode(part, False)
+            if result:
+                yield result
+        result = self.decode("", True)
+        if result:
+            yield result
+
+    def decode(self, input, final=False):
+        # We're doing basically the same as a ``BufferedIncrementalDecoder``,
+        # but since  the buffer is only relevant until the encoding has been detected
+        # (in which case the buffer of the underlying codec might kick in),
+        # we're implementing buffering ourselves to avoid some overhead.
+        if self.decoder is None:
+            input = self.buffer + input
+            self.encoding = codecs.detect_xml_encoding(input, final)
+            if self.encoding is None:
+                self.buffer = input # retry the complete input on the next call
+                return u"" # no encoding determined yet, so no output
+            if self.encoding == "xml":
+                raise ValueError("xml not allowed as encoding name")
+            self.buffer = "" # isn't needed any more, as the decoder might keep its own buffer
+            self.decoder = codecs.getincrementaldecoder(self.encoding)(self._errors)
+        if self.headerfixed:
+            return self.decoder.decode(input, final)
+        # If we haven't fixed the header yet, the content of ``self.buffer`` is a ``unicode`` object
+        output = self.buffer + self.decoder.decode(input, final)
+        newoutput = codecs.fix_xml_encoding(output, unicode(self.encoding), final)
+        if newoutput is None:
+            self.buffer = output # retry fixing the declaration (but keep the decoded stuff)
+            return u""
+        self.headerfixed = True
+        return newoutput
+
+    def reset(self):
+        codecs.IncrementalDecoder.reset(self)
+        self.decoder = None
+        self.buffer = ""
+        self.headerfixed = False
+
+    def _geterrors(self):
+        return self._errors
+
+    def _seterrors(self, errors):
+        # Setting ``errors`` must be done on the real decoder too
+        if self.decoder is not None:
+            self.decoder.errors = errors
+        self._errors = errors
+    errors = property(_geterrors, _seterrors)
+
+
+class IncrementalEncoder(codecs.IncrementalEncoder):
+    def __init__(self, errors="strict", encoding=None):
+        self.encoder = None
+        self.encoding = encoding
+        codecs.IncrementalEncoder.__init__(self, errors)
+        self._errors = errors # Store ``errors`` somewhere else, because we have to hide it in a property
+        self.buffer = u""
+
+    def iterencode(self, input):
+        for part in input:
+            result = self.encode(part, False)
+            if result:
+                yield result
+        result = self.encode(u"", True)
+        if result:
+            yield result
+
+    def encode(self, input, final=False):
+        if self.encoder is None:
+            input = self.buffer + input
+            if self.encoding is not None:
+                # Replace encoding in the declaration with the specified one
+                newinput = codecs.fix_xml_encoding(input, unicode(self.encoding), final)
+                if newinput is None: # declaration not complete => Retry next time
+                    self.buffer = input
+                    return ""
+                input = newinput
+            else:
+                # Use encoding from the XML declaration
+                self.encoding = codecs.detect_xml_encoding(input, final)
+            if self.encoding is not None:
+                if self.encoding == "xml":
+                    raise ValueError("xml not allowed as encoding name")
+                info = codecs.lookup(self.encoding)
+                self.encoder = info.incrementalencoder(self._errors)
+                self.buffer = u""
+            else:
+                self.buffer = input
+                return ""
+        return self.encoder.encode(input, final)
+
+    def reset(self):
+        codecs.IncrementalEncoder.reset(self)
+        self.encoder = None
+        self.buffer = u""
+
+    def _geterrors(self):
+        return self._errors
+
+    def _seterrors(self, errors):
+        # Setting ``errors ``must be done on the real encoder too
+        if self.encoder is not None:
+            self.encoder.errors = errors
+        self._errors = errors
+    errors = property(_geterrors, _seterrors)
+
+
+class StreamWriter(codecs.StreamWriter):
+    def __init__(self, stream, errors="strict", encoding="utf-8", header=False):
+        codecs.StreamWriter.__init__(self, stream, errors)
+        self.encoder = IncrementalEncoder(errors)
+        self._errors = errors
+
+    def encode(self, input, errors='strict'):
+        return (self.encoder.encode(input, False), len(input))
+
+    def _geterrors(self):
+        return self._errors
+
+    def _seterrors(self, errors):
+        # Setting ``errors`` must be done on the encoder too
+        if self.encoder is not None:
+            self.encoder.errors = errors
+        self._errors = errors
+    errors = property(_geterrors, _seterrors)
+
+
+class StreamReader(codecs.StreamReader):
+    def __init__(self, stream, errors="strict"):
+        codecs.StreamReader.__init__(self, stream, errors)
+        self.decoder = IncrementalDecoder(errors)
+        self._errors = errors
+
+    def decode(self, input, errors='strict'):
+        return (self.decoder.decode(input, False), len(input))
+
+    def _geterrors(self):
+        return self._errors
+
+    def _seterrors(self, errors):
+        # Setting ``errors`` must be done on the decoder too
+        if self.decoder is not None:
+            self.decoder.errors = errors
+        self._errors = errors
+    errors = property(_geterrors, _seterrors)
+
+
+def getregentry():
+    return codecs.CodecInfo(
+        name="xml",
+        encode=encode,
+        decode=decode,
+        incrementalencoder=IncrementalEncoder,
+        incrementaldecoder=IncrementalDecoder,
+        streamwriter=StreamWriter,
+        streamreader=StreamReader,
+    )
Index: Lib/test/test_codecs.py
===================================================================
--- Lib/test/test_codecs.py	(Revision 58878)
+++ Lib/test/test_codecs.py	(Arbeitskopie)
@@ -1406,7 +1406,216 @@
                                        info.streamwriter, 'strict') as srw:
             self.assertEquals(srw.read(), u"\xfc")
 
+class XMLCodecTest(unittest.TestCase):
 
+    def test_detectencoding_str(self):
+        self.assert_(codecs.detect_xml_encoding("") is None)
+        self.assert_(codecs.detect_xml_encoding("\xef") is None)
+        self.assertEqual(codecs.detect_xml_encoding("\xef\x33"), "utf-8")
+        self.assert_(codecs.detect_xml_encoding("\xef\xbb") is None)
+        self.assertEqual(codecs.detect_xml_encoding("\xef\xbb\x33"), "utf-8")
+        self.assertEqual(codecs.detect_xml_encoding("\xef\xbb\xbf"), "utf-8-sig")
+        self.assert_(codecs.detect_xml_encoding("\xff") is None)
+        self.assertEqual(codecs.detect_xml_encoding("\xff\x33"), "utf-8")
+        self.assert_(codecs.detect_xml_encoding("\xff\xfe") is None)
+        self.assertEqual(codecs.detect_xml_encoding("\xff\xfe\x33"), "utf-16")
+        self.assert_(codecs.detect_xml_encoding("\xff\xfe\x00") is None)
+        self.assertEqual(codecs.detect_xml_encoding("\xff\xfe\x00\x33"), "utf-16")
+        self.assertEqual(codecs.detect_xml_encoding("\xff\xfe\x00\x00"), "utf-32")
+        self.assert_(codecs.detect_xml_encoding("\x00") is None)
+        self.assertEqual(codecs.detect_xml_encoding("\x00\x33"), "utf-8")
+        self.assert_(codecs.detect_xml_encoding("\x00\x00") is None)
+        self.assertEqual(codecs.detect_xml_encoding("\x00\x00\x33"), "utf-8")
+        self.assert_(codecs.detect_xml_encoding("\x00\x00\xfe") is None)
+        self.assertEqual(codecs.detect_xml_encoding("\x00\x00\x00\x33"), "utf-8")
+        self.assertEqual(codecs.detect_xml_encoding("\x00\x00\x00<"), "utf-32-be")
+        self.assertEqual(codecs.detect_xml_encoding("\x00\x00\xfe\xff"), "utf-32")
+        self.assert_(codecs.detect_xml_encoding("<") is None)
+        self.assertEqual(codecs.detect_xml_encoding("<\x33"), "utf-8")
+        self.assert_(codecs.detect_xml_encoding("<\x00") is None)
+        self.assertEqual(codecs.detect_xml_encoding("<\x00\x33"), "utf-8")
+        self.assert_(codecs.detect_xml_encoding("<\x00\x00") is None)
+        self.assertEqual(codecs.detect_xml_encoding("<\x00\x00\x33"), "utf-8")
+        self.assertEqual(codecs.detect_xml_encoding("<\x00\x00\x00"), "utf-32-le")
+        self.assert_(codecs.detect_xml_encoding("<?") is None)
+        self.assert_(codecs.detect_xml_encoding("<?x") is None)
+        self.assert_(codecs.detect_xml_encoding("<?xm") is None)
+        self.assert_(codecs.detect_xml_encoding("<?xml") is None)
+        self.assert_(codecs.detect_xml_encoding("<?xml\r") is None)
+        self.assert_(codecs.detect_xml_encoding("<?xml\rversion='1.0'") is None)
+        self.assert_(codecs.detect_xml_encoding("<?xml\rversion='1.0' encoding='x") is None)
+        self.assertEqual(codecs.detect_xml_encoding("<?xml\rversion='1.0' encoding='x'"), "x")
+        self.assertEqual(codecs.detect_xml_encoding('<?xml\rversion="1.0" encoding="x"'), "x")
+        self.assertEqual(codecs.detect_xml_encoding('<?xml \r\n\t \r\n\t \r\n\tversion \r\n\t \r\n\t= \r\n\t \r\n\t"1.0" \r\n\t \r\n\t \r\n\tencoding \r\n\t \r\n\t= \r\n\t \r\n\t"x"'), "x")
+        self.assertEqual(codecs.detect_xml_encoding("<?xml\rversion='1.0' ?>"), "utf-8")
+        self.assert_(codecs.detect_xml_encoding("<?xml\rversion='1.0' Encoding='x'") is None) # encoding not recognized (might come later)
+        self.assert_(codecs.detect_xml_encoding("<?xml\rVersion='1.0'") is None)
+        self.assertRaises(ValueError, codecs.detect_xml_encoding, "<?xml\rversion='1.0' encoding=''") # empty encoding
+        self.assert_(codecs.detect_xml_encoding("<", False) is None)
+        self.assertEqual(codecs.detect_xml_encoding("<", True), "utf-8")
+        self.assert_(codecs.detect_xml_encoding("<?", False) is None)
+        self.assertEqual(codecs.detect_xml_encoding("<?", True), "utf-8")
+
+    def test_detectencoding_unicode(self):
+        # Unicode version (only parses the header)
+        self.assert_(codecs.detect_xml_encoding(u'<?xml \r\n\t \r\n\t \r\n\tversion \r\n\t \r\n\t= \r\n\t \r\n\t"1.0" \r\n\t \r\n\t \r\n\tencoding \r\n\t \r\n\t= \r\n\t \r\n\t"x') is None)
+        self.assertEqual(codecs.detect_xml_encoding(u'<?xml \r\n\t \r\n\t \r\n\tversion \r\n\t \r\n\t= \r\n\t \r\n\t"1.0" \r\n\t \r\n\t \r\n\tencoding \r\n\t \r\n\t= \r\n\t \r\n\t"x', True), "utf-8")
+        self.assertEqual(codecs.detect_xml_encoding(u'<?xml \r\n\t \r\n\t \r\n\tversion \r\n\t \r\n\t= \r\n\t \r\n\t"1.0" \r\n\t \r\n\t \r\n\tencoding \r\n\t \r\n\t= \r\n\t \r\n\t"x"'), "x")
+
+    def test_fixencoding(self):
+        s = u'<?xml \r\n\t \r\n\t \r\n\tversion \r\n\t \r\n\t= \r\n\t \r\n\t"1.0" \r\n\t \r\n\t \r\n\tencoding \r\n\t \r\n\t= \r\n\t \r\n\t"x'
+        self.assert_(codecs.fix_xml_encoding(s, u"utf-8") is None)
+
+        s = u'<?xml \r\n\t \r\n\t \r\n\tversion \r\n\t \r\n\t= \r\n\t \r\n\t"1.0" \r\n\t \r\n\t \r\n\tencoding \r\n\t \r\n\t= \r\n\t \r\n\t"x'
+        self.assertEqual(codecs.fix_xml_encoding(s, u"utf-8", True), s)
+
+        s = u'<?xml \r\n\t \r\n\t \r\n\tversion \r\n\t \r\n\t= \r\n\t \r\n\t"1.0" \r\n\t \r\n\t \r\n\tencoding \r\n\t \r\n\t= \r\n\t \r\n\t"x"'
+        self.assertEqual(codecs.fix_xml_encoding(s, u"utf-8"), s.replace('"x"', '"utf-8"'))
+
+    def check_partial(self, decoder, input, *parts):
+        self.assertEqual(len(input), len(parts))
+        for (c, part) in zip(input, parts):
+            self.assertEqual(decoder.decode(c), part)
+
+    def test_partial(self):
+        decoder = codecs.getincrementaldecoder("xml")()
+
+        # UTF-16
+        self.check_partial(decoder, u"\ufeff".encode("utf-16-be"), u"", u"")
+        decoder.reset()
+
+        self.check_partial(decoder, u"\ufeff".encode("utf-16-le"), u"", u"")
+        decoder.reset()
+
+        result = (u"", u"", u"", u"\u1234", u"", u"a")
+
+        self.check_partial(decoder, u"\u1234a".encode("utf-16"), *result)
+        decoder.reset()
+
+        # Fake utf-16 stored big endian
+        self.check_partial(decoder, u"\ufeff\u1234a".encode("utf-16-be"), *result)
+        decoder.reset()
+
+        # Fake utf-16 stored little endian
+        self.check_partial(decoder, u"\ufeff\u1234a".encode("utf-16-le"), *result)
+        decoder.reset()
+
+        # UTF-32
+        result = (u"", u"", u"", u"", u"", u"", u"", u"\u1234", u"", u"", u"", u"a")
+        self.check_partial(decoder, u"\u1234a".encode("utf-32"), *result)
+        decoder.reset()
+
+        # Fake utf-32 stored big endian
+        self.check_partial(decoder, u"\ufeff\u1234a".encode("utf-32-be"), *result)
+        decoder.reset()
+
+        # Fake utf-32 stored little endian
+        self.check_partial(decoder, u"\ufeff\u1234a".encode("utf-32-le"), *result)
+        decoder.reset()
+
+        # UTF-8-Sig
+        self.check_partial(decoder, u"\u1234a".encode("utf-8-sig"), u"", u"", u"", u"", u"", u"\u1234", u"a")
+        decoder.reset()
+
+    def test_decoder(self):
+        def checkauto(encoding, input=u"<?xml encoding='x'?>g\xfcrk\u20ac"):
+            # Check stateless decoder
+            d = codecs.getdecoder("xml")
+            self.assertEqual(
+                d(input.encode(encoding))[0],
+                input.replace("'x'", repr(encoding))
+            )
+
+            # Check stateless decoder with specified encoding
+            self.assertEqual(
+                d(input.encode(encoding), encoding=encoding)[0],
+                input.replace("'x'", repr(encoding))
+            )
+
+            # Check incremental decoder
+            id = codecs.getincrementaldecoder("xml")()
+            self.assertEqual(
+                "".join(id.iterdecode(input.encode(encoding))),
+                input.replace("'x'", repr(encoding))
+            )
+
+            # Check incremental decoder with specified encoding
+            id = codecs.getincrementaldecoder("xml")(encoding)
+            self.assertEqual(
+                "".join(id.iterdecode(input.encode(encoding))),
+                input.replace("'x'", repr(encoding))
+            )
+
+        # Autodetectable encodings
+        checkauto("utf-8-sig")
+        checkauto("utf-16")
+        checkauto("utf-16-le")
+        checkauto("utf-16-be")
+        checkauto("utf-32")
+        checkauto("utf-32-le")
+        checkauto("utf-32-be")
+
+        def checkdecl(encoding, input=u"<?xml encoding=%r?><g\xfcrk>\u20ac</g\xfcrk>"):
+            # Check stateless decoder with encoding autodetection
+            d = codecs.getdecoder("xml")
+            input = input % encoding
+            self.assertEqual(d(input.encode(encoding))[0], input)
+
+            # Check stateless decoder with specified encoding
+            self.assertEqual(d(input.encode(encoding), encoding=encoding)[0], input)
+
+            # Check incremental decoder with encoding autodetection
+            id = codecs.getincrementaldecoder("xml")()
+            self.assertEqual("".join(id.iterdecode(input.encode(encoding))), input)
+
+            # Check incremental decoder with specified encoding
+            id = codecs.getincrementaldecoder("xml")(encoding)
+            self.assertEqual("".join(id.iterdecode(input.encode(encoding))), input)
+
+        # Use correct declaration
+        checkdecl("utf-8")
+        checkdecl("iso-8859-1", u"<?xml encoding=%r?><g\xfcrk/>")
+        checkdecl("iso-8859-15")
+        checkdecl("cp1252")
+
+        # No recursion
+        self.assertRaises(ValueError, "<?xml encoding='xml'?><gurk/>".decode, "xml")
+
+    def test_encoder(self):
+        def check(encoding, input=u"<?xml encoding='x'?>g\xfcrk\u20ac"):
+            # Check stateless encoder with encoding autodetection
+            e = codecs.getencoder("xml")
+            inputdecl = input.replace("'x'", repr(encoding))
+            self.assertEqual(e(inputdecl)[0].decode(encoding), inputdecl)
+
+            # Check stateless encoder with specified encoding
+            self.assertEqual(e(input, encoding=encoding)[0].decode(encoding), inputdecl)
+
+            # Check incremental encoder with encoding autodetection
+            ie = codecs.getincrementalencoder("xml")()
+            self.assertEqual("".join(ie.iterencode(inputdecl)).decode(encoding), inputdecl)
+
+            # Check incremental encoder with specified encoding
+            ie = codecs.getincrementalencoder("xml")(encoding=encoding)
+            self.assertEqual("".join(ie.iterencode(input)).decode(encoding), inputdecl)
+
+        # Autodetectable encodings
+        check("utf-8-sig")
+        check("utf-16")
+        check("utf-16-le")
+        check("utf-16-be")
+        check("utf-32")
+        check("utf-32-le")
+        check("utf-32-be")
+        check("utf-8")
+        check("iso-8859-1", u"<?xml encoding='x'?><g\xfcrk/>")
+        check("iso-8859-15")
+        check("cp1252")
+
+        # No recursion
+        self.assertRaises(ValueError, u"<?xml encoding='xml'?><gurk/>".encode, "xml")
+
+
 def test_main():
     test_support.run_unittest(
         UTF32Test,
@@ -1435,6 +1644,7 @@
         BasicStrTest,
         CharmapTest,
         WithStmtTest,
+        XMLCodecTest
     )
 
 
Index: Modules/_codecs_functions.c
===================================================================
--- Modules/_codecs_functions.c	(Revision 0)
+++ Modules/_codecs_functions.c	(Revision 0)
@@ -0,0 +1,115 @@
+/* ------------------------------------------------------------------------
+
+   _codecs_functions -- bit shared between 8bit and unicode implementations
+                        of functions in the codecs module.
+
+   ------------------------------------------------------------------------ */
+
+/* Parses a pseudoattr. Returns 2 if a name has been found, 1 if we're at the
+   end of the declaration, 0 if we didn't have enough data and -1 on error.
+   The pseudoattr name is put into namestart and nameend,
+   The pseudoattr value is put into valuestart and valueend. */
+static int STRINGLIB_PARSEPSEUDOATTR(
+    const STRINGLIB_CHAR *s, const STRINGLIB_CHAR *end,
+    const STRINGLIB_CHAR **namestart, const STRINGLIB_CHAR **nameend,
+    const STRINGLIB_CHAR **valuestart, const STRINGLIB_CHAR **valueend)
+{
+    STRINGLIB_CHAR quote;
+
+    /* goto beginning of next word */
+    while (s<end && (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n'))
+        ++s;
+
+    if (s == end) /* don't know yet */
+        return 0;
+
+    if (s+1<end)
+    {
+        /* we're at the end of the declaration => there's no pseudoattr there */
+        if (s[0] == '?' && s[1] == '>')
+            return 1;
+    }
+
+    *namestart = s;
+    while (s<end && STRINGLIB_ISALPHA(*s))
+        ++s;
+    if (s == end) /* don't know yet */
+        return 0;
+    *nameend = s;
+
+    if (*namestart == *nameend)
+    {
+        PyErr_SetString(PyExc_ValueError,
+            "malformed XML declaration: empty or malformed pseudoattr name");
+        return -1;
+    }
+
+    while (s<end && (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n'))
+        ++s;
+
+    if (s==end) /* don't know yet */
+        return 0;
+
+    if (*s++ != '=')
+    {
+        PyErr_SetString(PyExc_ValueError, "malformed XML declaration: expected '='");
+        return -1;
+    }
+
+    while (s<end && (*s == ' ' || *s == '\t' || *s == '\r' || *s == '\n'))
+        ++s;
+    if (s == end) /* don't know yet */
+        return 0;
+
+    quote = *s;
+
+    if (quote != '"' && quote != '\'')
+    {
+        PyErr_SetString(PyExc_ValueError, "malformed XML declaration: expected quote");
+        return -1;
+    }
+
+    *valuestart = ++s;
+    while (s < end && *s != quote)
+        ++s;
+    if (s == end) /* don't know yet */
+        return 0;
+    *valueend = s;
+
+    if (*valuestart == *valueend)
+    {
+        PyErr_SetString(PyExc_ValueError, "malformed XML declaration: empty pseudoattr value");
+        return -1;
+    }
+
+    return 2; /* found one */
+}
+
+
+/* finds the pseudo attribute encoding and returns the position in
+   encodingstart/encodingend.
+   Return values are the same as for parsepseudoattr()
+*/
+static int STRINGLIB_PARSEENCODING(
+    const STRINGLIB_CHAR *str, const STRINGLIB_CHAR *strend,
+    const STRINGLIB_CHAR **encodingstart, const STRINGLIB_CHAR **encodingend)
+{
+    while (1)
+    {
+        const STRINGLIB_CHAR *namestart;
+        const STRINGLIB_CHAR *nameend;
+
+        int result = STRINGLIB_PARSEPSEUDOATTR(str, strend, &namestart, &nameend, encodingstart, encodingend);
+
+        switch (result)
+        {
+            default:
+                return result;
+            case 2: /* found one, now check if it's "encoding"  */
+                if ((nameend-namestart == 8) && !STRINGLIB_CMP2CHAR(namestart, "encoding", 8))
+                    return 2;
+                /* not "encoding" => continue */
+                str = *encodingend+1;
+        }
+    }
+}
Index: Modules/_codecsmodule.c
===================================================================
--- Modules/_codecsmodule.c	(Revision 58878)
+++ Modules/_codecsmodule.c	(Arbeitskopie)
@@ -1073,6 +1073,444 @@
     return PyCodec_LookupError(name);
 }
 
+/* --- Functions for XML codecs ------------------------------------------- */
+
+static int cmpu2s(const Py_UNICODE *u, const char *s, Py_ssize_t len)
+{
+    while (len)
+    {
+        if (*u != *s)
+            return *u - *s;
+        ++u;
+        ++s;
+        --len;
+    }
+    return 0;
+}
+
+/* define unicode version of parsepseudoattr/parseencoding */
+#define STRINGLIB_PARSEPSEUDOATTR   parse_xml_pseudoattr_unicode
+#define STRINGLIB_PARSEENCODING     parse_xml_encoding_unicode
+#define STRINGLIB_CHAR              Py_UNICODE
+#define STRINGLIB_ISALPHA(c)        Py_UNICODE_ISALPHA(c)
+#define STRINGLIB_CMP2CHAR(u, s, l) cmpu2s(u, s, l)
+
+#include "_codecs_functions.c"
+
+#undef STRINGLIB_PARSEPSEUDOATTR
+#undef STRINGLIB_PARSEENCODING
+#undef STRINGLIB_CHAR
+#undef STRINGLIB_ISALPHA
+#undef STRINGLIB_CMP2CHAR
+
+/* define str version of parsepseudoattr/parseencoding */
+#define STRINGLIB_PARSEPSEUDOATTR   parse_xml_pseudoattr_str
+#define STRINGLIB_PARSEENCODING     parse_xml_encoding_str
+#define STRINGLIB_CHAR              char
+#define STRINGLIB_ISALPHA(c)        (((c)>='a' && (c)<='z') || ((c)>='A' && (c)<='Z'))
+#define STRINGLIB_CMP2CHAR(u, s, l) strncmp(u, s, l)
+
+#include "_codecs_functions.c"
+
+#undef STRINGLIB_PARSEPSEUDOATTR
+#undef STRINGLIB_PARSEENCODING
+#undef STRINGLIB_CHAR
+#undef STRINGLIB_ISALPHA
+#undef STRINGLIB_CMP2CHAR
+
+/* Parses a unicode XML declaration and returns the position of the encoding in
+   encodingstart/encodingend. Return values are the same as for parseencoding(). */
+int parse_xml_declaration_unicode(const Py_UNICODE *str, const Py_UNICODE *strend, const Py_UNICODE **encodingstart, const Py_UNICODE **encodingend)
+{
+    Py_ssize_t strlen = strend - str;
+
+    if (strlen>0)
+    {
+        if (*str++ != '<')
+            return 1;
+        if (strlen>1)
+        {
+            if (*str++ != '?')
+                return 1;
+            if (strlen>2)
+            {
+                if (*str++ != 'x')
+                    return 1;
+                if (strlen>3)
+                {
+                    if (*str++ != 'm')
+                        return 1;
+                    if (strlen>4)
+                    {
+                        if (*str++ != 'l')
+                            return 1;
+                        if (strlen>5)
+                        {
+                            if (*str != ' ' && *str != '\t' && *str != '\r' && *str != '\n')
+                                return 1;
+                            return parse_xml_encoding_unicode(++str, strend, encodingstart, encodingend);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+/* We're using bits to store all possible candidate encodings (or variants, i.e.
+ * we have two bits for the variants of UTF-16 and two for the
+ * variants of UTF-32).
+ *
+ * Prefixes for various XML encodings
+ * (see http://www.w3.org/TR/2004/REC-xml-20040204/#sec-guessing)
+ * UTF-8-SIG   xEF  xBB  xBF
+ * UTF-16 (LE) xFF  xFE ~x00|~x00
+ * UTF-16 (BE) xFE  xFF
+ * UTF-16-LE    <   x00   ?   x00
+ * UTF-16-BE   x00   <
+ * UTF-32 (LE) xFF  xFE  x00  x00
+ * UTF-32 (BE) x00  x00  xFE  xFF
+ * UTF-32-LE    <   x00  x00  x00
+ * UTF-32-BE   x00  x00  x00   <
+ * XML-DECL     <    ?    x    m    l
+*/
+
+#define CANDIDATE_UTF_8_SIG    (1<<0)
+#define CANDIDATE_UTF_16_AS_LE (1<<1)
+#define CANDIDATE_UTF_16_AS_BE (1<<2)
+#define CANDIDATE_UTF_16_LE    (1<<3)
+#define CANDIDATE_UTF_16_BE    (1<<4)
+#define CANDIDATE_UTF_32_AS_LE (1<<5)
+#define CANDIDATE_UTF_32_AS_BE (1<<6)
+#define CANDIDATE_UTF_32_LE    (1<<7)
+#define CANDIDATE_UTF_32_BE    (1<<8)
+#define CANDIDATE_DECL         (1<<9)
+#define CANDIDATES             ((CANDIDATE_DECL<<1)-1) /* All bits */
+
+#if 0
+/* for debugging output */
+void DUMPCANDIDATES(int candidates)
+{
+    if (candidates&CANDIDATE_UTF_8_SIG)
+        printf("u8s ");
+    else
+        printf("--- ");
+    if (candidates&CANDIDATE_UTF_16_AS_LE)
+        printf("u16(le) ");
+    else
+        printf("------- ");
+    if (candidates&CANDIDATE_UTF_16_AS_BE)
+        printf("u16(be) ");
+    else
+        printf("------- ");
+    if (candidates&CANDIDATE_UTF_16_LE)
+        printf("u16le ");
+    else
+        printf("----- ");
+    if (candidates&CANDIDATE_UTF_16_BE)
+        printf("u16be ");
+    else
+        printf("----- ");
+    if (candidates&CANDIDATE_UTF_32_AS_LE)
+        printf("u32(le) ");
+    else
+        printf("------- ");
+    if (candidates&CANDIDATE_UTF_32_AS_BE)
+        printf("u32(be) ");
+    else
+        printf("------- ");
+    if (candidates&CANDIDATE_UTF_32_LE)
+        printf("u32le ");
+    else
+        printf("----- ");
+    if (candidates&CANDIDATE_UTF_32_BE)
+        printf("u32be ");
+    else
+        printf("----- ");
+    if (candidates&CANDIDATE_DECL)
+        printf("decl\n");
+    else
+        printf("----\n");
+}
+
+/* for debugging output */
+void DUMPBYTE(char c)
+{
+    printf("-> %02x\n", (int)(unsigned char)c);
+}
+#else
+#define DUMPCANDIDATES(x)
+#define DUMPBYTE(x)
+#endif
+
+static PyObject *detect_xml_encoding_str(const char *str, Py_ssize_t len, int final)
+{
+    const char *origstr;
+    Py_ssize_t origlen;
+    int candidates = CANDIDATES; /* all 10 encodings are still possible */
+    const char *strend;
+    char firstbytes[4];
+
+    origlen = len;
+    origstr = str;
+    strend = str + len;
+
+    /* For each byte in the input delete the appropriate bit if the
+     * encoding has the wrong value in this spot. If no bits remain
+     * we default to UTF-8. If only one bit remains (and we had enough input)
+     * this is the resulting encoding.
+     */
+    DUMPCANDIDATES(candidates);
+    if (len)
+    {
+        /* Check first byte */
+        firstbytes[0] = *str;
+        DUMPBYTE(*str);
+        if (firstbytes[0] != '\xef')
+            candidates &= ~CANDIDATE_UTF_8_SIG;
+        if (firstbytes[0] != '\xff')
+            candidates &= ~CANDIDATE_UTF_32_AS_LE&
+                          ~CANDIDATE_UTF_16_AS_LE;
+        if (firstbytes[0] != '\xfe')
+            candidates &= ~CANDIDATE_UTF_16_AS_BE;
+        if (firstbytes[0] != '<')
+            candidates &= ~CANDIDATE_UTF_32_LE&
+                          ~CANDIDATE_UTF_16_LE&
+                          ~CANDIDATE_DECL;
+        if (firstbytes[0] != '\x00')
+            candidates &= ~CANDIDATE_UTF_32_AS_BE&
+                          ~CANDIDATE_UTF_32_BE&
+                          ~CANDIDATE_UTF_16_BE;
+        DUMPCANDIDATES(candidates);
+        if (++str, --len)
+        {
+            /* Check second byte */
+            firstbytes[1] = *str;
+            DUMPBYTE(*str);
+            if (firstbytes[1] != '\xbb')
+                candidates &= ~CANDIDATE_UTF_8_SIG;
+            if (firstbytes[1] != '\xfe')
+                candidates &= ~CANDIDATE_UTF_16_AS_LE&
+                              ~CANDIDATE_UTF_32_AS_LE;
+            if (firstbytes[1] != '\xff')
+                candidates &= ~CANDIDATE_UTF_16_AS_BE;
+            if (firstbytes[1] != '\x00')
+                candidates &= ~CANDIDATE_UTF_16_LE&
+                              ~CANDIDATE_UTF_32_AS_BE&
+                              ~CANDIDATE_UTF_32_LE&
+                              ~CANDIDATE_UTF_32_BE;
+            if (firstbytes[1] != '<')
+                candidates &= ~CANDIDATE_UTF_16_BE;
+            if (firstbytes[1] != '?')
+                candidates &= ~CANDIDATE_DECL;
+            DUMPCANDIDATES(candidates);
+            if (++str, --len)
+            {
+                /* Check third byte */
+                firstbytes[2] = *str;
+                DUMPBYTE(*str);
+                if (firstbytes[2] != '\xbf')
+                    candidates &= ~CANDIDATE_UTF_8_SIG;
+                if (firstbytes[2] != '?')
+                    candidates &= ~CANDIDATE_UTF_16_LE;
+                if (firstbytes[2] != '\x00')
+                    candidates &= ~CANDIDATE_UTF_32_AS_LE&
+                                  ~CANDIDATE_UTF_32_LE&
+                                  ~CANDIDATE_UTF_32_BE;
+                if (firstbytes[2] != '\xfe')
+                    candidates &= ~CANDIDATE_UTF_32_AS_BE;
+                if (firstbytes[2] != 'x')
+                    candidates &= ~CANDIDATE_DECL;
+                DUMPCANDIDATES(candidates);
+                if (++str, --len)
+                {
+                    /* Check fourth byte */
+                    firstbytes[3] = *str;
+                    DUMPBYTE(*str);
+                    if (firstbytes[3] == '\x00' && firstbytes[2] == '\x00')
+                        candidates &= ~CANDIDATE_UTF_16_AS_LE;
+                    if (firstbytes[3] != '\x00')
+                        candidates &= ~CANDIDATE_UTF_16_LE&
+                                      ~CANDIDATE_UTF_32_AS_LE&
+                                      ~CANDIDATE_UTF_32_LE;
+                    if (firstbytes[3] != '\xff')
+                        candidates &= ~CANDIDATE_UTF_32_AS_BE;
+                    if (firstbytes[3] != '<')
+                        candidates &= ~CANDIDATE_UTF_32_BE;
+                    if (firstbytes[3] != 'm')
+                        candidates &= ~CANDIDATE_DECL;
+                    DUMPCANDIDATES(candidates);
+                    if (++str, --len)
+                    {
+                        /* Check fifth byte */
+                        DUMPBYTE(*str);
+                        if (*str != 'l')
+                            candidates &= ~CANDIDATE_DECL;
+                        DUMPCANDIDATES(candidates);
+                        if (++str, --len)
+                        {
+                            /* Check sixth byte */
+                            DUMPBYTE(*str);
+                            if (*str != ' ' && *str != '\t' && *str != '\r' && *str != '\n')
+                                candidates &= ~CANDIDATE_DECL;
+                            DUMPCANDIDATES(candidates);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    if (candidates == 0)
+        return PyString_FromString("utf-8");
+    else if (!(candidates & (candidates-1))) /* only one encoding remaining */
+    {
+        if ((candidates == CANDIDATE_UTF_8_SIG) && (origlen >= 3))
+            return PyString_FromString("utf-8-sig");
+        else if ((candidates == CANDIDATE_UTF_16_AS_LE) && (origlen >= 2))
+            return PyString_FromString("utf-16");
+        else if ((candidates == CANDIDATE_UTF_16_AS_BE) && (origlen >= 2))
+            return PyString_FromString("utf-16");
+        else if ((candidates == CANDIDATE_UTF_16_LE) && (origlen >= 4))
+            return PyString_FromString("utf-16-le");
+        else if ((candidates == CANDIDATE_UTF_16_BE) && (origlen >= 2))
+            return PyString_FromString("utf-16-be");
+        else if ((candidates == CANDIDATE_UTF_32_AS_LE) && (origlen >= 4))
+            return PyString_FromString("utf-32");
+        else if ((candidates == CANDIDATE_UTF_32_AS_BE) && (origlen >= 4))
+            return PyString_FromString("utf-32");
+        else if ((candidates == CANDIDATE_UTF_32_LE) && (origlen >= 4))
+            return PyString_FromString("utf-32-le");
+        else if ((candidates == CANDIDATE_UTF_32_BE) && (origlen >= 4))
+            return PyString_FromString("utf-32-be");
+        else if ((candidates == CANDIDATE_DECL) && (origlen >= 6))
+        {
+            const char *encodingstart;
+            const char *encodingend;
+
+            switch (parse_xml_encoding_str(str, strend, &encodingstart, &encodingend))
+            {
+                case -1:
+                    return NULL;
+                case 0: /* don't know yet */
+                    Py_RETURN_NONE;
+                case 1: /* not found => default to utf-8 */
+                    return PyString_FromString("utf-8");
+                case 2: /* found it  */
+                    return PyString_FromStringAndSize(encodingstart, encodingend-encodingstart);
+            }
+        }
+    }
+    if (final) /* if this is the last call, and we haven't determined an encoding yet, we default to UTF-8 */
+        return PyString_FromString("utf-8");
+    /* We don't know yet */
+    Py_RETURN_NONE;
+}
+
+static PyObject *detect_xml_encoding_unicode(const Py_UNICODE *str, Py_ssize_t len, int final)
+{
+    const Py_UNICODE *encodingstart;
+    const Py_UNICODE *encodingend;
+
+    switch (parse_xml_declaration_unicode(str, str+len, &encodingstart, &encodingend))
+    {
+        case -1:
+            return NULL;
+        case 0: /* don't know yet */
+            if (final) /* we won't get better data, so default to utf-8 */
+                goto utf8;
+            Py_RETURN_NONE;
+        case 1: /* not found => default to UTF-8 */
+            goto utf8;
+        case 2: /* found it => put the encoding name into this spot and return the new string */
+            return PyUnicode_FromUnicode(encodingstart, encodingend-encodingstart);
+    }
+    utf8:
+    return PyUnicode_DecodeASCII("utf-8", 5, NULL);
+}
+
+static PyObject *detect_xml_encoding(PyObject *self, PyObject *args)
+{
+    PyObject *obj;
+    int final = 0;
+
+    if (!PyArg_ParseTuple(args, "O|i:detect_xml_encoding", &obj, &final))
+        return NULL;
+
+    if (PyString_Check(obj))
+        return detect_xml_encoding_str(PyString_AS_STRING(obj), PyString_GET_SIZE(obj), final);
+    else if (PyUnicode_Check(obj))
+        return detect_xml_encoding_unicode(PyUnicode_AS_UNICODE(obj), PyUnicode_GET_SIZE(obj), final);
+    else
+    {
+        PyErr_SetString(PyExc_TypeError, "expected str or unicode");
+        return NULL;
+    }
+}
+
+static char detect_xml_encoding__doc__[] =
+"detect_xml_encoding(str[, final=False]) -> str or None\n\
+\n\
+Tries to detect the XML encoding from the first few bytes of the string\n\
+or the encoding declaration in the XML header. Return the name of the\n\
+encoding or None, if the encoding is ambiguous.";
+
+PyObject *fix_xml_encoding(PyObject *self, PyObject *args)
+{
+    PyObject *strobj;
+    const Py_UNICODE *strstart;
+    const Py_UNICODE *strend;
+    int final = 0;
+    const Py_UNICODE *enc;
+    Py_ssize_t enclen;
+    const Py_UNICODE *encodingstart;
+    const Py_UNICODE *encodingend;
+
+    if (!PyArg_ParseTuple(args, "O!u#|i:fixencoding", &PyUnicode_Type, &strobj, &enc, &enclen, &final))
+        return NULL;
+
+    strstart = PyUnicode_AS_UNICODE(strobj);
+    strend = strstart + PyUnicode_GET_SIZE(strobj);
+    switch (parse_xml_declaration_unicode(strstart, strend, &encodingstart, &encodingend))
+    {
+        case -1:
+            return NULL;
+        case 0: /* don't know yet */
+            if (final) /* we won't get better data, so use what we have */
+                goto original;
+            Py_RETURN_NONE;
+        case 1: /* not found => return original string */
+            goto original;
+        case 2: /* found it */
+        {
+            /* yes => put the encoding name into this spot and return the new string */
+            PyObject *newobj = PyUnicode_FromUnicode(NULL, (encodingstart-strstart) + enclen + (strend - encodingend));
+            Py_UNICODE *new;
+            if (!newobj)
+                return NULL;
+            new = PyUnicode_AS_UNICODE(newobj);
+            #define Py_UNICODE_STPCOPY(target, source, length) (Py_UNICODE_COPY(target, source, length), (target)+(length))
+            new = Py_UNICODE_STPCOPY(new, strstart, encodingstart-strstart);
+            new = Py_UNICODE_STPCOPY(new, enc, enclen);
+            (void) Py_UNICODE_STPCOPY(new, encodingend, strend - encodingend);
+            return newobj;
+        }
+    }
+    Py_RETURN_NONE;
+    original:
+    Py_INCREF(strobj);
+    return strobj;
+}
+
+static char fix_xml_encoding__doc__[] =
+"fix_xml_encoding(unicode, encoding) -> unicode or None\n\
+\n\
+Replaces the encoding specification in the XML declaration at the start of the\n\
+first argument with the encoding specified. If there's no XML declaration the\n\
+original string is returned. If the string isn't long enough to find an\n\
+encoding None is returned.";
+
+
 /* --- Module API --------------------------------------------------------- */
 
 static PyMethodDef _codecs_functions[] = {
@@ -1129,6 +1567,10 @@
         register_error__doc__},
     {"lookup_error", 		lookup_error,			METH_VARARGS,
         lookup_error__doc__},
+    {"detect_xml_encoding", 	detect_xml_encoding,		METH_VARARGS,
+        detect_xml_encoding__doc__},
+    {"fix_xml_encoding", 	fix_xml_encoding,		METH_VARARGS,
+        fix_xml_encoding__doc__},
     {NULL, NULL}		/* sentinel */
 };