# HG changeset patch
# Parent 0c914a2be764f54217cd93a089fbef862ad4da05

diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@@ -337,14 +337,15 @@
 |                         | in :pep:`383`.                                |
 +-------------------------+-----------------------------------------------+
 
-In addition, the following error handlers are specific to a single codec:
+In addition, the following error handlers are specific to Unicode encoding schemes:
 
-+-------------------+---------+-------------------------------------------+
-| Value             | Codec   | Meaning                                   |
-+===================+=========+===========================================+
-|``'surrogatepass'``| utf-8   | Allow encoding and decoding of surrogate  |
-|                   |         | codes in UTF-8.                           |
-+-------------------+---------+-------------------------------------------+
++-------------------+------------------------+-------------------------------------------+
+| Value             | Codec                  | Meaning                                   |
++===================+========================+===========================================+
+|``'surrogatepass'``| utf-8, utf-16, utf-32, | Allow encoding and decoding of surrogate  |
+|                   | utf-16-be, utf-16-le,  | codes in all the Unicode encoding schemes.|
+|                   | utf-32-be, utf-32-le   |                                           |
++-------------------+------------------------+-------------------------------------------+
 
 .. versionadded:: 3.1
    The ``'surrogateescape'`` and ``'surrogatepass'`` error handlers.
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -304,13 +304,27 @@
                          "[]".encode(self.encoding))
         self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
                          "[?]".encode(self.encoding))
-        if (hasattr(self,"ill_formed_sequence")):
-            test_string = "A"
-            bom = "".encode(self.encoding)
-            well_formed_sequence = test_string.encode(self.encoding)[len(bom):]
-            test_sequence = bom + self.ill_formed_sequence + well_formed_sequence
+
+        bom = "".encode(self.encoding)
+        for before, after in [("\U00010fff", "A"), ("[", "]"),
+                              ("A", "\U00010fff")]:
+            before_sequence = before.encode(self.encoding)[len(bom):]
+            after_sequence = after.encode(self.encoding)[len(bom):]
+            test_string = before + "\uDC80" + after
+            test_sequence = (bom + before_sequence + 
+                             self.ill_formed_sequence + after_sequence)
             self.assertRaises(UnicodeDecodeError, test_sequence.decode,
                               self.encoding)
+            self.assertEqual(test_string.encode(self.encoding,
+                                                "surrogatepass"),
+                             test_sequence)
+            self.assertEqual(test_string, 
+                             test_sequence.decode(self.encoding, 
+                                                  "surrogatepass"))
+            self.assertEqual(before + after,
+                             test_sequence.decode(self.encoding, "ignore"))
+            self.assertEqual(before + "\ufffd" + after,
+                             test_sequence.decode(self.encoding, "replace"))
 
 class UTF32Test(CommonUTFTest):
     encoding = "utf-32"
@@ -407,6 +421,8 @@
 class UTF32LETest(CommonUTFTest):
     encoding = "utf-32-le"
 
+    ill_formed_sequence = b"\x80\xdc\x00\x00"
+
     def test_partial(self):
         self.check_partial(
             "\x00\xff\u0100\uffff",
@@ -447,6 +463,8 @@
 class UTF32BETest(CommonUTFTest):
     encoding = "utf-32-be"
 
+    ill_formed_sequence = b"\x00\x00\xdc\x80"
+
     def test_partial(self):
         self.check_partial(
             "\x00\xff\u0100\uffff",
@@ -569,6 +587,8 @@
 class UTF16LETest(CommonUTFTest):
     encoding = "utf-16-le"
 
+    ill_formed_sequence = b"\x80\xdc"
+
     def test_partial(self):
         self.check_partial(
             "\x00\xff\u0100\uffff",
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -5093,6 +5093,7 @@
     const unsigned char *q, *e;
     int bo = 0;       /* assume native ordering by default */
     const char *errmsg = "";
+    const char *encoding = "utf32le";
     /* Offsets from q for retrieving bytes in the right order. */
 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
     int iorder[] = {0, 1, 2, 3};
@@ -5194,9 +5195,11 @@
         q += 4;
         continue;
       utf32Error:
+        if (bo == 1)
+            encoding = "utf32be";
         if (unicode_decode_call_errorhandler(
                 errors, &errorHandler,
-                "utf32", errmsg,
+                encoding, errmsg,
                 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
                 &unicode, &outpos))
             goto onError;
@@ -5300,9 +5303,22 @@
         else {
             Py_ssize_t newpos;
             Py_ssize_t repsize, k, morebytes;
+            const char* encoding;
+            switch (byteorder){
+            case 0:
+                encoding = "utf-32";
+                break;
+            case -1:
+                encoding = "utf-32-le";
+                break;
+            case 1:
+                encoding = "utf-32-be";
+                break;
+            }
             rep = unicode_encode_call_errorhandler(
-                  errors, &errorHandler, "utf-32", "surrogates not allowed",
+                  errors, &errorHandler, encoding, "surrogates not allowed",
                   str, &exc, i-1, i, &newpos);
+
             if (!rep)
                 goto error;
 
@@ -5437,6 +5453,7 @@
     int bo = 0;       /* assume native ordering by default */
     int native_ordering = 0;
     const char *errmsg = "";
+    const char *encoding = "utf16le";;
     /* Offsets from q for retrieving byte pairs in the right order. */
 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
     int ihi = 1, ilo = 0;
@@ -5578,6 +5595,7 @@
             errmsg = "unexpected end of data";
             startinpos = (((const char *)q) - 2) - starts;
             endinpos = ((const char *)e) + 1 - starts;
+
             goto utf16Error;
         }
         if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
@@ -5600,13 +5618,19 @@
         errmsg = "illegal encoding";
         startinpos = (((const char *)q)-2)-starts;
         endinpos = startinpos+2;
+
         /* Fall through to report the error */
 
       utf16Error:
+        if (ilo)
+            encoding = "utf16be";
+        /* e doesn't point to the end ('\0') but the following function expects
+           that. */
+        e += 1;
         if (unicode_decode_call_errorhandler(
                 errors,
                 &errorHandler,
-                "utf16", errmsg,
+                encoding, errmsg,
                 &starts,
                 (const char **)&e,
                 &startinpos,
@@ -5616,6 +5640,8 @@
                 &unicode,
                 &outpos))
             goto onError;
+        /* e in this function always points to the char before the end ('\0') */
+        e -= 1;        
     }
     /* remaining byte at the end? (size should be even) */
     if (e == q) {
@@ -5753,8 +5779,20 @@
         else {
             Py_ssize_t newpos;
             Py_ssize_t repsize, k, morebytes;
+            const char* encoding;
+            switch (byteorder){
+            case 0:
+                encoding = "utf-16";
+                break;
+            case -1:
+                encoding = "utf-16-le";
+                break;
+            case 1:
+                encoding = "utf-16-be";
+                break;
+            }
             rep = unicode_encode_call_errorhandler(
-                  errors, &errorHandler, "utf-16", "surrogates not allowed",
+                  errors, &errorHandler, encoding, "surrogates not allowed",
                   str, &exc, i-1, i, &newpos);
             if (!rep)
                 goto error;
diff --git a/Python/codecs.c b/Python/codecs.c
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -731,6 +731,14 @@
     }
 }
 
+/* Endianness switches; defaults to little endian */
+
+#ifdef WORDS_BIGENDIAN
+# define BYTEORDER_IS_BIG_ENDIAN
+#else
+# define BYTEORDER_IS_LITTLE_ENDIAN
+#endif
+
 /* This handler is declared static until someone demonstrates
    a need to call it directly. */
 static PyObject *
@@ -738,24 +746,67 @@
 {
     PyObject *restuple;
     PyObject *object;
+    PyObject *encode;
+    char *encoding;
+    int bytelength = 3;
+    int le = -1;
     Py_ssize_t i;
     Py_ssize_t start;
     Py_ssize_t end;
     PyObject *res;
     if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
-        char *outp;
+        unsigned char *outp;
         if (PyUnicodeEncodeError_GetStart(exc, &start))
             return NULL;
         if (PyUnicodeEncodeError_GetEnd(exc, &end))
             return NULL;
         if (!(object = PyUnicodeEncodeError_GetObject(exc)))
             return NULL;
-        res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
+        if (!(encode = PyUnicodeEncodeError_GetEncoding(exc))) {
+            Py_DECREF(object);
+            return NULL;
+        }
+        if (!(encoding = PyUnicode_AsUTF8(encode))) {
+            Py_DECREF(object);
+            Py_DECREF(encode);
+            return NULL;
+        }
+        Py_DECREF(encode);
+
+        if (strcmp(encoding, "utf-8") == 0){
+            /*no need to check others*/
+        }
+        else if (strcmp(encoding, "utf-16") == 0) {
+            bytelength = 2;
+#ifdef BYTEORDER_IS_BIG_ENDIAN
+            le = 0;
+#endif
+        }
+        else if (strcmp(encoding, "utf-16-le") == 0) 
+            bytelength = 2;
+        else if (strcmp(encoding, "utf-16-be") == 0) {
+            bytelength = 2;
+            le = 0;
+        }
+        else if (strcmp(encoding, "utf-32") == 0) {
+            bytelength = 4;
+#ifdef BYTEORDER_IS_BIG_ENDIAN
+            le = 0;
+#endif
+        }
+        else if (strcmp(encoding, "utf-32-le") == 0)
+            bytelength = 4;
+        else if (strcmp(encoding, "utf-32-be") == 0) {
+            bytelength = 4;
+            le = 0;
+        }
+        res = PyBytes_FromStringAndSize(NULL, bytelength*(end-start));
+
         if (!res) {
             Py_DECREF(object);
             return NULL;
         }
-        outp = PyBytes_AsString(res);
+        outp = (unsigned char*)PyBytes_AsString(res);
         for (i = start; i < end; i++) {
             /* object is guaranteed to be "ready" */
             Py_UCS4 ch = PyUnicode_READ_CHAR(object, i);
@@ -766,9 +817,37 @@
                 Py_DECREF(object);
                 return NULL;
             }
-            *outp++ = (char)(0xe0 | (ch >> 12));
-            *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
-            *outp++ = (char)(0x80 | (ch & 0x3f));
+            switch (bytelength) {
+            case 3:
+                *outp++ = (unsigned char)(0xe0 | (ch >> 12));
+                *outp++ = (unsigned char)(0x80 | ((ch >> 6) & 0x3f));
+                *outp++ = (unsigned char)(0x80 | (ch & 0x3f));
+                break;
+            case 2:
+                if (le) {
+                    *outp++ = (unsigned char) ch;
+                    *outp++ = (unsigned char)(ch >> 8);
+                } 
+                else {
+                    *outp++ = (unsigned char)(ch >> 8);
+                    *outp++ = (unsigned char) ch;
+                }
+                break;
+            case 4:
+                if (le) {
+                    *outp++ = (unsigned char) ch;
+                    *outp++ = (unsigned char)(ch >> 8);
+                    *outp++ = (unsigned char)(ch >> 16);
+                    *outp++ = (unsigned char)(ch >> 24);
+                }
+                else {
+                    *outp++ = (unsigned char)(ch >> 24);
+                    *outp++ = (unsigned char)(ch >> 16);
+                    *outp++ = (unsigned char)(ch >> 8);
+                    *outp++ = (unsigned char) ch;
+                }
+                break;
+            }
         }
         restuple = Py_BuildValue("(On)", res, end);
         Py_DECREF(res);
@@ -780,24 +859,77 @@
         Py_UCS4 ch = 0;
         if (PyUnicodeDecodeError_GetStart(exc, &start))
             return NULL;
+        if (PyUnicodeDecodeError_GetEnd(exc, &end))
+            return NULL;
         if (!(object = PyUnicodeDecodeError_GetObject(exc)))
             return NULL;
         if (!(p = (unsigned char*)PyBytes_AsString(object))) {
             Py_DECREF(object);
             return NULL;
         }
+        if (!(encode = PyUnicodeDecodeError_GetEncoding(exc))) {
+            Py_DECREF(object);
+            return NULL;
+        }
+        if (!(encoding = PyUnicode_AsUTF8(encode))) {
+            Py_DECREF(object);
+            Py_DECREF(encode);
+            return NULL;
+        }
+        Py_DECREF(encode);
+
+        if (strcmp(encoding, "utf8") == 0){
+            /*no need to check others*/
+        }
+        else if (strcmp(encoding, "utf16le") == 0) 
+            bytelength = 2;
+        else if (strcmp(encoding, "utf16be") == 0) {
+            bytelength = 2;
+            le = 0;
+        }
+        else if (strcmp(encoding, "utf32le") == 0)
+            bytelength = 4;
+        else if (strcmp(encoding, "utf32be") == 0) {
+            bytelength = 4;
+            le = 0;
+        }
+
         /* Try decoding a single surrogate character. If
            there are more, let the codec call us again. */
         p += start;
-        if ((p[0] & 0xf0) == 0xe0 ||
-            (p[1] & 0xc0) == 0x80 ||
-            (p[2] & 0xc0) == 0x80) {
-            /* it's a three-byte code */
-            ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
-            if (ch < 0xd800 || ch > 0xdfff)
-                /* it's not a surrogate - fail */
+
+        switch (bytelength) {
+        case 3:
+            if ((p[0] & 0xf0) == 0xe0 ||
+                (p[1] & 0xc0) == 0x80 ||
+                (p[2] & 0xc0) == 0x80) {
+                /* it's a three-byte code */
+                ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
+                if (ch < 0xd800 || ch > 0xdfff)
+                    /* it's not a surrogate - fail */
+                    ch = 0;
+            }
+            break;
+        case 2:
+            if (end - start != 2)
+                break;
+            if (le)
+                ch = p[1] << 8 | p[0];
+            else
+                ch = p[0] << 8 | p[1];
+            break;
+        case 4:
+            if (end - start != 4)
+                break;
+            if (le)
+                ch = (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0];
+            else
+                ch = (p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3];
+            if (ch > 0x10ffff)
                 ch = 0;
+            break;
         }
+            
         Py_DECREF(object);
         if (ch == 0) {
             PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
@@ -806,7 +938,7 @@
         res = PyUnicode_FromOrdinal(ch);
         if (res == NULL)
             return NULL;
-        return Py_BuildValue("(Nn)", res, start+3);
+        return Py_BuildValue("(Nn)", res, start + bytelength);
     }
     else {
         wrong_exception_type(exc);