diff --git a/Lib/test/test_codeccallbacks.py b/Lib/test/test_codeccallbacks.py --- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -76,6 +76,18 @@ b"スパモ \xe4nd eggs" ) + def test_test_xmlcharrefreplace_with_surrogates(self): + strings = ['\U0001f49d', '\ud83d\udc9d', '\ud83d', '\udc9d'] + expected = [b'💝', b'💝', b'�', b'�'] + for encoding in ['ascii', 'latin1']: + for s, exp in zip(strings, expected): + self.assertEqual(s.encode(encoding, 'xmlcharrefreplace'), + exp) + self.assertEqual((s+'X').encode(encoding, 'xmlcharrefreplace'), + exp+b'X') + + + def test_xmlcharnamereplace(self): # This time use a named character entity for unencodable # characters, if one is available. diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4707,7 +4707,17 @@ } /* generate replacement (temporarily (mis)uses p) */ for (p = collstart; p < collend; ++p) { - str += sprintf(str, "&#%d;", (int)*p); + if ((0xD800 <= *p && *p <= 0xDBFF) && + (p+1 < collend) && + (0xDC00 <= *(p+1) && *(p+1) <= 0xDFFF)) { + Py_UNICODE fst = *p; + Py_UCS4 ch = (((((Py_UCS4)fst & 0x03FF) << 10) | + ((Py_UCS4)(*(++p)) & 0x03FF)) + 0x10000); + str += sprintf(str, "&#%d;", (int)ch); + } + else { + str += sprintf(str, "&#%d;", (int)*p); + } } p = collend; break;