diff -r 889023da7454 Lib/codecs.py --- a/Lib/codecs.py Mon Mar 16 08:31:38 2015 +0200 +++ b/Lib/codecs.py Mon Mar 16 08:55:36 2015 +0200 @@ -1076,6 +1076,95 @@ def make_encoding_map(decoding_map): m[v] = None return m +_surrogates_re = None + +def convert_surrogates(data, errors): + handler = None + global _surrogates_re + if not _surrogates_re: + import re + _surrogates_re = re.compile('[\ud800-\uefff]+') + pos = 0 + res = [] + while True: + m = _surrogates_re.search(data, pos) + if m: + if handler is None: + handler = lookup_error(errors) + res.append(data[pos: m.start()]) + repl, pos = handler(UnicodeTranslateError(data, m.start(), m.end(), + 'lone surrogates')) + res.append(repl) + elif pos: + res.append(data[pos:]) + return ''.join(res) + else: + return data + +def convert_surrogateescape(data, errors): + handler = None + global _surrogates_re + if not _surrogates_re: + import re + _surrogates_re = re.compile('[\ud800-\uefff]+') + pos = 0 + res = [] + while True: + m = _surrogates_re.search(data, pos) + if m: + if handler is None: + handler = lookup_error(errors) + start = m.start() + res.append(data[pos: start]) + try: + baddata = data[start: m.end()].encode('ascii', 'surrogateescape') + except UnicodeEncodeError as err: + raise UnicodeTranslateError(data, + err.start + start,err.end + start, + r'surrogates not in range \udc80-\udcff') from None + try: + repl, pos = handler(UnicodeDecodeError('unicode', baddata, + 0, len(baddata), + 'lone surrogates')) + except UnicodeDecodeError as err: + raise UnicodeTranslateError(data, + err.start + start, + err.end + start, + err.reason) from None + pos += start + res.append(repl) + elif pos: + res.append(data[pos:]) + return ''.join(res) + else: + return data + +_astral_re = None + +def convert_astrals(data, errors): + handler = None + global _astral_re + if not _astral_re: + import re + _astral_re = re.compile(r'[^\u0000-\uffff]+') + pos = 0 + res = [] + while True: + m = _astral_re.search(data, pos) + if m: + if handler is None: + handler = lookup_error(errors) + res.append(data[pos: m.start()]) + repl, pos = handler(UnicodeTranslateError(data, m.start(), m.end(), + 'astral characters')) + res.append(repl) + elif pos: + res.append(data[pos:]) + return ''.join(res) + else: + return data + + ### error handlers try: diff -r 889023da7454 Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Mon Mar 16 08:31:38 2015 +0200 +++ b/Lib/test/test_codecs.py Mon Mar 16 08:55:36 2015 +0200 @@ -1721,6 +1721,88 @@ class CodecsModuleTest(unittest.TestCase self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined', errors) + def test_convert_surrogates(self): + self.assertRaises(TypeError, codecs.convert_surrogates) + self.assertRaises(TypeError, codecs.convert_surrogates, + 'abc', 'strict', '') + with self.assertRaises(UnicodeTranslateError): + codecs.convert_surrogates('a\ud800b', 'strict') + tests = [ + ('ignore', ('', '')), + ('replace', ('\ufffd','\ufffd')), + ('backslashreplace', ('\\ud800', '\\udfff')), + # ('namereplace', ('\\ud800', '\\udfff')), + # ('xmlcharrefreplace', ('�', '�')), + # ('surrogatepass', ('\ud800', '\udfff')), + ] + for (error, args) in tests: + for tmpl in ('a{}b', 'a{}b{}', 'a{}{}c', 'a{}b{}c'): + data = tmpl.format('\ud800', '\udfff') + expected = tmpl.format(*args) + with self.subTest(error=error, data=data): + self.assertEqual(codecs.convert_surrogates(data, error), + expected) + + def test_convert_surrogateescape(self): + self.assertRaises(TypeError, codecs.convert_surrogateescape) + self.assertRaises(TypeError, codecs.convert_surrogateescape, + 'abc', 'strict', '') + with self.assertRaises(UnicodeTranslateError): + codecs.convert_surrogateescape('a\udc80b', 'strict') + with self.assertRaises(TypeError): + codecs.convert_surrogateescape('a\udc80b', 'namereplace') + with self.assertRaises(TypeError): + codecs.convert_surrogateescape('a\udc80b', 'xmlcharrefreplace') + with self.assertRaises(UnicodeTranslateError): + codecs.convert_surrogateescape('a\udc80b', 'surrogatepass') + tests = [ + ('ignore', ('', '')), + ('replace', ('\ufffd','\ufffd')), + ('backslashreplace', ('\\x80','\\xff')), + ('surrogateescape', ('\udc80','\udcff')), + ] + for (error, args) in tests: + for tmpl in ('a{}b', 'a{}b{}', 'a{}{}c', 'a{}b{}c'): + data = tmpl.format('\udc80', '\udcff') + expected = tmpl.format(*args) + if error == 'replace': + expected = expected.replace('\ufffd\ufffd', '\ufffd') + with self.subTest(error=error, data=data): + self.assertEqual(codecs.convert_surrogateescape(data, error), + expected) + for error in ('strict', 'ignore', 'replace', + 'backslashreplace', 'namereplace', 'xmlcharrefreplace', + 'surrogatepass', 'surrogateescape'): + with self.assertRaises(UnicodeTranslateError): + codecs.convert_surrogateescape('\udc7f', error) + with self.assertRaises(UnicodeTranslateError): + codecs.convert_surrogateescape('\udd00', error) + + def test_convert_astrals(self): + self.assertRaises(TypeError, codecs.convert_astrals) + self.assertRaises(TypeError, codecs.convert_astrals, + 'abc', 'strict', '') + with self.assertRaises(UnicodeTranslateError): + codecs.convert_astrals('a\U00010280b', 'strict') + # with self.assertRaises(UnicodeTranslateError): + # codecs.convert_astrals('a\U00010280b', 'surrogatepass') + with self.assertRaises(TypeError): + codecs.convert_astrals('a\U00010280b', 'surrogateescape') + tests = [ + ('ignore', ('', '')), + ('replace', ('\ufffd','\ufffd')), + ('backslashreplace', ('\\U00010280', '\\U000e007f')), + # ('namereplace', ('\\N{LYCIAN LETTER A}', '\\N{CANCEL TAG}')), + # ('xmlcharrefreplace', ('𐊀','󠁿')), + ] + for (error, args) in tests: + for tmpl in ('a{}b', 'a{}b{}', 'a{}{}c', 'a{}b{}c'): + data = tmpl.format('\U00010280', '\U000e007f') + expected = tmpl.format(*args) + with self.subTest(error=error, data=data): + self.assertEqual(codecs.convert_astrals(data, error), + expected) + class StreamReaderTest(unittest.TestCase): def setUp(self):