import codecs import re def convert_surrogates(data, errors='strict'): handler = None p = re.compile('[\ud800-\uefff]+') pos = 0 res = [] while True: m = p.search(data, pos) if m: if handler is None: handler = codecs.lookup_error(errors) res.append(data[pos: m.start()]) repl, pos = handler(UnicodeTranslateError(data, m.start(), m.end(), 'lone surrogates')) res.append(repl) elif pos: res.append(data[pos:]) return ''.join(res) else: return data def convert_surrogateescape(data, errors='strict'): handler = None p = re.compile('[\ud800-\uefff]+') pos = 0 res = [] while True: m = p.search(data, pos) if m: if handler is None: handler = codecs.lookup_error(errors) start = m.start() res.append(data[pos: start]) try: baddata = data[start: m.end()].encode('ascii', 'surrogateescape') except UnicodeEncodeError as err: raise UnicodeTranslateError(data, err.start + start,err.end + start, r'surrogates not in range \ud880-\ud8ff') from None try: repl, pos = handler(UnicodeDecodeError('unicode', baddata, 0, len(baddata), 'lone surrogates')) except UnicodeDecodeError as err: raise UnicodeTranslateError(data, err.start + start, err.end + start, err.reason) from None pos += start res.append(repl) elif pos: res.append(data[pos:]) return ''.join(res) else: return data