# workaround for unicode escaping buffer overflow, # see http://sourceforge.net/tracker/index.php?func=detail&aid=1541585&group_id=5470&atid=305470 # written by G. Brandl and Th. Waldmann if len(u'\U00010000') == 1: # UCS-4, needs fixing def fixed_unicode_escape(string, quotes): ret = qchar = '' if quotes: qchar = ("'" in string and not '"' in string) and '"' or "'" ret = 'u' + qchar for ch in string: och = ord(ch) # escape quotes and backslashes if ch == qchar or ch == '\\': ret += '\\' + str(ch) # map 21-bit characters to '\U00xxxxxx' elif och >= 0x10000: ret += '\\U%08x' % och # map 16-bit characters to '\uxxxx' elif och >= 0x100: ret += '\\u%04x' % och # map special whitespace to '\t', '\n', '\r' elif ch == '\t': ret += '\\t' elif ch == '\n': ret += '\\n' elif ch == '\r': ret += '\\r' # map non-printable US ASCII to '\xhh' elif och < 0x20 or och >= 0x7F: ret += '\\x%02x' % och else: ret += str(ch) if quotes: ret += qchar return ret def new_repr(x, old_repr=repr): if isinstance(x, unicode): return fixed_unicode_escape(x, 1) else: return old_repr(x) # patch the builtin repr with fixed implementation import __builtin__ orig_repr = __builtin__.repr __builtin__.repr = new_repr import codecs from encodings import unicode_escape class Codec(codecs.Codec): def unicode_escape_encode(cls, inputobj, errors='strict'): return fixed_unicode_escape(inputobj, 0), len(inputobj) encode = classmethod(unicode_escape_encode) # Note: Binding this as C function will result in the class not # converting them to a method. This is intended. decode = codecs.unicode_escape_decode class StreamWriter(Codec, codecs.StreamWriter): pass class StreamReader(Codec, codecs.StreamReader): pass def getregentry(): return (Codec.encode, Codec.decode, StreamReader, StreamWriter) # patch also the unicode_escape Codec: unicode_escape.Codec = Codec unicode_escape.StreamWriter = StreamWriter unicode_escape.StreamReader = StreamReader unicode_escape.getregentry = getregentry else: # UCS-2, not vulnerable pass if __name__ == '__main__': print "Trying to crash. If you have a non-fixed python 2.3/2.4, you'll see some msg from glibc." print "Trying repr..." assert(repr(u"\U00010000" * 39 + u"\uffff" * 4096) == repr(u"\U00010000" * 39 + u"\uffff" * 4096)) print "Trying encode unicode-escape ..." x = (u"\U00010000" * 39 + u"\uffff" * 4096).encode('unicode-escape') print "Finished."