import unicodedata CHARACTERS = ( # First try printable and common characters to have a readable filename. # For each character, the encoding list are just example of encodings able # to encode the character (the list is not exhaustive). # U+00E6 (Latin Small Letter Ae): cp1252, iso-8859-1 '\u00E6', # U+0130 (Latin Capital Letter I With Dot Above): cp1254, iso8859_3 '\u0130', # U+0141 (Latin Capital Letter L With Stroke): cp1250, cp1257 '\u0141', # U+03C6 (Greek Small Letter Phi): cp1253 '\u03C6', # U+041A (Cyrillic Capital Letter Ka): cp1251 '\u041A', # U+05D0 (Hebrew Letter Alef): Encodable to cp424 '\u05D0', # U+060C (Arabic Comma): cp864, cp1006, iso8859_6, mac_arabic '\u060C', # U+062A (Arabic Letter Teh): cp720 '\u062A', # U+0E01 (Thai Character Ko Kai): cp874 '\u0E01', # Then try more "special" characters. "special" because they may be # interpreted or displayed differently depending on the exact locale # encoding and the font. # U+00A0 (No-Break Space) '\u00A0', # U+20AC (Euro Sign) '\u20AC', ) CODE_PAGES = (424, 720, 864, 874, 932, 950, 1006) + tuple(range(1250, 1257+1)) LOCALE_ENCODINGS = [ 'iso-8859-1', 'iso8859_3', 'iso8859_6', 'mac_arabic', 'mac_farsi'] #, 'utf8'] def is_encodable(character, encoding, errors): try: encoded = character.encode(encoding, errors) decoded = encoded.decode(encoding, errors) except UnicodeError: return False else: return True def check_encoding(encoding, errors): for character in CHARACTERS: if is_encodable(character, encoding, errors): return print("No character for encoding %s:%s :-(" % (encoding, errors)) for code_page in CODE_PAGES: check_encoding('cp%s' % code_page, 'strict') for encoding in LOCALE_ENCODINGS: check_encoding(encoding, 'surrogateescape') print("CHARACTERS = (") for character in sorted(CHARACTERS): encodings = [] for code_page in CODE_PAGES: encoding = 'cp%s' % code_page if is_encodable(character, encoding, 'strict'): encodings.append(encoding) for encoding in LOCALE_ENCODINGS: if is_encodable(character, encoding, 'surrogateescape'): encodings.append(encoding) name = unicodedata.name(character).title() print(" # U+%04X (%s): Encodable to %s" % (ord(character), name, ', '.join(encodings))) print(" '\\u%04X'," % ord(character)) print(")")