# works on Windows only import os import sys import codecs from ctypes import * winver = sys.getwindowsversion() if sys.version_info >= (3,): unicode = str stdout = sys.stdout.buffer else: stdout = sys.stdout def chcp(cp): os.system(u"chcp %d > NUL" % cp) def _take_fst(f): def f2(*a, **kw): return f(*a, **kw)[0] return f2 if sys.version_info >= (3,3): cp_enc = _take_fst(codecs.code_page_encode) cp_dec = _take_fst(codecs.code_page_decode) else: def cp_dec(cp, bs): buf = create_unicode_buffer(len(bs) * 2) n = cdll.kernel32.MultiByteToWideChar( c_uint(cp), c_int32(0), c_char_p(bs), c_int(len(bs)), buf, c_int(len(bs) * 2)) if n <= 0: raise ValueError(n) return u''.join(buf[0:n]) def cp_enc(cp, us): buf = create_string_buffer(len(us) * 4) n = cdll.kernel32.WideCharToMultiByte( c_uint(cp), c_int32(0), c_wchar_p(us), c_int(len(us)), buf, c_int(len(us) * 4), c_char_p(None), c_char_p(None)) if n <= 0: raise ValueError(n) return b''.join(buf[0:n]) def test_cp_dec(cp, bs): mbcs = cp_dec(cp, bs) pycp = bs.decode(u"cp%d" % cp, u'replace') return (mbcs == pycp, mbcs, pycp) def test_cp_enc(cp, us): mbcs = cp_enc(cp, us) pycp = us.encode(u"cp%d" % cp, u'replace') return (mbcs == pycp, mbcs, pycp) # Now we add a few encodings that someone else have found strange. # https://ftfy.readthedocs.io/en/latest/#module-ftfy.bad_codecs.sloppy known_pages = list(range(1250,1259)) + [437, 874] # Known-bad per ftfy. known_bytes = b'\x81\x8D' dec_results = {cp: test_cp_dec(cp, known_bytes) for cp in known_pages} known_chars = known_bytes.decode('latin-1') enc_results = {cp: test_cp_enc(cp, known_chars) for cp in known_pages} chcp(65001) def pr65001(s=u'', nl=True): global stdout stdout.write(str(s).encode(u'utf_8')) if nl: stdout.write(b'\r\n') pr65001(u"Results for Py {0} on Windows {1}:".format( unicode(tuple(sys.version_info)), unicode(tuple(winver)))) for cp in known_pages: pr65001(u"cp%d:" % cp, False) if not dec_results[cp][0]: pr65001(u' dec=(') pr65001(u',\n'.join(u' ' + unicode(repr(s)) for s in dec_results[cp])) pr65001(u')', False) if not enc_results[cp][0]: pr65001(u', enc=(') pr65001(u',\n'.join(u' ' + unicode(repr(s)) for s in enc_results[cp])) pr65001(u')', False) pr65001(u'') pr65001(u'')