diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst --- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1136,6 +1136,10 @@ particular, the following variants typic +-----------------+--------------------------------+--------------------------------+ | koi8_r | | Russian | +-----------------+--------------------------------+--------------------------------+ +| koi8_t | | Tajik | +| | | | +| | | .. versionadded:: 3.5 | ++-----------------+--------------------------------+--------------------------------+ | koi8_u | | Ukrainian | +-----------------+--------------------------------+--------------------------------+ | mac_cyrillic | maccyrillic | Bulgarian, Byelorussian, | diff --git a/Doc/whatsnew/3.5.rst b/Doc/whatsnew/3.5.rst --- a/Doc/whatsnew/3.5.rst +++ b/Doc/whatsnew/3.5.rst @@ -118,7 +118,8 @@ Other Language Changes Some smaller changes made to the core Python language are: -* None yet. +* New Tajik :ref:`codec ` ``koi8_t``. (Contributed by + Serhiy Storchaka in :issue:`22681`.) diff --git a/Lib/encodings/koi8_r.py b/Lib/encodings/koi8_t.py copy from Lib/encodings/koi8_r.py copy to Lib/encodings/koi8_t.py --- a/Lib/encodings/koi8_r.py +++ b/Lib/encodings/koi8_t.py @@ -1,6 +1,7 @@ -""" Python Character Mapping Codec koi8_r generated from 'MAPPINGS/VENDORS/MISC/KOI8-R.TXT' with gencodec.py. - -"""#" +""" Python Character Mapping Codec koi8_t +""" +# http://ru.wikipedia.org/wiki/КОИ-8 +# http://www.opensource.apple.com/source/libiconv/libiconv-4/libiconv/tests/KOI8-T.TXT import codecs @@ -32,7 +33,7 @@ class StreamReader(Codec,codecs.StreamRe def getregentry(): return codecs.CodecInfo( - name='koi8-r', + name='koi8-t', encode=Codec().encode, decode=Codec().decode, incrementalencoder=IncrementalEncoder, @@ -173,69 +174,69 @@ decoding_table = ( '}' # 0x7D -> RIGHT CURLY BRACKET '~' # 0x7E -> TILDE '\x7f' # 0x7F -> DELETE - '\u2500' # 0x80 -> BOX DRAWINGS LIGHT HORIZONTAL - '\u2502' # 0x81 -> BOX DRAWINGS LIGHT VERTICAL - '\u250c' # 0x82 -> BOX DRAWINGS LIGHT DOWN AND RIGHT - '\u2510' # 0x83 -> BOX DRAWINGS LIGHT DOWN AND LEFT - '\u2514' # 0x84 -> BOX DRAWINGS LIGHT UP AND RIGHT - '\u2518' # 0x85 -> BOX DRAWINGS LIGHT UP AND LEFT - '\u251c' # 0x86 -> BOX DRAWINGS LIGHT VERTICAL AND RIGHT - '\u2524' # 0x87 -> BOX DRAWINGS LIGHT VERTICAL AND LEFT - '\u252c' # 0x88 -> BOX DRAWINGS LIGHT DOWN AND HORIZONTAL - '\u2534' # 0x89 -> BOX DRAWINGS LIGHT UP AND HORIZONTAL - '\u253c' # 0x8A -> BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL - '\u2580' # 0x8B -> UPPER HALF BLOCK - '\u2584' # 0x8C -> LOWER HALF BLOCK - '\u2588' # 0x8D -> FULL BLOCK - '\u258c' # 0x8E -> LEFT HALF BLOCK - '\u2590' # 0x8F -> RIGHT HALF BLOCK - '\u2591' # 0x90 -> LIGHT SHADE - '\u2592' # 0x91 -> MEDIUM SHADE - '\u2593' # 0x92 -> DARK SHADE - '\u2320' # 0x93 -> TOP HALF INTEGRAL - '\u25a0' # 0x94 -> BLACK SQUARE - '\u2219' # 0x95 -> BULLET OPERATOR - '\u221a' # 0x96 -> SQUARE ROOT - '\u2248' # 0x97 -> ALMOST EQUAL TO - '\u2264' # 0x98 -> LESS-THAN OR EQUAL TO - '\u2265' # 0x99 -> GREATER-THAN OR EQUAL TO - '\xa0' # 0x9A -> NO-BREAK SPACE - '\u2321' # 0x9B -> BOTTOM HALF INTEGRAL - '\xb0' # 0x9C -> DEGREE SIGN - '\xb2' # 0x9D -> SUPERSCRIPT TWO - '\xb7' # 0x9E -> MIDDLE DOT - '\xf7' # 0x9F -> DIVISION SIGN - '\u2550' # 0xA0 -> BOX DRAWINGS DOUBLE HORIZONTAL - '\u2551' # 0xA1 -> BOX DRAWINGS DOUBLE VERTICAL - '\u2552' # 0xA2 -> BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE + '\u049b' # 0x80 -> CYRILLIC SMALL LETTER KA WITH DESCENDER + '\u0493' # 0x81 -> CYRILLIC SMALL LETTER GHE WITH STROKE + '\u201a' # 0x82 -> SINGLE LOW-9 QUOTATION MARK + '\u0492' # 0x83 -> CYRILLIC CAPITAL LETTER GHE WITH STROKE + '\u201e' # 0x84 -> DOUBLE LOW-9 QUOTATION MARK + '\u2026' # 0x85 -> HORIZONTAL ELLIPSIS + '\u2020' # 0x86 -> DAGGER + '\u2021' # 0x87 -> DOUBLE DAGGER + '\ufffe' # 0x88 -> UNDEFINED + '\u2030' # 0x89 -> PER MILLE SIGN + '\u04b3' # 0x8A -> CYRILLIC SMALL LETTER HA WITH DESCENDER + '\u2039' # 0x8B -> SINGLE LEFT-POINTING ANGLE QUOTATION MARK + '\u04b2' # 0x8C -> CYRILLIC CAPITAL LETTER HA WITH DESCENDER + '\u04b7' # 0x8D -> CYRILLIC SMALL LETTER CHE WITH DESCENDER + '\u04b6' # 0x8E -> CYRILLIC CAPITAL LETTER CHE WITH DESCENDER + '\ufffe' # 0x8F -> UNDEFINED + '\u049a' # 0x90 -> CYRILLIC CAPITAL LETTER KA WITH DESCENDER + '\u2018' # 0x91 -> LEFT SINGLE QUOTATION MARK + '\u2019' # 0x92 -> RIGHT SINGLE QUOTATION MARK + '\u201c' # 0x93 -> LEFT DOUBLE QUOTATION MARK + '\u201d' # 0x94 -> RIGHT DOUBLE QUOTATION MARK + '\u2022' # 0x95 -> BULLET + '\u2013' # 0x96 -> EN DASH + '\u2014' # 0x97 -> EM DASH + '\ufffe' # 0x98 -> UNDEFINED + '\u2122' # 0x99 -> TRADE MARK SIGN + '\ufffe' # 0x9A -> UNDEFINED + '\u203a' # 0x9B -> SINGLE RIGHT-POINTING ANGLE QUOTATION MARK + '\ufffe' # 0x9C -> UNDEFINED + '\ufffe' # 0x9D -> UNDEFINED + '\ufffe' # 0x9E -> UNDEFINED + '\ufffe' # 0x9F -> UNDEFINED + '\ufffe' # 0xA0 -> UNDEFINED + '\u04ef' # 0xA1 -> CYRILLIC SMALL LETTER U WITH MACRON + '\u04ee' # 0xA2 -> CYRILLIC CAPITAL LETTER U WITH MACRON '\u0451' # 0xA3 -> CYRILLIC SMALL LETTER IO - '\u2553' # 0xA4 -> BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE - '\u2554' # 0xA5 -> BOX DRAWINGS DOUBLE DOWN AND RIGHT - '\u2555' # 0xA6 -> BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE - '\u2556' # 0xA7 -> BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE - '\u2557' # 0xA8 -> BOX DRAWINGS DOUBLE DOWN AND LEFT - '\u2558' # 0xA9 -> BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE - '\u2559' # 0xAA -> BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE - '\u255a' # 0xAB -> BOX DRAWINGS DOUBLE UP AND RIGHT - '\u255b' # 0xAC -> BOX DRAWINGS UP SINGLE AND LEFT DOUBLE - '\u255c' # 0xAD -> BOX DRAWINGS UP DOUBLE AND LEFT SINGLE - '\u255d' # 0xAE -> BOX DRAWINGS DOUBLE UP AND LEFT - '\u255e' # 0xAF -> BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE - '\u255f' # 0xB0 -> BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE - '\u2560' # 0xB1 -> BOX DRAWINGS DOUBLE VERTICAL AND RIGHT - '\u2561' # 0xB2 -> BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE + '\xa4' # 0xA4 -> CURRENCY SIGN + '\u04e3' # 0xA5 -> CYRILLIC SMALL LETTER I WITH MACRON + '\xa6' # 0xA6 -> BROKEN BAR + '\xa7' # 0xA7 -> SECTION SIGN + '\ufffe' # 0xA8 -> UNDEFINED + '\ufffe' # 0xA9 -> UNDEFINED + '\ufffe' # 0xAA -> UNDEFINED + '\xab' # 0xAB -> LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + '\xac' # 0xAC -> NOT SIGN + '\xad' # 0xAD -> SOFT HYPHEN + '\xae' # 0xAE -> REGISTERED SIGN + '\ufffe' # 0xAF -> UNDEFINED + '\xb0' # 0xB0 -> DEGREE SIGN + '\xb1' # 0xB1 -> PLUS-MINUS SIGN + '\xb2' # 0xB2 -> SUPERSCRIPT TWO '\u0401' # 0xB3 -> CYRILLIC CAPITAL LETTER IO - '\u2562' # 0xB4 -> BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE - '\u2563' # 0xB5 -> BOX DRAWINGS DOUBLE VERTICAL AND LEFT - '\u2564' # 0xB6 -> BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE - '\u2565' # 0xB7 -> BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE - '\u2566' # 0xB8 -> BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL - '\u2567' # 0xB9 -> BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE - '\u2568' # 0xBA -> BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE - '\u2569' # 0xBB -> BOX DRAWINGS DOUBLE UP AND HORIZONTAL - '\u256a' # 0xBC -> BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE - '\u256b' # 0xBD -> BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE - '\u256c' # 0xBE -> BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL + '\ufffe' # 0xB4 -> UNDEFINED + '\u04e2' # 0xB5 -> CYRILLIC CAPITAL LETTER I WITH MACRON + '\xb6' # 0xB6 -> PILCROW SIGN + '\xb7' # 0xB7 -> MIDDLE DOT + '\ufffe' # 0xB8 -> UNDEFINED + '\u2116' # 0xB9 -> NUMERO SIGN + '\ufffe' # 0xBA -> UNDEFINED + '\xbb' # 0xBB -> RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + '\ufffe' # 0xBC -> UNDEFINED + '\ufffe' # 0xBD -> UNDEFINED + '\ufffe' # 0xBE -> UNDEFINED '\xa9' # 0xBF -> COPYRIGHT SIGN '\u044e' # 0xC0 -> CYRILLIC SMALL LETTER YU '\u0430' # 0xC1 -> CYRILLIC SMALL LETTER A diff --git a/Lib/locale.py b/Lib/locale.py --- a/Lib/locale.py +++ b/Lib/locale.py @@ -696,6 +696,7 @@ locale_encoding_alias = { 'euc_kr': 'eucKR', 'utf_8': 'UTF-8', 'koi8_r': 'KOI8-R', + 'koi8_t': 'KOI8-T', 'koi8_u': 'KOI8-U', 'cp1251': 'CP1251', 'cp1255': 'CP1255', diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -1754,6 +1754,7 @@ all_unicode_encodings = [ "iso8859_9", "johab", "koi8_r", + "koi8_t", "koi8_u", "latin_1", "mac_cyrillic", diff --git a/Lib/test/test_unicode.py b/Lib/test/test_unicode.py --- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1929,7 +1929,7 @@ class UnicodeTest(string_tests.CommonTes 'cp863', 'cp865', 'cp866', 'cp1125', 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6', - 'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1', + 'iso8859_7', 'iso8859_9', 'koi8_r', 'koi8_t', 'koi8_u', 'latin_1', 'mac_cyrillic', 'mac_latin2', 'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', @@ -1957,14 +1957,14 @@ class UnicodeTest(string_tests.CommonTes 'cp863', 'cp865', 'cp866', 'cp1125', 'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15', 'iso8859_2', 'iso8859_4', 'iso8859_5', - 'iso8859_9', 'koi8_r', 'latin_1', + 'iso8859_9', 'koi8_r', 'koi8_u', 'latin_1', 'mac_cyrillic', 'mac_latin2', ### These have undefined mappings: #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255', #'cp1256', 'cp1257', 'cp1258', #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874', - #'iso8859_3', 'iso8859_6', 'iso8859_7', + #'iso8859_3', 'iso8859_6', 'iso8859_7', 'koi8_t', #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish', ### These fail the round-trip: diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -704,7 +704,7 @@ class ElementTreeTest(unittest.TestCase) 'mac-roman', 'mac-turkish', 'iso2022-jp', 'iso2022-jp-1', 'iso2022-jp-2', 'iso2022-jp-2004', 'iso2022-jp-3', 'iso2022-jp-ext', - 'koi8-r', 'koi8-u', + 'koi8-r', 'koi8-t', 'koi8-u', 'hz', 'ptcp154', ] for encoding in supported_encodings: diff --git a/Misc/NEWS b/Misc/NEWS --- a/Misc/NEWS +++ b/Misc/NEWS @@ -181,6 +181,8 @@ Core and Builtins Library ------- +- Issue #22681: Added support for the koi8_t encoding. + - Issue #18853: Fixed ResourceWarning in shlex.__nain__. - Issue #9351: Defaults set with set_defaults on an argparse subparser