diff -r c79fd57f86b6 Lib/test/test_codecencodings_cn.py --- a/Lib/test/test_codecencodings_cn.py +++ b/Lib/test/test_codecencodings_cn.py @@ -49,6 +49,8 @@ class Test_GB18030(multibytecodec_support.TestBase (b"abc\x84\x32\x80\x80def", "replace", 'abc\ufffd2\ufffd\ufffddef'), (b"abc\x81\x30\x81\x30def", "strict", 'abc\x80def'), (b"abc\x86\x30\x81\x30def", "replace", 'abc\ufffd0\ufffd0def'), + # issue24117 + (b"\x81\x30\xFF\x30", "strict", None), ) has_iso10646 = True @@ -81,6 +83,10 @@ class Test_HZ(multibytecodec_support.TestBase, uni (b'ab~{\x81\x81\x41\x44~}cd', 'replace', 'ab\uFFFD\uFFFD\u804Acd'), (b'ab~{\x41\x44~}cd', 'replace', 'ab\u804Acd'), (b"ab~{\x79\x79\x41\x44~}cd", "replace", "ab\ufffd\ufffd\u804acd"), + # issue24117 + ('hi~', 'strict', b'hi~~'), # escape ~ + (b'~{Dc~~:C~}', 'strict', None), # ~~ only in ASCII mode + (b'~{Dc~\n:C~}', 'strict', None), # ~\n only in ASCII mode ) def test_main(): diff -r c79fd57f86b6 Modules/cjkcodecs/_codecs_cn.c --- a/Modules/cjkcodecs/_codecs_cn.c +++ b/Modules/cjkcodecs/_codecs_cn.c @@ -279,7 +279,8 @@ DECODER(gb18030) REQUIRE_INBUF(4); c3 = INBYTE3; c4 = INBYTE4; - if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39) + if (c < 0x81 || c > 0xFE || c3 < 0x81 || c3 > 0xFE || + c4 < 0x30 || c4 > 0x39) return 1; c -= 0x81; c2 -= 0x30; c3 -= 0x81; c4 -= 0x30; @@ -348,14 +349,27 @@ ENCODER(hz) DBCHAR code; if (c < 0x80) { - if (state->i == 0) { - WRITEBYTE1((unsigned char)c); - NEXT(1, 1); + if (c == '~') { + if (state->i == 0) { + WRITEBYTE2('~', '~'); + NEXT(1, 2); + } + else { + WRITEBYTE4('~', '}', '~', '~'); + NEXT(1, 4); + state->i = 0; + } } else { - WRITEBYTE3('~', '}', (unsigned char)c); - NEXT(1, 3); - state->i = 0; + if (state->i == 0) { + WRITEBYTE1((unsigned char)c); + NEXT(1, 1); + } + else { + WRITEBYTE3('~', '}', (unsigned char)c); + NEXT(1, 3); + state->i = 0; + } } continue; } @@ -404,19 +418,18 @@ DECODER(hz) Py_UCS4 decoded; if (c == '~') { - unsigned char c2 = INBYTE2; + unsigned char c2; REQUIRE_INBUF(2); - if (c2 == '~') { - OUTCHAR('~'); - NEXT_IN(2); - continue; - } - else if (c2 == '{' && state->i == 0) + c2 = INBYTE2; + + if (c2 == '{' && state->i == 0) state->i = 1; /* set GB */ else if (c2 == '}' && state->i == 1) state->i = 0; /* set ASCII */ - else if (c2 == '\n') + else if (c2 == '~' && state->i == 0) + OUTCHAR('~'); + else if (c2 == '\n' && state->i == 0) ; /* line-continuation */ else return 1; diff -r c79fd57f86b6 Modules/cjkcodecs/README --- a/Modules/cjkcodecs/README +++ b/Modules/cjkcodecs/README @@ -77,3 +77,54 @@ Notes on implmentation characteristics of each cod - U+007E TILDE is mapped to SHIFT-JIS 0x7e. - U+FF3C FULL-WIDTH REVERSE SOLIDUS is mapped to SHIFT-JIS 815f. + +6) GB2312 codec + + There exist two implementations of GB2312 differently in two code points. + + bytes Implementation A Implementation B + A1A4 U+00B7 MIDDLE DOT U+30FB KATAKANA MIDDLE DOT + A1AA U+2014 EM DASH U+2015 HORIZONTAL BAR + + Implementation A is compatible with GBK and GB18030, while Implementation + B is not. + + Python 2.x/3.x are using Implementation B. + + As of 2015, Microsoft .Net Framework is using Implementation A. + iconv-1.14, php-5.6, ActivePerl-5.20, Java-1.7 are using Implementation B. + Ruby-2.2 is compatible with both Implementation A and Implementation B, it + internally converts the conflictive characters to Implementation A. + + +7) GBK codec + + This implmentation can be seem as either "GBK without PUA code points", + or "CP936 v2.01 without euro sign (0x80 <-> U+20AC)". + + GBK standard gave a total of 23940 two-byte sequences, 2149 of them + were mapped to PUA of BMP. In this implmentation, these 2149 sequences + were discarded, so this implmentation has 21791 (=23940-2149) two-byte + sequences. + + The 2149 (=2054+95) sequences are mapped to 0xE000-0xE864 in PUA. + + The 2054 are empty positions, nothing was assigned to these positions. + + The 95 were assigned with characters. When GBK standard was published + in 1995, these 95 characters were not included by Unicode, so mapped + them to PUA. They are included by later version of Unicode. + + +8) GB18030 codec + + This codec implemented full GB18030-2000 standard. + 25 characters were mapped to PUA of BMP. + + + To implement full GB18030-2005 codec, just modify as below: + + bytes GB18030-2000 GB18030-2005 + A8BC U+E7C7 U+1E3F + 8135F437 U+1E3F U+E7C7 + * U+1E3F is "LATIN SMALL LETTER M WITH ACUTE".