diff -r e8783c581928 Lib/test/test_codecencodings_cn.py --- a/Lib/test/test_codecencodings_cn.py +++ b/Lib/test/test_codecencodings_cn.py @@ -46,6 +46,8 @@ class Test_GB18030(test_multibytecodec_support.Tes ("abc\x80\x80\xc1\xc4", "ignore", u"abc\u804a"), ("abc\x84\x39\x84\x39\xc1\xc4", "replace", u"abc\ufffd\u804a"), (u"\u30fb", "strict", "\x819\xa79"), + # issue24117 + (b"\x81\x30\xFF\x30", "strict", None), ) has_iso10646 = True @@ -76,6 +78,10 @@ class Test_HZ(test_multibytecodec_support.TestBase (b'ab~cd', 'replace', u'ab\uFFFDd'), (b'ab\xffcd', 'replace', u'ab\uFFFDcd'), (b'ab~{\x81\x81\x41\x44~}cd', 'replace', u'ab\uFFFD\uFFFD\u804Acd'), + # issue24117 + (u'hi~', 'strict', b'hi~~'), # escape ~ + (b'~{Dc~~:C~}', 'strict', None), # ~~ only in ASCII mode + (b'~{Dc~\n:C~}', 'strict', None), # ~\n only in ASCII mode ) def test_main(): diff -r e8783c581928 Modules/cjkcodecs/_codecs_cn.c --- a/Modules/cjkcodecs/_codecs_cn.c +++ b/Modules/cjkcodecs/_codecs_cn.c @@ -266,7 +266,8 @@ DECODER(gb18030) REQUIRE_INBUF(4) c3 = IN3; c4 = IN4; - if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39) + if (c < 0x81 || c > 0xFE || c3 < 0x81 || c3 > 0xFE || + c4 < 0x30 || c4 > 0x39) return 4; c -= 0x81; c2 -= 0x30; c3 -= 0x81; c4 -= 0x30; @@ -333,14 +334,27 @@ ENCODER(hz) DBCHAR code; if (c < 0x80) { - if (state->i == 0) { - WRITE1((unsigned char)c) - NEXT(1, 1) + if (c == '~') { + if (state->i == 0) { + WRITE2('~', '~') + NEXT(1, 2) + } + else { + WRITE4('~', '}', '~', '~') + NEXT(1, 4) + state->i = 0; + } } else { - WRITE3('~', '}', (unsigned char)c) - NEXT(1, 3) - state->i = 0; + if (state->i == 0) { + WRITE1((unsigned char)c) + NEXT(1, 1) + } + else { + WRITE3('~', '}', (unsigned char)c) + NEXT(1, 3) + state->i = 0; + } } continue; } @@ -385,10 +399,11 @@ DECODER(hz) unsigned char c = IN1; if (c == '~') { - unsigned char c2 = IN2; + unsigned char c2; REQUIRE_INBUF(2) - if (c2 == '~') { + c2 = IN2; + if (c2 == '~' && state->i == 0) { WRITE1('~') NEXT(2, 1) continue; @@ -397,7 +412,7 @@ DECODER(hz) state->i = 1; /* set GB */ else if (c2 == '}' && state->i == 1) state->i = 0; /* set ASCII */ - else if (c2 == '\n') + else if (c2 == '\n' && state->i == 0) ; /* line-continuation */ else return 2;