diff -r 0b2d4089180c Include/unicodeobject.h --- a/Include/unicodeobject.h Wed Apr 10 17:01:38 2013 -0400 +++ b/Include/unicodeobject.h Thu Apr 11 01:48:14 2013 +0200 @@ -933,6 +933,13 @@ PyAPI_FUNC(int) _PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer, Py_ssize_t length, Py_UCS4 maxchar); +/* Append a Unicode character. + Return 0 on success, raise an exception and return -1 on error. */ +PyAPI_FUNC(int) +_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, + Py_UCS4 ch + ); + /* Append a Unicode string. Return 0 on success, raise an exception and return -1 on error. */ PyAPI_FUNC(int) diff -r 0b2d4089180c Modules/cjkcodecs/_codecs_cn.c --- a/Modules/cjkcodecs/_codecs_cn.c Wed Apr 10 17:01:38 2013 -0400 +++ b/Modules/cjkcodecs/_codecs_cn.c Thu Apr 11 01:48:14 2013 +0200 @@ -23,12 +23,12 @@ * A844 undefined U+2015 HORIZONTAL BAR */ -#define GBK_DECODE(dc1, dc2, assi) \ - if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \ - else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \ - else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \ - else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \ - else TRYMAP_DEC(gbkext, assi, dc1, dc2); +#define GBK_DECODE(dc1, dc2, writer) \ + if ((dc1) == 0xa1 && (dc2) == 0xaa) OUT(0x2014); \ + else if ((dc1) == 0xa8 && (dc2) == 0x44) OUT(0x2015); \ + else if ((dc1) == 0xa1 && (dc2) == 0xa4) OUT(0x00b7); \ + else TRYMAP_DEC(gb2312, writer, dc1 ^ 0x80, dc2 ^ 0x80); \ + else TRYMAP_DEC(gbkext, writer, dc1, dc2); #define GBK_ENCODE(code, assi) \ if ((code) == 0x2014) (assi) = 0xa1aa; \ @@ -43,7 +43,7 @@ ENCODER(gb2312) { while (inleft > 0) { - Py_UNICODE c = IN1; + Py_UCS4 c = IN1; DBCHAR code; if (c < 0x80) { @@ -73,17 +73,15 @@ DECODER(gb2312) while (inleft > 0) { unsigned char c = **inbuf; - REQUIRE_OUTBUF(1) - if (c < 0x80) { - OUT1(c) - NEXT(1, 1) + OUT(c); + NEXT_IN(1); continue; } REQUIRE_INBUF(2) - TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) { - NEXT(2, 1) + TRYMAP_DEC(gb2312, writer, c ^ 0x80, IN2 ^ 0x80) { + NEXT_IN(2); } else return 1; } @@ -99,7 +97,7 @@ DECODER(gb2312) ENCODER(gbk) { while (inleft > 0) { - Py_UNICODE c = IN1; + Py_UCS4 c = IN1; DBCHAR code; if (c < 0x80) { @@ -130,20 +128,18 @@ DECODER(gbk) while (inleft > 0) { unsigned char c = IN1; - REQUIRE_OUTBUF(1) - if (c < 0x80) { - OUT1(c) - NEXT(1, 1) + OUT(c); + NEXT_IN(1); continue; } REQUIRE_INBUF(2) - GBK_DECODE(c, IN2, **outbuf) + GBK_DECODE(c, IN2, writer) else return 1; - NEXT(2, 1) + NEXT_IN(2); } return 0; @@ -157,7 +153,7 @@ DECODER(gbk) ENCODER(gb18030) { while (inleft > 0) { - ucs4_t c = IN1; + Py_UCS4 c = IN1; DBCHAR code; if (c < 0x80) { @@ -174,7 +170,7 @@ ENCODER(gb18030) return 1; #endif else if (c >= 0x10000) { - ucs4_t tc = c - 0x10000; + Py_UCS4 tc = c - 0x10000; REQUIRE_OUTBUF(4) @@ -208,7 +204,7 @@ ENCODER(gb18030) utrrange++) if (utrrange->first <= c && c <= utrrange->last) { - Py_UNICODE tc; + Py_UCS4 tc; tc = c - utrrange->first + utrrange->base; @@ -247,11 +243,9 @@ DECODER(gb18030) while (inleft > 0) { unsigned char c = IN1, c2; - REQUIRE_OUTBUF(1) - if (c < 0x80) { - OUT1(c) - NEXT(1, 1) + OUT(c); + NEXT_IN(1); continue; } @@ -261,7 +255,7 @@ DECODER(gb18030) if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */ const struct _gb18030_to_unibmp_ranges *utr; unsigned char c3, c4; - ucs4_t lseq; + Py_UCS4 lseq; REQUIRE_INBUF(4) c3 = IN3; @@ -272,34 +266,34 @@ DECODER(gb18030) c3 -= 0x81; c4 -= 0x30; if (c < 4) { /* U+0080 - U+FFFF */ - lseq = ((ucs4_t)c * 10 + c2) * 1260 + - (ucs4_t)c3 * 10 + c4; + lseq = ((Py_UCS4)c * 10 + c2) * 1260 + + (Py_UCS4)c3 * 10 + c4; if (lseq < 39420) { for (utr = gb18030_to_unibmp_ranges; lseq >= (utr + 1)->base; utr++) ; - OUT1(utr->first - utr->base + lseq) - NEXT(4, 1) + OUT(utr->first - utr->base + lseq); + NEXT_IN(4); continue; } } else if (c >= 15) { /* U+10000 - U+10FFFF */ - lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2) - * 1260 + (ucs4_t)c3 * 10 + c4; + lseq = 0x10000 + (((Py_UCS4)c-15) * 10 + c2) + * 1260 + (Py_UCS4)c3 * 10 + c4; if (lseq <= 0x10FFFF) { - WRITEUCS4(lseq); - NEXT_IN(4) + OUT(lseq); + NEXT_IN(4); continue; } } return 1; } - GBK_DECODE(c, c2, **outbuf) - else TRYMAP_DEC(gb18030ext, **outbuf, c, c2); + GBK_DECODE(c, c2, writer) + else TRYMAP_DEC(gb18030ext, writer, c, c2); else return 1; - NEXT(2, 1) + NEXT_IN(2); } return 0; @@ -329,7 +323,7 @@ ENCODER_RESET(hz) ENCODER(hz) { while (inleft > 0) { - Py_UNICODE c = IN1; + Py_UCS4 c = IN1; DBCHAR code; if (c < 0x80) { @@ -389,8 +383,8 @@ DECODER(hz) REQUIRE_INBUF(2) if (c2 == '~') { - WRITE1('~') - NEXT(2, 1) + OUT('~'); + NEXT_IN(2); continue; } else if (c2 == '{' && state->i == 0) @@ -401,7 +395,7 @@ DECODER(hz) ; /* line-continuation */ else return 1; - NEXT(2, 0); + NEXT_IN(2); continue; } @@ -409,14 +403,13 @@ DECODER(hz) return 1; if (state->i == 0) { /* ASCII mode */ - WRITE1(c) - NEXT(1, 1) + OUT(c); + NEXT_IN(1); } else { /* GB mode */ REQUIRE_INBUF(2) - REQUIRE_OUTBUF(1) - TRYMAP_DEC(gb2312, **outbuf, c, IN2) { - NEXT(2, 1) + TRYMAP_DEC(gb2312, writer, c, IN2) { + NEXT_IN(2); } else return 1; diff -r 0b2d4089180c Modules/cjkcodecs/_codecs_hk.c --- a/Modules/cjkcodecs/_codecs_hk.c Wed Apr 10 17:01:38 2013 -0400 +++ b/Modules/cjkcodecs/_codecs_hk.c Thu Apr 11 01:48:14 2013 +0200 @@ -39,7 +39,7 @@ static const DBCHAR big5hkscs_pairenc_ta ENCODER(big5hkscs) { while (inleft > 0) { - ucs4_t c = **inbuf; + Py_UCS4 c = **inbuf; DBCHAR code; Py_ssize_t insize; @@ -103,26 +103,24 @@ DECODER(big5hkscs) { while (inleft > 0) { unsigned char c = IN1; - ucs4_t decoded; - - REQUIRE_OUTBUF(1) + Py_UCS4 decoded; if (c < 0x80) { - OUT1(c) - NEXT(1, 1) + OUT(c); + NEXT_IN(1); continue; } REQUIRE_INBUF(2) if (0xc6 > c || c > 0xc8 || (c < 0xc7 && IN2 < 0xa1)) { - TRYMAP_DEC(big5, **outbuf, c, IN2) { - NEXT(2, 1) + TRYMAP_DEC(big5, writer, c, IN2) { + NEXT_IN(2); continue; } } - TRYMAP_DEC(big5hkscs, decoded, c, IN2) + TRYMAP_DEC_CHAR(big5hkscs, decoded, c, IN2) { int s = BH2S(c, IN2); const unsigned char *hintbase; @@ -146,25 +144,25 @@ DECODER(big5hkscs) return MBERR_INTERNAL; if (hintbase[s >> 3] & (1 << (s & 7))) { - WRITEUCS4(decoded | 0x20000) - NEXT_IN(2) + OUT(decoded | 0x20000); + NEXT_IN(2); } else { - OUT1(decoded) - NEXT(2, 1) + OUT(decoded); + NEXT_IN(2); } continue; } switch ((c << 8) | IN2) { - case 0x8862: WRITE2(0x00ca, 0x0304); break; - case 0x8864: WRITE2(0x00ca, 0x030c); break; - case 0x88a3: WRITE2(0x00ea, 0x0304); break; - case 0x88a5: WRITE2(0x00ea, 0x030c); break; + case 0x8862: OUTCHAR2(0x00ca, 0x0304); break; + case 0x8864: OUTCHAR2(0x00ca, 0x030c); break; + case 0x88a3: OUTCHAR2(0x00ea, 0x0304); break; + case 0x88a5: OUTCHAR2(0x00ea, 0x030c); break; default: return 1; } - NEXT(2, 2) /* all decoded codepoints are pairs, above. */ + NEXT_IN(2); /* all decoded codepoints are pairs, above. */ } return 0; diff -r 0b2d4089180c Modules/cjkcodecs/_codecs_iso2022.c --- a/Modules/cjkcodecs/_codecs_iso2022.c Wed Apr 10 17:01:38 2013 -0400 +++ b/Modules/cjkcodecs/_codecs_iso2022.c Thu Apr 11 01:48:14 2013 +0200 @@ -102,8 +102,8 @@ /*-*- internal data structures -*-*/ typedef int (*iso2022_init_func)(void); -typedef ucs4_t (*iso2022_decode_func)(const unsigned char *data); -typedef DBCHAR (*iso2022_encode_func)(const ucs4_t *data, Py_ssize_t *length); +typedef Py_UCS4 (*iso2022_decode_func)(const unsigned char *data); +typedef DBCHAR (*iso2022_encode_func)(const Py_UCS4 *data, Py_ssize_t *length); struct iso2022_designation { unsigned char mark; @@ -158,7 +158,7 @@ ENCODER(iso2022) while (inleft > 0) { const struct iso2022_designation *dsg; DBCHAR encoded; - ucs4_t c = **inbuf; + Py_UCS4 c = **inbuf; Py_ssize_t insize; if (c < 0x80) { @@ -196,9 +196,9 @@ ENCODER(iso2022) length = 2; #if Py_UNICODE_SIZE == 2 if (length == 2) { - ucs4_t u4in[2]; - u4in[0] = (ucs4_t)IN1; - u4in[1] = (ucs4_t)IN2; + Py_UCS4 u4in[2]; + u4in[0] = (Py_UCS4)IN1; + u4in[1] = (Py_UCS4)IN2; encoded = dsg->encoder(u4in, &length); } else encoded = dsg->encoder(&c, &length); @@ -277,7 +277,7 @@ ENCODER(iso2022) WRITE2(encoded >> 8, encoded & 0xff) NEXT_OUT(2) } - NEXT_IN(insize) + NEXT_IN(insize); } return 0; @@ -376,45 +376,43 @@ iso2022processesc(const void *config, Mu return 0; } -#define ISO8859_7_DECODE(c, assi) \ - if ((c) < 0xa0) (assi) = (c); \ - else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) \ - (assi) = (c); \ - else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 || \ - (0xbffffd77L & (1L << ((c)-0xb4))))) \ - (assi) = 0x02d0 + (c); \ - else if ((c) == 0xa1) (assi) = 0x2018; \ - else if ((c) == 0xa2) (assi) = 0x2019; \ - else if ((c) == 0xaf) (assi) = 0x2015; +#define ISO8859_7_DECODE(c, writer) \ + if ((c) < 0xa0) OUT(c); \ + else if ((c) < 0xc0 && (0x288f3bc9L & (1L << ((c)-0xa0)))) \ + OUT(c); \ + else if ((c) >= 0xb4 && (c) <= 0xfe && ((c) >= 0xd4 || \ + (0xbffffd77L & (1L << ((c)-0xb4))))) \ + OUT(0x02d0 + (c)); \ + else if ((c) == 0xa1) OUT(0x2018); \ + else if ((c) == 0xa2) OUT(0x2019); \ + else if ((c) == 0xaf) OUT(0x2015); static Py_ssize_t iso2022processg2(const void *config, MultibyteCodec_State *state, const unsigned char **inbuf, Py_ssize_t *inleft, - Py_UNICODE **outbuf, Py_ssize_t *outleft) + _PyUnicodeWriter *writer) { /* not written to use encoder, decoder functions because only few * encodings use G2 designations in CJKCodecs */ if (STATE_G2 == CHARSET_ISO8859_1) { if (IN3 < 0x80) - OUT1(IN3 + 0x80) + OUT(IN3 + 0x80); else return 3; } else if (STATE_G2 == CHARSET_ISO8859_7) { - ISO8859_7_DECODE(IN3 ^ 0x80, **outbuf) + ISO8859_7_DECODE(IN3 ^ 0x80, writer) else return 3; } else if (STATE_G2 == CHARSET_ASCII) { if (IN3 & 0x80) return 3; - else **outbuf = IN3; + else OUT(IN3); } else return MBERR_INTERNAL; (*inbuf) += 3; *inleft -= 3; - (*outbuf) += 1; - *outleft -= 1; return 0; } @@ -429,8 +427,8 @@ DECODER(iso2022) if (STATE_GETFLAG(F_ESCTHROUGHOUT)) { /* ESC throughout mode: * for non-iso2022 escape sequences */ - WRITE1(c) /* assume as ISO-8859-1 */ - NEXT(1, 1) + OUT(c); /* assume as ISO-8859-1 */ + NEXT_IN(1); if (IS_ESCEND(c)) { STATE_CLEARFLAG(F_ESCTHROUGHOUT) } @@ -449,32 +447,32 @@ DECODER(iso2022) else if (CONFIG_ISSET(USE_G2) && IN2 == 'N') {/* SS2 */ REQUIRE_INBUF(3) err = iso2022processg2(config, state, - inbuf, &inleft, outbuf, &outleft); + inbuf, &inleft, writer); if (err != 0) return err; } else { - WRITE1(ESC) + OUT(ESC); STATE_SETFLAG(F_ESCTHROUGHOUT) - NEXT(1, 1) + NEXT_IN(1); } break; case SI: if (CONFIG_ISSET(NO_SHIFT)) goto bypass; STATE_CLEARFLAG(F_SHIFTED) - NEXT_IN(1) + NEXT_IN(1); break; case SO: if (CONFIG_ISSET(NO_SHIFT)) goto bypass; STATE_SETFLAG(F_SHIFTED) - NEXT_IN(1) + NEXT_IN(1); break; case LF: STATE_CLEARFLAG(F_SHIFTED) - WRITE1(LF) - NEXT(1, 1) + OUT(LF); + NEXT_IN(1); break; default: if (c < 0x20) /* C0 */ @@ -484,7 +482,7 @@ DECODER(iso2022) else { const struct iso2022_designation *dsg; unsigned char charset; - ucs4_t decoded; + Py_UCS4 decoded; if (STATE_GETFLAG(F_SHIFTED)) charset = STATE_G1; @@ -492,8 +490,8 @@ DECODER(iso2022) charset = STATE_G0; if (charset == CHARSET_ASCII) { -bypass: WRITE1(c) - NEXT(1, 1) +bypass: OUT(c); + NEXT_IN(1); break; } @@ -518,17 +516,15 @@ bypass: return dsg->width; if (decoded < 0x10000) { - WRITE1(decoded) - NEXT_OUT(1) + OUT(decoded); } else if (decoded < 0x30000) { - WRITEUCS4(decoded) + OUT(decoded); } else { /* JIS X 0213 pairs */ - WRITE2(decoded >> 16, decoded & 0xffff) - NEXT_OUT(2) + OUTCHAR2(decoded >> 16, decoded & 0xffff); } - NEXT_IN(dsg->width) + NEXT_IN(dsg->width); } break; } @@ -577,18 +573,18 @@ ksx1001_init(void) return 0; } -static ucs4_t +static Py_UCS4 ksx1001_decoder(const unsigned char *data) { - ucs4_t u; - TRYMAP_DEC(ksx1001, u, data[0], data[1]) + Py_UCS4 u; + TRYMAP_DEC_CHAR(ksx1001, u, data[0], data[1]) return u; else return MAP_UNMAPPABLE; } static DBCHAR -ksx1001_encoder(const ucs4_t *data, Py_ssize_t *length) +ksx1001_encoder(const Py_UCS4 *data, Py_ssize_t *length) { DBCHAR coded; assert(*length == 1); @@ -613,20 +609,20 @@ jisx0208_init(void) return 0; } -static ucs4_t +static Py_UCS4 jisx0208_decoder(const unsigned char *data) { - ucs4_t u; + Py_UCS4 u; if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */ return 0xff3c; - else TRYMAP_DEC(jisx0208, u, data[0], data[1]) + else TRYMAP_DEC_CHAR(jisx0208, u, data[0], data[1]) return u; else return MAP_UNMAPPABLE; } static DBCHAR -jisx0208_encoder(const ucs4_t *data, Py_ssize_t *length) +jisx0208_encoder(const Py_UCS4 *data, Py_ssize_t *length) { DBCHAR coded; assert(*length == 1); @@ -654,18 +650,18 @@ jisx0212_init(void) return 0; } -static ucs4_t +static Py_UCS4 jisx0212_decoder(const unsigned char *data) { - ucs4_t u; - TRYMAP_DEC(jisx0212, u, data[0], data[1]) + Py_UCS4 u; + TRYMAP_DEC_CHAR(jisx0212, u, data[0], data[1]) return u; else return MAP_UNMAPPABLE; } static DBCHAR -jisx0212_encoder(const ucs4_t *data, Py_ssize_t *length) +jisx0212_encoder(const Py_UCS4 *data, Py_ssize_t *length) { DBCHAR coded; assert(*length == 1); @@ -705,30 +701,30 @@ jisx0213_init(void) } #define config ((void *)2000) -static ucs4_t +static Py_UCS4 jisx0213_2000_1_decoder(const unsigned char *data) { - ucs4_t u; + Py_UCS4 u; EMULATE_JISX0213_2000_DECODE_PLANE1(u, data[0], data[1]) else if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */ return 0xff3c; - else TRYMAP_DEC(jisx0208, u, data[0], data[1]); - else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]); - else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1]) + else TRYMAP_DEC_CHAR(jisx0208, u, data[0], data[1]); + else TRYMAP_DEC_CHAR(jisx0213_1_bmp, u, data[0], data[1]); + else TRYMAP_DEC_CHAR(jisx0213_1_emp, u, data[0], data[1]) u |= 0x20000; - else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]); + else TRYMAP_DEC_CHAR(jisx0213_pair, u, data[0], data[1]); else return MAP_UNMAPPABLE; return u; } -static ucs4_t +static Py_UCS4 jisx0213_2000_2_decoder(const unsigned char *data) { - ucs4_t u; - EMULATE_JISX0213_2000_DECODE_PLANE2(u, data[0], data[1]) - TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]); - else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1]) + Py_UCS4 u; + EMULATE_JISX0213_2000_DECODE_PLANE2_CHAR(u, data[0], data[1]) + TRYMAP_DEC_CHAR(jisx0213_2_bmp, u, data[0], data[1]); + else TRYMAP_DEC_CHAR(jisx0213_2_emp, u, data[0], data[1]) u |= 0x20000; else return MAP_UNMAPPABLE; @@ -736,28 +732,28 @@ jisx0213_2000_2_decoder(const unsigned c } #undef config -static ucs4_t +static Py_UCS4 jisx0213_2004_1_decoder(const unsigned char *data) { - ucs4_t u; + Py_UCS4 u; if (data[0] == 0x21 && data[1] == 0x40) /* F/W REVERSE SOLIDUS */ return 0xff3c; - else TRYMAP_DEC(jisx0208, u, data[0], data[1]); - else TRYMAP_DEC(jisx0213_1_bmp, u, data[0], data[1]); - else TRYMAP_DEC(jisx0213_1_emp, u, data[0], data[1]) + else TRYMAP_DEC_CHAR(jisx0208, u, data[0], data[1]); + else TRYMAP_DEC_CHAR(jisx0213_1_bmp, u, data[0], data[1]); + else TRYMAP_DEC_CHAR(jisx0213_1_emp, u, data[0], data[1]) u |= 0x20000; - else TRYMAP_DEC(jisx0213_pair, u, data[0], data[1]); + else TRYMAP_DEC_CHAR(jisx0213_pair, u, data[0], data[1]); else return MAP_UNMAPPABLE; return u; } -static ucs4_t +static Py_UCS4 jisx0213_2004_2_decoder(const unsigned char *data) { - ucs4_t u; - TRYMAP_DEC(jisx0213_2_bmp, u, data[0], data[1]); - else TRYMAP_DEC(jisx0213_2_emp, u, data[0], data[1]) + Py_UCS4 u; + TRYMAP_DEC_CHAR(jisx0213_2_bmp, u, data[0], data[1]); + else TRYMAP_DEC_CHAR(jisx0213_2_emp, u, data[0], data[1]) u |= 0x20000; else return MAP_UNMAPPABLE; @@ -765,7 +761,7 @@ jisx0213_2004_2_decoder(const unsigned c } static DBCHAR -jisx0213_encoder(const ucs4_t *data, Py_ssize_t *length, void *config) +jisx0213_encoder(const Py_UCS4 *data, Py_ssize_t *length, void *config) { DBCHAR coded; @@ -819,7 +815,7 @@ jisx0213_encoder(const ucs4_t *data, Py_ } static DBCHAR -jisx0213_2000_1_encoder(const ucs4_t *data, Py_ssize_t *length) +jisx0213_2000_1_encoder(const Py_UCS4 *data, Py_ssize_t *length) { DBCHAR coded = jisx0213_encoder(data, length, (void *)2000); if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) @@ -831,7 +827,7 @@ jisx0213_2000_1_encoder(const ucs4_t *da } static DBCHAR -jisx0213_2000_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length) +jisx0213_2000_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length) { DBCHAR coded; Py_ssize_t ilength = *length; @@ -854,7 +850,7 @@ jisx0213_2000_1_encoder_paironly(const u } static DBCHAR -jisx0213_2000_2_encoder(const ucs4_t *data, Py_ssize_t *length) +jisx0213_2000_2_encoder(const Py_UCS4 *data, Py_ssize_t *length) { DBCHAR coded = jisx0213_encoder(data, length, (void *)2000); if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) @@ -866,7 +862,7 @@ jisx0213_2000_2_encoder(const ucs4_t *da } static DBCHAR -jisx0213_2004_1_encoder(const ucs4_t *data, Py_ssize_t *length) +jisx0213_2004_1_encoder(const Py_UCS4 *data, Py_ssize_t *length) { DBCHAR coded = jisx0213_encoder(data, length, NULL); if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) @@ -878,7 +874,7 @@ jisx0213_2004_1_encoder(const ucs4_t *da } static DBCHAR -jisx0213_2004_1_encoder_paironly(const ucs4_t *data, Py_ssize_t *length) +jisx0213_2004_1_encoder_paironly(const Py_UCS4 *data, Py_ssize_t *length) { DBCHAR coded; Py_ssize_t ilength = *length; @@ -901,7 +897,7 @@ jisx0213_2004_1_encoder_paironly(const u } static DBCHAR -jisx0213_2004_2_encoder(const ucs4_t *data, Py_ssize_t *length) +jisx0213_2004_2_encoder(const Py_UCS4 *data, Py_ssize_t *length) { DBCHAR coded = jisx0213_encoder(data, length, NULL); if (coded == MAP_UNMAPPABLE || coded == MAP_MULTIPLE_AVAIL) @@ -912,17 +908,17 @@ jisx0213_2004_2_encoder(const ucs4_t *da return MAP_UNMAPPABLE; } -static ucs4_t +static Py_UCS4 jisx0201_r_decoder(const unsigned char *data) { - ucs4_t u; - JISX0201_R_DECODE(*data, u) + Py_UCS4 u; + JISX0201_R_DECODE_CHAR(*data, u) else return MAP_UNMAPPABLE; return u; } static DBCHAR -jisx0201_r_encoder(const ucs4_t *data, Py_ssize_t *length) +jisx0201_r_encoder(const Py_UCS4 *data, Py_ssize_t *length) { DBCHAR coded; JISX0201_R_ENCODE(*data, coded) @@ -930,17 +926,17 @@ jisx0201_r_encoder(const ucs4_t *data, P return coded; } -static ucs4_t +static Py_UCS4 jisx0201_k_decoder(const unsigned char *data) { - ucs4_t u; - JISX0201_K_DECODE(*data ^ 0x80, u) + Py_UCS4 u; + JISX0201_K_DECODE_CHAR(*data ^ 0x80, u) else return MAP_UNMAPPABLE; return u; } static DBCHAR -jisx0201_k_encoder(const ucs4_t *data, Py_ssize_t *length) +jisx0201_k_encoder(const Py_UCS4 *data, Py_ssize_t *length) { DBCHAR coded; JISX0201_K_ENCODE(*data, coded) @@ -961,18 +957,18 @@ gb2312_init(void) return 0; } -static ucs4_t +static Py_UCS4 gb2312_decoder(const unsigned char *data) { - ucs4_t u; - TRYMAP_DEC(gb2312, u, data[0], data[1]) + Py_UCS4 u; + TRYMAP_DEC_CHAR(gb2312, u, data[0], data[1]) return u; else return MAP_UNMAPPABLE; } static DBCHAR -gb2312_encoder(const ucs4_t *data, Py_ssize_t *length) +gb2312_encoder(const Py_UCS4 *data, Py_ssize_t *length) { DBCHAR coded; assert(*length == 1); @@ -986,14 +982,14 @@ gb2312_encoder(const ucs4_t *data, Py_ss } -static ucs4_t +static Py_UCS4 dummy_decoder(const unsigned char *data) { return MAP_UNMAPPABLE; } static DBCHAR -dummy_encoder(const ucs4_t *data, Py_ssize_t *length) +dummy_encoder(const Py_UCS4 *data, Py_ssize_t *length) { return MAP_UNMAPPABLE; } diff -r 0b2d4089180c Modules/cjkcodecs/_codecs_jp.c --- a/Modules/cjkcodecs/_codecs_jp.c Wed Apr 10 17:01:38 2013 -0400 +++ b/Modules/cjkcodecs/_codecs_jp.c Thu Apr 11 01:48:14 2013 +0200 @@ -20,7 +20,7 @@ ENCODER(cp932) { while (inleft > 0) { - Py_UNICODE c = IN1; + Py_UCS4 c = IN1; DBCHAR code; unsigned char c1, c2; @@ -66,8 +66,8 @@ ENCODER(cp932) } else if (c >= 0xe000 && c < 0xe758) { /* User-defined area */ - c1 = (Py_UNICODE)(c - 0xe000) / 188; - c2 = (Py_UNICODE)(c - 0xe000) % 188; + c1 = (Py_UCS4)(c - 0xe000) / 188; + c2 = (Py_UCS4)(c - 0xe000) % 188; OUT1(c1 + 0xf0) OUT2(c2 < 0x3f ? c2 + 0x40 : c2 + 0x41) } @@ -85,31 +85,30 @@ DECODER(cp932) while (inleft > 0) { unsigned char c = IN1, c2; - REQUIRE_OUTBUF(1) if (c <= 0x80) { - OUT1(c) - NEXT(1, 1) + OUT(c); + NEXT_IN(1); continue; } else if (c >= 0xa0 && c <= 0xdf) { if (c == 0xa0) - OUT1(0xf8f0) /* half-width katakana */ + OUT(0xf8f0); /* half-width katakana */ else - OUT1(0xfec0 + c) - NEXT(1, 1) + OUT(0xfec0 + c); + NEXT_IN(1); continue; } else if (c >= 0xfd/* && c <= 0xff*/) { /* Windows compatibility */ - OUT1(0xf8f1 - 0xfd + c) - NEXT(1, 1) + OUT(0xf8f1 - 0xfd + c); + NEXT_IN(1); continue; } REQUIRE_INBUF(2) c2 = IN2; - TRYMAP_DEC(cp932ext, **outbuf, c, c2); + TRYMAP_DEC(cp932ext, writer, c, c2); else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){ if (c2 < 0x40 || (c2 > 0x7e && c2 < 0x80) || c2 > 0xfc) return 1; @@ -119,21 +118,21 @@ DECODER(cp932) c = (2 * c + (c2 < 0x5e ? 0 : 1) + 0x21); c2 = (c2 < 0x5e ? c2 : c2 - 0x5e) + 0x21; - TRYMAP_DEC(jisx0208, **outbuf, c, c2); + TRYMAP_DEC(jisx0208, writer, c, c2); else return 1; } else if (c >= 0xf0 && c <= 0xf9) { if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc)) - OUT1(0xe000 + 188 * (c - 0xf0) + - (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41)) + OUT(0xe000 + 188 * (c - 0xf0) + + (c2 < 0x80 ? c2 - 0x40 : c2 - 0x41)); else return 1; } else return 1; - NEXT(2, 1) + NEXT_IN(2); } return 0; @@ -147,7 +146,7 @@ DECODER(cp932) ENCODER(euc_jis_2004) { while (inleft > 0) { - ucs4_t c = IN1; + Py_UCS4 c = IN1; DBCHAR code; Py_ssize_t insize; @@ -235,13 +234,11 @@ DECODER(euc_jis_2004) { while (inleft > 0) { unsigned char c = IN1; - ucs4_t code; - - REQUIRE_OUTBUF(1) + Py_UCS4 code; if (c < 0x80) { - OUT1(c) - NEXT(1, 1) + OUT(c); + NEXT_IN(1); continue; } @@ -252,8 +249,8 @@ DECODER(euc_jis_2004) REQUIRE_INBUF(2) c2 = IN2; if (c2 >= 0xa1 && c2 <= 0xdf) { - OUT1(0xfec0 + c2) - NEXT(2, 1) + OUT(0xfec0 + c2); + NEXT_IN(2); } else return 1; @@ -266,16 +263,16 @@ DECODER(euc_jis_2004) c3 = IN3 ^ 0x80; /* JIS X 0213 Plane 2 or JIS X 0212 (see NOTES) */ - EMULATE_JISX0213_2000_DECODE_PLANE2(**outbuf, c2, c3) - else TRYMAP_DEC(jisx0213_2_bmp, **outbuf, c2, c3) ; - else TRYMAP_DEC(jisx0213_2_emp, code, c2, c3) { - WRITEUCS4(EMPBASE | code) - NEXT_IN(3) + EMULATE_JISX0213_2000_DECODE_PLANE2(writer, c2, c3) + else TRYMAP_DEC(jisx0213_2_bmp, writer, c2, c3) ; + else TRYMAP_DEC_CHAR(jisx0213_2_emp, code, c2, c3) { + OUT(EMPBASE | code); + NEXT_IN(3); continue; } - else TRYMAP_DEC(jisx0212, **outbuf, c2, c3) ; + else TRYMAP_DEC(jisx0212, writer, c2, c3) ; else return 1; - NEXT(3, 1) + NEXT_IN(3); } else { unsigned char c2; @@ -285,23 +282,23 @@ DECODER(euc_jis_2004) c2 = IN2 ^ 0x80; /* JIS X 0213 Plane 1 */ - EMULATE_JISX0213_2000_DECODE_PLANE1(**outbuf, c, c2) - else if (c == 0x21 && c2 == 0x40) **outbuf = 0xff3c; - else if (c == 0x22 && c2 == 0x32) **outbuf = 0xff5e; - else TRYMAP_DEC(jisx0208, **outbuf, c, c2); - else TRYMAP_DEC(jisx0213_1_bmp, **outbuf, c, c2); - else TRYMAP_DEC(jisx0213_1_emp, code, c, c2) { - WRITEUCS4(EMPBASE | code) - NEXT_IN(2) + EMULATE_JISX0213_2000_DECODE_PLANE1(writer, c, c2) + else if (c == 0x21 && c2 == 0x40) OUT(0xff3c); + else if (c == 0x22 && c2 == 0x32) OUT(0xff5e); + else TRYMAP_DEC(jisx0208, writer, c, c2); + else TRYMAP_DEC(jisx0213_1_bmp, writer, c, c2); + else TRYMAP_DEC_CHAR(jisx0213_1_emp, code, c, c2) { + OUT(EMPBASE | code); + NEXT_IN(2); continue; } - else TRYMAP_DEC(jisx0213_pair, code, c, c2) { - WRITE2(code >> 16, code & 0xffff) - NEXT(2, 2) + else TRYMAP_DEC_CHAR(jisx0213_pair, code, c, c2) { + OUTCHAR2(code >> 16, code & 0xffff); + NEXT_IN(2); continue; } else return 1; - NEXT(2, 1) + NEXT_IN(2); } } @@ -316,7 +313,7 @@ DECODER(euc_jis_2004) ENCODER(euc_jp) { while (inleft > 0) { - Py_UNICODE c = IN1; + Py_UCS4 c = IN1; DBCHAR code; if (c < 0x80) { @@ -369,11 +366,9 @@ DECODER(euc_jp) while (inleft > 0) { unsigned char c = IN1; - REQUIRE_OUTBUF(1) - if (c < 0x80) { - OUT1(c) - NEXT(1, 1) + OUT(c); + NEXT_IN(1); continue; } @@ -384,8 +379,8 @@ DECODER(euc_jp) REQUIRE_INBUF(2) c2 = IN2; if (c2 >= 0xa1 && c2 <= 0xdf) { - OUT1(0xfec0 + c2) - NEXT(2, 1) + OUT(0xfec0 + c2); + NEXT_IN(2); } else return 1; @@ -397,8 +392,8 @@ DECODER(euc_jp) c2 = IN2; c3 = IN3; /* JIS X 0212 */ - TRYMAP_DEC(jisx0212, **outbuf, c2 ^ 0x80, c3 ^ 0x80) { - NEXT(3, 1) + TRYMAP_DEC(jisx0212, writer, c2 ^ 0x80, c3 ^ 0x80) { + NEXT_IN(3); } else return 1; @@ -412,13 +407,13 @@ DECODER(euc_jp) #ifndef STRICT_BUILD if (c == 0xa1 && c2 == 0xc0) /* FULL-WIDTH REVERSE SOLIDUS */ - **outbuf = 0xff3c; + OUT(0xff3c); else #endif - TRYMAP_DEC(jisx0208, **outbuf, + TRYMAP_DEC(jisx0208, writer, c ^ 0x80, c2 ^ 0x80) ; else return 1; - NEXT(2, 1) + NEXT_IN(2); } } @@ -433,7 +428,7 @@ DECODER(euc_jp) ENCODER(shift_jis) { while (inleft > 0) { - Py_UNICODE c = IN1; + Py_UCS4 c = IN1; DBCHAR code; unsigned char c1, c2; @@ -488,14 +483,12 @@ DECODER(shift_jis) while (inleft > 0) { unsigned char c = IN1; - REQUIRE_OUTBUF(1) - #ifdef STRICT_BUILD - JISX0201_R_DECODE(c, **outbuf) + JISX0201_R_DECODE(c, writer) #else - if (c < 0x80) **outbuf = c; + if (c < 0x80) OUT(c); #endif - else JISX0201_K_DECODE(c, **outbuf) + else JISX0201_K_DECODE(c, writer) else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xea)){ unsigned char c1, c2; @@ -512,13 +505,13 @@ DECODER(shift_jis) #ifndef STRICT_BUILD if (c1 == 0x21 && c2 == 0x40) { /* FULL-WIDTH REVERSE SOLIDUS */ - OUT1(0xff3c) - NEXT(2, 1) + OUT(0xff3c); + NEXT_IN(2); continue; } #endif - TRYMAP_DEC(jisx0208, **outbuf, c1, c2) { - NEXT(2, 1) + TRYMAP_DEC(jisx0208, writer, c1, c2) { + NEXT_IN(2); continue; } else @@ -527,7 +520,7 @@ DECODER(shift_jis) else return 1; - NEXT(1, 1) /* JIS X 0201 */ + NEXT_IN(1); /* JIS X 0201 */ } return 0; @@ -541,7 +534,7 @@ DECODER(shift_jis) ENCODER(shift_jis_2004) { while (inleft > 0) { - ucs4_t c = IN1; + Py_UCS4 c = IN1; DBCHAR code = NOCHAR; int c1, c2; Py_ssize_t insize; @@ -636,11 +629,10 @@ DECODER(shift_jis_2004) while (inleft > 0) { unsigned char c = IN1; - REQUIRE_OUTBUF(1) - JISX0201_DECODE(c, **outbuf) + JISX0201_DECODE(c, writer) else if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xfc)){ unsigned char c1, c2; - ucs4_t code; + Py_UCS4 code; REQUIRE_INBUF(2) c2 = IN2; @@ -654,50 +646,47 @@ DECODER(shift_jis_2004) if (c1 < 0x5e) { /* Plane 1 */ c1 += 0x21; - EMULATE_JISX0213_2000_DECODE_PLANE1(**outbuf, + EMULATE_JISX0213_2000_DECODE_PLANE1(writer, c1, c2) - else TRYMAP_DEC(jisx0208, **outbuf, c1, c2) { - NEXT_OUT(1) + else TRYMAP_DEC(jisx0208, writer, c1, c2) { } - else TRYMAP_DEC(jisx0213_1_bmp, **outbuf, + else TRYMAP_DEC(jisx0213_1_bmp, writer, c1, c2) { - NEXT_OUT(1) } - else TRYMAP_DEC(jisx0213_1_emp, code, c1, c2) { - WRITEUCS4(EMPBASE | code) + else TRYMAP_DEC_CHAR(jisx0213_1_emp, code, c1, c2) { + OUT(EMPBASE | code); } - else TRYMAP_DEC(jisx0213_pair, code, c1, c2) { - WRITE2(code >> 16, code & 0xffff) - NEXT_OUT(2) + else TRYMAP_DEC_CHAR(jisx0213_pair, code, c1, c2) { + OUTCHAR2(code >> 16, code & 0xffff); } else return 1; - NEXT_IN(2) + NEXT_IN(2); } else { /* Plane 2 */ if (c1 >= 0x67) c1 += 0x07; else if (c1 >= 0x63 || c1 == 0x5f) c1 -= 0x37; else c1 -= 0x3d; - EMULATE_JISX0213_2000_DECODE_PLANE2(**outbuf, + EMULATE_JISX0213_2000_DECODE_PLANE2(writer, c1, c2) - else TRYMAP_DEC(jisx0213_2_bmp, **outbuf, - c1, c2) ; - else TRYMAP_DEC(jisx0213_2_emp, code, c1, c2) { - WRITEUCS4(EMPBASE | code) - NEXT_IN(2) + else TRYMAP_DEC(jisx0213_2_bmp, writer, + c1, c2) { + } else TRYMAP_DEC_CHAR(jisx0213_2_emp, code, c1, c2) { + OUT(EMPBASE | code); + NEXT_IN(2); continue; } else return 1; - NEXT(2, 1) + NEXT_IN(2); } continue; } else return 1; - NEXT(1, 1) /* JIS X 0201 */ + NEXT_IN(1); /* JIS X 0201 */ } return 0; diff -r 0b2d4089180c Modules/cjkcodecs/_codecs_kr.c --- a/Modules/cjkcodecs/_codecs_kr.c Wed Apr 10 17:01:38 2013 -0400 +++ b/Modules/cjkcodecs/_codecs_kr.c Thu Apr 11 01:48:14 2013 +0200 @@ -34,7 +34,7 @@ static const unsigned char u2cgk_jongseo ENCODER(euc_kr) { while (inleft > 0) { - Py_UNICODE c = IN1; + Py_UCS4 c = IN1; DBCHAR code; if (c < 0x80) { @@ -104,11 +104,9 @@ DECODER(euc_kr) while (inleft > 0) { unsigned char c = IN1; - REQUIRE_OUTBUF(1) - if (c < 0x80) { - OUT1(c) - NEXT(1, 1) + OUT(c); + NEXT_IN(1); continue; } @@ -145,11 +143,11 @@ DECODER(euc_kr) if (cho == NONE || jung == NONE || jong == NONE) return 1; - OUT1(0xac00 + cho*588 + jung*28 + jong); - NEXT(8, 1) + OUT(0xac00 + cho*588 + jung*28 + jong);; + NEXT_IN(8); } - else TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80) { - NEXT(2, 1) + else TRYMAP_DEC(ksx1001, writer, c ^ 0x80, IN2 ^ 0x80) { + NEXT_IN(2); } else return 1; @@ -167,7 +165,7 @@ DECODER(euc_kr) ENCODER(cp949) { while (inleft > 0) { - Py_UNICODE c = IN1; + Py_UCS4 c = IN1; DBCHAR code; if (c < 0x80) { @@ -197,20 +195,18 @@ DECODER(cp949) while (inleft > 0) { unsigned char c = IN1; - REQUIRE_OUTBUF(1) - if (c < 0x80) { - OUT1(c) - NEXT(1, 1) + OUT(c); + NEXT_IN(1); continue; } REQUIRE_INBUF(2) - TRYMAP_DEC(ksx1001, **outbuf, c ^ 0x80, IN2 ^ 0x80); - else TRYMAP_DEC(cp949ext, **outbuf, c, IN2); + TRYMAP_DEC(ksx1001, writer, c ^ 0x80, IN2 ^ 0x80); + else TRYMAP_DEC(cp949ext, writer, c, IN2); else return 1; - NEXT(2, 1) + NEXT_IN(2); } return 0; @@ -251,7 +247,7 @@ static const DBCHAR u2johabjamo[] = { ENCODER(johab) { while (inleft > 0) { - Py_UNICODE c = IN1; + Py_UCS4 c = IN1; DBCHAR code; if (c < 0x80) { @@ -350,11 +346,9 @@ DECODER(johab) while (inleft > 0) { unsigned char c = IN1, c2; - REQUIRE_OUTBUF(1) - if (c < 0x80) { - OUT1(c) - NEXT(1, 1) + OUT(c); + NEXT_IN(1); continue; } @@ -381,33 +375,33 @@ DECODER(johab) if (i_cho == FILL) { if (i_jung == FILL) { if (i_jong == FILL) - OUT1(0x3000) + OUT(0x3000); else - OUT1(0x3100 | - johabjamo_jongseong[c_jong]) + OUT(0x3100 | + johabjamo_jongseong[c_jong]); } else { if (i_jong == FILL) - OUT1(0x3100 | - johabjamo_jungseong[c_jung]) + OUT(0x3100 | + johabjamo_jungseong[c_jung]); else return 1; } } else { if (i_jung == FILL) { if (i_jong == FILL) - OUT1(0x3100 | - johabjamo_choseong[c_cho]) + OUT(0x3100 | + johabjamo_choseong[c_cho]); else return 1; } else - OUT1(0xac00 + - i_cho * 588 + - i_jung * 28 + - (i_jong == FILL ? 0 : i_jong)) + OUT(0xac00 + + i_cho * 588 + + i_jung * 28 + + (i_jong == FILL ? 0 : i_jong)); } - NEXT(2, 1) + NEXT_IN(2); } else { /* KS X 1001 except hangul jamos and syllables */ if (c == 0xdf || c > 0xf9 || @@ -424,9 +418,9 @@ DECODER(johab) t1 = t1 + (t2 < 0x5e ? 0 : 1) + 0x21; t2 = (t2 < 0x5e ? t2 : t2 - 0x5e) + 0x21; - TRYMAP_DEC(ksx1001, **outbuf, t1, t2); + TRYMAP_DEC(ksx1001, writer, t1, t2); else return 1; - NEXT(2, 1) + NEXT_IN(2); } } } diff -r 0b2d4089180c Modules/cjkcodecs/_codecs_tw.c --- a/Modules/cjkcodecs/_codecs_tw.c Wed Apr 10 17:01:38 2013 -0400 +++ b/Modules/cjkcodecs/_codecs_tw.c Thu Apr 11 01:48:14 2013 +0200 @@ -14,7 +14,7 @@ ENCODER(big5) { while (inleft > 0) { - Py_UNICODE c = **inbuf; + Py_UCS4 c = **inbuf; DBCHAR code; if (c < 0x80) { @@ -43,17 +43,15 @@ DECODER(big5) while (inleft > 0) { unsigned char c = IN1; - REQUIRE_OUTBUF(1) - if (c < 0x80) { - OUT1(c) - NEXT(1, 1) + OUT(c); + NEXT_IN(1); continue; } REQUIRE_INBUF(2) - TRYMAP_DEC(big5, **outbuf, c, IN2) { - NEXT(2, 1) + TRYMAP_DEC(big5, writer, c, IN2) { + NEXT_IN(2); } else return 1; } @@ -69,7 +67,7 @@ DECODER(big5) ENCODER(cp950) { while (inleft > 0) { - Py_UNICODE c = IN1; + Py_UCS4 c = IN1; DBCHAR code; if (c < 0x80) { @@ -97,21 +95,19 @@ DECODER(cp950) while (inleft > 0) { unsigned char c = IN1; - REQUIRE_OUTBUF(1) - if (c < 0x80) { - OUT1(c) - NEXT(1, 1) + OUT(c); + NEXT_IN(1); continue; } REQUIRE_INBUF(2) - TRYMAP_DEC(cp950ext, **outbuf, c, IN2); - else TRYMAP_DEC(big5, **outbuf, c, IN2); + TRYMAP_DEC(cp950ext, writer, c, IN2); + else TRYMAP_DEC(big5, writer, c, IN2); else return 1; - NEXT(2, 1) + NEXT_IN(2); } return 0; diff -r 0b2d4089180c Modules/cjkcodecs/alg_jisx0201.h --- a/Modules/cjkcodecs/alg_jisx0201.h Wed Apr 10 17:01:38 2013 -0400 +++ b/Modules/cjkcodecs/alg_jisx0201.h Thu Apr 11 01:48:14 2013 +0200 @@ -10,15 +10,24 @@ JISX0201_R_ENCODE(c, assi) \ else JISX0201_K_ENCODE(c, assi) -#define JISX0201_R_DECODE(c, assi) \ +#define JISX0201_R_DECODE_CHAR(c, assi) \ if ((c) < 0x5c) (assi) = (c); \ else if ((c) == 0x5c) (assi) = 0x00a5; \ else if ((c) < 0x7e) (assi) = (c); \ else if ((c) == 0x7e) (assi) = 0x203e; \ else if ((c) == 0x7f) (assi) = 0x7f; -#define JISX0201_K_DECODE(c, assi) \ +#define JISX0201_R_DECODE(c, writer) \ + if ((c) < 0x5c) OUT(c); \ + else if ((c) == 0x5c) OUT(0x00a5); \ + else if ((c) < 0x7e) OUT(c); \ + else if ((c) == 0x7e) OUT(0x203e); \ + else if ((c) == 0x7f) OUT(0x7f); +#define JISX0201_K_DECODE(c, writer) \ if ((c) >= 0xa1 && (c) <= 0xdf) \ - (assi) = 0xfec0 + (c); -#define JISX0201_DECODE(c, assi) \ - JISX0201_R_DECODE(c, assi) \ - else JISX0201_K_DECODE(c, assi) + OUT(0xfec0 + (c)); +#define JISX0201_K_DECODE_CHAR(c, assi) \ + if ((c) >= 0xa1 && (c) <= 0xdf) \ + (assi) = 0xfec0 + (c); +#define JISX0201_DECODE(c, writer) \ + JISX0201_R_DECODE(c, writer) \ + else JISX0201_K_DECODE(c, writer) diff -r 0b2d4089180c Modules/cjkcodecs/cjkcodecs.h --- a/Modules/cjkcodecs/cjkcodecs.h Wed Apr 10 17:01:38 2013 -0400 +++ b/Modules/cjkcodecs/cjkcodecs.h Thu Apr 11 01:48:14 2013 +0200 @@ -33,7 +33,7 @@ struct dbcs_index { typedef struct dbcs_index decode_map; struct widedbcs_index { - const ucs4_t *map; + const Py_UCS4 *map; unsigned char bottom, top; }; typedef struct widedbcs_index widedecode_map; @@ -56,7 +56,7 @@ struct dbcs_map { }; struct pair_encodemap { - ucs4_t uniseq; + Py_UCS4 uniseq; DBCHAR code; }; @@ -86,7 +86,7 @@ static const struct dbcs_map *mapping_li static Py_ssize_t encoding##_decode( \ MultibyteCodec_State *state, const void *config, \ const unsigned char **inbuf, Py_ssize_t inleft, \ - Py_UNICODE **outbuf, Py_ssize_t outleft) + _PyUnicodeWriter *writer) #define DECODER_RESET(encoding) \ static Py_ssize_t encoding##_decode_reset( \ MultibyteCodec_State *state, const void *config) @@ -101,13 +101,15 @@ static const struct dbcs_map *mapping_li #endif #define NEXT_IN(i) \ - (*inbuf) += (i); \ - (inleft) -= (i); + do { \ + (*inbuf) += (i); \ + (inleft) -= (i); \ + } while (0) #define NEXT_OUT(o) \ (*outbuf) += (o); \ (outleft) -= (o); #define NEXT(i, o) \ - NEXT_IN(i) NEXT_OUT(o) + NEXT_IN(i); NEXT_OUT(o) #define REQUIRE_INBUF(n) \ if (inleft < (n)) \ @@ -121,6 +123,26 @@ static const struct dbcs_map *mapping_li #define IN3 ((*inbuf)[2]) #define IN4 ((*inbuf)[3]) +#define OUT(c) \ + do { \ + if (_PyUnicodeWriter_WriteChar(writer, (c)) < 0) \ + return MBERR_TOOSMALL; \ + } while (0) + +#define OUTCHAR2(c1, c2) \ + do { \ + Py_UCS4 _c1 = (c1); \ + Py_UCS4 _c2 = (c2); \ + if (_PyUnicodeWriter_Prepare(writer, 2, _c1) < 0) \ + return MBERR_TOOSMALL; \ + if (_PyUnicodeWriter_Prepare(writer, 2, _c2) < 0) \ + return MBERR_TOOSMALL; \ + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, _c1); \ + writer->pos++; \ + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, _c2); \ + writer->pos++; \ + } while (0) + #define OUT1(c) ((*outbuf)[0]) = (c); #define OUT2(c) ((*outbuf)[1]) = (c); #define OUT3(c) ((*outbuf)[2]) = (c); @@ -145,19 +167,6 @@ static const struct dbcs_map *mapping_li (*outbuf)[2] = (c3); \ (*outbuf)[3] = (c4); -#if Py_UNICODE_SIZE == 2 -# define WRITEUCS4(c) \ - REQUIRE_OUTBUF(2) \ - (*outbuf)[0] = Py_UNICODE_HIGH_SURROGATE(c); \ - (*outbuf)[1] = Py_UNICODE_LOW_SURROGATE(c); \ - NEXT_OUT(2) -#else -# define WRITEUCS4(c) \ - REQUIRE_OUTBUF(1) \ - **outbuf = (Py_UNICODE)(c); \ - NEXT_OUT(1) -#endif - #define _TRYMAP_ENC(m, assi, val) \ ((m)->map != NULL && (val) >= (m)->bottom && \ (val)<= (m)->top && ((assi) = (m)->map[(val) - \ @@ -167,24 +176,41 @@ static const struct dbcs_map *mapping_li #define TRYMAP_ENC(charset, assi, uni) \ if TRYMAP_ENC_COND(charset, assi, uni) -#define _TRYMAP_DEC(m, assi, val) \ - ((m)->map != NULL && (val) >= (m)->bottom && \ - (val)<= (m)->top && ((assi) = (m)->map[(val) - \ - (m)->bottom]) != UNIINV) -#define TRYMAP_DEC(charset, assi, c1, c2) \ - if _TRYMAP_DEC(&charset##_decmap[c1], assi, c2) +Py_LOCAL_INLINE(int) +_TRYMAP_DEC_WRITE(_PyUnicodeWriter *writer, Py_UCS4 c) +{ + if (c == UNIINV || _PyUnicodeWriter_WriteChar(writer, c) < 0) + return UNIINV; + else + return c; +} -#define _TRYMAP_ENC_MPLANE(m, assplane, asshi, asslo, val) \ - ((m)->map != NULL && (val) >= (m)->bottom && \ - (val)<= (m)->top && \ - ((assplane) = (m)->map[((val) - (m)->bottom)*3]) != 0 && \ +#define _TRYMAP_DEC(m, writer, val) \ + ((m)->map != NULL && \ + (val) >= (m)->bottom && \ + (val)<= (m)->top && \ + _TRYMAP_DEC_WRITE(writer, (m)->map[(val) - (m)->bottom]) != UNIINV) +#define _TRYMAP_DEC_CHAR(m, assi, val) \ + ((m)->map != NULL && \ + (val) >= (m)->bottom && \ + (val)<= (m)->top && \ + ((assi) = (m)->map[(val) - (m)->bottom]) != UNIINV) +#define TRYMAP_DEC(charset, writer, c1, c2) \ + if _TRYMAP_DEC(&charset##_decmap[c1], writer, c2) +#define TRYMAP_DEC_CHAR(charset, assi, c1, c2) \ + if _TRYMAP_DEC_CHAR(&charset##_decmap[c1], assi, c2) + +#define _TRYMAP_ENC_MPLANE(m, assplane, asshi, asslo, val) \ + ((m)->map != NULL && (val) >= (m)->bottom && \ + (val)<= (m)->top && \ + ((assplane) = (m)->map[((val) - (m)->bottom)*3]) != 0 && \ (((asshi) = (m)->map[((val) - (m)->bottom)*3 + 1]), 1) && \ (((asslo) = (m)->map[((val) - (m)->bottom)*3 + 2]), 1)) #define TRYMAP_ENC_MPLANE(charset, assplane, asshi, asslo, uni) \ if _TRYMAP_ENC_MPLANE(&charset##_encmap[(uni) >> 8], \ assplane, asshi, asslo, (uni) & 0xff) -#define TRYMAP_DEC_MPLANE(charset, assi, plane, c1, c2) \ - if _TRYMAP_DEC(&charset##_decmap[plane][c1], assi, c2) +#define TRYMAP_DEC_MPLANE(charset, writer, plane, c1, c2) \ + if _TRYMAP_DEC(&charset##_decmap[plane][c1], writer, c2) #if Py_UNICODE_SIZE == 2 #define DECODE_SURROGATE(c) \ @@ -323,7 +349,7 @@ find_pairencmap(ucs2_t body, ucs2_t modi const struct pair_encodemap *haystack, int haystacksize) { int pos, min, max; - ucs4_t value = body << 16 | modifier; + Py_UCS4 value = body << 16 | modifier; min = 0; max = haystacksize; diff -r 0b2d4089180c Modules/cjkcodecs/emu_jisx0213_2000.h --- a/Modules/cjkcodecs/emu_jisx0213_2000.h Wed Apr 10 17:01:38 2013 -0400 +++ b/Modules/cjkcodecs/emu_jisx0213_2000.h Thu Apr 11 01:48:14 2013 +0200 @@ -38,6 +38,9 @@ ((c1) == 0x7E && (c2) == 0x7E))) \ return EMULATE_JISX0213_2000_DECODE_INVALID; -#define EMULATE_JISX0213_2000_DECODE_PLANE2(assi, c1, c2) \ +#define EMULATE_JISX0213_2000_DECODE_PLANE2(writer, c1, c2) \ + if (config == (void *)2000 && (c1) == 0x7D && (c2) == 0x3B) \ + OUT(0x9B1D); +#define EMULATE_JISX0213_2000_DECODE_PLANE2_CHAR(assi, c1, c2) \ if (config == (void *)2000 && (c1) == 0x7D && (c2) == 0x3B) \ (assi) = 0x9B1D; diff -r 0b2d4089180c Modules/cjkcodecs/mappings_cn.h --- a/Modules/cjkcodecs/mappings_cn.h Wed Apr 10 17:01:38 2013 -0400 +++ b/Modules/cjkcodecs/mappings_cn.h Thu Apr 11 01:48:14 2013 +0200 @@ -4049,7 +4049,7 @@ 0,0},{0,0,0},{0,0,0},{0,0,0},{0,0,0},{0, static const struct _gb18030_to_unibmp_ranges { - Py_UNICODE first, last; + Py_UCS4 first, last; DBCHAR base; } gb18030_to_unibmp_ranges[] = { {128,163,0},{165,166,36},{169,175,38},{178,182,45},{184,214,50},{216,223,81},{ diff -r 0b2d4089180c Modules/cjkcodecs/mappings_jisx0213_pair.h --- a/Modules/cjkcodecs/mappings_jisx0213_pair.h Wed Apr 10 17:01:38 2013 -0400 +++ b/Modules/cjkcodecs/mappings_jisx0213_pair.h Thu Apr 11 01:48:14 2013 +0200 @@ -3,7 +3,7 @@ static const struct widedbcs_index *jisx0213_pair_decmap; static const struct pair_encodemap *jisx0213_pair_encmap; #else -static const ucs4_t __jisx0213_pair_decmap[49] = { +static const Py_UCS4 __jisx0213_pair_decmap[49] = { 810234010,810365082,810496154,810627226,810758298,816525466,816656538, 816787610,816918682,817049754,817574042,818163866,818426010,838283418, 15074048,U,U,U,39060224,39060225,42730240,42730241,39387904,39387905,39453440, diff -r 0b2d4089180c Modules/cjkcodecs/multibytecodec.c --- a/Modules/cjkcodecs/multibytecodec.c Wed Apr 10 17:01:38 2013 -0400 +++ b/Modules/cjkcodecs/multibytecodec.c Thu Apr 11 01:48:14 2013 +0200 @@ -17,8 +17,8 @@ typedef struct { typedef struct { const unsigned char *inbuf, *inbuf_top, *inbuf_end; - Py_UNICODE *outbuf, *outbuf_end; - PyObject *excobj, *outobj; + PyObject *excobj; + _PyUnicodeWriter writer; } MultibyteDecodeBuffer; PyDoc_STRVAR(MultibyteCodec_Encode__doc__, @@ -197,29 +197,6 @@ expand_encodebuffer(MultibyteEncodeBuffe goto errorexit; \ } -static int -expand_decodebuffer(MultibyteDecodeBuffer *buf, Py_ssize_t esize) -{ - Py_ssize_t orgpos, orgsize; - - orgpos = (Py_ssize_t)(buf->outbuf - PyUnicode_AS_UNICODE(buf->outobj)); - orgsize = PyUnicode_GET_SIZE(buf->outobj); - if (PyUnicode_Resize(&buf->outobj, orgsize + ( - esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize)) == -1) - return -1; - - buf->outbuf = PyUnicode_AS_UNICODE(buf->outobj) + orgpos; - buf->outbuf_end = PyUnicode_AS_UNICODE(buf->outobj) - + PyUnicode_GET_SIZE(buf->outobj); - - return 0; -} -#define REQUIRE_DECODEBUFFER(buf, s) { \ - if ((s) < 1 || (buf)->outbuf + (s) > (buf)->outbuf_end) \ - if (expand_decodebuffer(buf, s) == -1) \ - goto errorexit; \ -} - /** * MultibyteCodec object @@ -374,7 +351,7 @@ multibytecodec_decerror(MultibyteCodec * PyObject *errors, Py_ssize_t e) { PyObject *retobj = NULL, *retuni = NULL; - Py_ssize_t retunisize, newpos; + Py_ssize_t newpos; const char *reason; Py_ssize_t esize, start, end; @@ -385,7 +362,6 @@ multibytecodec_decerror(MultibyteCodec * else { switch (e) { case MBERR_TOOSMALL: - REQUIRE_DECODEBUFFER(buf, -1); return 0; /* retry it */ case MBERR_TOOFEW: reason = "incomplete multibyte sequence"; @@ -403,8 +379,9 @@ multibytecodec_decerror(MultibyteCodec * } if (errors == ERROR_REPLACE) { - REQUIRE_DECODEBUFFER(buf, 1); - *buf->outbuf++ = Py_UNICODE_REPLACEMENT_CHARACTER; + if (_PyUnicodeWriter_WriteChar(&buf->writer, + Py_UNICODE_REPLACEMENT_CHARACTER) < 0) + goto errorexit; } if (errors == ERROR_IGNORE || errors == ERROR_REPLACE) { buf->inbuf += esize; @@ -447,15 +424,8 @@ multibytecodec_decerror(MultibyteCodec * goto errorexit; } - if (PyUnicode_AsUnicode(retuni) == NULL) + if (_PyUnicodeWriter_WriteStr(&buf->writer, retuni) < 0) goto errorexit; - retunisize = PyUnicode_GET_SIZE(retuni); - if (retunisize > 0) { - REQUIRE_DECODEBUFFER(buf, retunisize); - memcpy((char *)buf->outbuf, PyUnicode_AS_UNICODE(retuni), - retunisize * Py_UNICODE_SIZE); - buf->outbuf += retunisize; - } newpos = PyLong_AsSsize_t(PyTuple_GET_ITEM(retobj, 1)); if (newpos < 0 && !PyErr_Occurred()) @@ -617,10 +587,10 @@ MultibyteCodec_Decode(MultibyteCodecObje { MultibyteCodec_State state; MultibyteDecodeBuffer buf; - PyObject *errorcb; + PyObject *errorcb, *res; Py_buffer pdata; const char *data, *errors = NULL; - Py_ssize_t datalen, finalsize; + Py_ssize_t datalen; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*|z:decode", codeckwarglist, &pdata, &errors)) @@ -640,29 +610,22 @@ MultibyteCodec_Decode(MultibyteCodecObje return make_tuple(PyUnicode_New(0, 0), 0); } + _PyUnicodeWriter_Init(&buf.writer, datalen); buf.excobj = NULL; buf.inbuf = buf.inbuf_top = (unsigned char *)data; buf.inbuf_end = buf.inbuf_top + datalen; - buf.outobj = PyUnicode_FromUnicode(NULL, datalen); - if (buf.outobj == NULL) - goto errorexit; - buf.outbuf = PyUnicode_AS_UNICODE(buf.outobj); - if (buf.outbuf == NULL) - goto errorexit; - buf.outbuf_end = buf.outbuf + PyUnicode_GET_SIZE(buf.outobj); if (self->codec->decinit != NULL && self->codec->decinit(&state, self->codec->config) != 0) goto errorexit; while (buf.inbuf < buf.inbuf_end) { - Py_ssize_t inleft, outleft, r; + Py_ssize_t inleft, r; inleft = (Py_ssize_t)(buf.inbuf_end - buf.inbuf); - outleft = (Py_ssize_t)(buf.outbuf_end - buf.outbuf); r = self->codec->decode(&state, self->codec->config, - &buf.inbuf, inleft, &buf.outbuf, outleft); + &buf.inbuf, inleft, &buf.writer); if (r == 0) break; else if (multibytecodec_decerror(self->codec, &state, @@ -670,23 +633,20 @@ MultibyteCodec_Decode(MultibyteCodecObje goto errorexit; } - finalsize = (Py_ssize_t)(buf.outbuf - - PyUnicode_AS_UNICODE(buf.outobj)); - - if (finalsize != PyUnicode_GET_SIZE(buf.outobj)) - if (PyUnicode_Resize(&buf.outobj, finalsize) == -1) - goto errorexit; + res = _PyUnicodeWriter_Finish(&buf.writer); + if (res == NULL) + goto errorexit; PyBuffer_Release(&pdata); Py_XDECREF(buf.excobj); ERROR_DECREF(errorcb); - return make_tuple(buf.outobj, datalen); + return make_tuple(res, datalen); errorexit: PyBuffer_Release(&pdata); ERROR_DECREF(errorcb); Py_XDECREF(buf.excobj); - Py_XDECREF(buf.outobj); + _PyUnicodeWriter_Dealloc(&buf.writer); return NULL; } @@ -859,17 +819,7 @@ decoder_prepare_buffer(MultibyteDecodeBu { buf->inbuf = buf->inbuf_top = (const unsigned char *)data; buf->inbuf_end = buf->inbuf_top + size; - if (buf->outobj == NULL) { /* only if outobj is not allocated yet */ - buf->outobj = PyUnicode_FromUnicode(NULL, size); - if (buf->outobj == NULL) - return -1; - buf->outbuf = PyUnicode_AsUnicode(buf->outobj); - if (buf->outbuf == NULL) - return -1; - buf->outbuf_end = buf->outbuf + - PyUnicode_GET_SIZE(buf->outobj); - } - + _PyUnicodeWriter_Init(&buf->writer, size); return 0; } @@ -878,14 +828,13 @@ decoder_feed_buffer(MultibyteStatefulDec MultibyteDecodeBuffer *buf) { while (buf->inbuf < buf->inbuf_end) { - Py_ssize_t inleft, outleft; + Py_ssize_t inleft; Py_ssize_t r; inleft = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); - outleft = (Py_ssize_t)(buf->outbuf_end - buf->outbuf); r = ctx->codec->decode(&ctx->state, ctx->codec->config, - &buf->inbuf, inleft, &buf->outbuf, outleft); + &buf->inbuf, inleft, &buf->writer); if (r == 0 || r == MBERR_TOOFEW) break; else if (multibytecodec_decerror(ctx->codec, &ctx->state, @@ -1058,8 +1007,9 @@ mbidecoder_decode(MultibyteIncrementalDe MultibyteDecodeBuffer buf; char *data, *wdata = NULL; Py_buffer pdata; - Py_ssize_t wsize, finalsize = 0, size, origpending; + Py_ssize_t wsize, size, origpending; int final = 0; + PyObject *res; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "y*|i:decode", incrementalkwarglist, &pdata, &final)) @@ -1067,7 +1017,8 @@ mbidecoder_decode(MultibyteIncrementalDe data = pdata.buf; size = pdata.len; - buf.outobj = buf.excobj = NULL; + _PyUnicodeWriter_Init(&buf.writer, 1); + buf.excobj = NULL; origpending = self->pendingsize; if (self->pendingsize == 0) { @@ -1109,23 +1060,22 @@ mbidecoder_decode(MultibyteIncrementalDe goto errorexit; } - finalsize = (Py_ssize_t)(buf.outbuf - PyUnicode_AS_UNICODE(buf.outobj)); - if (finalsize != PyUnicode_GET_SIZE(buf.outobj)) - if (PyUnicode_Resize(&buf.outobj, finalsize) == -1) - goto errorexit; + res = _PyUnicodeWriter_Finish(&buf.writer); + if (res == NULL) + goto errorexit; PyBuffer_Release(&pdata); if (wdata != data) PyMem_Del(wdata); Py_XDECREF(buf.excobj); - return buf.outobj; + return res; errorexit: PyBuffer_Release(&pdata); if (wdata != NULL && wdata != data) PyMem_Del(wdata); Py_XDECREF(buf.excobj); - Py_XDECREF(buf.outobj); + _PyUnicodeWriter_Dealloc(&buf.writer); return NULL; } @@ -1265,13 +1215,14 @@ mbstreamreader_iread(MultibyteStreamRead const char *method, Py_ssize_t sizehint) { MultibyteDecodeBuffer buf; - PyObject *cres; - Py_ssize_t rsize, finalsize = 0; + PyObject *cres, *res; + Py_ssize_t rsize; if (sizehint == 0) return PyUnicode_New(0, 0); - buf.outobj = buf.excobj = NULL; + _PyUnicodeWriter_Init(&buf.writer, 1); + buf.excobj = NULL; cres = NULL; for (;;) { @@ -1340,29 +1291,27 @@ mbstreamreader_iread(MultibyteStreamRead goto errorexit; } - finalsize = (Py_ssize_t)(buf.outbuf - - PyUnicode_AS_UNICODE(buf.outobj)); Py_DECREF(cres); cres = NULL; - if (sizehint < 0 || finalsize != 0 || rsize == 0) + if (sizehint < 0 || buf.writer.pos != 0 || rsize == 0) break; sizehint = 1; /* read 1 more byte and retry */ } - if (finalsize != PyUnicode_GET_SIZE(buf.outobj)) - if (PyUnicode_Resize(&buf.outobj, finalsize) == -1) - goto errorexit; + res = _PyUnicodeWriter_Finish(&buf.writer); + if (res == NULL) + goto errorexit; Py_XDECREF(cres); Py_XDECREF(buf.excobj); - return buf.outobj; + return res; errorexit: Py_XDECREF(cres); Py_XDECREF(buf.excobj); - Py_XDECREF(buf.outobj); + _PyUnicodeWriter_Dealloc(&buf.writer); return NULL; } diff -r 0b2d4089180c Modules/cjkcodecs/multibytecodec.h --- a/Modules/cjkcodecs/multibytecodec.h Wed Apr 10 17:01:38 2013 -0400 +++ b/Modules/cjkcodecs/multibytecodec.h Thu Apr 11 01:48:14 2013 +0200 @@ -10,11 +10,7 @@ extern "C" { #endif -#ifdef uint32_t -typedef uint32_t ucs4_t; -#else -typedef unsigned int ucs4_t; -#endif +typedef Py_UCS4 Py_UCS4; #ifdef uint16_t typedef uint16_t ucs2_t, DBCHAR; @@ -27,7 +23,7 @@ typedef union { int i; unsigned char c[8]; ucs2_t u2[4]; - ucs4_t u4[2]; + Py_UCS4 u4[2]; } MultibyteCodec_State; typedef int (*mbcodec_init)(const void *config); @@ -44,7 +40,7 @@ typedef Py_ssize_t (*mbencodereset_func) typedef Py_ssize_t (*mbdecode_func)(MultibyteCodec_State *state, const void *config, const unsigned char **inbuf, Py_ssize_t inleft, - Py_UNICODE **outbuf, Py_ssize_t outleft); + _PyUnicodeWriter *writer); typedef int (*mbdecodeinit_func)(MultibyteCodec_State *state, const void *config); typedef Py_ssize_t (*mbdecodereset_func)(MultibyteCodec_State *state, diff -r 0b2d4089180c Objects/unicodeobject.c --- a/Objects/unicodeobject.c Wed Apr 10 17:01:38 2013 -0400 +++ b/Objects/unicodeobject.c Thu Apr 11 01:48:14 2013 +0200 @@ -12948,6 +12948,16 @@ int } int +_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch) +{ + if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0) + return -1; + PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch); + writer->pos++; + return 0; +} + +int _PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str) { Py_UCS4 maxchar;