Index: Lib/test/test_multibytecodec.py =================================================================== --- Lib/test/test_multibytecodec.py (revision 42831) +++ Lib/test/test_multibytecodec.py (working copy) @@ -9,6 +9,101 @@ from test import test_multibytecodec_support import unittest, StringIO, codecs +class Test_MultibyteCodec(unittest.TestCase): + + def test_nullcoding(self): + self.assertEqual(''.decode('gb18030'), u'') + self.assertEqual(unicode('', 'gb18030'), u'') + self.assertEqual(u''.encode('gb18030'), '') + + def test_str_decode(self): + self.assertEqual('abcd'.encode('gb18030'), 'abcd') + + +class Test_IncrementalEncoder(unittest.TestCase): + + def test_stateless(self): + # cp949 encoder isn't stateful at all. + encoder = codecs.getincrementalencoder('cp949')() + self.assertEqual(encoder.encode(u'\ud30c\uc774\uc36c \ub9c8\uc744'), + '\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb') + self.assertEqual(encoder.reset(), None) + self.assertEqual(encoder.encode(u'\u2606\u223c\u2606', True), + '\xa1\xd9\xa1\xad\xa1\xd9') + self.assertEqual(encoder.reset(), None) + self.assertEqual(encoder.encode(u'', True), '') + self.assertEqual(encoder.encode(u'', False), '') + self.assertEqual(encoder.reset(), None) + + def test_stateful(self): + # jisx0213 encoder is stateful for a few codepoints. eg) + # U+00E6 => A9DC + # U+00E6 U+0300 => ABC4 + # U+0300 => ABDC + + encoder = codecs.getincrementalencoder('jisx0213')() + self.assertEqual(encoder.encode(u'\u00e6\u0300'), '\xab\xc4') + self.assertEqual(encoder.encode(u'\u00e6'), '') + self.assertEqual(encoder.encode(u'\u0300'), '\xab\xc4') + self.assertEqual(encoder.encode(u'\u00e6', True), '\xa9\xdc') + + self.assertEqual(encoder.reset(), None) + self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc') + + self.assertEqual(encoder.encode(u'\u00e6'), '') + self.assertEqual(encoder.encode('', True), '\xa9\xdc') + self.assertEqual(encoder.encode('', True), '') + + def test_stateful_keep_buffer(self): + encoder = codecs.getincrementalencoder('jisx0213')() + self.assertEqual(encoder.encode(u'\u00e6'), '') + self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123') + self.assertEqual(encoder.encode(u'\u0300\u00e6'), '\xab\xc4') + self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123') + self.assertEqual(encoder.reset(), None) + self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc') + self.assertEqual(encoder.encode(u'\u00e6'), '') + self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123') + self.assertEqual(encoder.encode(u'', True), '\xa9\xdc') + + +class Test_IncrementalDecoder(unittest.TestCase): + + def test_dbcs(self): + # cp949 decoder is simple with only 1 or 2 bytes sequences. + decoder = codecs.getincrementaldecoder('cp949')() + self.assertEqual(decoder.decode('\xc6\xc4\xc0\xcc\xbd'), + u'\ud30c\uc774') + self.assertEqual(decoder.decode('\xe3 \xb8\xb6\xc0\xbb'), + u'\uc36c \ub9c8\uc744') + self.assertEqual(decoder.decode(''), u'') + + def test_dbcs_keep_buffer(self): + decoder = codecs.getincrementaldecoder('cp949')() + self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c') + self.assertRaises(UnicodeDecodeError, decoder.decode, '', True) + self.assertEqual(decoder.decode('\xcc'), u'\uc774') + + self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c') + self.assertRaises(UnicodeDecodeError, decoder.decode, '\xcc\xbd', True) + self.assertEqual(decoder.decode('\xcc'), u'\uc774') + + def test_iso2022(self): + decoder = codecs.getincrementaldecoder('iso2022-jp')() + ESC = '\x1b' + self.assertEqual(decoder.decode(ESC + '('), u'') + self.assertEqual(decoder.decode('B', True), u'') + self.assertEqual(decoder.decode(ESC + '$'), u'') + self.assertEqual(decoder.decode('B@$'), u'\u4e16') + self.assertEqual(decoder.decode('@$@'), u'\u4e16') + self.assertEqual(decoder.decode('$', True), u'\u4e16') + self.assertEqual(decoder.reset(), None) + self.assertEqual(decoder.decode('@$'), u'@$') + self.assertEqual(decoder.decode(ESC + '$'), u'') + self.assertRaises(UnicodeDecodeError, decoder.decode, '', True) + self.assertEqual(decoder.decode('B@$'), u'\u4e16') + + class Test_StreamWriter(unittest.TestCase): if len(u'\U00012345') == 2: # UCS2 def test_gb18030(self): @@ -30,15 +125,16 @@ self.assertEqual(s.getvalue(), '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851') - # standard utf-8 codecs has broken StreamReader - if test_multibytecodec_support.__cjkcodecs__: - def test_utf_8(self): - s= StringIO.StringIO() - c = codecs.lookup('utf-8')[3](s) - c.write(u'123') - self.assertEqual(s.getvalue(), '123') - c.write(u'\U00012345') - self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85') + def test_utf_8(self): + s= StringIO.StringIO() + c = codecs.lookup('utf-8')[3](s) + c.write(u'123') + self.assertEqual(s.getvalue(), '123') + c.write(u'\U00012345') + self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85') + + # Python utf-8 codec can't buffer surrogate pairs yet. + if 0: c.write(u'\U00012345'[0]) self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85') c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac') @@ -61,22 +157,18 @@ else: # UCS4 pass - def test_nullcoding(self): - self.assertEqual(''.decode('gb18030'), u'') - self.assertEqual(unicode('', 'gb18030'), u'') - self.assertEqual(u''.encode('gb18030'), '') - - def test_str_decode(self): - self.assertEqual('abcd'.encode('gb18030'), 'abcd') - def test_streamwriter_strwrite(self): s = StringIO.StringIO() wr = codecs.getwriter('gb18030')(s) wr.write('abcd') self.assertEqual(s.getvalue(), 'abcd') + def test_main(): suite = unittest.TestSuite() + suite.addTest(unittest.makeSuite(Test_MultibyteCodec)) + suite.addTest(unittest.makeSuite(Test_IncrementalEncoder)) + suite.addTest(unittest.makeSuite(Test_IncrementalDecoder)) suite.addTest(unittest.makeSuite(Test_StreamWriter)) test_support.run_suite(suite) Index: Lib/test/test_multibytecodec_support.py =================================================================== --- Lib/test/test_multibytecodec_support.py (revision 42831) +++ Lib/test/test_multibytecodec_support.py (working copy) @@ -3,15 +3,12 @@ # test_multibytecodec_support.py # Common Unittest Routines for CJK codecs # -# $CJKCodecs: test_multibytecodec_support.py,v 1.6 2004/06/19 06:09:55 perky Exp $ import sys, codecs, os.path import unittest from test import test_support from StringIO import StringIO -__cjkcodecs__ = 0 # define this as 0 for python - class TestBase: encoding = '' # codec name codec = None # codec tuple (with 4 elements) @@ -26,6 +23,8 @@ if self.codec is None: self.codec = codecs.lookup(self.encoding) self.encode, self.decode, self.reader, self.writer = self.codec + self.incrementalencoder = self.codec.incrementalencoder + self.incrementaldecoder = self.codec.incrementaldecoder def test_chunkcoding(self): for native, utf8 in zip(*[StringIO(f).readlines() @@ -47,52 +46,88 @@ else: self.assertRaises(UnicodeError, func, source, scheme) - if sys.hexversion >= 0x02030000: - def test_xmlcharrefreplace(self): - if self.has_iso10646: - return + def test_xmlcharrefreplace(self): + if self.has_iso10646: + return - s = u"\u0b13\u0b23\u0b60 nd eggs" - self.assertEqual( - self.encode(s, "xmlcharrefreplace")[0], - "ଓଣୠ nd eggs" - ) + s = u"\u0b13\u0b23\u0b60 nd eggs" + self.assertEqual( + self.encode(s, "xmlcharrefreplace")[0], + "ଓଣୠ nd eggs" + ) - def test_customreplace(self): - if self.has_iso10646: - return + def test_customreplace(self): + if self.has_iso10646: + return - import htmlentitydefs + import htmlentitydefs - names = {} - for (key, value) in htmlentitydefs.entitydefs.items(): - if len(value)==1: - names[value.decode('latin-1')] = self.decode(key)[0] + names = {} + for (key, value) in htmlentitydefs.entitydefs.items(): + if len(value)==1: + names[value.decode('latin-1')] = self.decode(key)[0] + else: + names[unichr(int(value[2:-1]))] = self.decode(key)[0] + + def xmlcharnamereplace(exc): + if not isinstance(exc, UnicodeEncodeError): + raise TypeError("don't know how to handle %r" % exc) + l = [] + for c in exc.object[exc.start:exc.end]: + try: + l.append(u"&%s;" % names[c]) + except KeyError: + l.append(u"&#%d;" % ord(c)) + return (u"".join(l), exc.end) + + codecs.register_error( + "test.xmlcharnamereplace", xmlcharnamereplace) + + if self.xmlcharnametest: + sin, sout = self.xmlcharnametest + else: + sin = u"\xab\u211c\xbb = \u2329\u1234\u232a" + sout = "«ℜ» = ⟨ሴ⟩" + self.assertEqual(self.encode(sin, + "test.xmlcharnamereplace")[0], sout) + + def test_incrementalencoder(self): + UTF8Reader = codecs.getreader('utf-8') + for sizehint in [None] + range(1, 33) + \ + [64, 128, 256, 512, 1024]: + istream = UTF8Reader(StringIO(self.tstring[1])) + ostream = StringIO() + encoder = self.incrementalencoder() + while 1: + if sizehint is not None: + data = istream.read(sizehint) else: - names[unichr(int(value[2:-1]))] = self.decode(key)[0] + data = istream.read() - def xmlcharnamereplace(exc): - if not isinstance(exc, UnicodeEncodeError): - raise TypeError("don't know how to handle %r" % exc) - l = [] - for c in exc.object[exc.start:exc.end]: - try: - l.append(u"&%s;" % names[c]) - except KeyError: - l.append(u"&#%d;" % ord(c)) - return (u"".join(l), exc.end) + if not data: + break + e = encoder.encode(data) + ostream.write(e) - codecs.register_error( - "test.xmlcharnamereplace", xmlcharnamereplace) + self.assertEqual(ostream.getvalue(), self.tstring[0]) - if self.xmlcharnametest: - sin, sout = self.xmlcharnametest - else: - sin = u"\xab\u211c\xbb = \u2329\u1234\u232a" - sout = "«ℜ» = ⟨ሴ⟩" - self.assertEqual(self.encode(sin, - "test.xmlcharnamereplace")[0], sout) + def test_incrementaldecoder(self): + UTF8Writer = codecs.getwriter('utf-8') + for sizehint in [None, -1] + range(1, 33) + \ + [64, 128, 256, 512, 1024]: + istream = StringIO(self.tstring[0]) + ostream = UTF8Writer(StringIO()) + decoder = self.incrementaldecoder() + while 1: + data = istream.read(sizehint) + if not data: + break + else: + u = decoder.decode(data) + ostream.write(u) + self.assertEqual(ostream.getvalue(), self.tstring[1]) + def test_streamreader(self): UTF8Writer = codecs.getwriter('utf-8') for name in ["read", "readline", "readlines"]: @@ -113,11 +148,7 @@ self.assertEqual(ostream.getvalue(), self.tstring[1]) def test_streamwriter(self): - if __cjkcodecs__: - readfuncs = ('read', 'readline', 'readlines') - else: - # standard utf8 codec has broken readline and readlines. - readfuncs = ('read',) + readfuncs = ('read', 'readline', 'readlines') UTF8Reader = codecs.getreader('utf-8') for name in readfuncs: for sizehint in [None] + range(1, 33) + \ @@ -211,10 +242,5 @@ self.assertEqual(unicode(csetch, self.encoding), unich) def load_teststring(encoding): - if __cjkcodecs__: - etxt = open(os.path.join('sampletexts', encoding) + '.txt').read() - utxt = open(os.path.join('sampletexts', encoding) + '.utf8').read() - return (etxt, utxt) - else: - from test import cjkencodings_test - return cjkencodings_test.teststring[encoding] + from test import cjkencodings_test + return cjkencodings_test.teststring[encoding] Index: Tools/unicode/gencjkcodecs.py =================================================================== --- Tools/unicode/gencjkcodecs.py (revision 0) +++ Tools/unicode/gencjkcodecs.py (revision 0) @@ -0,0 +1,82 @@ +import os, string + +codecs = { + 'cn': ('gb2312', 'gbk', 'gb18030', 'hz'), + 'tw': ('big5', 'cp950'), + 'hk': ('big5hkscs',), + 'jp': ('cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213', + 'euc_jis_2004', 'shift_jis_2004'), + 'kr': ('cp949', 'euc_kr', 'johab'), + 'iso2022': ('iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', + 'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', + 'iso2022_kr'), +} + +TEMPLATE = string.Template("""\ +# +# $encoding.py: Python Unicode Codec for $ENCODING +# +# Written by Hye-Shik Chang +# + +import _codecs_$owner, codecs + +codec = _codecs_$owner.getcodec('$encoding') + +class Codec(codecs.Codec): + encode = codec.encode + decode = codec.decode + +class IncrementalEncoder(codecs.IncrementalEncoder): + def __init__(self, errors='strict'): + __enc = codec.IncrementalEncoder(errors) + self.encode = __enc.encode + self.reset = __enc.reset + +class IncrementalDecoder(codecs.IncrementalDecoder): + def __init__(self, errors='strict'): + __dec = codec.IncrementalDecoder(errors) + self.decode = __dec.decode + self.reset = __dec.reset + +class StreamReader(Codec, codecs.StreamReader): + def __init__(self, stream, errors='strict'): + codecs.StreamReader.__init__(self, stream, errors) + __codec = codec.StreamReader(stream, errors) + self.read = __codec.read + self.readline = __codec.readline + self.readlines = __codec.readlines + self.reset = __codec.reset + +class StreamWriter(Codec, codecs.StreamWriter): + def __init__(self, stream, errors='strict'): + codecs.StreamWriter.__init__(self, stream, errors) + __codec = codec.StreamWriter(stream, errors) + self.write = __codec.write + self.writelines = __codec.writelines + self.reset = __codec.reset + +def getregentry(): + return codecs.CodecInfo( + name='$encoding', + encode=Codec().encode, + decode=Codec().decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) +""") + +def gencodecs(prefix): + for loc, encodings in codecs.iteritems(): + for enc in encodings: + code = TEMPLATE.substitute(ENCODING=enc.upper(), + encoding=enc.lower(), + owner=loc) + codecpath = os.path.join(prefix, enc + '.py') + open(codecpath, 'w').write(code) + +if __name__ == '__main__': + import sys + gencodecs(sys.argv[1]) Index: Tools/unicode/Makefile =================================================================== --- Tools/unicode/Makefile (revision 42831) +++ Tools/unicode/Makefile (working copy) @@ -15,7 +15,7 @@ all: distclean mappings codecs -codecs: misc windows iso apple ebcdic custom-mappings +codecs: misc windows iso apple ebcdic custom-mappings cjk ### Mappings @@ -44,11 +44,11 @@ $(RM) -f build/readme.* iso: build/ - $(PYTHON) gencodec.py MAPPINGS/ISO8859/ build/iso + $(PYTHON) gencodec.py MAPPINGS/ISO8859/ build/ iso $(RM) -f build/isoreadme.* apple: build/ - $(PYTHON) gencodec.py MAPPINGS/VENDORS/APPLE/ build/mac_ + $(PYTHON) gencodec.py MAPPINGS/VENDORS/APPLE/ build/ mac_ $(RM) build/mac_dingbats.* $(RM) build/mac_japanese.* $(RM) build/mac_chin* @@ -72,6 +72,9 @@ $(PYTHON) gencodec.py MAPPINGS/VENDORS/MICSFT/EBCDIC/ build/ $(RM) -f build/readme.* +cjk: build/ + $(PYTHON) gencjkcodecs.py build/ + ### Cleanup clean: Index: Modules/cjkcodecs/multibytecodec.c =================================================================== --- Modules/cjkcodecs/multibytecodec.c (revision 42831) +++ Modules/cjkcodecs/multibytecodec.c (working copy) @@ -38,6 +38,12 @@ are 'ignore' and 'replace' as well as any other name registerd with\n\ codecs.register_error that is able to handle UnicodeDecodeErrors."); +PyDoc_STRVAR(MultibyteCodec_IncrementalEncoder__doc__, +"I.IncrementalEncoder([errors]) -> IncrementalEncoder instance"); + +PyDoc_STRVAR(MultibyteCodec_IncrementalDecoder__doc__, +"I.IncrementalDecoder([errors]) -> IncrementalDecoder instance"); + PyDoc_STRVAR(MultibyteCodec_StreamReader__doc__, "I.StreamReader(stream[, errors]) -> StreamReader instance"); @@ -45,11 +51,15 @@ "I.StreamWriter(stream[, errors]) -> StreamWriter instance"); static char *codeckwarglist[] = {"input", "errors", NULL}; +static char *incfactorykwarglist[] = {"errors", NULL}; +static char *incrementalkwarglist[] = {"input", "final", NULL}; static char *streamkwarglist[] = {"stream", "errors", NULL}; static PyObject *multibytecodec_encode(MultibyteCodec *, MultibyteCodec_State *, const Py_UNICODE **, Py_ssize_t, PyObject *, int); +static PyObject *mbiencoder_create(MultibyteCodec *codec, const char *errors); +static PyObject *mbidecoder_create(MultibyteCodec *codec, const char *errors); static PyObject *mbstreamreader_create(MultibyteCodec *, PyObject *, const char *); static PyObject *mbstreamwriter_create(MultibyteCodec *, @@ -125,8 +135,7 @@ { Py_ssize_t orgpos, orgsize; - orgpos = (Py_ssize_t)(buf->outbuf - - PyUnicode_AS_UNICODE(buf->outobj)); + orgpos = (Py_ssize_t)(buf->outbuf - PyUnicode_AS_UNICODE(buf->outobj)); orgsize = PyUnicode_GET_SIZE(buf->outobj); if (PyUnicode_Resize(&buf->outobj, orgsize + ( esize < (orgsize >> 1) ? (orgsize >> 1) | 1 : esize)) == -1) @@ -144,6 +153,11 @@ goto errorexit; \ } + +/** + * MultibyteCodec object + */ + static int multibytecodec_encerror(MultibyteCodec *codec, MultibyteCodec_State *state, @@ -166,7 +180,7 @@ return 0; /* retry it */ case MBERR_TOOFEW: reason = "incomplete multibyte sequence"; - esize = (size_t)(buf->inbuf_end - buf->inbuf); + esize = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); break; case MBERR_INTERNAL: PyErr_SetString(PyExc_RuntimeError, @@ -309,7 +323,7 @@ return 0; /* retry it */ case MBERR_TOOFEW: reason = "incomplete multibyte sequence"; - esize = (size_t)(buf->inbuf_end - buf->inbuf); + esize = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); break; case MBERR_INTERNAL: PyErr_SetString(PyExc_RuntimeError, @@ -453,7 +467,7 @@ goto errorexit; } - finalsize = (Py_ssize_t)((char*)buf.outbuf - + finalsize = (Py_ssize_t)((char *)buf.outbuf - PyString_AS_STRING(buf.outobj)); if (finalsize != PyString_GET_SIZE(buf.outobj)) @@ -554,7 +568,7 @@ return make_tuple(PyUnicode_FromUnicode(NULL, 0), 0); } - buf.outobj = buf.excobj = NULL; + buf.excobj = NULL; buf.inbuf = buf.inbuf_top = (unsigned char *)data; buf.inbuf_end = buf.inbuf_top + datalen; buf.outobj = PyUnicode_FromUnicode(NULL, datalen); @@ -606,6 +620,32 @@ } static PyObject * +MultibyteCodec_IncrementalEncoder(MultibyteCodecObject *self, + PyObject *args, PyObject *kwargs) +{ + char *errors = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|s:IncrementalEncoder", + incfactorykwarglist, &errors)) + return NULL; + + return mbiencoder_create(self->codec, errors); +} + +static PyObject * +MultibyteCodec_IncrementalDecoder(MultibyteCodecObject *self, + PyObject *args, PyObject *kwargs) +{ + char *errors = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|s:IncrementalDecoder", + incfactorykwarglist, &errors)) + return NULL; + + return mbidecoder_create(self->codec, errors); +} + +static PyObject * MultibyteCodec_StreamReader(MultibyteCodecObject *self, PyObject *args, PyObject *kwargs) { @@ -640,6 +680,12 @@ {"decode", (PyCFunction)MultibyteCodec_Decode, METH_VARARGS | METH_KEYWORDS, MultibyteCodec_Decode__doc__}, + {"IncrementalEncoder", (PyCFunction)MultibyteCodec_IncrementalEncoder, + METH_VARARGS | METH_KEYWORDS, + MultibyteCodec_IncrementalEncoder__doc__}, + {"IncrementalDecoder", (PyCFunction)MultibyteCodec_IncrementalDecoder, + METH_VARARGS | METH_KEYWORDS, + MultibyteCodec_IncrementalDecoder__doc__}, {"StreamReader",(PyCFunction)MultibyteCodec_StreamReader, METH_VARARGS | METH_KEYWORDS, MultibyteCodec_StreamReader__doc__}, @@ -656,7 +702,6 @@ } - static PyTypeObject MultibyteCodec_Type = { PyObject_HEAD_INIT(NULL) 0, /* ob_size */ @@ -690,13 +735,380 @@ multibytecodec_methods, /* tp_methods */ }; + +/** + * Utility functions for stateful codec mechanism + */ + +#define STATEFUL_DCTX(o) ((MultibyteStatefulDecoderContext *)(o)) +#define STATEFUL_ECTX(o) ((MultibyteStatefulEncoderContext *)(o)) + static PyObject * +encoder_encode_stateful(MultibyteStatefulEncoderContext *ctx, + PyObject *unistr, int final) +{ + PyObject *ucvt, *r = NULL; + Py_UNICODE *inbuf, *inbuf_end, *inbuf_tmp = NULL; + Py_ssize_t datalen, origpending; + + if (PyUnicode_Check(unistr)) + ucvt = NULL; + else { + unistr = ucvt = PyObject_Unicode(unistr); + if (unistr == NULL) + return NULL; + else if (!PyUnicode_Check(unistr)) { + PyErr_SetString(PyExc_TypeError, + "couldn't convert the object to unicode."); + Py_DECREF(ucvt); + return NULL; + } + } + + datalen = PyUnicode_GET_SIZE(unistr); + origpending = ctx->pendingsize; + + if (ctx->pendingsize > 0) { + inbuf_tmp = PyMem_New(Py_UNICODE, datalen + ctx->pendingsize); + if (inbuf_tmp == NULL) + goto errorexit; + memcpy(inbuf_tmp, ctx->pending, + Py_UNICODE_SIZE * ctx->pendingsize); + memcpy(inbuf_tmp + ctx->pendingsize, + PyUnicode_AS_UNICODE(unistr), + Py_UNICODE_SIZE * datalen); + datalen += ctx->pendingsize; + ctx->pendingsize = 0; + inbuf = inbuf_tmp; + } + else + inbuf = (Py_UNICODE *)PyUnicode_AS_UNICODE(unistr); + + inbuf_end = inbuf + datalen; + + r = multibytecodec_encode(ctx->codec, &ctx->state, + (const Py_UNICODE **)&inbuf, + datalen, ctx->errors, final ? MBENC_FLUSH : 0); + if (r == NULL) { + /* recover the original pending buffer */ + memcpy(ctx->pending, inbuf_tmp, Py_UNICODE_SIZE * origpending); + ctx->pendingsize = origpending; + goto errorexit; + } + + if (inbuf < inbuf_end) { + ctx->pendingsize = (Py_ssize_t)(inbuf_end - inbuf); + if (ctx->pendingsize > MAXENCPENDING) { + /* normal codecs can't reach here */ + ctx->pendingsize = 0; + PyErr_SetString(PyExc_UnicodeError, + "pending buffer overflow"); + goto errorexit; + } + memcpy(ctx->pending, inbuf, + ctx->pendingsize * Py_UNICODE_SIZE); + } + + if (inbuf_tmp != NULL) + PyMem_Del(inbuf_tmp); + Py_XDECREF(ucvt); + return r; + +errorexit: + if (inbuf_tmp != NULL) + PyMem_Del(inbuf_tmp); + Py_XDECREF(r); + Py_XDECREF(ucvt); + return NULL; +} + +static int +decoder_append_pending(MultibyteStatefulDecoderContext *ctx, + MultibyteDecodeBuffer *buf) +{ + Py_ssize_t npendings; + + npendings = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); + if (npendings + ctx->pendingsize > MAXDECPENDING) { + PyErr_SetString(PyExc_UnicodeError, "pending buffer overflow"); + return -1; + } + memcpy(ctx->pending + ctx->pendingsize, buf->inbuf, npendings); + ctx->pendingsize += npendings; + return 0; +} + +static int +decoder_prepare_buffer(MultibyteDecodeBuffer *buf, const char *data, + Py_ssize_t size) +{ + buf->inbuf = buf->inbuf_top = (const unsigned char *)data; + buf->inbuf_end = buf->inbuf_top + size; + if (buf->outobj == NULL) { /* only if outobj is not allocated yet */ + buf->outobj = PyUnicode_FromUnicode(NULL, size); + if (buf->outobj == NULL) + return -1; + buf->outbuf = PyUnicode_AS_UNICODE(buf->outobj); + buf->outbuf_end = buf->outbuf + + PyUnicode_GET_SIZE(buf->outobj); + } + + return 0; +} + +static int +decoder_feed_buffer(MultibyteStatefulDecoderContext *ctx, + MultibyteDecodeBuffer *buf) +{ + while (buf->inbuf < buf->inbuf_end) { + Py_ssize_t inleft, outleft; + int r; + + inleft = (Py_ssize_t)(buf->inbuf_end - buf->inbuf); + outleft = (Py_ssize_t)(buf->outbuf_end - buf->outbuf); + + r = ctx->codec->decode(&ctx->state, ctx->codec->config, + &buf->inbuf, inleft, &buf->outbuf, outleft); + if (r == 0 || r == MBERR_TOOFEW) + break; + else if (multibytecodec_decerror(ctx->codec, &ctx->state, + buf, ctx->errors, r)) + return -1; + } + return 0; +} + + +/** + * MultibyteIncrementalEncoder object + */ + +static PyObject * +mbiencoder_encode(MultibyteIncrementalEncoderObject *self, + PyObject *args, PyObject *kwargs) +{ + PyObject *data, *r; + int final = 0; + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|i:encode", + incrementalkwarglist, &data, &final)) + return NULL; + + r = encoder_encode_stateful(STATEFUL_ECTX(self), data, final); + if (r == NULL) + return NULL; + + return r; +} + +static PyObject * +mbiencoder_reset(MultibyteIncrementalEncoderObject *self) +{ + if (self->codec->decreset != NULL && + self->codec->decreset(&self->state, self->codec->config) != 0) + return NULL; + self->pendingsize = 0; + + Py_RETURN_NONE; +} + +static struct PyMethodDef mbiencoder_methods[] = { + {"encode", (PyCFunction)mbiencoder_encode, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"reset", (PyCFunction)mbiencoder_reset, + METH_NOARGS, NULL}, + {NULL, NULL}, +}; + +static void +mbiencoder_dealloc(MultibyteIncrementalEncoderObject *self) +{ + if (self->errors > ERROR_MAX) { + Py_DECREF(self->errors); + } + PyObject_Del(self); +} + +static PyTypeObject MultibyteIncrementalEncoder_Type = { + PyObject_HEAD_INIT(NULL) + 0, /* ob_size */ + "MultibyteIncrementalEncoder", /* tp_name */ + sizeof(MultibyteIncrementalEncoderObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + /* methods */ + (destructor)mbiencoder_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + PyObject_GenericGetAttr, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iterext */ + mbiencoder_methods, /* tp_methods */ +}; + + +/** + * MultibyteIncrementalDecoder object + */ + +static PyObject * +mbidecoder_decode(MultibyteIncrementalDecoderObject *self, + PyObject *args, PyObject *kwargs) +{ + MultibyteDecodeBuffer buf; + char *data, *wdata; + Py_ssize_t wsize, finalsize = 0, size, origpending; + int final = 0; + + if (!_PyArg_ParseTupleAndKeywords_SizeT(args, kwargs, "t#|i:decode", + incrementalkwarglist, &data, &size, &final)) + return NULL; + + buf.outobj = buf.excobj = NULL; + origpending = self->pendingsize; + + if (self->pendingsize == 0) { + wsize = size; + wdata = data; + } + else { + wsize = size + self->pendingsize; + wdata = PyMem_Malloc(wsize); + if (wdata == NULL) + goto errorexit; + memcpy(wdata, self->pending, self->pendingsize); + memcpy(wdata + self->pendingsize, data, size); + self->pendingsize = 0; + } + + if (decoder_prepare_buffer(&buf, wdata, wsize) != 0) + goto errorexit; + + if (decoder_feed_buffer(STATEFUL_DCTX(self), &buf)) + goto errorexit; + + if (final && buf.inbuf < buf.inbuf_end) { + if (multibytecodec_decerror(self->codec, &self->state, + &buf, self->errors, MBERR_TOOFEW)) { + /* recover the original pending buffer */ + memcpy(self->pending, wdata, origpending); + self->pendingsize = origpending; + goto errorexit; + } + } + + if (buf.inbuf < buf.inbuf_end) { /* pending sequence still exists */ + if (decoder_append_pending(STATEFUL_DCTX(self), &buf) != 0) + goto errorexit; + } + + finalsize = (Py_ssize_t)(buf.outbuf - PyUnicode_AS_UNICODE(buf.outobj)); + if (finalsize != PyUnicode_GET_SIZE(buf.outobj)) + if (PyUnicode_Resize(&buf.outobj, finalsize) == -1) + goto errorexit; + + if (wdata != data) + PyMem_Del(wdata); + Py_XDECREF(buf.excobj); + return buf.outobj; + +errorexit: + if (wdata != NULL && wdata != data) + PyMem_Del(wdata); + Py_XDECREF(buf.excobj); + Py_XDECREF(buf.outobj); + return NULL; +} + +static PyObject * +mbidecoder_reset(MultibyteIncrementalDecoderObject *self) +{ + if (self->codec->decreset != NULL && + self->codec->decreset(&self->state, self->codec->config) != 0) + return NULL; + self->pendingsize = 0; + + Py_RETURN_NONE; +} + +static struct PyMethodDef mbidecoder_methods[] = { + {"decode", (PyCFunction)mbidecoder_decode, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"reset", (PyCFunction)mbidecoder_reset, + METH_NOARGS, NULL}, + {NULL, NULL}, +}; + +static void +mbidecoder_dealloc(MultibyteIncrementalDecoderObject *self) +{ + if (self->errors > ERROR_MAX) { + Py_DECREF(self->errors); + } + PyObject_Del(self); +} + +static PyTypeObject MultibyteIncrementalDecoder_Type = { + PyObject_HEAD_INIT(NULL) + 0, /* ob_size */ + "MultibyteIncrementalDecoder", /* tp_name */ + sizeof(MultibyteIncrementalDecoderObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + /* methods */ + (destructor)mbidecoder_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + PyObject_GenericGetAttr, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iterext */ + mbidecoder_methods, /* tp_methods */ +}; + + +/** + * MultibyteStreamReader object + */ + +static PyObject * mbstreamreader_iread(MultibyteStreamReaderObject *self, const char *method, Py_ssize_t sizehint) { MultibyteDecodeBuffer buf; PyObject *cres; - Py_ssize_t rsize, r, finalsize = 0; + Py_ssize_t rsize, finalsize = 0; if (sizehint == 0) return PyUnicode_FromUnicode(NULL, 0); @@ -740,40 +1152,14 @@ } rsize = PyString_GET_SIZE(cres); - buf.inbuf = buf.inbuf_top = - (unsigned char *)PyString_AS_STRING(cres); - buf.inbuf_end = buf.inbuf_top + rsize; - if (buf.outobj == NULL) { - buf.outobj = PyUnicode_FromUnicode(NULL, rsize); - if (buf.outobj == NULL) - goto errorexit; - buf.outbuf = PyUnicode_AS_UNICODE(buf.outobj); - buf.outbuf_end = buf.outbuf + - PyUnicode_GET_SIZE(buf.outobj); - } + if (decoder_prepare_buffer(&buf, PyString_AS_STRING(cres), + rsize) != 0) + goto errorexit; - r = 0; - if (rsize > 0) - while (buf.inbuf < buf.inbuf_end) { - Py_ssize_t inleft, outleft; + if (rsize > 0 && decoder_feed_buffer( + (MultibyteStatefulDecoderContext *)self, &buf)) + goto errorexit; - inleft = (Py_ssize_t)(buf.inbuf_end - - buf.inbuf); - outleft = (Py_ssize_t)(buf.outbuf_end - - buf.outbuf); - - r = self->codec->decode(&self->state, - self->codec->config, - &buf.inbuf, inleft, - &buf.outbuf, outleft); - if (r == 0 || r == MBERR_TOOFEW) - break; - else if (multibytecodec_decerror(self->codec, - &self->state, &buf, - self->errors, r)) - goto errorexit; - } - if (rsize == 0 || sizehint < 0) { /* end of file */ if (buf.inbuf < buf.inbuf_end && multibytecodec_decerror(self->codec, &self->state, @@ -782,20 +1168,9 @@ } if (buf.inbuf < buf.inbuf_end) { /* pending sequence exists */ - Py_ssize_t npendings; - - /* we can't assume that pendingsize is still 0 here. - * because this function can be called recursively - * from error callback */ - npendings = (Py_ssize_t)(buf.inbuf_end - buf.inbuf); - if (npendings + self->pendingsize > MAXDECPENDING) { - PyErr_SetString(PyExc_RuntimeError, - "pending buffer overflow"); + if (decoder_append_pending(STATEFUL_DCTX(self), + &buf) != 0) goto errorexit; - } - memcpy(self->pending + self->pendingsize, buf.inbuf, - npendings); - self->pendingsize += npendings; } finalsize = (Py_ssize_t)(buf.outbuf - @@ -901,8 +1276,7 @@ return NULL; self->pendingsize = 0; - Py_INCREF(Py_None); - return Py_None; + Py_RETURN_NONE; } static struct PyMethodDef mbstreamreader_methods[] = { @@ -927,8 +1301,6 @@ PyObject_Del(self); } - - static PyTypeObject MultibyteStreamReader_Type = { PyObject_HEAD_INIT(NULL) 0, /* ob_size */ @@ -962,86 +1334,27 @@ mbstreamreader_methods, /* tp_methods */ }; + +/** + * MultibyteStreamWriter object + */ + static int mbstreamwriter_iwrite(MultibyteStreamWriterObject *self, PyObject *unistr) { - PyObject *wr, *ucvt, *r = NULL; - Py_UNICODE *inbuf, *inbuf_end, *inbuf_tmp = NULL; - Py_ssize_t datalen; + PyObject *str, *wr; - if (PyUnicode_Check(unistr)) - ucvt = NULL; - else { - unistr = ucvt = PyObject_Unicode(unistr); - if (unistr == NULL) - return -1; - else if (!PyUnicode_Check(unistr)) { - PyErr_SetString(PyExc_TypeError, - "couldn't convert the object to unicode."); - Py_DECREF(ucvt); - return -1; - } - } + str = encoder_encode_stateful(STATEFUL_ECTX(self), unistr, 0); + if (str == NULL) + return -1; - datalen = PyUnicode_GET_SIZE(unistr); - if (datalen == 0) { - Py_XDECREF(ucvt); - return 0; - } - - if (self->pendingsize > 0) { - inbuf_tmp = PyMem_New(Py_UNICODE, datalen + self->pendingsize); - if (inbuf_tmp == NULL) - goto errorexit; - memcpy(inbuf_tmp, self->pending, - Py_UNICODE_SIZE * self->pendingsize); - memcpy(inbuf_tmp + self->pendingsize, - PyUnicode_AS_UNICODE(unistr), - Py_UNICODE_SIZE * datalen); - datalen += self->pendingsize; - self->pendingsize = 0; - inbuf = inbuf_tmp; - } - else - inbuf = (Py_UNICODE *)PyUnicode_AS_UNICODE(unistr); - - inbuf_end = inbuf + datalen; - - r = multibytecodec_encode(self->codec, &self->state, - (const Py_UNICODE **)&inbuf, datalen, self->errors, 0); - if (r == NULL) - goto errorexit; - - if (inbuf < inbuf_end) { - self->pendingsize = (Py_ssize_t)(inbuf_end - inbuf); - if (self->pendingsize > MAXENCPENDING) { - self->pendingsize = 0; - PyErr_SetString(PyExc_RuntimeError, - "pending buffer overflow"); - goto errorexit; - } - memcpy(self->pending, inbuf, - self->pendingsize * Py_UNICODE_SIZE); - } - - wr = PyObject_CallMethod(self->stream, "write", "O", r); + wr = PyObject_CallMethod(self->stream, "write", "O", str); + Py_DECREF(str); if (wr == NULL) - goto errorexit; + return -1; - if (inbuf_tmp != NULL) - PyMem_Del(inbuf_tmp); - Py_DECREF(r); - Py_DECREF(wr); - Py_XDECREF(ucvt); return 0; - -errorexit: - if (inbuf_tmp != NULL) - PyMem_Del(inbuf_tmp); - Py_XDECREF(r); - Py_XDECREF(ucvt); - return -1; } static PyObject * @@ -1054,10 +1367,8 @@ if (mbstreamwriter_iwrite(self, strobj)) return NULL; - else { - Py_INCREF(Py_None); - return Py_None; - } + else + Py_RETURN_NONE; } static PyObject * @@ -1087,8 +1398,7 @@ return NULL; } - Py_INCREF(Py_None); - return Py_None; + Py_RETURN_NONE; } static PyObject * @@ -1119,8 +1429,7 @@ } Py_DECREF(pwrt); - Py_INCREF(Py_None); - return Py_None; + Py_RETURN_NONE; } static void @@ -1143,8 +1452,6 @@ {NULL, NULL}, }; - - static PyTypeObject MultibyteStreamWriter_Type = { PyObject_HEAD_INIT(NULL) 0, /* ob_size */ @@ -1178,6 +1485,11 @@ mbstreamwriter_methods, /* tp_methods */ }; + +/** + * Factory functions + */ + static PyObject * __create_codec(PyObject *ignore, PyObject *arg) { @@ -1202,6 +1514,58 @@ } static PyObject * +mbiencoder_create(MultibyteCodec *codec, const char *errors) +{ + MultibyteIncrementalEncoderObject *self; + + self = PyObject_New(MultibyteIncrementalEncoderObject, + &MultibyteIncrementalEncoder_Type); + if (self == NULL) + return NULL; + + self->codec = codec; + self->pendingsize = 0; + self->errors = get_errorcallback(errors); + if (self->errors == NULL) + goto errorexit; + if (self->codec->encinit != NULL && + self->codec->encinit(&self->state, self->codec->config) != 0) + goto errorexit; + + return (PyObject *)self; + +errorexit: + Py_XDECREF(self); + return NULL; +} + +static PyObject * +mbidecoder_create(MultibyteCodec *codec, const char *errors) +{ + MultibyteIncrementalDecoderObject *self; + + self = PyObject_New(MultibyteIncrementalDecoderObject, + &MultibyteIncrementalDecoder_Type); + if (self == NULL) + return NULL; + + self->codec = codec; + self->pendingsize = 0; + self->errors = get_errorcallback(errors); + if (self->errors == NULL) + goto errorexit; + if (self->codec->decinit != NULL && + self->codec->decinit(&self->state, self->codec->config) != 0) + goto errorexit; + + return (PyObject *)self; + +errorexit: + Py_XDECREF(self); + return NULL; +} + +static PyObject * mbstreamreader_create(MultibyteCodec *codec, PyObject *stream, const char *errors) { @@ -1269,6 +1633,10 @@ { if (PyType_Ready(&MultibyteCodec_Type) < 0) return; + if (PyType_Ready(&MultibyteIncrementalEncoder_Type) < 0) + return; + if (PyType_Ready(&MultibyteIncrementalDecoder_Type) < 0) + return; if (PyType_Ready(&MultibyteStreamReader_Type) < 0) return; if (PyType_Ready(&MultibyteStreamWriter_Type) < 0) Index: Modules/cjkcodecs/multibytecodec.h =================================================================== --- Modules/cjkcodecs/multibytecodec.h (revision 42831) +++ Modules/cjkcodecs/multibytecodec.h (working copy) @@ -67,24 +67,46 @@ MultibyteCodec *codec; } MultibyteCodecObject; +#define _MultibyteStatefulCodec_HEAD \ + PyObject_HEAD \ + MultibyteCodec *codec; \ + MultibyteCodec_State state; \ + PyObject *errors; + +#define MAXENCPENDING 2 +#define _MultibyteStatefulEncoder_HEAD \ + _MultibyteStatefulCodec_HEAD \ + Py_UNICODE pending[MAXENCPENDING]; \ + Py_ssize_t pendingsize; +typedef struct { + _MultibyteStatefulEncoder_HEAD +} MultibyteStatefulEncoderContext; + #define MAXDECPENDING 8 +#define _MultibyteStatefulDecoder_HEAD \ + _MultibyteStatefulCodec_HEAD \ + unsigned char pending[MAXDECPENDING]; \ + Py_ssize_t pendingsize; typedef struct { - PyObject_HEAD - MultibyteCodec *codec; - MultibyteCodec_State state; - unsigned char pending[MAXDECPENDING]; - Py_ssize_t pendingsize; - PyObject *stream, *errors; + _MultibyteStatefulDecoder_HEAD +} MultibyteStatefulDecoderContext; + +typedef struct { + _MultibyteStatefulEncoder_HEAD +} MultibyteIncrementalEncoderObject; + +typedef struct { + _MultibyteStatefulDecoder_HEAD +} MultibyteIncrementalDecoderObject; + +typedef struct { + _MultibyteStatefulDecoder_HEAD + PyObject *stream; } MultibyteStreamReaderObject; -#define MAXENCPENDING 2 typedef struct { - PyObject_HEAD - MultibyteCodec *codec; - MultibyteCodec_State state; - Py_UNICODE pending[MAXENCPENDING]; - Py_ssize_t pendingsize; - PyObject *stream, *errors; + _MultibyteStatefulEncoder_HEAD + PyObject *stream; } MultibyteStreamWriterObject; /* positive values for illegal sequences */