Index: Doc/api/concrete.tex =================================================================== --- Doc/api/concrete.tex (revision 46417) +++ Doc/api/concrete.tex (working copy) @@ -1431,6 +1431,18 @@ raised by the codec. \end{cfuncdesc} +\begin{cfuncdesc}{PyObject*}{PyUnicode_DecodeMBCSStateful}{const char *s, + int size, + const char *errors, + int *consumed} + If \var{consumed} is \NULL{}, behave like + \cfunction{PyUnicode_DecodeMBCS()}. If \var{consumed} is not \NULL{}, + \cfunction{PyUnicode_DecodeMBCSStateful()} will not decode trailing lead + byte and the number of bytes that have been decoded will be stored in + \var{consumed}. + \versionadded{2.5} +\end{cfuncdesc} + \begin{cfuncdesc}{PyObject*}{PyUnicode_EncodeMBCS}{const Py_UNICODE *s, Py_ssize_t size, const char *errors} Index: Include/unicodeobject.h =================================================================== --- Include/unicodeobject.h (revision 46417) +++ Include/unicodeobject.h (working copy) @@ -940,6 +940,13 @@ const char *errors /* error handling */ ); +PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful( + const char *string, /* MBCS encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + Py_ssize_t *consumed /* bytes consumed */ + ); + PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString( PyObject *unicode /* Unicode object */ ); Index: Lib/encodings/mbcs.py =================================================================== --- Lib/encodings/mbcs.py (revision 46417) +++ Lib/encodings/mbcs.py (working copy) @@ -22,9 +22,10 @@ def encode(self, input, final=False): return codecs.mbcs_encode(input,self.errors)[0] -class IncrementalDecoder(codecs.IncrementalDecoder): - def decode(self, input, final=False): - return codecs.mbcs_decode(input,self.errors)[0] +class IncrementalDecoder(codecs.BufferedIncrementalDecoder): + def _buffer_decode(self, input, errors, final): + return codecs.mbcs_decode(input,self.errors,final) + class StreamWriter(Codec,codecs.StreamWriter): pass Index: Modules/_codecsmodule.c =================================================================== --- Modules/_codecsmodule.c (revision 46417) +++ Modules/_codecsmodule.c (working copy) @@ -518,15 +518,20 @@ PyObject *args) { const char *data; - Py_ssize_t size; + Py_ssize_t size, consumed; const char *errors = NULL; + int final = 1; + PyObject *decoded; - if (!PyArg_ParseTuple(args, "t#|z:mbcs_decode", - &data, &size, &errors)) + if (!PyArg_ParseTuple(args, "t#|zi:mbcs_decode", + &data, &size, &errors, &final)) return NULL; - return codec_tuple(PyUnicode_DecodeMBCS(data, size, errors), - size); + decoded = PyUnicode_DecodeMBCSStateful( + data, size, errors, final ? NULL : &consumed); + if (!decoded) + return NULL; + return codec_tuple(decoded, final ? size : consumed); } #endif /* MS_WINDOWS */ Index: Objects/unicodeobject.c =================================================================== --- Objects/unicodeobject.c (revision 46417) +++ Objects/unicodeobject.c (working copy) @@ -2820,65 +2820,194 @@ /* --- MBCS codecs for Windows -------------------------------------------- */ -PyObject *PyUnicode_DecodeMBCS(const char *s, - Py_ssize_t size, - const char *errors) +#if SIZEOF_INT < SIZEOF_SSIZE_T +#define NEED_RETRY +#endif + +static int is_dbcs_lead_byte(const char *s, int offset) { - PyUnicodeObject *v; + const char *curr = s + offset; + + if (IsDBCSLeadByte(*curr)) { + const char *prev = CharPrev(s, curr); + return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2); + } + return 0; +} + +/* + * Decode MBCS string into unicode object. If 'final' is set, converts + * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise. + */ +static int decode_mbcs(PyUnicodeObject **v, + const char *s, /* MBCS string */ + int size, /* sizeof MBCS string */ + int final) +{ Py_UNICODE *p; - DWORD usize; + Py_ssize_t n = 0; + int usize = 0; + assert(size >= 0); + + /* Skip trailing lead-byte unless 'final' is set */ + if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1)) + --size; + /* First get the size of the result */ - assert(size < INT_MAX); - usize = MultiByteToWideChar(CP_ACP, 0, s, (int)size, NULL, 0); - if (size > 0 && usize==0) - return PyErr_SetFromWindowsErrWithFilename(0, NULL); + if (size > 0) { + usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0); + if (usize == 0) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } + } - v = _PyUnicode_New(usize); - if (v == NULL) - return NULL; - if (usize == 0) - return (PyObject *)v; - p = PyUnicode_AS_UNICODE(v); - if (0 == MultiByteToWideChar(CP_ACP, 0, s, (int)size, p, usize)) { - Py_DECREF(v); - return PyErr_SetFromWindowsErrWithFilename(0, NULL); + if (*v == NULL) { + /* Create unicode object */ + *v = _PyUnicode_New(usize); + if (*v == NULL) + return -1; } + else { + /* Extend unicode object */ + n = PyUnicode_GET_SIZE(*v); + if (_PyUnicode_Resize(v, n + usize) < 0) + return -1; + } + /* Do the conversion */ + if (size > 0) { + p = PyUnicode_AS_UNICODE(*v) + n; + if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } + } + + return size; +} + +PyObject *PyUnicode_DecodeMBCSStateful(const char *s, + Py_ssize_t size, + const char *errors, + Py_ssize_t *consumed) +{ + PyUnicodeObject *v = NULL; + int done; + + if (consumed) + *consumed = 0; + +#ifdef NEED_RETRY + retry: + if (size > INT_MAX) + done = decode_mbcs(&v, s, INT_MAX, 0); + else +#endif + done = decode_mbcs(&v, s, (int)size, !consumed); + + if (done < 0) { + Py_XDECREF(v); + return NULL; + } + + if (consumed) + *consumed += done; + +#ifdef NEED_RETRY + if (size > INT_MAX) { + s += done; + size -= done; + goto retry; + } +#endif + return (PyObject *)v; } -PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, +PyObject *PyUnicode_DecodeMBCS(const char *s, Py_ssize_t size, const char *errors) { - PyObject *repr; - char *s; - DWORD mbcssize; + return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL); +} - /* If there are no characters, bail now! */ - if (size==0) - return PyString_FromString(""); +/* + * Convert unicode into string object (MBCS). + * Returns 0 if succeed, -1 otherwise. + */ +static int encode_mbcs(PyObject **repr, + const Py_UNICODE *p, /* unicode */ + int size) /* size of unicode */ +{ + int mbcssize = 0; + Py_ssize_t n = 0; + assert(size >= 0); + /* First get the size of the result */ - assert(size 0) { + mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL); + if (mbcssize == 0) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } + } - repr = PyString_FromStringAndSize(NULL, mbcssize); - if (repr == NULL) - return NULL; - if (mbcssize == 0) - return repr; + if (*repr == NULL) { + /* Create string object */ + *repr = PyString_FromStringAndSize(NULL, mbcssize); + if (*repr == NULL) + return -1; + } + else { + /* Extend string object */ + n = PyString_Size(*repr); + if (_PyString_Resize(repr, n + mbcssize) < 0) + return -1; + } /* Do the conversion */ - s = PyString_AS_STRING(repr); - assert(size < INT_MAX); - if (0 == WideCharToMultiByte(CP_ACP, 0, p, (int)size, s, mbcssize, NULL, NULL)) { - Py_DECREF(repr); - return PyErr_SetFromWindowsErrWithFilename(0, NULL); + if (size > 0) { + char *s = PyString_AS_STRING(*repr) + n; + if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) { + PyErr_SetFromWindowsErrWithFilename(0, NULL); + return -1; + } } + + return 0; +} + +PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p, + Py_ssize_t size, + const char *errors) +{ + PyObject *repr = NULL; + int ret; + +#ifdef NEED_RETRY + retry: + if (size > INT_MAX) + ret = encode_mbcs(&repr, p, INT_MAX); + else +#endif + ret = encode_mbcs(&repr, p, (int)size); + + if (ret < 0) { + Py_XDECREF(repr); + return NULL; + } + +#ifdef NEED_RETRY + if (size > INT_MAX) { + p += INT_MAX; + size -= INT_MAX; + goto retry; + } +#endif + return repr; } @@ -2893,6 +3022,8 @@ NULL); } +#undef NEED_RETRY + #endif /* MS_WINDOWS */ /* --- Character Mapping Codec -------------------------------------------- */