--- parsetok.c.orig 2004-09-20 18:28:01.937500000 +0900 +++ parsetok.c 2004-09-20 22:15:19.859375000 +0900 @@ -9,6 +9,7 @@ #include "parsetok.h" #include "errcode.h" #include "graminit.h" +#include "unicodeobject.h" int Py_TabcheckFlag; @@ -16,6 +17,9 @@ /* Forward */ static node *parsetok(struct tok_state *, grammar *, int, perrdetail *, int); static void initerr(perrdetail *err_ret, const char* filename); +#ifdef Py_USING_UNICODE +static PyObject * dec_utf8(const char *enc, const char *text, size_t len); +#endif /* Parse input coming from a string. Return error code, print some errors. */ node * @@ -187,13 +191,45 @@ err_ret->lineno = tok->lineno; err_ret->offset = tok->cur - tok->buf; if (tok->buf != NULL) { - size_t len = tok->inp - tok->buf; - err_ret->text = (char *) PyObject_MALLOC(len + 1); - if (err_ret->text != NULL) { - if (len > 0) - strncpy(err_ret->text, tok->buf, len); - err_ret->text[len] = '\0'; - } + char *text = NULL; + size_t len = tok->inp - tok->buf; +#ifdef Py_USING_UNICODE + if (tok->encoding) { + /* convert source to original encondig */ + PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len); + if (lineobj != NULL) { + int linelen = PyString_Size(lineobj); + const char *line = PyString_AsString(lineobj); + text = PyObject_MALLOC(linelen + 1); + if (text != NULL && line != NULL) { + if (linelen) + strncpy(text, line, linelen); + text[linelen] = '\0'; + } + Py_DECREF(lineobj); + + if (err_ret->offset > 1) { + /* adjust error offset */ + PyObject *offsetobj = dec_utf8(tok->encoding, + tok->buf, err_ret->offset-1); + if (offsetobj) { + err_ret->offset = PyString_Size(offsetobj) + 1; + Py_DECREF(offsetobj); + } + } + + } + } +#endif + if (text == NULL) { + text = (char *) PyObject_MALLOC(len + 1); + if (text != NULL) { + if (len > 0) + strncpy(text, tok->buf, len); + text[len] = '\0'; + } + } + err_ret->text = text; } } else if (tok->encoding != NULL) { node* r = PyNode_New(encoding_decl); @@ -220,3 +256,16 @@ err_ret->token = -1; err_ret->expected = -1; } + +#ifdef Py_USING_UNICODE +static PyObject * +dec_utf8(const char *enc, const char *text, size_t len) { + PyObject *ret = NULL; + PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, NULL); + if (unicode_text) { + ret = PyUnicode_AsEncodedString(unicode_text, enc, NULL); + Py_DECREF(unicode_text); + } + return ret; +} +#endif