Index: Makefile.pre.in
===================================================================
RCS file: /cvsroot/python/python/dist/src/Makefile.pre.in,v
retrieving revision 1.83
diff -u -r1.83 Makefile.pre.in
--- Makefile.pre.in	8 May 2002 08:59:59 -0000	1.83
+++ Makefile.pre.in	9 May 2002 13:36:24 -0000
@@ -190,15 +190,15 @@
 		Parser/node.o \
 		Parser/parser.o \
 		Parser/parsetok.o \
-		Parser/tokenizer.o \
 		Parser/bitset.o \
 		Parser/metagrammar.o
 
-PARSER_OBJS=	$(POBJS) Parser/myreadline.o
+PARSER_OBJS=	$(POBJS) Parser/myreadline.o Parser/tokenizer.o
 
 PGOBJS=		\
 		Objects/obmalloc.o \
 		Python/mysnprintf.o \
+		Parser/tokenizer_pgen.o \
 		Parser/firstsets.o \
 		Parser/grammar.o \
 		Parser/pgen.o \
@@ -415,6 +415,8 @@
 				$(srcdir)/Include/token.h \
 				$(srcdir)/Include/grammar.h
 Parser/metagrammar.o:	$(srcdir)/Parser/metagrammar.c
+
+Parser/tokenizer_pgen.o:	$(srcdir)/Parser/tokenizer.c
 
 
 Python/compile.o Python/symtable.o: $(GRAMMAR_H)
Index: Grammar/Grammar
===================================================================
RCS file: /cvsroot/python/python/dist/src/Grammar/Grammar,v
retrieving revision 1.45
diff -u -r1.45 Grammar
--- Grammar/Grammar	15 Oct 2001 15:44:04 -0000	1.45
+++ Grammar/Grammar	9 May 2002 13:36:24 -0000
@@ -100,3 +100,6 @@
 list_iter: list_for | list_if
 list_for: 'for' exprlist 'in' testlist_safe [list_iter]
 list_if: 'if' test [list_iter]
+
+# not used in grammar, but may appear in "node" passed from Parser to Compiler
+encoding_decl: NAME
Index: Include/errcode.h
===================================================================
RCS file: /cvsroot/python/python/dist/src/Include/errcode.h,v
retrieving revision 2.14
diff -u -r2.14 errcode.h
--- Include/errcode.h	1 Sep 2000 23:29:26 -0000	2.14
+++ Include/errcode.h	9 May 2002 13:36:24 -0000
@@ -25,6 +25,7 @@
 #define E_OVERFLOW      19	/* Node had too many children */
 #define E_TOODEEP	20	/* Too many indentation levels */
 #define E_DEDENT	21	/* No matching outer block for dedent */
+#define E_DECODE	22	/* Error in decoding into Unicode */
 
 #ifdef __cplusplus
 }
Index: Include/graminit.h
===================================================================
RCS file: /cvsroot/python/python/dist/src/Include/graminit.h,v
retrieving revision 2.18
diff -u -r2.18 graminit.h
--- Include/graminit.h	15 Oct 2001 15:44:04 -0000	2.18
+++ Include/graminit.h	9 May 2002 13:36:24 -0000
@@ -64,3 +64,4 @@
 #define list_iter 319
 #define list_for 320
 #define list_if 321
+#define encoding_decl 322
Index: Parser/parsetok.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Parser/parsetok.c,v
retrieving revision 2.30
diff -u -r2.30 parsetok.c
--- Parser/parsetok.c	22 Mar 2002 23:53:03 -0000	2.30
+++ Parser/parsetok.c	9 May 2002 13:36:26 -0000
@@ -8,6 +8,7 @@
 #include "parser.h"
 #include "parsetok.h"
 #include "errcode.h"
+#include "graminit.h"
 
 int Py_TabcheckFlag;
 
@@ -36,8 +37,8 @@
 		return NULL;
 	}
 
+	tok->filename = "<string>";
 	if (Py_TabcheckFlag || Py_VerboseFlag) {
-		tok->filename = "<string>";
 		tok->altwarning = (tok->filename != NULL);
 		if (Py_TabcheckFlag >= 2)
 			tok->alterror++;
@@ -69,8 +70,8 @@
 		err_ret->error = E_NOMEM;
 		return NULL;
 	}
+	tok->filename = filename;
 	if (Py_TabcheckFlag || Py_VerboseFlag) {
-		tok->filename = filename;
 		tok->altwarning = (filename != NULL);
 		if (Py_TabcheckFlag >= 2)
 			tok->alterror++;
@@ -176,6 +177,13 @@
 				err_ret->text[len] = '\0';
 			}
 		}
+	} else if (tok->encoding != NULL) {
+		node* r = PyNode_New(encoding_decl);
+		r->n_str = tok->encoding;
+		r->n_nchildren = 1;
+		r->n_child = n;
+		tok->encoding = NULL;
+		n = r;
 	}
 
 	PyTokenizer_Free(tok);
Index: Parser/tokenizer.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Parser/tokenizer.c,v
retrieving revision 2.54
diff -u -r2.54 tokenizer.c
--- Parser/tokenizer.c	14 Apr 2002 20:12:41 -0000	2.54
+++ Parser/tokenizer.c	9 May 2002 13:36:26 -0000
@@ -5,10 +5,19 @@
 #include "pgenheaders.h"
 
 #include <ctype.h>
+#include <assert.h>
 
 #include "tokenizer.h"
 #include "errcode.h"
 
+#ifndef PGEN
+#include "unicodeobject.h"
+#include "stringobject.h"
+#include "fileobject.h"
+#include "codecs.h"
+#include "abstract.h"
+#endif /* PGEN */
+
 extern char *PyOS_Readline(char *);
 /* Return malloc'ed string including trailing \n;
    empty malloc'ed string for EOF;
@@ -114,9 +123,351 @@
 	tok->alterror = 0;
 	tok->alttabsize = 1;
 	tok->altindstack[0] = 0;
+	tok->decoding_state = 0;
+	tok->decoding_erred = 0;
+	tok->read_coding_spec = 0;
+	tok->issued_encoding_warning = 0;
+	tok->encoding = NULL;
+	tok->decoding_readline = NULL;
+	tok->decoding_buffer = NULL;
 	return tok;
 }
 
+#ifdef PGEN
+
+static char *
+decoding_fgets(char *s, int size, struct tok_state *tok)
+{
+	return fgets(s, size, tok->fp);
+}
+
+static int
+decoding_feof(struct tok_state *tok)
+{
+	return feof(tok->fp);
+}
+
+static const char *
+decode_str(const char *str, struct tok_state *tok)
+{
+	return str;
+}
+
+#else /* PGEN */
+
+static char *
+error_ret(struct tok_state *tok) /* XXX */
+{
+	tok->decoding_erred = 1;
+	if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
+		PyMem_DEL(tok->buf);
+	tok->buf = NULL;
+	return NULL;		/* as if it were EOF */
+}
+
+static char *
+new_string(const char *s, int len)
+{
+	char* result = PyMem_NEW(char, len + 1);
+	if (result != NULL) {
+		memcpy(result, s, len);
+		result[len] = '\0';
+	}
+	return result;
+}
+
+static char *
+get_normal_name(char *s)	/* for utf-8 and latin-1 */
+{
+	char buf[13];
+	int i;
+	for (i = 0; i < 12; i++) {
+		int c = s[i];
+		if (c == '\0') break;
+		else if (c == '_') buf[i] = '-';
+		else buf[i] = tolower(c);
+	}
+	buf[i] = '\0';
+	if (strcmp(buf, "utf-8") == 0 ||
+	    strncmp(buf, "utf-8-", 6) == 0) return "utf-8";
+	else if (strcmp(buf, "latin-1") == 0 ||
+		 strcmp(buf, "iso-8859-1") == 0 ||
+		 strcmp(buf, "iso-latin-1") == 0 ||
+		 strncmp(buf, "latin-1-", 8) == 0 ||
+		 strncmp(buf, "iso-8859-1-", 11) == 0 ||
+		 strncmp(buf, "iso-latin-1-", 12) == 0) return "iso-8859-1";
+	else return s;
+}
+
+static char *
+get_coding_spec(const char *s, int size)
+{
+	int i;
+	for (i = 0; i < size - 6; i++) { /* XXX inefficient search */
+		const char* t = s + i;
+		if (strncmp(t, "coding", 6) == 0) {
+			const char* begin = NULL;
+			t += 6;
+			if (t[0] != ':' && t[0] != '=') continue;
+			do t++;	while (t[0] == '\x20' || t[0] == '\t');
+			begin = t;
+			while (isalnum(t[0]) || t[0] == '-' || t[0] == '_' ||
+			       t[0] == '.') t++;
+			if (begin < t) {
+				char* r = new_string(begin, t - begin);
+				char* q = get_normal_name(r);
+				if (r != q) {
+					assert(strlen(r) >= strlen(q));
+					strcpy(r, q);
+				}
+				return r;
+			}
+		}
+	}
+	return NULL;
+}
+
+static int
+check_coding_spec(const char* line, int size, struct tok_state *tok,
+		  int set_readline(struct tok_state *, const char *))
+{
+	int r = 1;
+	char* cs = get_coding_spec(line, size);
+	if (cs != NULL) {
+		tok->read_coding_spec = 1;
+		if (tok->encoding == NULL) {
+			assert(tok->decoding_state == 1); /* raw */
+			if (strcmp(cs, "utf-8") == 0 ||
+			    strcmp(cs, "iso-8859-1") == 0) {
+				tok->encoding = cs;
+			} else {
+				r = set_readline(tok, cs);
+				if (r) {
+					tok->encoding = cs;
+					tok->decoding_state = -1;
+				}
+			}
+		} else {	/* then, compare cs with BOM */
+			r = (strcmp(tok->encoding, cs) == 0);
+			PyMem_DEL(cs);
+		}
+	}
+	return r;
+}
+
+static int
+check_bom(int get_char(struct tok_state *),
+	  void unget_char(int, struct tok_state *),
+	  int set_readline(struct tok_state *, const char *),
+	  struct tok_state *tok)
+{
+	int ch = get_char(tok);
+	tok->decoding_state = 1;
+	if (ch == EOF) {
+		return 1;
+	} else if (ch == 0xEF) {
+		ch = get_char(tok); if (ch != 0xBB) goto NON_BOM;
+		ch = get_char(tok); if (ch != 0xBF) goto NON_BOM;
+	} else if (ch == 0xFE) {
+		ch = get_char(tok); if (ch != 0xFF) goto NON_BOM;
+		if (!set_readline(tok, "utf-16-be")) return 0;
+		tok->decoding_state = -1;
+	} else if (ch == 0xFF) {
+		ch = get_char(tok); if (ch != 0xFE) goto NON_BOM;
+		if (!set_readline(tok, "utf-16-le")) return 0;
+		tok->decoding_state = -1;
+	} else {
+		unget_char(ch, tok);
+		return 1;
+	}
+	tok->encoding = new_string("utf-8", 5);	/* resulting is in utf-8 */
+	return 1;
+  NON_BOM:
+	/* any token beginning with '\xEF', '\xFE', '\xFF' is a bad token */
+	unget_char(0xFF, tok);	/* XXX this will case a syntax error */
+	return 1;
+}
+
+static char *
+fp_readl(char *s, int size, struct tok_state *tok)
+{
+	PyObject* utf8;
+	PyObject* bf = tok->decoding_buffer;
+	if (bf == NULL) {
+		bf = PyObject_CallObject(tok->decoding_readline, NULL);
+		if (bf == NULL) return error_ret(tok);
+	} else {
+		tok->decoding_buffer = NULL;
+	}
+	utf8 = PyUnicode_AsUTF8String(bf);
+	Py_DECREF(bf);
+	if (utf8 == NULL) return error_ret(tok);
+	else {
+		const char* str = PyString_AsString(utf8);
+		assert(strlen(str) < size); /* XXX */
+		strcpy(s, str);
+		Py_DECREF(utf8);
+		if (s[0] == '\0') return NULL; /* EOF */
+		return s;
+	}
+}
+
+static int
+fp_setreadl(struct tok_state *tok, const char* enc)
+{
+	PyObject *reader, *stream, *readline;
+
+	stream = PyFile_FromFile(tok->fp, tok->filename, "rb", NULL);
+	if (stream == NULL) return 0;
+
+	reader = PyCodec_StreamReader(enc, stream, NULL);
+	Py_DECREF(stream);
+	if (reader == NULL) return 0;
+
+	readline = PyObject_GetAttrString(reader, "readline");
+	Py_DECREF(reader);
+	if (readline == NULL) return 0;
+
+	tok->decoding_readline = readline;
+	return 1;
+}
+
+static int fp_getc(struct tok_state *tok) {
+	return getc(tok->fp);
+}
+
+static void fp_ungetc(int c, struct tok_state *tok) {
+	ungetc(c, tok->fp);
+}
+
+static char *
+decoding_fgets(char *s, int size, struct tok_state *tok)
+{
+	char *line;
+	int warn = 0, badchar = 0;
+	for (;;)
+		if (tok->decoding_state < 0) {
+			line = fp_readl(s, size, tok);
+			break;
+		} else if (tok->decoding_state > 0) {
+			line = Py_UniversalNewlineFgets(s, size, 
+							tok->fp, NULL);
+			warn = 1;
+			break;
+		} else {
+			if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
+				return error_ret(tok);
+			assert(tok->decoding_state != 0);
+		}
+	if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
+		if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
+			return error_ret(tok);
+		}
+	}
+#ifndef PGEN
+	if (warn && line && !tok->issued_encoding_warning && !tok->encoding) {
+		unsigned char *c;
+		for (c = line; *c; c++)
+			if (*c > 127) {
+				badchar = *c;
+				break;
+			}
+	}
+	if (badchar) {
+		char buf[200];
+		sprintf(buf, "Non-ASCII character '\\x%.2x', "
+			"but no declared encoding", badchar);
+		PyErr_WarnExplicit(PyExc_DeprecationWarning,
+				   buf, tok->filename, tok->lineno, 
+				   NULL, NULL);
+		tok->issued_encoding_warning = 1;
+	}
+#endif
+	return line;
+}
+
+static int
+decoding_feof(struct tok_state *tok)
+{
+	if (tok->decoding_state >= 0) {
+		return feof(tok->fp);
+	} else {
+		PyObject* bf = tok->decoding_buffer;
+		if (bf == NULL) {
+			bf = PyObject_CallObject(tok->decoding_readline, NULL);
+			if (bf == NULL) {
+				error_ret(tok);
+				return 1;
+			} else {
+				tok->decoding_buffer = bf;
+			}
+		}
+		return PyObject_Length(bf) == 0;
+	}
+}
+
+static int bf_getc(struct tok_state *tok) {
+	return *tok->str++;
+}
+static void bf_ungetc(int c, struct tok_state *tok) {
+	tok->str--;
+	assert(*tok->str == c);	/* tok->cur may point to read-only segment */
+}
+static int bf_setreadl(struct tok_state *tok, const char* enc) {
+	tok->enc = enc;
+	return 1;
+}
+static PyObject *
+translate_into_utf8(const char* str, const char* enc) {
+	PyObject *utf8;
+	PyObject* bf = PyUnicode_Decode(str, strlen(str), enc, NULL);
+	if (bf == NULL)
+		return NULL;
+	utf8 = PyUnicode_AsUTF8String(bf);
+	Py_DECREF(bf);
+	return utf8;
+}
+static const char *
+decode_str(const char *str, struct tok_state *tok)
+{
+	PyObject* utf8 = NULL;
+	const char *s;
+	int lineno = 0;
+	tok->enc = NULL;
+	tok->str = str;
+	if (!check_bom(bf_getc, bf_ungetc, bf_setreadl, tok))
+		return NULL;
+	str = tok->str;		/* string after BOM if any */
+	assert(r);
+	if (tok->enc != NULL) {
+		utf8 = translate_into_utf8(str, tok->enc);
+		if (utf8 == NULL)
+			return NULL;
+		str = PyString_AsString(utf8);
+	}
+	for (s = str;; s++) {
+		if (*s == '\0') break;
+		else if (*s == '\n') {
+			lineno++;
+			if (lineno == 2) break;
+		}
+	}
+	tok->enc = NULL;
+	if (!check_coding_spec(str, s - str, tok, bf_setreadl))
+		return NULL;
+	if (tok->enc != NULL) {
+		assert(utf8 == NULL);
+		utf8 = translate_into_utf8(str, tok->enc);
+		if (utf8 == NULL)
+			return NULL;
+		str = PyString_AsString(utf8);
+	}
+	assert(tok->decoding_buffer == NULL);
+	tok->decoding_buffer = utf8; /* CAUTION */
+	return str;
+}
+
+#endif /* PGEN */
 
 /* Set up tokenizer for string */
 
@@ -126,6 +477,9 @@
 	struct tok_state *tok = tok_new();
 	if (tok == NULL)
 		return NULL;
+	str = (char *)decode_str(str, tok);
+	if (str == NULL)
+		return NULL;
 	tok->buf = tok->cur = tok->end = tok->inp = str;
 	return tok;
 }
@@ -157,6 +511,10 @@
 void
 PyTokenizer_Free(struct tok_state *tok)
 {
+	if (tok->encoding != NULL)
+		PyMem_DEL(tok->encoding);
+	Py_XDECREF(tok->decoding_readline);
+	Py_XDECREF(tok->decoding_buffer);
 	if (tok->fp != NULL && tok->buf != NULL)
 		PyMem_DEL(tok->buf);
 	PyMem_DEL(tok);
@@ -246,8 +604,8 @@
 					}
 					tok->end = tok->buf + BUFSIZ;
 				}
-				if (Py_UniversalNewlineFgets(tok->buf, (int)(tok->end - tok->buf),
-					  tok->fp, NULL) == NULL) {
+				if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
+					  tok) == NULL) {
 					tok->done = E_EOF;
 					done = 1;
 				}
@@ -259,7 +617,7 @@
 			}
 			else {
 				cur = tok->cur - tok->buf;
-				if (feof(tok->fp)) {
+				if (decoding_feof(tok)) {
 					tok->done = E_EOF;
 					done = 1;
 				}
@@ -285,9 +643,9 @@
 				tok->end = tok->buf + newsize;
 				tok->start = curstart < 0 ? NULL :
 					     tok->buf + curstart;
-				if (Py_UniversalNewlineFgets(tok->inp,
+				if (decoding_fgets(tok->inp,
 					       (int)(tok->end - tok->inp),
-					       tok->fp, NULL) == NULL) {
+					       tok) == NULL) {
 					/* Last line does not end in \n,
 					   fake one */
 					strcpy(tok->inp, "\n");
@@ -506,9 +864,8 @@
 
 /* Get next token, after space stripping etc. */
 
-int
-PyTokenizer_Get(register struct tok_state *tok, char **p_start,
-		char **p_end)
+static int
+tok_get(register struct tok_state *tok, char **p_start, char **p_end)
 {
 	register int c;
 	int blankline;
@@ -915,6 +1272,16 @@
 	return PyToken_OneChar(c);
 }
 
+int
+PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
+{
+	int result = tok_get(tok, p_start, p_end);
+	if (tok->decoding_erred) {
+		result = ERRORTOKEN;
+		tok->done = E_DECODE;
+	}
+	return result;
+}
 
 #ifdef Py_DEBUG
 
Index: Parser/tokenizer.h
===================================================================
RCS file: /cvsroot/python/python/dist/src/Parser/tokenizer.h,v
retrieving revision 2.16
diff -u -r2.16 tokenizer.h
--- Parser/tokenizer.h	1 Sep 2000 23:29:28 -0000	2.16
+++ Parser/tokenizer.h	9 May 2002 13:36:26 -0000
@@ -4,6 +4,7 @@
 extern "C" {
 #endif
 
+#include "object.h"
 
 /* Tokenizer interface */
 
@@ -38,6 +39,16 @@
 	int alterror;	/* Issue error if alternate tabs don't match */
 	int alttabsize;	/* Alternate tab spacing */
 	int altindstack[MAXINDENT];	/* Stack of alternate indents */
+	/* Stuff for PEP 0263 */
+	int decoding_state;	/* -1:decoding, 0:init, 1:raw */
+	int decoding_erred;	/* whether erred in decoding  */
+	int read_coding_spec;	/* whether 'coding:...' has been read  */
+	int issued_encoding_warning; /* whether non-ASCII warning was issued */
+	char *encoding;
+	PyObject *decoding_readline; /* codecs.open(...).readline */
+	PyObject *decoding_buffer;
+	const char* enc;
+	const char* str;
 };
 
 extern struct tok_state *PyTokenizer_FromString(char *);
Index: Python/bltinmodule.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Python/bltinmodule.c,v
retrieving revision 2.254
diff -u -r2.254 bltinmodule.c
--- Python/bltinmodule.c	29 Apr 2002 21:27:32 -0000	2.254
+++ Python/bltinmodule.c	9 May 2002 13:36:27 -0000
@@ -1453,19 +1453,26 @@
 				result = PyString_FromStringAndSize(s, (int)(len-1));
 			}
 		}
-		PyMem_FREE(s);
-		return result;
-	}
-	if (v != NULL) {
 		f = PySys_GetObject("stdout");
 		if (f == NULL) {
 			PyErr_SetString(PyExc_RuntimeError, "lost sys.stdout");
 			return NULL;
 		}
+		PyFile_SoftSpace(f, 0);
+		PyMem_FREE(s);
+		return result;
+	}
+	f = PySys_GetObject("stdout");
+	if (f == NULL) {
+		PyErr_SetString(PyExc_RuntimeError, "lost sys.stdout");
+		return NULL;
+	}
+	if (v != NULL) {
 		if (Py_FlushLine() != 0 ||
 		    PyFile_WriteObject(v, f, Py_PRINT_RAW) != 0)
 			return NULL;
 	}
+	PyFile_SoftSpace(f, 0);
 	f = PySys_GetObject("stdin");
 	if (f == NULL) {
 		PyErr_SetString(PyExc_RuntimeError, "lost sys.stdin");
Index: Python/compile.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Python/compile.c,v
retrieving revision 2.242
diff -u -r2.242 compile.c
--- Python/compile.c	26 Apr 2002 01:57:19 -0000	2.242
+++ Python/compile.c	9 May 2002 13:36:29 -0000
@@ -422,6 +422,7 @@
 	int c_closure;		/* Is nested w/freevars? */
 	struct symtable *c_symtable; /* pointer to module symbol table */
         PyFutureFeatures *c_future; /* pointer to module's __future__ */
+	char *c_encoding;	/* source encoding (a borrowed reference) */
 };
 
 static int
@@ -1119,6 +1120,23 @@
 }
 
 static PyObject *
+decode_utf8(char **sPtr, char *end, char* encoding)
+{
+	PyObject *u, *v;
+	char *s, *t;
+	t = s = *sPtr;
+	/* while (s < end && *s != '\\') s++; */ /* inefficient for u".." */
+	while (s < end && (*s & 0x80)) s++;
+	*sPtr = s;
+	u = PyUnicode_DecodeUTF8(t, s - t, NULL);
+	if (u == NULL)
+		return NULL;
+	v = PyUnicode_AsEncodedString(u, encoding, NULL);
+	Py_DECREF(u);
+	return v;
+}
+
+static PyObject *
 parsestr(struct compiling *com, char *s)
 {
 	PyObject *v;
@@ -1130,6 +1148,8 @@
 	int first = *s;
 	int quote = first;
 	int rawmode = 0;
+	char* encoding = ((com == NULL) ? NULL : com->c_encoding);
+	int need_encoding;
 #ifdef Py_USING_UNICODE
 	int unicode = 0;
 #endif
@@ -1174,28 +1194,101 @@
 	}
 #ifdef Py_USING_UNICODE
 	if (unicode || Py_UnicodeFlag) {
+		PyObject *u, *w;
+		if (encoding == NULL) {
+			buf = s;
+			u = NULL;
+		} else if (strcmp(encoding, "iso-8859-1") == 0) {
+			buf = s;
+			u = NULL;
+		} else {
+			/* "\XX" may become "\u005c\uHHLL" (12 bytes) */
+			u = PyString_FromStringAndSize((char *)NULL, len * 4);
+			if (u == NULL)
+				return NULL;
+			p = buf = PyString_AsString(u);
+			end = s + len;
+			while (s < end) {
+				if (*s == '\\') {
+					*p++ = *s++;
+					if (*s & 0x80) {
+						strcpy(p, "u005c");
+						p += 5;
+					}
+				}
+				if (*s & 0x80) { /* XXX inefficient */
+					char *r;
+					int rn, i;
+					w = decode_utf8(&s, end, "utf-16-be");
+					if (w == NULL) {
+						Py_DECREF(u);
+						return NULL;
+					}
+					r = PyString_AsString(w);
+					rn = PyString_Size(w);
+					assert(rn % 2 == 0);
+					for (i = 0; i < rn; i += 2) {
+						sprintf(p, "\\u%02x%02x",
+							r[i + 0] & 0xFF,
+							r[i + 1] & 0xFF);
+						p += 6;
+					}
+					Py_DECREF(w);
+				} else {
+					*p++ = *s++;
+				}
+			}
+			len = p - buf;
+		}
 		if (rawmode)
-			v = PyUnicode_DecodeRawUnicodeEscape(
-				 s, len, NULL);
+			v = PyUnicode_DecodeRawUnicodeEscape(buf, len, NULL);
 		else
-			v = PyUnicode_DecodeUnicodeEscape(
-				s, len, NULL);
+			v = PyUnicode_DecodeUnicodeEscape(buf, len, NULL);
+		Py_XDECREF(u);
 		if (v == NULL)
 			PyErr_SyntaxLocation(com->c_filename, com->c_lineno);
 		return v;
 			
 	}
 #endif
-	if (rawmode || strchr(s, '\\') == NULL)
-		return PyString_FromStringAndSize(s, len);
-	v = PyString_FromStringAndSize((char *)NULL, len);
+	need_encoding = (encoding != NULL &&
+			 strcmp(encoding, "utf-8") != 0 &&
+			 strcmp(encoding, "iso-8859-1") != 0);
+	if (rawmode || strchr(s, '\\') == NULL) {
+		if (need_encoding) {
+			PyObject* u = PyUnicode_DecodeUTF8(s, len, NULL);
+			if (u == NULL)
+				return NULL;
+			v = PyUnicode_AsEncodedString(u, encoding, NULL);
+			Py_DECREF(u);
+			return v;
+		} else {
+			return PyString_FromStringAndSize(s, len);
+		}
+	}
+	v = PyString_FromStringAndSize((char *)NULL, /* XXX 4 is enough? */
+				       need_encoding ? len * 4 : len);
 	if (v == NULL)
 		return NULL;
 	p = buf = PyString_AsString(v);
 	end = s + len;
 	while (s < end) {
 		if (*s != '\\') {
-			*p++ = *s++;
+		  ORDINAL: 
+			if (need_encoding && (*s & 0x80)) {
+				char *r;
+				int rn;
+				PyObject* w = decode_utf8(&s, end, encoding);
+				if (w == NULL)
+					return NULL;
+				r = PyString_AsString(w);
+				rn = PyString_Size(w);
+				memcpy(p, r, rn);
+				p += rn;
+				Py_DECREF(w);
+			} else {
+				*p++ = *s++;
+			}
 			continue;
 		}
 		s++;
@@ -1252,8 +1345,8 @@
 			return NULL;
 		default:
 			*p++ = '\\';
-			*p++ = s[-1];
-			break;
+			s--;
+			goto ORDINAL;
 		}
 	}
 	_PyString_Resize(&v, (int)(p - buf));
@@ -4075,6 +4168,12 @@
 	PyCodeObject *co;
 	if (!com_init(&sc, filename))
 		return NULL;
+	if (TYPE(n) == encoding_decl) {
+		sc.c_encoding = STR(n);
+		n = CHILD(n, 0);
+	} else {
+		sc.c_encoding = NULL;
+	}
 	if (base) {
 		sc.c_private = base->c_private;
 		sc.c_symtable = base->c_symtable;
@@ -4083,6 +4182,10 @@
 		    || (sc.c_symtable->st_cur->ste_type == TYPE_FUNCTION))
 			sc.c_nested = 1;
 		sc.c_flags |= base->c_flags & PyCF_MASK;
+		if (base->c_encoding != NULL) {
+			assert(sc.c_encoding == NULL);
+			sc.c_encoding = base->c_encoding;
+		}
 	} else {
 		sc.c_private = NULL;
 		sc.c_future = PyNode_Future(n, filename);
Index: Python/pythonrun.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Python/pythonrun.c,v
retrieving revision 2.160
diff -u -r2.160 pythonrun.c
--- Python/pythonrun.c	23 Apr 2002 20:31:01 -0000	2.160
+++ Python/pythonrun.c	9 May 2002 13:36:30 -0000
@@ -1201,6 +1201,7 @@
 err_input(perrdetail *err)
 {
 	PyObject *v, *w, *errtype;
+	PyObject* u = NULL;
 	char *msg = NULL;
 	errtype = PyExc_SyntaxError;
 	v = Py_BuildValue("(ziiz)", err->filename,
@@ -1252,12 +1253,24 @@
 		errtype = PyExc_IndentationError;
 		msg = "too many levels of indentation";
 		break;
+	case E_DECODE: {	/* XXX */
+		PyThreadState* tstate = PyThreadState_Get();
+		PyObject* value = tstate->curexc_value;
+		if (value != NULL) {
+			u = PyObject_Repr(value);
+			if (u != NULL) {
+				msg = PyString_AsString(u);
+				break;
+			}
+		}
+	}
 	default:
 		fprintf(stderr, "error=%d\n", err->error);
 		msg = "unknown parsing error";
 		break;
 	}
 	w = Py_BuildValue("(sO)", msg, v);
+	Py_XDECREF(u);
 	Py_XDECREF(v);
 	PyErr_SetObject(errtype, w);
 	Py_XDECREF(w);
--- /dev/null	Sat Mar 23 20:46:34 2002
+++ Parser/tokenizer_pgen.c	Thu May  9 13:03:27 2002
@@ -0,0 +1,2 @@
+#define PGEN
+#include "tokenizer.c"