Index: Grammar/Grammar =================================================================== RCS file: /cvsroot/python/python/dist/src/Grammar/Grammar,v retrieving revision 1.45 diff -c -r1.45 Grammar *** Grammar/Grammar 15 Oct 2001 15:44:04 -0000 1.45 --- Grammar/Grammar 21 Mar 2002 10:16:54 -0000 *************** *** 100,102 **** --- 100,105 ---- list_iter: list_for | list_if list_for: 'for' exprlist 'in' testlist_safe [list_iter] list_if: 'if' test [list_iter] + + # not used in grammar, but may appear as the root of the tree + encoding_decl: NAME \ No newline at end of file Index: Include/unicodeobject.h =================================================================== RCS file: /cvsroot/python/python/dist/src/Include/unicodeobject.h,v retrieving revision 2.36 diff -c -r2.36 unicodeobject.h *** Include/unicodeobject.h 19 Oct 2001 02:01:31 -0000 2.36 --- Include/unicodeobject.h 21 Mar 2002 10:16:55 -0000 *************** *** 718,723 **** --- 718,730 ---- const char *errors /* error handling */ ); + extern DL_IMPORT(PyObject*) PyUnicode_DecodeUnicodeEscapeSize( + const char *string, /* Unicode-Escape encoded string */ + int length, /* size of string */ + const char *errors, /* error handling */ + int itemsize + ); + extern DL_IMPORT(PyObject*) PyUnicode_AsUnicodeEscapeString( PyObject *unicode /* Unicode object */ ); *************** *** 733,738 **** --- 740,752 ---- const char *string, /* Raw-Unicode-Escape encoded string */ int length, /* size of string */ const char *errors /* error handling */ + ); + + extern DL_IMPORT(PyObject*) PyUnicode_DecodeRawUnicodeEscapeSize( + const char *string, /* Raw-Unicode-Escape encoded string */ + int length, /* size of string */ + const char *errors, /* error handling */ + int itemsize ); extern DL_IMPORT(PyObject*) PyUnicode_AsRawUnicodeEscapeString( Index: Objects/unicodeobject.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v retrieving revision 2.131 diff -c -r2.131 unicodeobject.c *** Objects/unicodeobject.c 21 Mar 2002 08:55:28 -0000 2.131 --- Objects/unicodeobject.c 21 Mar 2002 10:16:58 -0000 *************** *** 1548,1559 **** --- 1548,1572 ---- int size, const char *errors) { + return PyUnicode_DecodeUnicodeEscapeSize(s, size, errors, 1); + } + + PyObject *PyUnicode_DecodeUnicodeEscapeSize(const char *s, + int size, + const char *errors, + int itemsize) + { PyUnicodeObject *v; Py_UNICODE *p, *buf; const char *end; char* message; + char* narrow = NULL; Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ + #define ITEM(p) ((itemsize == 1) ? (*(unsigned char*)(p)) : (*(Py_UNICODE*)(p))) + #define ITEMINC(p) (((p)+=itemsize),ITEM((p)-itemsize)) + + assert(itemsize == 1 || itemsize == sizeof(Py_UNICODE)); /* Escaped strings will always be longer than the resulting Unicode string, so we start with size here and then reduce the length after conversion to the true value. */ *************** *** 1564,1585 **** return (PyObject *)v; p = buf = PyUnicode_AS_UNICODE(v); ! end = s + size; while (s < end) { ! unsigned char c; Py_UNICODE x; int i, digits; /* Non-escape characters are interpreted as Unicode ordinals */ ! if (*s != '\\') { ! *p++ = (unsigned char) *s++; continue; } /* \ - Escapes */ ! s++; ! switch (*s++) { /* \x escapes */ case '\n': break; --- 1577,1598 ---- return (PyObject *)v; p = buf = PyUnicode_AS_UNICODE(v); ! end = s + size * itemsize; while (s < end) { ! Py_UNICODE c; Py_UNICODE x; int i, digits; /* Non-escape characters are interpreted as Unicode ordinals */ ! if (ITEM(s) != '\\') { ! *p++ = ITEMINC(s); continue; } /* \ - Escapes */ ! s += itemsize; ! switch (ITEMINC(s)) { /* \x escapes */ case '\n': break; *************** *** 1597,1607 **** /* \OOO (octal) escapes */ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': ! x = s[-1] - '0'; ! if ('0' <= *s && *s <= '7') { ! x = (x<<3) + *s++ - '0'; ! if ('0' <= *s && *s <= '7') ! x = (x<<3) + *s++ - '0'; } *p++ = x; break; --- 1610,1620 ---- /* \OOO (octal) escapes */ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': ! x = ITEM(s-itemsize) - '0'; ! if ('0' <= ITEM(s) && ITEM(s) <= '7') { ! x = (x<<3) + ITEMINC(s) - '0'; ! if ('0' <= ITEM(s) && ITEM(s) <= '7') ! x = (x<<3) + ITEMINC(s) - '0'; } *p++ = x; break; *************** *** 1626,1648 **** hexescape: chr = 0; for (i = 0; i < digits; i++) { ! c = (unsigned char) s[i]; ! if (!isxdigit(c)) { ! if (unicodeescape_decoding_error(&p, errors, message)) ! goto onError; ! chr = 0xffffffff; ! i++; ! break; ! } chr = (chr<<4) & ~0xF; if (c >= '0' && c <= '9') chr += c - '0'; else if (c >= 'a' && c <= 'f') chr += 10 + c - 'a'; ! else chr += 10 + c - 'A'; ! } ! s += i; if (chr == 0xffffffff) /* _decoding_error will have already written into the target buffer. */ --- 1639,1661 ---- hexescape: chr = 0; for (i = 0; i < digits; i++) { ! c = ITEM(s + i * itemsize); /* s[i] */ ! if (!isxdigit(c)) chr = (chr<<4) & ~0xF; if (c >= '0' && c <= '9') chr += c - '0'; else if (c >= 'a' && c <= 'f') chr += 10 + c - 'a'; ! else if (c >= 'A' && c <= 'F') chr += 10 + c - 'A'; ! else { ! if (unicodeescape_decoding_error(&p, errors, message)) ! goto onError; ! chr = 0xffffffff; ! i++; ! break; ! } } ! s += i * itemsize; if (chr == 0xffffffff) /* _decoding_error will have already written into the target buffer. */ *************** *** 1690,1707 **** goto ucnhashError; } if (*s == '{') { ! const char *start = s+1; /* look for the closing brace */ while (*s != '}' && s < end) ! s++; if (s > start && s < end && *s == '}') { /* found a name. look it up in the unicode database */ message = "unknown Unicode character name"; ! s++; ! if (ucnhash_CAPI->getcode(start, s-start-1, &chr)) goto store; } } if (unicodeescape_decoding_error(&p, errors, message)) goto onError; break; --- 1703,1738 ---- goto ucnhashError; } if (*s == '{') { ! const char *start = s + itemsize; /* look for the closing brace */ while (*s != '}' && s < end) ! s += itemsize; if (s > start && s < end && *s == '}') { /* found a name. look it up in the unicode database */ + if (itemsize == 1) { + i = (s-start)/itemsize; + if (narrow) + PyMem_DEL(narrow); + narrow = PyMem_NEW(char, i); + if (!narrow) { + message = "out of memory"; + goto malformed_ucn; + } + while (i-- > 0) { + if (ITEM(start + i * itemsize) > 128) + goto malformed_ucn; + narrow[i] = ITEM(start + i * itemsize); + } + start = narrow; + } message = "unknown Unicode character name"; ! i = (s - start) / itemsize; ! s += itemsize; ! if (ucnhash_CAPI->getcode(start, i, &chr)) goto store; } } + malformed_ucn: if (unicodeescape_decoding_error(&p, errors, message)) goto onError; break; *************** *** 1713,1725 **** } else { *p++ = '\\'; ! *p++ = (unsigned char)s[-1]; } break; } } if (_PyUnicode_Resize(&v, (int)(p - buf))) ! goto onError; return (PyObject *)v; ucnhashError: --- 1744,1758 ---- } else { *p++ = '\\'; ! *p++ = ITEM(s - itemsize); } break; } } + if (narrow) + PyMem_DEL(narrow); if (_PyUnicode_Resize(&v, (int)(p - buf))) ! goto onError; return (PyObject *)v; ucnhashError: *************** *** 1731,1737 **** --- 1764,1773 ---- onError: Py_XDECREF(v); + if (narrow) + PyMem_DEL(narrow); return NULL; + #undef ITEM } /* Return a Unicode-Escape string version of the Unicode object. *************** *** 1900,1910 **** --- 1936,1957 ---- int size, const char *errors) { + return PyUnicode_DecodeRawUnicodeEscapeSize(s, size, errors, 1); + } + + + PyObject *PyUnicode_DecodeRawUnicodeEscapeSize(const char *s, + int size, + const char *errors, + int itemsize) + { PyUnicodeObject *v; Py_UNICODE *p, *buf; const char *end; const char *bs; + #define ITEM(p) ((itemsize == 1) ? (*(unsigned char*)(p)) : (*(Py_UNICODE*)(p))) + assert(itemsize == 1 || itemsize == sizeof(Py_UNICODE)); /* Escaped strings will always be longer than the resulting Unicode string, so we start with size here and then reduce the length after conversion to the true value. */ *************** *** 1914,1928 **** if (size == 0) return (PyObject *)v; p = buf = PyUnicode_AS_UNICODE(v); ! end = s + size; while (s < end) { ! unsigned char c; Py_UCS4 x; int i; /* Non-escape characters are interpreted as Unicode ordinals */ ! if (*s != '\\') { ! *p++ = (unsigned char)*s++; continue; } --- 1961,1976 ---- if (size == 0) return (PyObject *)v; p = buf = PyUnicode_AS_UNICODE(v); ! end = s + size * itemsize; while (s < end) { ! Py_UNICODE c; Py_UCS4 x; int i; /* Non-escape characters are interpreted as Unicode ordinals */ ! if (ITEM(s) != '\\') { ! *p++ = ITEM(s); ! s += itemsize; continue; } *************** *** 1930,1951 **** backslashes if odd */ bs = s; for (;s < end;) { ! if (*s != '\\') break; ! *p++ = (unsigned char)*s++; } ! if (((s - bs) & 1) == 0 || s >= end || ! *s != 'u') { continue; } p--; ! s++; /* \uXXXX with 4 hex digits */ for (x = 0, i = 0; i < 4; i++) { ! c = (unsigned char)s[i]; ! if (!isxdigit(c)) { if (unicodeescape_decoding_error(&p, errors, "truncated \\uXXXX")) goto onError; --- 1978,2007 ---- backslashes if odd */ bs = s; for (;s < end;) { ! if (ITEM(s) != '\\') break; ! *p++ = ITEM(s); ! s += itemsize; } ! if ((((s - bs) / itemsize) & 1) == 0 || s >= end || ! ITEM(s) != 'u') { continue; } p--; ! s += itemsize; /* \uXXXX with 4 hex digits */ for (x = 0, i = 0; i < 4; i++) { ! c = ITEM(s + i*itemsize); /* s[i] */ ! x = (x<<4) & ~0xF; ! if (c >= '0' && c <= '9') ! x += c - '0'; ! else if (c >= 'a' && c <= 'f') ! x += 10 + c - 'a'; ! else if (c >= 'A' && c <= 'F') ! x += 10 + c - 'A'; ! else { if (unicodeescape_decoding_error(&p, errors, "truncated \\uXXXX")) goto onError; *************** *** 1953,1967 **** i++; break; } - x = (x<<4) & ~0xF; - if (c >= '0' && c <= '9') - x += c - '0'; - else if (c >= 'a' && c <= 'f') - x += 10 + c - 'a'; - else - x += 10 + c - 'A'; } ! s += i; if (x != 0xffffffff) *p++ = x; } --- 2009,2016 ---- i++; break; } } ! s += i * itemsize; if (x != 0xffffffff) *p++ = x; } *************** *** 1972,1977 **** --- 2021,2027 ---- onError: Py_XDECREF(v); return NULL; + #undef ITEM } PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, Index: Parser/parsetok.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Parser/parsetok.c,v retrieving revision 2.29 diff -c -r2.29 parsetok.c *** Parser/parsetok.c 5 Jan 2002 21:40:08 -0000 2.29 --- Parser/parsetok.c 21 Mar 2002 10:16:58 -0000 *************** *** 36,43 **** return NULL; } if (Py_TabcheckFlag || Py_VerboseFlag) { - tok->filename = ""; tok->altwarning = (tok->filename != NULL); if (Py_TabcheckFlag >= 2) tok->alterror++; --- 36,43 ---- return NULL; } + tok->filename = ""; if (Py_TabcheckFlag || Py_VerboseFlag) { tok->altwarning = (tok->filename != NULL); if (Py_TabcheckFlag >= 2) tok->alterror++; *************** *** 69,76 **** err_ret->error = E_NOMEM; return NULL; } if (Py_TabcheckFlag || Py_VerboseFlag) { - tok->filename = filename; tok->altwarning = (filename != NULL); if (Py_TabcheckFlag >= 2) tok->alterror++; --- 69,76 ---- err_ret->error = E_NOMEM; return NULL; } + tok->filename = filename; if (Py_TabcheckFlag || Py_VerboseFlag) { tok->altwarning = (filename != NULL); if (Py_TabcheckFlag >= 2) tok->alterror++; *************** *** 170,175 **** --- 170,180 ---- err_ret->text[len] = '\0'; } } + } + else if (tok->encoding) { + node* _Py_AddEncoding(node*, char *); + n = _Py_AddEncoding(n, tok->encoding); + tok->encoding = 0; /* XXX PyMem */ } PyTokenizer_Free(tok); Index: Parser/pgenmain.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Parser/pgenmain.c,v retrieving revision 2.24 diff -c -r2.24 pgenmain.c *** Parser/pgenmain.c 11 Sep 2001 16:43:16 -0000 2.24 --- Parser/pgenmain.c 21 Mar 2002 10:16:58 -0000 *************** *** 22,27 **** --- 22,39 ---- int Py_DebugFlag; int Py_VerboseFlag; + struct node* + _Py_AddEncoding(struct node* n, char* encoding) + { + return n; + } + + int + _Py_WarnBadEncoding(char* file, int line, char* start, char* end, char* enc) + { + return 0; + } + /* Forward */ grammar *getgrammar(char *filename); #ifdef THINK_C Index: Parser/tokenizer.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Parser/tokenizer.c,v retrieving revision 2.53 diff -c -r2.53 tokenizer.c *** Parser/tokenizer.c 30 Aug 2001 20:51:59 -0000 2.53 --- Parser/tokenizer.c 21 Mar 2002 10:16:58 -0000 *************** *** 9,14 **** --- 9,16 ---- #include "errcode.h" extern char *PyOS_Readline(char *); + extern int _Py_WarnBadEncoding(char *file, int line, + char *text, char *end, char *encoding); /* Return malloc'ed string including trailing \n; empty malloc'ed string for EOF; NULL if interrupted */ *************** *** 28,33 **** --- 30,36 ---- static struct tok_state *tok_new(void); static int tok_nextc(struct tok_state *tok); static void tok_backup(struct tok_state *tok, int c); + static void check_encoding(struct tok_state *tok); /* Token names */ *************** *** 113,118 **** --- 116,123 ---- tok->alterror = 0; tok->alttabsize = 1; tok->altindstack[0] = 0; + tok->encoding = 0; + tok->issued_encoding_warning = 0; return tok; } *************** *** 294,299 **** --- 299,305 ---- tok->inp = strchr(tok->inp, '\0'); done = tok->inp[-1] == '\n'; } + check_encoding(tok); tok->cur = tok->buf + cur; #ifndef macintosh /* replace "\r\n" with "\n" */ *************** *** 502,507 **** --- 508,525 ---- return 0; } + static void + check_encoding(struct tok_state *tok) + { + int res; + if (tok->issued_encoding_warning) + return; + res = _Py_WarnBadEncoding(tok->filename, tok->lineno, + tok->buf, tok->end, tok->encoding); + if (res) { + tok->issued_encoding_warning = 1; + } + } /* Get next token, after space stripping etc. */ *************** *** 522,527 **** --- 540,556 ---- register int col = 0; register int altcol = 0; tok->atbol = 0; + /* UTF-8 signature: EF BB BF */ + c = tok_nextc(tok); + if (c == Py_CHARMASK('\xef') && tok->lineno == 1 && + ((tok->inp - tok->cur) > 2) && + tok->cur[0] == '\xbb' && tok->cur[1] == '\xbf') { + tok->encoding = strdup("utf-8"); + tok->cur += 2; + } + else { + tok_backup(tok, c); + } for (;;) { c = tok_nextc(tok); if (c == ' ') *************** *** 626,633 **** "set tabsize=", /* will vi never die? */ /* more templates can be added here to support other editors */ }; char cbuf[80]; ! char *tp, **cp; tp = cbuf; do { *tp++ = c = tok_nextc(tok); --- 655,667 ---- "set tabsize=", /* will vi never die? */ /* more templates can be added here to support other editors */ }; + static char *codings[] = { + /* From PEP 263 */ + "coding:", + "coding=", + }; char cbuf[80]; ! char *tp, **cp, *end, *coding; tp = cbuf; do { *tp++ = c = tok_nextc(tok); *************** *** 649,656 **** } } } ! while (c != EOF && c != '\n') c = tok_nextc(tok); } /* Check for EOF and errors now */ --- 683,723 ---- } } } ! coding = NULL; ! for (cp = codings; ! cp < codings + sizeof(codings)/sizeof(codings[0]); ! cp++) { ! if ((tp = strstr(cbuf, *cp))) { ! tp += strlen(*cp); ! while (*tp == ' ' || *tp == '\t') ! tp++; ! end = coding = tp; ! while ((*end >= 'a' && *end <='z') || ! (*end >= 'A' && *end <='Z') || ! (*end >= '0' && *end <='9') || ! *end == '-' || *end == '_' || ! *end == '.') ! end++; ! if (end == tp) ! coding = 0; ! else ! *end = '\0'; ! } ! } ! /* Accept coding declarations only on the first two lines. */ ! if (coding && tok->lineno < 3) { ! for (tp = coding; *tp; tp++) ! if (*tp >= 'A' && *tp <= 'Z') ! *tp = *tp - 'A' + 'a'; ! if (tok->encoding ! && strcmp(tok->encoding, coding)!=0) { ! /* XXX: error: duplicate encoding */ ! } ! tok->encoding = strdup(coding); ! } ! while (c != EOF && c != '\n') { c = tok_nextc(tok); + } } /* Check for EOF and errors now */ Index: Parser/tokenizer.h =================================================================== RCS file: /cvsroot/python/python/dist/src/Parser/tokenizer.h,v retrieving revision 2.16 diff -c -r2.16 tokenizer.h *** Parser/tokenizer.h 1 Sep 2000 23:29:28 -0000 2.16 --- Parser/tokenizer.h 21 Mar 2002 10:16:58 -0000 *************** *** 38,43 **** --- 38,46 ---- int alterror; /* Issue error if alternate tabs don't match */ int alttabsize; /* Alternate tab spacing */ int altindstack[MAXINDENT]; /* Stack of alternate indents */ + /* Source encodings. */ + char *encoding; + int issued_encoding_warning; }; extern struct tok_state *PyTokenizer_FromString(char *); Index: Python/compile.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Python/compile.c,v retrieving revision 2.239 diff -c -r2.239 compile.c *** Python/compile.c 3 Mar 2002 21:30:27 -0000 2.239 --- Python/compile.c 21 Mar 2002 10:17:00 -0000 *************** *** 426,431 **** --- 426,432 ---- int c_closure; /* Is nested w/freevars? */ struct symtable *c_symtable; /* pointer to module symbol table */ PyFutureFeatures *c_future; /* pointer to module's __future__ */ + const char *c_encoding; /* Source encoding, if any; borrowed memory. */ }; static int *************** *** 1178,1191 **** } #ifdef Py_USING_UNICODE if (unicode || Py_UnicodeFlag) { ! if (rawmode) ! v = PyUnicode_DecodeRawUnicodeEscape( ! s, len, NULL); ! else ! v = PyUnicode_DecodeUnicodeEscape( ! s, len, NULL); if (v == NULL) PyErr_SyntaxLocation(com->c_filename, com->c_lineno); return v; } --- 1179,1237 ---- } #ifdef Py_USING_UNICODE if (unicode || Py_UnicodeFlag) { ! int i, allascii = 1, noescapes = 1, elemsize = 0; ! PyObject *u = NULL; ! for (i = 0; i < len; i++) { ! if (s[i] == '\\') ! noescapes = 0; ! if (Py_CHARMASK(s[i]) > 127) { ! allascii = 0; ! } ! } ! if (allascii && noescapes) { ! v = PyUnicode_DecodeASCII (s, len, NULL); ! } ! else if (allascii) { ! /* Decode escapes from ASCII buffer. */ ! elemsize = 1; ! } ! else { ! /* Need to decode to charset first, ! then decode escapes. */ ! u = PyUnicode_Decode(s, len, com->c_encoding, NULL); ! if (u) { ! if(noescapes) { ! v = u; ! u = NULL; ! } ! else { ! elemsize = sizeof(Py_UNICODE); ! s = (char*)PyUnicode_AS_UNICODE(u); ! } ! } ! else { ! /* If we got an ASCII decoding error, ! make the error message more precise. */ ! if (strcmp(com->c_encoding, "ascii") == 0 && ! PyErr_ExceptionMatches(PyExc_UnicodeError)) { ! PyErr_Clear(); ! PyErr_SetString(PyExc_UnicodeError, ! "Non-ASCII characters but no declared encoding"); ! } ! v = NULL; ! } ! } ! if (elemsize) { ! if (rawmode) ! v = PyUnicode_DecodeRawUnicodeEscapeSize( ! s, len, NULL, elemsize); ! else ! v = PyUnicode_DecodeUnicodeEscapeSize( ! s, len, NULL, elemsize); ! } if (v == NULL) PyErr_SyntaxLocation(com->c_filename, com->c_lineno); + Py_XDECREF(u); return v; } *************** *** 4087,4092 **** --- 4133,4145 ---- PyCodeObject *co; if (!com_init(&sc, filename)) return NULL; + if (TYPE(n) == encoding_decl) { + sc.c_encoding = STR(n); + n = CHILD(n, 0); + } + else { + sc.c_encoding = "ascii"; + } if (base) { sc.c_private = base->c_private; sc.c_symtable = base->c_symtable; *************** *** 4095,4100 **** --- 4148,4157 ---- || (sc.c_symtable->st_cur->ste_type == TYPE_FUNCTION)) sc.c_nested = 1; sc.c_flags |= base->c_flags & PyCF_MASK; + if (base->c_encoding) { + assert(sc.c_encoding == NULL); + sc.c_encoding = base->c_encoding; + } } else { sc.c_private = NULL; sc.c_future = PyNode_Future(n, filename); *************** *** 5514,5517 **** --- 5571,5618 ---- if (TYPE(CHILD(n, i)) >= single_input) symtable_assign(st, CHILD(n, i), def_flag); } + } + + node* + _Py_AddEncoding(node* n, char *encoding) + { + node *r = PyNode_New(encoding_decl); + r->n_str = encoding; + r->n_nchildren = 1; + r->n_child = n; + return r; + } + + int + _Py_WarnBadEncoding(char* file, int line, + char *start, char *end, char *encoding) + { + char *c; + char buf[200]; + if (encoding != NULL) { + #ifdef Py_USING_UNICODE + PyObject *uni = PyUnicode_Decode(start, end-start, + encoding, "strict"); + if (!uni) { + PyErr_Clear(); + PyErr_WarnExplicit(PyExc_DeprecationWarning, + "line violates declared encoding", + file, line, NULL, NULL); + return 1; + } + Py_DECREF(uni); + #endif + return 0; + } + /* Default to ASCII. */ + for (c = start; c < end; c++) + if (Py_CHARMASK(*c) > 127) + break; + if (c == end) + return 0; + sprintf(buf, "Non-ASCII character '\\x%2x', but no declared encoding", + Py_CHARMASK(*c)); + PyErr_WarnExplicit(PyExc_DeprecationWarning, + buf, file, line, NULL, NULL); + return 1; }