diff --git a/Include/bytesobject.h b/Include/bytesobject.h --- a/Include/bytesobject.h +++ b/Include/bytesobject.h @@ -1,4 +1,3 @@ - /* Bytes (String) object interface */ #ifndef Py_BYTESOBJECT_H @@ -45,7 +44,7 @@ PyAPI_DATA(PyTypeObject) PyBytesIter_Type; #define PyBytes_Check(op) \ - PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_BYTES_SUBCLASS) + PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_BYTES_SUBCLASS) #define PyBytes_CheckExact(op) (Py_TYPE(op) == &PyBytes_Type) PyAPI_FUNC(PyObject *) PyBytes_FromStringAndSize(const char *, Py_ssize_t); @@ -74,11 +73,16 @@ PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t, const char *, Py_ssize_t, const char *); +/* Helper for PyBytes_DecodeEscape that detects invalid escape chars. */ +PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t, + const char *, Py_ssize_t, + const char *, + const char**); /* Macro, trading safety for speed */ #ifndef Py_LIMITED_API #define PyBytes_AS_STRING(op) (assert(PyBytes_Check(op)), \ - (((PyBytesObject *)(op))->ob_sval)) + (((PyBytesObject *)(op))->ob_sval)) #define PyBytes_GET_SIZE(op) (assert(PyBytes_Check(op)),Py_SIZE(op)) #endif @@ -97,8 +101,8 @@ PyObject *obj, /* string or Unicode object */ char **s, /* pointer to buffer variable */ Py_ssize_t *len /* pointer to length variable or NULL - (only possible for 0-terminated - strings) */ + (only possible for 0-terminated + strings) */ ); /* Using the current locale, insert the thousands grouping @@ -106,21 +110,21 @@ see Objects/stringlib/localeutil.h */ #ifndef Py_LIMITED_API PyAPI_FUNC(Py_ssize_t) _PyBytes_InsertThousandsGroupingLocale(char *buffer, - Py_ssize_t n_buffer, - char *digits, - Py_ssize_t n_digits, - Py_ssize_t min_width); + Py_ssize_t n_buffer, + char *digits, + Py_ssize_t n_digits, + Py_ssize_t min_width); /* Using explicit passed-in values, insert the thousands grouping into the string pointed to by buffer. For the argument descriptions, see Objects/stringlib/localeutil.h */ PyAPI_FUNC(Py_ssize_t) _PyBytes_InsertThousandsGrouping(char *buffer, - Py_ssize_t n_buffer, - char *digits, - Py_ssize_t n_digits, - Py_ssize_t min_width, - const char *grouping, - const char *thousands_sep); + Py_ssize_t n_buffer, + char *digits, + Py_ssize_t n_digits, + Py_ssize_t min_width, + const char *grouping, + const char *thousands_sep); #endif /* Flags used by string formatting */ diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1486,6 +1486,17 @@ const char *errors /* error handling */ ); +/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape + chars. */ +PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape( + const char *string, /* Unicode-Escape encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + const char **first_invalid_escape /* on return, points to first + invalid escaped char in + string. */ +); + PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( PyObject *unicode /* Unicode object */ ); diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -1105,11 +1105,12 @@ return p; } -PyObject *PyBytes_DecodeEscape(const char *s, +PyObject *_PyBytes_DecodeEscape(const char *s, Py_ssize_t len, const char *errors, Py_ssize_t unicode, - const char *recode_encoding) + const char *recode_encoding, + const char **first_invalid_escape) { int c; char *p; @@ -1123,6 +1124,8 @@ return NULL; writer.overallocate = 1; + *first_invalid_escape = NULL; + end = s + len; while (s < end) { if (*s != '\\') { @@ -1207,8 +1210,10 @@ break; default: - if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, "invalid escape sequence '\\%c'", *(--s)) < 0) - goto failed; + if (*first_invalid_escape == NULL) { + *first_invalid_escape = s-1; /* Back up one char, since we've + already incremented s. */ + } *p++ = '\\'; goto non_esc; /* an arbitrary number of unescaped UTF-8 bytes may follow. */ @@ -1222,6 +1227,29 @@ return NULL; } +PyObject *PyBytes_DecodeEscape(const char *s, + Py_ssize_t len, + const char *errors, + Py_ssize_t unicode, + const char *recode_encoding) +{ + const char* first_invalid_escape; + PyObject *result = _PyBytes_DecodeEscape(s, len, errors, unicode, + recode_encoding, + &first_invalid_escape); + if (result == NULL) + return NULL; + if (first_invalid_escape != NULL) { + if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, + "invalid escape sequence '\\%c'", + *first_invalid_escape) < 0) { + Py_DECREF(result); + return NULL; + } + } + return result; + +} /* -------------------------------------------------------------------- */ /* object api */ diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5896,9 +5896,10 @@ static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; PyObject * -PyUnicode_DecodeUnicodeEscape(const char *s, - Py_ssize_t size, - const char *errors) +_PyUnicode_DecodeUnicodeEscape(const char *s, + Py_ssize_t size, + const char *errors, + const char **first_invalid_escape) { const char *starts = s; _PyUnicodeWriter writer; @@ -5906,6 +5907,9 @@ PyObject *errorHandler = NULL; PyObject *exc = NULL; + // so we can remember if we've seen an invalid escape char or not + *first_invalid_escape = NULL; + if (size == 0) { _Py_RETURN_UNICODE_EMPTY(); } @@ -6080,9 +6084,10 @@ goto error; default: - if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, - "invalid escape sequence '\\%c'", c) < 0) - goto onError; + if (*first_invalid_escape == NULL) { + *first_invalid_escape = s-1; /* Back up one char, since we've + already incremented s. */ + } WRITE_ASCII_CHAR('\\'); WRITE_CHAR(c); continue; @@ -6117,6 +6122,27 @@ return NULL; } +PyObject * +PyUnicode_DecodeUnicodeEscape(const char *s, + Py_ssize_t size, + const char *errors) +{ + const char *first_invalid_escape; + PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors, + &first_invalid_escape); + if (result == NULL) + return NULL; + if (first_invalid_escape != NULL) { + if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, + "invalid escape sequence '\\%c'", + *first_invalid_escape) < 0) { + Py_DECREF(result); + return NULL; + } + } + return result; +} + /* Return a Unicode-Escape string version of the Unicode object. If quotes is true, the string is enclosed in u"" or u'' quotes as diff --git a/Python/ast.c b/Python/ast.c --- a/Python/ast.c +++ b/Python/ast.c @@ -4114,7 +4114,8 @@ } static PyObject * -decode_unicode_with_escapes(struct compiling *c, const char *s, size_t len) +decode_unicode_with_escapes(struct compiling *c, const node *n, const char *s, + size_t len) { PyObject *v, *u; char *buf; @@ -4167,11 +4168,53 @@ len = p - buf; s = buf; - v = PyUnicode_DecodeUnicodeEscape(s, len, NULL); + const char *first_invalid_escape; + char first_invalid_escape_char = 0; + v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape); + + /* We have to remember this before we decref u, because it points + inside u. */ + if (v != NULL && first_invalid_escape != NULL) { + first_invalid_escape_char = *first_invalid_escape; + } + Py_XDECREF(u); + + if (v != NULL && first_invalid_escape != NULL) { + char msg[300]; + PyOS_snprintf(msg, sizeof(msg), + "invalid escape sequence \\%c", + first_invalid_escape_char); + ast_error(c, n, msg); + Py_DECREF(v); + return NULL; + } return v; } +static PyObject * +decode_bytes_with_escapes(struct compiling *c, const node *n, const char *s, + size_t len) +{ + const char *first_invalid_escape; + PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, 0, NULL, + &first_invalid_escape); + if (result == NULL) + return NULL; + + if (first_invalid_escape != NULL) { + char first_invalid_escape_char = *first_invalid_escape; + char msg[300]; + PyOS_snprintf(msg, sizeof(msg), + "invalid escape sequence \\%c", + first_invalid_escape_char); + ast_error(c, n, msg); + Py_DECREF(result); + return NULL; + } + return result; +} + /* Compile this expression in to an expr_ty. Add parens around the expression, in order to allow leading spaces in the expression. */ static expr_ty @@ -4310,7 +4353,7 @@ literal_end-literal_start, NULL, NULL); else - *literal = decode_unicode_with_escapes(c, literal_start, + *literal = decode_unicode_with_escapes(c, n, literal_start, literal_end-literal_start); if (!*literal) return -1; @@ -5048,12 +5091,12 @@ if (*rawmode) *result = PyBytes_FromStringAndSize(s, len); else - *result = PyBytes_DecodeEscape(s, len, NULL, /* ignored */ 0, NULL); + *result = decode_bytes_with_escapes(c, n, s, len); } else { if (*rawmode) *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL); else - *result = decode_unicode_with_escapes(c, s, len); + *result = decode_unicode_with_escapes(c, n, s, len); } return *result == NULL ? -1 : 0; }