diff -r fb64c7a81010 Include/bytesobject.h --- a/Include/bytesobject.h Mon Oct 31 08:31:13 2016 +0200 +++ b/Include/bytesobject.h Mon Oct 31 14:01:26 2016 +0200 @@ -74,6 +74,11 @@ PyAPI_FUNC(PyObject*) _PyBytes_FromHex( PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t, const char *, Py_ssize_t, const char *); +/* Helper for PyBytes_DecodeEscape that detects invalid escape chars. */ +PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t, + const char *, Py_ssize_t, + const char *, + const char **); /* Macro, trading safety for speed */ #ifndef Py_LIMITED_API diff -r fb64c7a81010 Include/unicodeobject.h --- a/Include/unicodeobject.h Mon Oct 31 08:31:13 2016 +0200 +++ b/Include/unicodeobject.h Mon Oct 31 14:01:26 2016 +0200 @@ -1486,6 +1486,17 @@ PyAPI_FUNC(PyObject*) PyUnicode_DecodeUn const char *errors /* error handling */ ); +/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape + chars. */ +PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscape( + const char *string, /* Unicode-Escape encoded string */ + Py_ssize_t length, /* size of string */ + const char *errors, /* error handling */ + const char **first_invalid_escape /* on return, points to first + invalid escaped char in + string. */ +); + PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString( PyObject *unicode /* Unicode object */ ); diff -r fb64c7a81010 Lib/test/test_string_literals.py --- a/Lib/test/test_string_literals.py Mon Oct 31 08:31:13 2016 +0200 +++ b/Lib/test/test_string_literals.py Mon Oct 31 14:01:26 2016 +0200 @@ -31,6 +31,7 @@ import os import sys import shutil import tempfile +import warnings import unittest @@ -104,6 +105,19 @@ class TestLiterals(unittest.TestCase): self.assertRaises(SyntaxError, eval, r""" '\U000000' """) self.assertRaises(SyntaxError, eval, r""" '\U0000000' """) + def test_eval_str_invalid_escape(self): + for b in range(1, 128): + if b in b"""\n\r"'01234567NU\\abfnrtuvx""": + continue + with self.assertWarns(DeprecationWarning): + self.assertEqual(eval(r"'\%c'" % b), '\\' + chr(b)) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always', category=DeprecationWarning) + eval("'''\n\\z'''") + self.assertEqual(len(w), 1) + self.assertEqual(w[0].filename, '') + self.assertEqual(w[0].lineno, 2) + def test_eval_str_raw(self): self.assertEqual(eval(""" r'x' """), 'x') self.assertEqual(eval(r""" r'\x01' """), '\\' + 'x01') @@ -130,6 +144,19 @@ class TestLiterals(unittest.TestCase): self.assertRaises(SyntaxError, eval, r""" b'\x' """) self.assertRaises(SyntaxError, eval, r""" b'\x0' """) + def test_eval_bytes_invalid_escape(self): + for b in range(1, 128): + if b in b"""\n\r"'01234567\\abfnrtvx""": + continue + with self.assertWarns(DeprecationWarning): + self.assertEqual(eval(r"b'\%c'" % b), b'\\' + bytes([b])) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always', category=DeprecationWarning) + eval("b'''\n\\z'''") + self.assertEqual(len(w), 1) + self.assertEqual(w[0].filename, '') + self.assertEqual(w[0].lineno, 2) + def test_eval_bytes_raw(self): self.assertEqual(eval(""" br'x' """), b'x') self.assertEqual(eval(""" rb'x' """), b'x') diff -r fb64c7a81010 Lib/test/test_unicode.py --- a/Lib/test/test_unicode.py Mon Oct 31 08:31:13 2016 +0200 +++ b/Lib/test/test_unicode.py Mon Oct 31 14:01:26 2016 +0200 @@ -2413,13 +2413,6 @@ class UnicodeTest(string_tests.CommonTes support.check_free_after_iterating(self, iter, str) support.check_free_after_iterating(self, reversed, str) - def test_invalid_sequences(self): - for letter in string.ascii_letters + "89": # 0-7 are octal escapes - if letter in "abfnrtuvxNU": - continue - with self.assertWarns(DeprecationWarning): - eval(r"'\%s'" % letter) - class CAPITest(unittest.TestCase): diff -r fb64c7a81010 Objects/bytesobject.c --- a/Objects/bytesobject.c Mon Oct 31 08:31:13 2016 +0200 +++ b/Objects/bytesobject.c Mon Oct 31 14:01:26 2016 +0200 @@ -1105,11 +1105,12 @@ static char * return p; } -PyObject *PyBytes_DecodeEscape(const char *s, +PyObject *_PyBytes_DecodeEscape(const char *s, Py_ssize_t len, const char *errors, Py_ssize_t unicode, - const char *recode_encoding) + const char *recode_encoding, + const char **first_invalid_escape) { int c; char *p; @@ -1123,6 +1124,8 @@ PyObject *PyBytes_DecodeEscape(const cha return NULL; writer.overallocate = 1; + *first_invalid_escape = NULL; + end = s + len; while (s < end) { if (*s != '\\') { @@ -1207,9 +1210,12 @@ PyObject *PyBytes_DecodeEscape(const cha break; default: - if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, "invalid escape sequence '\\%c'", *(--s)) < 0) - goto failed; + if (*first_invalid_escape == NULL) { + *first_invalid_escape = s-1; /* Back up one char, since we've + already incremented s. */ + } *p++ = '\\'; + s--; goto non_esc; /* an arbitrary number of unescaped UTF-8 bytes may follow. */ } @@ -1222,6 +1228,29 @@ PyObject *PyBytes_DecodeEscape(const cha return NULL; } +PyObject *PyBytes_DecodeEscape(const char *s, + Py_ssize_t len, + const char *errors, + Py_ssize_t unicode, + const char *recode_encoding) +{ + const char* first_invalid_escape; + PyObject *result = _PyBytes_DecodeEscape(s, len, errors, unicode, + recode_encoding, + &first_invalid_escape); + if (result == NULL) + return NULL; + if (first_invalid_escape != NULL) { + if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, + "invalid escape sequence '\\%c'", + *first_invalid_escape) < 0) { + Py_DECREF(result); + return NULL; + } + } + return result; + +} /* -------------------------------------------------------------------- */ /* object api */ diff -r fb64c7a81010 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Mon Oct 31 08:31:13 2016 +0200 +++ b/Objects/unicodeobject.c Mon Oct 31 14:01:26 2016 +0200 @@ -5896,9 +5896,10 @@ PyUnicode_AsUTF16String(PyObject *unicod static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL; PyObject * -PyUnicode_DecodeUnicodeEscape(const char *s, - Py_ssize_t size, - const char *errors) +_PyUnicode_DecodeUnicodeEscape(const char *s, + Py_ssize_t size, + const char *errors, + const char **first_invalid_escape) { const char *starts = s; _PyUnicodeWriter writer; @@ -5906,6 +5907,9 @@ PyUnicode_DecodeUnicodeEscape(const char PyObject *errorHandler = NULL; PyObject *exc = NULL; + // so we can remember if we've seen an invalid escape char or not + *first_invalid_escape = NULL; + if (size == 0) { _Py_RETURN_UNICODE_EMPTY(); } @@ -6080,9 +6084,10 @@ PyUnicode_DecodeUnicodeEscape(const char goto error; default: - if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, - "invalid escape sequence '\\%c'", c) < 0) - goto onError; + if (*first_invalid_escape == NULL) { + *first_invalid_escape = s-1; /* Back up one char, since we've + already incremented s. */ + } WRITE_ASCII_CHAR('\\'); WRITE_CHAR(c); continue; @@ -6117,6 +6122,27 @@ PyUnicode_DecodeUnicodeEscape(const char return NULL; } +PyObject * +PyUnicode_DecodeUnicodeEscape(const char *s, + Py_ssize_t size, + const char *errors) +{ + const char *first_invalid_escape; + PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors, + &first_invalid_escape); + if (result == NULL) + return NULL; + if (first_invalid_escape != NULL) { + if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1, + "invalid escape sequence '\\%c'", + *first_invalid_escape) < 0) { + Py_DECREF(result); + return NULL; + } + } + return result; +} + /* Return a Unicode-Escape string version of the Unicode object. If quotes is true, the string is enclosed in u"" or u'' quotes as diff -r fb64c7a81010 Python/ast.c --- a/Python/ast.c Mon Oct 31 08:31:13 2016 +0200 +++ b/Python/ast.c Mon Oct 31 14:01:26 2016 +0200 @@ -4113,8 +4113,26 @@ decode_utf8(struct compiling *c, const c return PyUnicode_DecodeUTF8(t, s - t, NULL); } +static int +warn_invalid_escape_sequence(struct compiling *c, const node *n, + char first_invalid_escape_char) +{ + int res; + PyObject *msg = PyUnicode_FromFormat("invalid escape sequence \\%c", + first_invalid_escape_char); + if (msg == NULL) { + return -1; + } + res = PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, + c->c_filename, LINENO(n), + NULL, NULL); + Py_DECREF(msg); + return res; +} + static PyObject * -decode_unicode_with_escapes(struct compiling *c, const char *s, size_t len) +decode_unicode_with_escapes(struct compiling *c, const node *n, const char *s, + size_t len) { PyObject *v, *u; char *buf; @@ -4167,11 +4185,41 @@ decode_unicode_with_escapes(struct compi len = p - buf; s = buf; - v = PyUnicode_DecodeUnicodeEscape(s, len, NULL); + const char *first_invalid_escape; + v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape); + + if (v != NULL && first_invalid_escape != NULL) { + if (warn_invalid_escape_sequence(c, n, *first_invalid_escape) < 0) { + /* We have not decref u before because first_invalid_escape points + inside u. */ + Py_XDECREF(u); + Py_DECREF(v); + return NULL; + } + } Py_XDECREF(u); return v; } +static PyObject * +decode_bytes_with_escapes(struct compiling *c, const node *n, const char *s, + size_t len) +{ + const char *first_invalid_escape; + PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, 0, NULL, + &first_invalid_escape); + if (result == NULL) + return NULL; + + if (first_invalid_escape != NULL) { + if (warn_invalid_escape_sequence(c, n, *first_invalid_escape) < 0) { + Py_DECREF(result); + return NULL; + } + } + return result; +} + /* Compile this expression in to an expr_ty. Add parens around the expression, in order to allow leading spaces in the expression. */ static expr_ty @@ -4310,7 +4358,7 @@ done: literal_end-literal_start, NULL, NULL); else - *literal = decode_unicode_with_escapes(c, literal_start, + *literal = decode_unicode_with_escapes(c, n, literal_start, literal_end-literal_start); if (!*literal) return -1; @@ -5048,12 +5096,12 @@ parsestr(struct compiling *c, const node if (*rawmode) *result = PyBytes_FromStringAndSize(s, len); else - *result = PyBytes_DecodeEscape(s, len, NULL, /* ignored */ 0, NULL); + *result = decode_bytes_with_escapes(c, n, s, len); } else { if (*rawmode) *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL); else - *result = decode_unicode_with_escapes(c, s, len); + *result = decode_unicode_with_escapes(c, n, s, len); } return *result == NULL ? -1 : 0; }