diff -r 8f0d5ecca524 Include/unicodeobject.h --- a/Include/unicodeobject.h Sat Oct 27 12:56:30 2012 +0200 +++ b/Include/unicodeobject.h Sun Oct 28 00:44:57 2012 -0300 @@ -882,11 +882,11 @@ ); PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV( - const char *format, /* ASCII-encoded string */ + const char *format, /* UTF-8 encoded string */ va_list vargs ); PyAPI_FUNC(PyObject *) PyUnicode_FromFormat( - const char *format, /* ASCII-encoded string */ + const char *format, /* UTF-8 encoded string */ ... ); diff -r 8f0d5ecca524 Lib/test/test_unicode.py --- a/Lib/test/test_unicode.py Sat Oct 27 12:56:30 2012 +0200 +++ b/Lib/test/test_unicode.py Sun Oct 28 00:44:57 2012 -0300 @@ -1734,13 +1734,29 @@ text = PyUnicode_FromFormat(b'ascii\x7f=%U', 'unicode\xe9') self.assertEqual(text, 'ascii\x7f=unicode\xe9') - # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV() + # valid utf-8 format + text = PyUnicode_FromFormat(b'unicode\xc3\xad') + text = text.encode("utf-8") + self.assertEqual(text, b'unicode\xc3\xad') + + # invalid utf8 format: ensure that PyUnicode_FromFormatV() # raises an error + # case 1: invalid start byte self.assertRaisesRegex(ValueError, - '^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format ' - 'string, got a non-ASCII byte: 0xe9$', - PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii') - + '^PyUnicode_FromFormatV\(\) expects UTF-8 encoding, ' + 'got an invalid starting byte: 0xff$', + PyUnicode_FromFormat, b'unicode\xff') + # case 2: invalid continuation byte + self.assertRaisesRegex(ValueError, + '^PyUnicode_FromFormatV\(\) expects UTF-8 encoding, ' + 'got an invalid continuation byte: 0xfe$', + PyUnicode_FromFormat, b'unicode\xce\xfe') + # case 3: unexpected end of data + self.assertRaisesRegex(ValueError, + '^PyUnicode_FromFormatV\(\) expects UTF-8 encoding, ' + 'got an invalid continuation byte: 0x00$', + PyUnicode_FromFormat, b'unicode\xce') + # test "%c" self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0xabcd)), '\uabcd') self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0x10ffff)), '\U0010ffff') diff -r 8f0d5ecca524 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Sat Oct 27 12:56:30 2012 +0200 +++ b/Objects/unicodeobject.c Sun Oct 28 00:44:57 2012 -0300 @@ -2662,29 +2662,62 @@ } else { const char *p; + char mb_char_len; Py_ssize_t len; p = f; do { - if ((unsigned char)*p > 127) { + /* check start byte to detect multi-byte sequences */ + if (((unsigned char)*p & 0x80) == 0) + mb_char_len = 1; + else if (((unsigned char)*p & 0xE0) == 0xC0) + mb_char_len = 2; + else if (((unsigned char)*p & 0xF0) == 0xE0) + mb_char_len = 3; + else if (((unsigned char)*p & 0xF8) == 0xF0) + mb_char_len = 4; + else { PyErr_Format(PyExc_ValueError, - "PyUnicode_FromFormatV() expects an ASCII-encoded format " - "string, got a non-ASCII byte: 0x%02x", + "PyUnicode_FromFormatV() expects UTF-8 encoding, " + "got an invalid starting byte: 0x%02x", (unsigned char)*p); return NULL; } - p++; + /* sanity utf8 multibyte sequences checks */ + char i; + for(i=1; i