diff -r a64a3da996ed Doc/c-api/unicode.rst --- a/Doc/c-api/unicode.rst Sun May 05 11:35:15 2013 -0500 +++ b/Doc/c-api/unicode.rst Mon May 06 00:57:17 2013 +0200 @@ -526,12 +526,23 @@ APIs: The `"%lld"` and `"%llu"` format specifiers are only available when :const:`HAVE_LONG_LONG` is defined. + .. note:: + The width formatter unit is number of characters rather than bytes. + The precision formatter unit is number of bytes for ``"%s"`` and + ``"%V"`` (if the ``PyObject*`` argument is NULL), and a number of + characters for ``"%A"``, ``"%U"``, ``"%S"``, ``"%R"`` and ``"%V"`` + (if the ``PyObject*`` argument is not NULL). + .. versionchanged:: 3.2 Support for ``"%lld"`` and ``"%llu"`` added. .. versionchanged:: 3.3 Support for ``"%li"``, ``"%lli"`` and ``"%zi"`` added. + .. versionchanged:: 3.4 + Support width and precision formatter for ``"%s"``, ``"%A"``, ``"%U"``, + ``"%V"``, ``"%S"``, ``"%R"`` added. + .. c:function:: PyObject* PyUnicode_FromFormatV(const char *format, va_list vargs) diff -r a64a3da996ed Include/unicodeobject.h --- a/Include/unicodeobject.h Sun May 05 11:35:15 2013 -0500 +++ b/Include/unicodeobject.h Mon May 06 00:57:17 2013 +0200 @@ -962,6 +962,15 @@ PyAPI_FUNC(int) Py_ssize_t end ); +/* Append a substring of an Unicode string. + Return 0 on success, raise an exception and return -1 on error. */ +PyAPI_FUNC(int) +_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, + PyObject *str, /* Unicode string */ + Py_ssize_t start, /* index of the first character */ + Py_ssize_t length /* length in character */ + ); + /* Append a latin1-encoded byte string. Return 0 on success, raise an exception and return -1 on error. */ PyAPI_FUNC(int) diff -r a64a3da996ed Lib/test/test_unicode.py --- a/Lib/test/test_unicode.py Sun May 05 11:35:15 2013 -0500 +++ b/Lib/test/test_unicode.py Mon May 06 00:57:17 2013 +0200 @@ -2007,9 +2007,13 @@ class UnicodeTest(string_tests.CommonTes for arg in args) return _PyUnicode_FromFormat(format, *cargs) + def check_format(expected, format, *args): + text = PyUnicode_FromFormat(format, *args) + self.assertEqual(expected, text) + # ascii format, non-ascii argument - text = PyUnicode_FromFormat(b'ascii\x7f=%U', 'unicode\xe9') - self.assertEqual(text, 'ascii\x7f=unicode\xe9') + check_format('ascii\x7f=unicode\xe9', + b'ascii\x7f=%U', 'unicode\xe9') # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV() # raises an error @@ -2019,83 +2023,200 @@ class UnicodeTest(string_tests.CommonTes PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii') # test "%c" - self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0xabcd)), '\uabcd') - self.assertEqual(PyUnicode_FromFormat(b'%c', c_int(0x10ffff)), '\U0010ffff') + check_format('\uabcd', + b'%c', c_int(0xabcd)) + check_format('\U0010ffff', + b'%c', c_int(0x10ffff)) # test "%" - self.assertEqual(PyUnicode_FromFormat(b'%'), '%') - self.assertEqual(PyUnicode_FromFormat(b'%%'), '%') - self.assertEqual(PyUnicode_FromFormat(b'%%s'), '%s') - self.assertEqual(PyUnicode_FromFormat(b'[%%]'), '[%]') - self.assertEqual(PyUnicode_FromFormat(b'%%%s', b'abc'), '%abc') + check_format('%', + b'%') + check_format('%', + b'%%') + check_format('%s', + b'%%s') + check_format('[%]', + b'[%%]') + check_format('%abc', + b'%%%s', b'abc') + + # truncated string + check_format('abc', + b'%.3s', b'abcdef') + check_format('abc[\ufffd', + b'%.5s', 'abc[\u20ac]'.encode('utf8')) + check_format("'\\u20acABC'", + b'%A', '\u20acABC') + check_format("'\\u20", + b'%.5A', '\u20acABCDEF') + check_format("'\u20acABC'", + b'%R', '\u20acABC') + check_format("'\u20acA", + b'%.3R', '\u20acABCDEF') + check_format('\u20acAB', + b'%.3S', '\u20acABCDEF') + check_format('\u20acAB', + b'%.3U', '\u20acABCDEF') + check_format('\u20acAB', + b'%.3V', '\u20acABCDEF', None) + check_format('abc[\ufffd', + b'%.5V', None, 'abc[\u20ac]'.encode('utf8')) + + # following tests comes from #7330 + # test width modifier and precision modifier with %S + check_format("repr= xx", + b'repr=%5S', 'xx') + check_format("repr=xx", + b'repr=%.2S', 'xxx') + check_format("repr= xx", + b'repr=%5.2S', 'xxx') + + # test width modifier and precision modifier with %R + check_format("repr= 'xx'", + b'repr=%5R', 'xx') + check_format("repr='x", + b'repr=%.2R', 'xxx') + check_format("repr= 'x", + b'repr=%5.2R', 'xxx') + + # test width modifier and precision modifier with %A + check_format("repr= 'xx'", + b'repr=%5A', 'xx') + check_format("repr='x", + b'repr=%.2A', 'xxx') + check_format("repr= 'x", + b'repr=%5.2A', 'xxx') + + # test width modifier and precision modifier with %s + check_format("repr= xx", + b'repr=%5s', b'xx') + check_format("repr=xx", + b'repr=%.2s', b'xxx') + check_format("repr= xx", + b'repr=%5.2s', b'xxx') + + # test width modifier and precision modifier with %U + check_format("repr= xx", + b'repr=%5U', 'xx') + check_format("repr=xx", + b'repr=%.2U', 'xxx') + check_format("repr= xx", + b'repr=%5.2U', 'xxx') + + # test width modifier and precision modifier with %V + check_format("repr= xx", + b'repr=%5V', 'xx', b'yy') + check_format("repr=xx", + b'repr=%.2V', 'xxx', b'yyy') + check_format("repr= xx", + b'repr=%5.2V', 'xxx', b'yyy') + check_format("repr= yy", + b'repr=%5V', None, b'yy') + check_format("repr=yy", + b'repr=%.2V', None, b'yyy') + check_format("repr= yy", + b'repr=%5.2V', None, b'yyy') # test integer formats (%i, %d, %u) - self.assertEqual(PyUnicode_FromFormat(b'%03i', c_int(10)), '010') - self.assertEqual(PyUnicode_FromFormat(b'%0.4i', c_int(10)), '0010') - self.assertEqual(PyUnicode_FromFormat(b'%i', c_int(-123)), '-123') - self.assertEqual(PyUnicode_FromFormat(b'%li', c_long(-123)), '-123') - self.assertEqual(PyUnicode_FromFormat(b'%lli', c_longlong(-123)), '-123') - self.assertEqual(PyUnicode_FromFormat(b'%zi', c_ssize_t(-123)), '-123') + check_format('010', + b'%03i', c_int(10)) + check_format('0010', + b'%0.4i', c_int(10)) + check_format('-123', + b'%i', c_int(-123)) + check_format('-123', + b'%li', c_long(-123)) + check_format('-123', + b'%lli', c_longlong(-123)) + check_format('-123', + b'%zi', c_ssize_t(-123)) - self.assertEqual(PyUnicode_FromFormat(b'%d', c_int(-123)), '-123') - self.assertEqual(PyUnicode_FromFormat(b'%ld', c_long(-123)), '-123') - self.assertEqual(PyUnicode_FromFormat(b'%lld', c_longlong(-123)), '-123') - self.assertEqual(PyUnicode_FromFormat(b'%zd', c_ssize_t(-123)), '-123') + check_format('-123', + b'%d', c_int(-123)) + check_format('-123', + b'%ld', c_long(-123)) + check_format('-123', + b'%lld', c_longlong(-123)) + check_format('-123', + b'%zd', c_ssize_t(-123)) - self.assertEqual(PyUnicode_FromFormat(b'%u', c_uint(123)), '123') - self.assertEqual(PyUnicode_FromFormat(b'%lu', c_ulong(123)), '123') - self.assertEqual(PyUnicode_FromFormat(b'%llu', c_ulonglong(123)), '123') - self.assertEqual(PyUnicode_FromFormat(b'%zu', c_size_t(123)), '123') + check_format('123', + b'%u', c_uint(123)) + check_format('123', + b'%lu', c_ulong(123)) + check_format('123', + b'%llu', c_ulonglong(123)) + check_format('123', + b'%zu', c_size_t(123)) # test long output min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1)) max_longlong = -min_longlong - 1 - self.assertEqual(PyUnicode_FromFormat(b'%lld', c_longlong(min_longlong)), str(min_longlong)) - self.assertEqual(PyUnicode_FromFormat(b'%lld', c_longlong(max_longlong)), str(max_longlong)) + check_format(str(min_longlong), + b'%lld', c_longlong(min_longlong)) + check_format(str(max_longlong), + b'%lld', c_longlong(max_longlong)) max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1 - self.assertEqual(PyUnicode_FromFormat(b'%llu', c_ulonglong(max_ulonglong)), str(max_ulonglong)) + check_format(str(max_ulonglong), + b'%llu', c_ulonglong(max_ulonglong)) PyUnicode_FromFormat(b'%p', c_void_p(-1)) # test padding (width and/or precision) - self.assertEqual(PyUnicode_FromFormat(b'%010i', c_int(123)), '123'.rjust(10, '0')) - self.assertEqual(PyUnicode_FromFormat(b'%100i', c_int(123)), '123'.rjust(100)) - self.assertEqual(PyUnicode_FromFormat(b'%.100i', c_int(123)), '123'.rjust(100, '0')) - self.assertEqual(PyUnicode_FromFormat(b'%100.80i', c_int(123)), '123'.rjust(80, '0').rjust(100)) + check_format('123'.rjust(10, '0'), + b'%010i', c_int(123)) + check_format('123'.rjust(100), + b'%100i', c_int(123)) + check_format('123'.rjust(100, '0'), + b'%.100i', c_int(123)) + check_format('123'.rjust(80, '0').rjust(100), + b'%100.80i', c_int(123)) - self.assertEqual(PyUnicode_FromFormat(b'%010u', c_uint(123)), '123'.rjust(10, '0')) - self.assertEqual(PyUnicode_FromFormat(b'%100u', c_uint(123)), '123'.rjust(100)) - self.assertEqual(PyUnicode_FromFormat(b'%.100u', c_uint(123)), '123'.rjust(100, '0')) - self.assertEqual(PyUnicode_FromFormat(b'%100.80u', c_uint(123)), '123'.rjust(80, '0').rjust(100)) + check_format('123'.rjust(10, '0'), + b'%010u', c_uint(123)) + check_format('123'.rjust(100), + b'%100u', c_uint(123)) + check_format('123'.rjust(100, '0'), + b'%.100u', c_uint(123)) + check_format('123'.rjust(80, '0').rjust(100), + b'%100.80u', c_uint(123)) - self.assertEqual(PyUnicode_FromFormat(b'%010x', c_int(0x123)), '123'.rjust(10, '0')) - self.assertEqual(PyUnicode_FromFormat(b'%100x', c_int(0x123)), '123'.rjust(100)) - self.assertEqual(PyUnicode_FromFormat(b'%.100x', c_int(0x123)), '123'.rjust(100, '0')) - self.assertEqual(PyUnicode_FromFormat(b'%100.80x', c_int(0x123)), '123'.rjust(80, '0').rjust(100)) + check_format('123'.rjust(10, '0'), + b'%010x', c_int(0x123)) + check_format('123'.rjust(100), + b'%100x', c_int(0x123)) + check_format('123'.rjust(100, '0'), + b'%.100x', c_int(0x123)) + check_format('123'.rjust(80, '0').rjust(100), + b'%100.80x', c_int(0x123)) # test %A - text = PyUnicode_FromFormat(b'%%A:%A', 'abc\xe9\uabcd\U0010ffff') - self.assertEqual(text, r"%A:'abc\xe9\uabcd\U0010ffff'") + check_format(r"%A:'abc\xe9\uabcd\U0010ffff'", + b'%%A:%A', 'abc\xe9\uabcd\U0010ffff') # test %V - text = PyUnicode_FromFormat(b'repr=%V', 'abc', b'xyz') - self.assertEqual(text, 'repr=abc') + check_format('repr=abc', + b'repr=%V', 'abc', b'xyz') # Test string decode from parameter of %s using utf-8. # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of # '\u4eba\u6c11' - text = PyUnicode_FromFormat(b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91') - self.assertEqual(text, 'repr=\u4eba\u6c11') + check_format('repr=\u4eba\u6c11', + b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91') #Test replace error handler. - text = PyUnicode_FromFormat(b'repr=%V', None, b'abc\xff') - self.assertEqual(text, 'repr=abc\ufffd') + check_format('repr=abc\ufffd', + b'repr=%V', None, b'abc\xff') # not supported: copy the raw format string. these tests are just here # to check for crashs and should not be considered as specifications - self.assertEqual(PyUnicode_FromFormat(b'%1%s', b'abc'), '%s') - self.assertEqual(PyUnicode_FromFormat(b'%1abc'), '%1abc') - self.assertEqual(PyUnicode_FromFormat(b'%+i', c_int(10)), '%+i') - self.assertEqual(PyUnicode_FromFormat(b'%.%s', b'abc'), '%.%s') + check_format('%s', + b'%1%s', b'abc') + check_format('%1abc', + b'%1abc') + check_format('%+i', + b'%+i', c_int(10)) + check_format('%.%s', + b'%.%s', b'abc') # Test PyUnicode_AsWideChar() def test_aswidechar(self): diff -r a64a3da996ed Objects/unicodeobject.c --- a/Objects/unicodeobject.c Sun May 05 11:35:15 2013 -0500 +++ b/Objects/unicodeobject.c Mon May 06 00:57:17 2013 +0200 @@ -2346,6 +2346,69 @@ makefmt(char *fmt, int longflag, int lon plus 1 for the sign. 53/22 is an upper bound for log10(256). */ #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) +static int +unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, + Py_ssize_t width, Py_ssize_t precision) +{ + Py_ssize_t length, fill, totallen; + Py_UCS4 maxchar; + + if (PyUnicode_READY(str) == -1) + return -1; + + length = PyUnicode_GET_LENGTH(str); + if ((precision == -1 || precision >= length) + && width <= length) + return _PyUnicodeWriter_WriteStr(writer, str); + + if (precision != -1) + length = Py_MIN(precision, length); + + totallen = Py_MAX(length, width); + if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar) + maxchar = _PyUnicode_FindMaxChar(str, 0, length); + else + maxchar = writer->maxchar; + + if (_PyUnicodeWriter_Prepare(writer, totallen, maxchar) == -1) + return -1; + + if (width > length) { + fill = width - length; + if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) + return -1; + writer->pos += fill; + } + + _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, + str, 0, length); + writer->pos += length; + return 0; + + return _PyUnicodeWriter_WriteSubstring(writer, str, 0, length); +} + +static int +unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str, + Py_ssize_t width, Py_ssize_t precision) +{ + /* UTF-8 */ + Py_ssize_t length; + PyObject *unicode; + int res; + + length = strlen(str); + if (precision != -1) + length = Py_MIN(length, precision); + unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL); + if (unicode == NULL) + return -1; + + res = unicode_fromformat_write_str(writer, unicode, width, -1); + Py_DECREF(unicode); + return res; +} + static const char* unicode_fromformat_arg(_PyUnicodeWriter *writer, const char *f, va_list *vargs) @@ -2353,12 +2416,12 @@ unicode_fromformat_arg(_PyUnicodeWriter const char *p; Py_ssize_t len; int zeropad; - int width; - int precision; + Py_ssize_t width; + Py_ssize_t precision; int longflag; int longlongflag; int size_tflag; - int fill; + Py_ssize_t fill; p = f; f++; @@ -2369,27 +2432,35 @@ unicode_fromformat_arg(_PyUnicodeWriter } /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ - width = 0; - while (Py_ISDIGIT((unsigned)*f)) { - if (width > (INT_MAX - ((int)*f - '0')) / 10) { - PyErr_SetString(PyExc_ValueError, - "width too big"); - return NULL; - } - width = (width*10) + (*f - '0'); + width = -1; + if (Py_ISDIGIT((unsigned)*f)) { + width = *f - '0'; f++; - } - precision = 0; + while (Py_ISDIGIT((unsigned)*f)) { + if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { + PyErr_SetString(PyExc_ValueError, + "width too big"); + return NULL; + } + width = (width * 10) + (*f - '0'); + f++; + } + } + precision = -1; if (*f == '.') { f++; - while (Py_ISDIGIT((unsigned)*f)) { - if (precision > (INT_MAX - ((int)*f - '0')) / 10) { - PyErr_SetString(PyExc_ValueError, - "precision too big"); - return NULL; - } - precision = (precision*10) + (*f - '0'); + if (Py_ISDIGIT((unsigned)*f)) { + precision = (*f - '0'); f++; + while (Py_ISDIGIT((unsigned)*f)) { + if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { + PyErr_SetString(PyExc_ValueError, + "precision too big"); + return NULL; + } + precision = (precision * 10) + (*f - '0'); + f++; + } } if (*f == '%') { /* "%.3%s" => f points to "3" */ @@ -2449,6 +2520,7 @@ unicode_fromformat_arg(_PyUnicodeWriter /* used by sprintf */ char fmt[10]; /* should be enough for "%0lld\0" */ char buffer[MAX_LONG_LONG_CHARS]; + Py_ssize_t preparelen; if (*f == 'u') { makefmt(fmt, longflag, longlongflag, size_tflag, *f); @@ -2494,26 +2566,29 @@ unicode_fromformat_arg(_PyUnicodeWriter if (precision < len) precision = len; + + preparelen = Py_MAX(precision, width); + assert(ucs1lib_find_max_char((Py_UCS1*)buffer, (Py_UCS1*)buffer + len) <= 127); + if (_PyUnicodeWriter_Prepare(writer, preparelen, 127) == -1) + return NULL; + if (width > precision) { Py_UCS4 fillchar; fill = width - precision; fillchar = zeropad?'0':' '; - if (_PyUnicodeWriter_Prepare(writer, fill, fillchar) == -1) - return NULL; if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1) return NULL; writer->pos += fill; } if (precision > len) { fill = precision - len; - if (_PyUnicodeWriter_Prepare(writer, fill, '0') == -1) - return NULL; if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1) return NULL; writer->pos += fill; } - if (_PyUnicodeWriter_WriteCstr(writer, buffer, len) == -1) - return NULL; + + unicode_write_cstr(writer->buffer, writer->pos, buffer, len); + writer->pos += len; break; } @@ -2535,8 +2610,11 @@ unicode_fromformat_arg(_PyUnicodeWriter len += 2; } - if (_PyUnicodeWriter_WriteCstr(writer, number, len) == -1) + assert(ucs1lib_find_max_char((Py_UCS1*)number, (Py_UCS1*)number + len) <= 127); + if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) return NULL; + unicode_write_cstr(writer->buffer, writer->pos, number, len); + writer->pos += len; break; } @@ -2544,14 +2622,8 @@ unicode_fromformat_arg(_PyUnicodeWriter { /* UTF-8 */ const char *s = va_arg(*vargs, const char*); - PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL); - if (!str) + if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0) return NULL; - if (_PyUnicodeWriter_WriteStr(writer, str) == -1) { - Py_DECREF(str); - return NULL; - } - Py_DECREF(str); break; } @@ -2560,7 +2632,7 @@ unicode_fromformat_arg(_PyUnicodeWriter PyObject *obj = va_arg(*vargs, PyObject *); assert(obj && _PyUnicode_CHECK(obj)); - if (_PyUnicodeWriter_WriteStr(writer, obj) == -1) + if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) return NULL; break; } @@ -2569,22 +2641,15 @@ unicode_fromformat_arg(_PyUnicodeWriter { PyObject *obj = va_arg(*vargs, PyObject *); const char *str = va_arg(*vargs, const char *); - PyObject *str_obj; - assert(obj || str); if (obj) { assert(_PyUnicode_CHECK(obj)); - if (_PyUnicodeWriter_WriteStr(writer, obj) == -1) + if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) return NULL; } else { - str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL); - if (!str_obj) + assert(str != NULL); + if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0) return NULL; - if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) { - Py_DECREF(str_obj); - return NULL; - } - Py_DECREF(str_obj); } break; } @@ -2597,7 +2662,7 @@ unicode_fromformat_arg(_PyUnicodeWriter str = PyObject_Str(obj); if (!str) return NULL; - if (_PyUnicodeWriter_WriteStr(writer, str) == -1) { + if (unicode_fromformat_write_str(writer, str, width, precision) == -1) { Py_DECREF(str); return NULL; } @@ -2613,7 +2678,7 @@ unicode_fromformat_arg(_PyUnicodeWriter repr = PyObject_Repr(obj); if (!repr) return NULL; - if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) { + if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) { Py_DECREF(repr); return NULL; } @@ -2629,7 +2694,7 @@ unicode_fromformat_arg(_PyUnicodeWriter ascii = PyObject_ASCII(obj); if (!ascii) return NULL; - if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) { + if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) { Py_DECREF(ascii); return NULL; }