diff -r dc721353b2e2 Doc/c-api/unicode.rst --- a/Doc/c-api/unicode.rst Sat Oct 06 23:55:33 2012 +0200 +++ b/Doc/c-api/unicode.rst Sun Oct 07 22:17:57 2012 +0200 @@ -524,12 +524,23 @@ APIs: The `"%lld"` and `"%llu"` format specifiers are only available when :const:`HAVE_LONG_LONG` is defined. + .. note:: + The width formatter unit is number of characters rather than bytes. + The precision formatter unit is number of bytes for ``"%s"`` and + ``"%V"`` (if the ``PyObject*`` argument is NULL), and a number of + characters for ``"%A"``, ``"%U"``, ``"%S"``, ``"%R"`` and ``"%V"`` + (if the ``PyObject*`` argument is not NULL). + .. versionchanged:: 3.2 Support for ``"%lld"`` and ``"%llu"`` added. .. versionchanged:: 3.3 Support for ``"%li"``, ``"%lli"`` and ``"%zi"`` added. + .. versionchanged:: 3.4 + Support width and precision formatter for ``"%s"``, ``"%A"``, ``"%U"``, + ``"%V"``, ``"%S"``, ``"%R"`` added. + .. c:function:: PyObject* PyUnicode_FromFormatV(const char *format, va_list vargs) diff -r dc721353b2e2 Include/unicodeobject.h --- a/Include/unicodeobject.h Sat Oct 06 23:55:33 2012 +0200 +++ b/Include/unicodeobject.h Sun Oct 07 22:17:57 2012 +0200 @@ -940,6 +940,15 @@ PyAPI_FUNC(int) PyObject *str /* Unicode string */ ); +/* Append a substring of an Unicode string. + Return 0 on success, raise an exception and return -1 on error. */ +PyAPI_FUNC(int) +_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, + PyObject *str, /* Unicode string */ + Py_ssize_t start, /* index of the first character */ + Py_ssize_t length /* length in character */ + ); + /* Append a latin1-encoded byte string. Return 0 on success, raise an exception and return -1 on error. */ PyAPI_FUNC(int) diff -r dc721353b2e2 Lib/test/test_unicode.py --- a/Lib/test/test_unicode.py Sat Oct 06 23:55:33 2012 +0200 +++ b/Lib/test/test_unicode.py Sun Oct 07 22:17:57 2012 +0200 @@ -1752,6 +1752,83 @@ class UnicodeTest(string_tests.CommonTes self.assertEqual(PyUnicode_FromFormat(b'[%%]'), '[%]') self.assertEqual(PyUnicode_FromFormat(b'%%%s', b'abc'), '%abc') + # truncated string + self.assertEqual(PyUnicode_FromFormat(b'%.3s', b'abcdef'), + 'abc') + self.assertEqual(PyUnicode_FromFormat(b'%.5s', 'abc[\u20ac]'.encode('utf8')), + 'abc[\ufffd') + self.assertEqual(PyUnicode_FromFormat(b'%A', '\u20acABC'), + "'\\u20acABC'") + self.assertEqual(PyUnicode_FromFormat(b'%.5A', '\u20acABCDEF'), + "'\\u20") + self.assertEqual(PyUnicode_FromFormat(b'%R', '\u20acABC'), + "'\u20acABC'") + self.assertEqual(PyUnicode_FromFormat(b'%.3R', '\u20acABCDEF'), + "'\u20acA") + self.assertEqual(PyUnicode_FromFormat(b'%.3S', '\u20acABCDEF'), + '\u20acAB') + self.assertEqual(PyUnicode_FromFormat(b'%.3U', '\u20acABCDEF'), + '\u20acAB') + self.assertEqual(PyUnicode_FromFormat(b'%.3V', '\u20acABCDEF', None), + '\u20acAB') + self.assertEqual(PyUnicode_FromFormat(b'%.5V', None, 'abc[\u20ac]'.encode('utf8')), + 'abc[\ufffd') + + # following tests comes from #7330 + # test width modifier and precision modifier with %S + text = PyUnicode_FromFormat(b'repr=%5S', 'xx') + self.assertEqual(text, "repr= xx") + text = PyUnicode_FromFormat(b'repr=%.2S', 'xxx') + self.assertEqual(text, "repr=xx") + text = PyUnicode_FromFormat(b'repr=%5.2S', 'xxx') + self.assertEqual(text, "repr= xx") + + # test width modifier and precision modifier with %R + text = PyUnicode_FromFormat(b'repr=%5R', 'xx') + self.assertEqual(text, "repr= 'xx'") + text = PyUnicode_FromFormat(b'repr=%.2R', 'xxx') + self.assertEqual(text, "repr='x") + text = PyUnicode_FromFormat(b'repr=%5.2R', 'xxx') + self.assertEqual(text, "repr= 'x") + + # test width modifier and precision modifier with %A + text = PyUnicode_FromFormat(b'repr=%5A', 'xx') + self.assertEqual(text, "repr= 'xx'") + text = PyUnicode_FromFormat(b'repr=%.2A', 'xxx') + self.assertEqual(text, "repr='x") + text = PyUnicode_FromFormat(b'repr=%5.2A', 'xxx') + self.assertEqual(text, "repr= 'x") + + # test width modifier and precision modifier with %s + text = PyUnicode_FromFormat(b'repr=%5s', b'xx') + self.assertEqual(text, "repr= xx") + text = PyUnicode_FromFormat(b'repr=%.2s', b'xxx') + self.assertEqual(text, "repr=xx") + text = PyUnicode_FromFormat(b'repr=%5.2s', b'xxx') + self.assertEqual(text, "repr= xx") + + # test width modifier and precision modifier with %U + text = PyUnicode_FromFormat(b'repr=%5U', 'xx') + self.assertEqual(text, "repr= xx") + text = PyUnicode_FromFormat(b'repr=%.2U', 'xxx') + self.assertEqual(text, "repr=xx") + text = PyUnicode_FromFormat(b'repr=%5.2U', 'xxx') + self.assertEqual(text, "repr= xx") + + # test width modifier and precision modifier with %V + text = PyUnicode_FromFormat(b'repr=%5V', 'xx', b'yy') + self.assertEqual(text, "repr= xx") + text = PyUnicode_FromFormat(b'repr=%.2V', 'xxx', b'yyy') + self.assertEqual(text, "repr=xx") + text = PyUnicode_FromFormat(b'repr=%5.2V', 'xxx', b'yyy') + self.assertEqual(text, "repr= xx") + text = PyUnicode_FromFormat(b'repr=%5V', None, b'yy') + self.assertEqual(text, "repr= yy") + text = PyUnicode_FromFormat(b'repr=%.2V', None, b'yyy') + self.assertEqual(text, "repr=yy") + text = PyUnicode_FromFormat(b'repr=%5.2V', None, b'yyy') + self.assertEqual(text, "repr= yy") + # test integer formats (%i, %d, %u) self.assertEqual(PyUnicode_FromFormat(b'%03i', c_int(10)), '010') self.assertEqual(PyUnicode_FromFormat(b'%0.4i', c_int(10)), '0010') diff -r dc721353b2e2 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Sat Oct 06 23:55:33 2012 +0200 +++ b/Objects/unicodeobject.c Sun Oct 07 22:17:57 2012 +0200 @@ -2333,6 +2333,31 @@ makefmt(char *fmt, int longflag, int lon plus 1 for the sign. 53/22 is an upper bound for log10(256). */ #define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22) +static int +unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str, + Py_ssize_t width, Py_ssize_t precision) +{ + Py_ssize_t length, fill; + + if (PyUnicode_READY(str) == -1) + return -1; + + length = PyUnicode_GET_LENGTH(str); + if (precision != -1) + length = Py_MIN(precision, length); + + if (width > length) { + fill = width - length; + if (_PyUnicodeWriter_Prepare(writer, fill, ' ') == -1) + return -1; + if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1) + return -1; + writer->pos += fill; + } + + return _PyUnicodeWriter_WriteSubstring(writer, str, 0, length); +} + static const char* unicode_fromformat_arg(_PyUnicodeWriter *writer, const char *f, va_list *vargs) @@ -2340,12 +2365,12 @@ unicode_fromformat_arg(_PyUnicodeWriter const char *p; Py_ssize_t len; int zeropad; - int width; - int precision; + Py_ssize_t width; + Py_ssize_t precision; int longflag; int longlongflag; int size_tflag; - int fill; + Py_ssize_t fill; p = f; f++; @@ -2356,26 +2381,34 @@ unicode_fromformat_arg(_PyUnicodeWriter } /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */ - width = 0; - while (Py_ISDIGIT((unsigned)*f)) { - if (width > (INT_MAX - ((int)*f - '0')) / 10) { - PyErr_SetString(PyExc_ValueError, - "width too big"); - return NULL; - } - width = (width*10) + (*f - '0'); + width = -1; + if (Py_ISDIGIT((unsigned)*f)) { + width = *f - '0'; f++; - } - precision = 0; + while (Py_ISDIGIT((unsigned)*f)) { + if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { + PyErr_SetString(PyExc_ValueError, + "width too big"); + return NULL; + } + width = (width * 10) + (*f - '0'); + f++; + } + } + precision = -1; if (*f == '.') { f++; + if (Py_ISDIGIT((unsigned)*f)) { + precision = (*f - '0'); + f++; + } while (Py_ISDIGIT((unsigned)*f)) { - if (precision > (INT_MAX - ((int)*f - '0')) / 10) { + if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) { PyErr_SetString(PyExc_ValueError, "precision too big"); return NULL; } - precision = (precision*10) + (*f - '0'); + precision = (precision * 10) + (*f - '0'); f++; } if (*f == '%') { @@ -2533,10 +2566,15 @@ unicode_fromformat_arg(_PyUnicodeWriter { /* UTF-8 */ const char *s = va_arg(*vargs, const char*); - PyObject *str = PyUnicode_DecodeUTF8Stateful(s, strlen(s), "replace", NULL); + Py_ssize_t length; + PyObject *str; + length = strlen(s); + if (precision != -1) + length = Py_MIN(length, precision); + str = PyUnicode_DecodeUTF8Stateful(s, length, "replace", NULL); if (!str) return NULL; - if (_PyUnicodeWriter_WriteStr(writer, str) == -1) { + if (unicode_fromformat_write_str(writer, str, width, -1) == -1) { Py_DECREF(str); return NULL; } @@ -2549,7 +2587,7 @@ unicode_fromformat_arg(_PyUnicodeWriter PyObject *obj = va_arg(*vargs, PyObject *); assert(obj && _PyUnicode_CHECK(obj)); - if (_PyUnicodeWriter_WriteStr(writer, obj) == -1) + if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) return NULL; break; } @@ -2562,14 +2600,18 @@ unicode_fromformat_arg(_PyUnicodeWriter assert(obj || str); if (obj) { assert(_PyUnicode_CHECK(obj)); - if (_PyUnicodeWriter_WriteStr(writer, obj) == -1) + if (unicode_fromformat_write_str(writer, obj, width, precision) == -1) return NULL; } else { - str_obj = PyUnicode_DecodeUTF8Stateful(str, strlen(str), "replace", NULL); + Py_ssize_t length; + length = strlen(str); + if (precision != -1) + length = Py_MIN(length, precision); + str_obj = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL); if (!str_obj) return NULL; - if (_PyUnicodeWriter_WriteStr(writer, str_obj) == -1) { + if (unicode_fromformat_write_str(writer, str_obj, width, -1) == -1) { Py_DECREF(str_obj); return NULL; } @@ -2586,7 +2628,7 @@ unicode_fromformat_arg(_PyUnicodeWriter str = PyObject_Str(obj); if (!str) return NULL; - if (_PyUnicodeWriter_WriteStr(writer, str) == -1) { + if (unicode_fromformat_write_str(writer, str, width, precision) == -1) { Py_DECREF(str); return NULL; } @@ -2602,7 +2644,7 @@ unicode_fromformat_arg(_PyUnicodeWriter repr = PyObject_Repr(obj); if (!repr) return NULL; - if (_PyUnicodeWriter_WriteStr(writer, repr) == -1) { + if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) { Py_DECREF(repr); return NULL; } @@ -2618,7 +2660,7 @@ unicode_fromformat_arg(_PyUnicodeWriter ascii = PyObject_ASCII(obj); if (!ascii) return NULL; - if (_PyUnicodeWriter_WriteStr(writer, ascii) == -1) { + if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) { Py_DECREF(ascii); return NULL; } @@ -12785,6 +12827,32 @@ int } int +_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str, + Py_ssize_t start, Py_ssize_t length) +{ + Py_UCS4 maxchar; + + if (PyUnicode_READY(str) == -1) + return -1; + + assert(0 <= start); + assert(start + length <= PyUnicode_GET_LENGTH(str)); + + if (length == 0) + return 0; + if (start == 0 && length == PyUnicode_GET_LENGTH(str)) + return _PyUnicodeWriter_WriteStr(writer, str); + + maxchar = _PyUnicode_FindMaxChar(str, start, start + length); + if (_PyUnicodeWriter_Prepare(writer, length, maxchar) == -1) + return -1; + _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos, + str, start, start + length); + writer->pos += length; + return 0; +} + +int _PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len) { Py_UCS4 maxchar;