Message382681
In https://github.com/python/cpython/blob/master/Objects/unicodeobject.c#L12930, unicode_repeat does string multiplication with an integer in 3 different ways:
1) one memset call, for utf-8 when string size is 1
2) linear 'for' loops, for utf-16 and utf-32 when string size is 1
3) logarithmic 'while' loop with memcpy calls, for utf-8/utf-16/utf-32 when string size > 1
Is there a performance or correctness reason for which we can't also use the 3rd way for the second case? I realise depending on architecture, the second case could benefit from vectorization, but the memcpy calls will also be hardware optimised.
An example of using just the 1st and 3rd methods:
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -12954,31 +12954,16 @@ unicode_repeat(PyObject *str, Py_ssize_t len)
assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
- if (PyUnicode_GET_LENGTH(str) == 1) {
- int kind = PyUnicode_KIND(str);
- Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
- if (kind == PyUnicode_1BYTE_KIND) {
- void *to = PyUnicode_DATA(u);
- memset(to, (unsigned char)fill_char, len);
- }
- else if (kind == PyUnicode_2BYTE_KIND) {
- Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
- for (n = 0; n < len; ++n)
- ucs2[n] = fill_char;
- } else {
- Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
- assert(kind == PyUnicode_4BYTE_KIND);
- for (n = 0; n < len; ++n)
- ucs4[n] = fill_char;
- }
- }
- else {
+ Py_ssize_t char_size = PyUnicode_KIND(str);
+ char *to = (char *) PyUnicode_DATA(u);
+ if (PyUnicode_GET_LENGTH(str) == 1 && char_size == PyUnicode_1BYTE_KIND) {
+ Py_UCS4 fill_char = PyUnicode_READ(char_size, PyUnicode_DATA(str), 0);
+ memset(to, fill_char, len);
+ } else {
/* number of characters copied this far */
Py_ssize_t done = PyUnicode_GET_LENGTH(str);
- Py_ssize_t char_size = PyUnicode_KIND(str);
- char *to = (char *) PyUnicode_DATA(u);
memcpy(to, PyUnicode_DATA(str),
PyUnicode_GET_LENGTH(str) * char_size);
while (done < nchars) {... |
|
Date |
User |
Action |
Args |
2020-12-07 20:18:22 | syl-nktaylor | set | recipients:
+ syl-nktaylor, vstinner, ezio.melotti |
2020-12-07 20:18:22 | syl-nktaylor | set | messageid: <1607372302.54.0.511899594666.issue42593@roundup.psfhosted.org> |
2020-12-07 20:18:22 | syl-nktaylor | link | issue42593 messages |
2020-12-07 20:18:21 | syl-nktaylor | create | |
|