diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h index 0dfe06f..d7f1468 100644 --- a/Include/unicodeobject.h +++ b/Include/unicodeobject.h @@ -1898,7 +1898,8 @@ PyAPI_FUNC(void) PyUnicode_AppendAndDel( PyAPI_FUNC(PyObject*) PyUnicode_Split( PyObject *s, /* String to split */ PyObject *sep, /* String separator */ - Py_ssize_t maxsplit /* Maxsplit count */ + Py_ssize_t maxsplit, /* Maxsplit count */ + int prune /* Whether to remove empty strings */ ); /* Dito, but split at line breaks. @@ -1942,7 +1943,8 @@ PyAPI_FUNC(PyObject*) PyUnicode_RPartition( PyAPI_FUNC(PyObject*) PyUnicode_RSplit( PyObject *s, /* String to split */ PyObject *sep, /* String separator */ - Py_ssize_t maxsplit /* Maxsplit count */ + Py_ssize_t maxsplit, /* Maxsplit count */ + int prune /* Whether to remove empty strings */ ); /* Translate a string by applying a character mapping table to it and diff --git a/Modules/_pickle.c b/Modules/_pickle.c index 947069a..6f7eda7 100644 --- a/Modules/_pickle.c +++ b/Modules/_pickle.c @@ -1566,7 +1566,7 @@ get_dotted_path(PyObject *obj, PyObject *name) { PyObject *dotted_path; Py_ssize_t i, n; - dotted_path = PyUnicode_Split(name, _PyUnicode_FromId(&PyId_dot), -1); + dotted_path = PyUnicode_Split(name, _PyUnicode_FromId(&PyId_dot), -1, 0); if (dotted_path == NULL) return NULL; n = PyList_GET_SIZE(dotted_path); diff --git a/Objects/bytearrayobject.c b/Objects/bytearrayobject.c index 16b4fd7..814fedd 100644 --- a/Objects/bytearrayobject.c +++ b/Objects/bytearrayobject.c @@ -1335,25 +1335,40 @@ bytearray.split maxsplit: Py_ssize_t = -1 Maximum number of splits to do. -1 (the default value) means no limit. + prune: object = None + Determines whether or not to keep empty strings in the final list Return a list of the sections in the bytearray, using sep as the delimiter. [clinic start generated code]*/ static PyObject * bytearray_split_impl(PyByteArrayObject *self, PyObject *sep, - Py_ssize_t maxsplit) -/*[clinic end generated code: output=833e2cf385d9a04d input=24f82669f41bf523]*/ + Py_ssize_t maxsplit, PyObject *prune) +/*[clinic end generated code: output=62a007e24098bdb0 input=87203683b22bc53d]*/ { Py_ssize_t len = PyByteArray_GET_SIZE(self), n; const char *s = PyByteArray_AS_STRING(self), *sub; PyObject *list; Py_buffer vsub; + int prune_value; + + if (prune == Py_None) { + if (sep == Py_None) + prune_value = 1; + else + prune_value = 0; + } else { + prune_value = PyObject_IsTrue(prune); + if (prune_value < 0) + return NULL; + } if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (sep == Py_None) - return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit); + return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit, + prune_value); if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) return NULL; @@ -1361,7 +1376,7 @@ bytearray_split_impl(PyByteArrayObject *self, PyObject *sep, n = vsub.len; list = stringlib_split( - (PyObject*) self, s, len, sub, n, maxsplit + (PyObject*) self, s, len, sub, n, maxsplit, prune_value ); PyBuffer_Release(&vsub); return list; @@ -1451,19 +1466,32 @@ Splitting is done starting at the end of the bytearray and working to the front. static PyObject * bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep, - Py_ssize_t maxsplit) -/*[clinic end generated code: output=a55e0b5a03cb6190 input=a68286e4dd692ffe]*/ + Py_ssize_t maxsplit, PyObject *prune) +/*[clinic end generated code: output=ef399a20ad6c8b71 input=a68286e4dd692ffe]*/ { Py_ssize_t len = PyByteArray_GET_SIZE(self), n; const char *s = PyByteArray_AS_STRING(self), *sub; PyObject *list; Py_buffer vsub; + int prune_value; + + if (prune == Py_None) { + if (sep == Py_None) + prune_value = 1; + else + prune_value = 0; + } else { + prune_value = PyObject_IsTrue(prune); + if (prune_value < 0) + return NULL; + } if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (sep == Py_None) - return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit); + return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit, + prune_value); if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) return NULL; @@ -1471,7 +1499,7 @@ bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep, n = vsub.len; list = stringlib_rsplit( - (PyObject*) self, s, len, sub, n, maxsplit + (PyObject*) self, s, len, sub, n, maxsplit, prune_value ); PyBuffer_Release(&vsub); return list; diff --git a/Objects/bytesobject.c b/Objects/bytesobject.c index 0a5c0ae..66fd7b5 100644 --- a/Objects/bytesobject.c +++ b/Objects/bytesobject.c @@ -1758,29 +1758,46 @@ bytes.split maxsplit: Py_ssize_t = -1 Maximum number of splits to do. -1 (the default value) means no limit. + prune: object = None + Determines whether or not to keep empty strings in the final list Return a list of the sections in the bytes, using sep as the delimiter. [clinic start generated code]*/ static PyObject * -bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit) -/*[clinic end generated code: output=52126b5844c1d8ef input=8b809b39074abbfa]*/ +bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *prune) +/*[clinic end generated code: output=cc9c523f3392cbe0 input=06605e7d3430ff7e]*/ { Py_ssize_t len = PyBytes_GET_SIZE(self), n; const char *s = PyBytes_AS_STRING(self), *sub; Py_buffer vsub; PyObject *list; + int prune_value; + + if (prune == Py_None) { + if (sep == Py_None) + prune_value = 1; + else + prune_value = 0; + } else { + prune_value = PyObject_IsTrue(prune); + if (prune_value < 0) + return NULL; + } if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (sep == Py_None) - return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit); + return stringlib_split_whitespace((PyObject*) self, s, len, maxsplit, + prune_value); if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) return NULL; sub = vsub.buf; n = vsub.len; - list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit); + list = stringlib_split((PyObject*) self, s, len, sub, n, maxsplit, + prune_value); PyBuffer_Release(&vsub); return list; } @@ -1848,24 +1865,39 @@ Splitting is done starting at the end of the bytes and working to the front. [clinic start generated code]*/ static PyObject * -bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit) -/*[clinic end generated code: output=ba698d9ea01e1c8f input=0f86c9f28f7d7b7b]*/ +bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *prune) +/*[clinic end generated code: output=372b333ea8e35927 input=0f86c9f28f7d7b7b]*/ { Py_ssize_t len = PyBytes_GET_SIZE(self), n; const char *s = PyBytes_AS_STRING(self), *sub; Py_buffer vsub; PyObject *list; + int prune_value; + + if (prune == Py_None) { + if (sep == Py_None) + prune_value = 1; + else + prune_value = 0; + } else { + prune_value = PyObject_IsTrue(prune); + if (prune_value < 0) + return NULL; + } if (maxsplit < 0) maxsplit = PY_SSIZE_T_MAX; if (sep == Py_None) - return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit); + return stringlib_rsplit_whitespace((PyObject*) self, s, len, maxsplit, + prune_value); if (PyObject_GetBuffer(sep, &vsub, PyBUF_SIMPLE) != 0) return NULL; sub = vsub.buf; n = vsub.len; - list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit); + list = stringlib_rsplit((PyObject*) self, s, len, sub, n, maxsplit, + prune_value); PyBuffer_Release(&vsub); return list; } diff --git a/Objects/clinic/bytearrayobject.c.h b/Objects/clinic/bytearrayobject.c.h index c75acb7..a5486ce 100644 --- a/Objects/clinic/bytearrayobject.c.h +++ b/Objects/clinic/bytearrayobject.c.h @@ -167,7 +167,7 @@ exit: } PyDoc_STRVAR(bytearray_split__doc__, -"split($self, /, sep=None, maxsplit=-1)\n" +"split($self, /, sep=None, maxsplit=-1, prune=None)\n" "--\n" "\n" "Return a list of the sections in the bytearray, using sep as the delimiter.\n" @@ -178,29 +178,32 @@ PyDoc_STRVAR(bytearray_split__doc__, " (space, tab, return, newline, formfeed, vertical tab).\n" " maxsplit\n" " Maximum number of splits to do.\n" -" -1 (the default value) means no limit."); +" -1 (the default value) means no limit.\n" +" prune\n" +" Determines whether or not to keep empty strings in the final list"); #define BYTEARRAY_SPLIT_METHODDEF \ {"split", (PyCFunction)bytearray_split, METH_FASTCALL, bytearray_split__doc__}, static PyObject * bytearray_split_impl(PyByteArrayObject *self, PyObject *sep, - Py_ssize_t maxsplit); + Py_ssize_t maxsplit, PyObject *prune); static PyObject * bytearray_split(PyByteArrayObject *self, PyObject **args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", NULL}; - static _PyArg_Parser _parser = {"|On:split", _keywords, 0}; + static const char * const _keywords[] = {"sep", "maxsplit", "prune", NULL}; + static _PyArg_Parser _parser = {"|OnO:split", _keywords, 0}; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; + PyObject *prune = Py_None; if (!_PyArg_ParseStack(args, nargs, kwnames, &_parser, - &sep, &maxsplit)) { + &sep, &maxsplit, &prune)) { goto exit; } - return_value = bytearray_split_impl(self, sep, maxsplit); + return_value = bytearray_split_impl(self, sep, maxsplit, prune); exit: return return_value; @@ -239,7 +242,7 @@ PyDoc_STRVAR(bytearray_rpartition__doc__, {"rpartition", (PyCFunction)bytearray_rpartition, METH_O, bytearray_rpartition__doc__}, PyDoc_STRVAR(bytearray_rsplit__doc__, -"rsplit($self, /, sep=None, maxsplit=-1)\n" +"rsplit($self, /, sep=None, maxsplit=-1, prune=None)\n" "--\n" "\n" "Return a list of the sections in the bytearray, using sep as the delimiter.\n" @@ -251,6 +254,8 @@ PyDoc_STRVAR(bytearray_rsplit__doc__, " maxsplit\n" " Maximum number of splits to do.\n" " -1 (the default value) means no limit.\n" +" prune\n" +" Determines whether or not to keep empty strings in the final list\n" "\n" "Splitting is done starting at the end of the bytearray and working to the front."); @@ -259,22 +264,23 @@ PyDoc_STRVAR(bytearray_rsplit__doc__, static PyObject * bytearray_rsplit_impl(PyByteArrayObject *self, PyObject *sep, - Py_ssize_t maxsplit); + Py_ssize_t maxsplit, PyObject *prune); static PyObject * bytearray_rsplit(PyByteArrayObject *self, PyObject **args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", NULL}; - static _PyArg_Parser _parser = {"|On:rsplit", _keywords, 0}; + static const char * const _keywords[] = {"sep", "maxsplit", "prune", NULL}; + static _PyArg_Parser _parser = {"|OnO:rsplit", _keywords, 0}; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; + PyObject *prune = Py_None; if (!_PyArg_ParseStack(args, nargs, kwnames, &_parser, - &sep, &maxsplit)) { + &sep, &maxsplit, &prune)) { goto exit; } - return_value = bytearray_rsplit_impl(self, sep, maxsplit); + return_value = bytearray_rsplit_impl(self, sep, maxsplit, prune); exit: return return_value; @@ -711,4 +717,4 @@ bytearray_sizeof(PyByteArrayObject *self, PyObject *Py_UNUSED(ignored)) { return bytearray_sizeof_impl(self); } -/*[clinic end generated code: output=225342a680391b9c input=a9049054013a1b77]*/ +/*[clinic end generated code: output=332a77da15a3719f input=a9049054013a1b77]*/ diff --git a/Objects/clinic/bytesobject.c.h b/Objects/clinic/bytesobject.c.h index a11ebd2..fea5c73 100644 --- a/Objects/clinic/bytesobject.c.h +++ b/Objects/clinic/bytesobject.c.h @@ -3,7 +3,7 @@ preserve [clinic start generated code]*/ PyDoc_STRVAR(bytes_split__doc__, -"split($self, /, sep=None, maxsplit=-1)\n" +"split($self, /, sep=None, maxsplit=-1, prune=None)\n" "--\n" "\n" "Return a list of the sections in the bytes, using sep as the delimiter.\n" @@ -14,28 +14,32 @@ PyDoc_STRVAR(bytes_split__doc__, " (space, tab, return, newline, formfeed, vertical tab).\n" " maxsplit\n" " Maximum number of splits to do.\n" -" -1 (the default value) means no limit."); +" -1 (the default value) means no limit.\n" +" prune\n" +" Determines whether or not to keep empty strings in the final list"); #define BYTES_SPLIT_METHODDEF \ {"split", (PyCFunction)bytes_split, METH_FASTCALL, bytes_split__doc__}, static PyObject * -bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit); +bytes_split_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *prune); static PyObject * bytes_split(PyBytesObject *self, PyObject **args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", NULL}; - static _PyArg_Parser _parser = {"|On:split", _keywords, 0}; + static const char * const _keywords[] = {"sep", "maxsplit", "prune", NULL}; + static _PyArg_Parser _parser = {"|OnO:split", _keywords, 0}; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; + PyObject *prune = Py_None; if (!_PyArg_ParseStack(args, nargs, kwnames, &_parser, - &sep, &maxsplit)) { + &sep, &maxsplit, &prune)) { goto exit; } - return_value = bytes_split_impl(self, sep, maxsplit); + return_value = bytes_split_impl(self, sep, maxsplit, prune); exit: return return_value; @@ -120,7 +124,7 @@ exit: } PyDoc_STRVAR(bytes_rsplit__doc__, -"rsplit($self, /, sep=None, maxsplit=-1)\n" +"rsplit($self, /, sep=None, maxsplit=-1, prune=None)\n" "--\n" "\n" "Return a list of the sections in the bytes, using sep as the delimiter.\n" @@ -132,6 +136,8 @@ PyDoc_STRVAR(bytes_rsplit__doc__, " maxsplit\n" " Maximum number of splits to do.\n" " -1 (the default value) means no limit.\n" +" prune\n" +" Determines whether or not to keep empty strings in the final list\n" "\n" "Splitting is done starting at the end of the bytes and working to the front."); @@ -139,22 +145,24 @@ PyDoc_STRVAR(bytes_rsplit__doc__, {"rsplit", (PyCFunction)bytes_rsplit, METH_FASTCALL, bytes_rsplit__doc__}, static PyObject * -bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit); +bytes_rsplit_impl(PyBytesObject *self, PyObject *sep, Py_ssize_t maxsplit, + PyObject *prune); static PyObject * bytes_rsplit(PyBytesObject *self, PyObject **args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - static const char * const _keywords[] = {"sep", "maxsplit", NULL}; - static _PyArg_Parser _parser = {"|On:rsplit", _keywords, 0}; + static const char * const _keywords[] = {"sep", "maxsplit", "prune", NULL}; + static _PyArg_Parser _parser = {"|OnO:rsplit", _keywords, 0}; PyObject *sep = Py_None; Py_ssize_t maxsplit = -1; + PyObject *prune = Py_None; if (!_PyArg_ParseStack(args, nargs, kwnames, &_parser, - &sep, &maxsplit)) { + &sep, &maxsplit, &prune)) { goto exit; } - return_value = bytes_rsplit_impl(self, sep, maxsplit); + return_value = bytes_rsplit_impl(self, sep, maxsplit, prune); exit: return return_value; @@ -499,4 +507,4 @@ bytes_fromhex(PyTypeObject *type, PyObject *arg) exit: return return_value; } -/*[clinic end generated code: output=2dc3c93cfd2dc440 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=d59838871754e7c1 input=a9049054013a1b77]*/ diff --git a/Objects/stringlib/split.h b/Objects/stringlib/split.h index 31f77a7..04aa51e 100644 --- a/Objects/stringlib/split.h +++ b/Objects/stringlib/split.h @@ -53,9 +53,9 @@ Py_LOCAL_INLINE(PyObject *) STRINGLIB(split_whitespace)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { - Py_ssize_t i, j, count=0; + Py_ssize_t i, j, k, count=0; PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); PyObject *sub; @@ -64,10 +64,16 @@ STRINGLIB(split_whitespace)(PyObject* str_obj, i = j = 0; while (maxcount-- > 0) { + k = i; while (i < str_len && STRINGLIB_ISSPACE(str[i])) i++; - if (i == str_len) break; - j = i; i++; + for (; prune == 0 && k < i-1; k++) { + SPLIT_ADD(str, k, k); + } + if (i == str_len) + break; + j = i; + i++; while (i < str_len && !STRINGLIB_ISSPACE(str[i])) i++; #ifndef STRINGLIB_MUTABLE @@ -102,11 +108,12 @@ Py_LOCAL_INLINE(PyObject *) STRINGLIB(split_char)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR ch, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { Py_ssize_t i, j, count=0; PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); PyObject *sub; + int pruned = 0; if (list == NULL) return NULL; @@ -116,21 +123,25 @@ STRINGLIB(split_char)(PyObject* str_obj, for(; j < str_len; j++) { /* I found that using memchr makes no difference */ if (str[j] == ch) { - SPLIT_ADD(str, i, j); + if (prune == 0 || i < j) { + SPLIT_ADD(str, i, j); + } else { + pruned = 1; + } i = j = j + 1; break; } } } #ifndef STRINGLIB_MUTABLE - if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + if (count == 0 && pruned == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { /* ch not in str_obj, so just use str_obj as list[0] */ Py_INCREF(str_obj); PyList_SET_ITEM(list, 0, (PyObject *)str_obj); count++; } else #endif - if (i <= str_len) { + if (i < str_len || (prune == 0 && i == str_len)) { SPLIT_ADD(str, i, str_len); } FIX_PREALLOC_SIZE(list); @@ -145,17 +156,18 @@ Py_LOCAL_INLINE(PyObject *) STRINGLIB(split)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { Py_ssize_t i, j, pos, count=0; PyObject *list, *sub; + int pruned = 0; if (sep_len == 0) { PyErr_SetString(PyExc_ValueError, "empty separator"); return NULL; } else if (sep_len == 1) - return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount); + return STRINGLIB(split_char)(str_obj, str, str_len, sep[0], maxcount, prune); list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) @@ -166,12 +178,18 @@ STRINGLIB(split)(PyObject* str_obj, pos = FASTSEARCH(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH); if (pos < 0) break; + if (prune && pos == 0) { /* Empty string; ignore */ + i += sep_len; + pruned = 1; + maxcount++; /* Don't count pruned strings in the max count */ + continue; + } j = i + pos; SPLIT_ADD(str, i, j); i = j + sep_len; } #ifndef STRINGLIB_MUTABLE - if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + if (count == 0 && pruned == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { /* No match in str_obj, so just use it as list[0] */ Py_INCREF(str_obj); PyList_SET_ITEM(list, 0, (PyObject *)str_obj); @@ -179,7 +197,8 @@ STRINGLIB(split)(PyObject* str_obj, } else #endif { - SPLIT_ADD(str, i, str_len); + if (prune == 0 || i < str_len) + SPLIT_ADD(str, i, str_len); } FIX_PREALLOC_SIZE(list); return list; @@ -192,9 +211,9 @@ STRINGLIB(split)(PyObject* str_obj, Py_LOCAL_INLINE(PyObject *) STRINGLIB(rsplit_whitespace)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { - Py_ssize_t i, j, count=0; + Py_ssize_t i, j, k, count=0; PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); PyObject *sub; @@ -203,10 +222,16 @@ STRINGLIB(rsplit_whitespace)(PyObject* str_obj, i = j = str_len - 1; while (maxcount-- > 0) { + k = i; while (i >= 0 && STRINGLIB_ISSPACE(str[i])) i--; - if (i < 0) break; - j = i; i--; + for (; prune == 0 && k > i+1; k--) { + SPLIT_ADD(str, k+1, k+1); + } + if (i < 0) + break; + j = i; + i--; while (i >= 0 && !STRINGLIB_ISSPACE(str[i])) i--; #ifndef STRINGLIB_MUTABLE @@ -226,8 +251,9 @@ STRINGLIB(rsplit_whitespace)(PyObject* str_obj, /* Skip any remaining whitespace and copy to beginning of string */ while (i >= 0 && STRINGLIB_ISSPACE(str[i])) i--; - if (i >= 0) + if (i >= 0) { SPLIT_ADD(str, 0, i + 1); + } } FIX_PREALLOC_SIZE(list); if (PyList_Reverse(list) < 0) @@ -243,11 +269,12 @@ Py_LOCAL_INLINE(PyObject *) STRINGLIB(rsplit_char)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR ch, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { Py_ssize_t i, j, count=0; PyObject *list = PyList_New(PREALLOC_SIZE(maxcount)); PyObject *sub; + int pruned = 0; if (list == NULL) return NULL; @@ -256,21 +283,25 @@ STRINGLIB(rsplit_char)(PyObject* str_obj, while ((i >= 0) && (maxcount-- > 0)) { for(; i >= 0; i--) { if (str[i] == ch) { - SPLIT_ADD(str, i + 1, j + 1); + if (prune == 0 || i < j) { + SPLIT_ADD(str, i + 1, j + 1); + } else { + pruned = 1; + } j = i = i - 1; break; } } } #ifndef STRINGLIB_MUTABLE - if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + if (count == 0 && pruned == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { /* ch not in str_obj, so just use str_obj as list[0] */ Py_INCREF(str_obj); PyList_SET_ITEM(list, 0, (PyObject *)str_obj); count++; } else #endif - if (j >= -1) { + if (j > -1 || (prune == 0 && j == -1)) { SPLIT_ADD(str, 0, j + 1); } FIX_PREALLOC_SIZE(list); @@ -287,17 +318,18 @@ Py_LOCAL_INLINE(PyObject *) STRINGLIB(rsplit)(PyObject* str_obj, const STRINGLIB_CHAR* str, Py_ssize_t str_len, const STRINGLIB_CHAR* sep, Py_ssize_t sep_len, - Py_ssize_t maxcount) + Py_ssize_t maxcount, int prune) { Py_ssize_t j, pos, count=0; PyObject *list, *sub; + int pruned = 0; if (sep_len == 0) { PyErr_SetString(PyExc_ValueError, "empty separator"); return NULL; } else if (sep_len == 1) - return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount); + return STRINGLIB(rsplit_char)(str_obj, str, str_len, sep[0], maxcount, prune); list = PyList_New(PREALLOC_SIZE(maxcount)); if (list == NULL) @@ -308,11 +340,17 @@ STRINGLIB(rsplit)(PyObject* str_obj, pos = FASTSEARCH(str, j, sep, sep_len, -1, FAST_RSEARCH); if (pos < 0) break; + if (prune && pos == j-1) { + j--; + pruned = 1; + maxcount++; + continue; + } SPLIT_ADD(str, pos + sep_len, j); j = pos; } #ifndef STRINGLIB_MUTABLE - if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { + if (count == 0 && pruned == 0 && STRINGLIB_CHECK_EXACT(str_obj)) { /* No match in str_obj, so just use it as list[0] */ Py_INCREF(str_obj); PyList_SET_ITEM(list, 0, (PyObject *)str_obj); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 7f58129..d003b14 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -10204,7 +10204,8 @@ PyUnicode_Splitlines(PyObject *string, int keepends) static PyObject * split(PyObject *self, PyObject *substring, - Py_ssize_t maxcount) + Py_ssize_t maxcount, + int prune) { int kind1, kind2; void *buf1, *buf2; @@ -10223,22 +10224,22 @@ split(PyObject *self, if (PyUnicode_IS_ASCII(self)) return asciilib_split_whitespace( self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); else return ucs1lib_split_whitespace( self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); case PyUnicode_2BYTE_KIND: return ucs2lib_split_whitespace( self, PyUnicode_2BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); case PyUnicode_4BYTE_KIND: return ucs4lib_split_whitespace( self, PyUnicode_4BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); default: assert(0); @@ -10272,18 +10273,18 @@ split(PyObject *self, case PyUnicode_1BYTE_KIND: if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) out = asciilib_split( - self, buf1, len1, buf2, len2, maxcount); + self, buf1, len1, buf2, len2, maxcount, prune); else out = ucs1lib_split( - self, buf1, len1, buf2, len2, maxcount); + self, buf1, len1, buf2, len2, maxcount, prune); break; case PyUnicode_2BYTE_KIND: out = ucs2lib_split( - self, buf1, len1, buf2, len2, maxcount); + self, buf1, len1, buf2, len2, maxcount, prune); break; case PyUnicode_4BYTE_KIND: out = ucs4lib_split( - self, buf1, len1, buf2, len2, maxcount); + self, buf1, len1, buf2, len2, maxcount, prune); break; default: out = NULL; @@ -10296,7 +10297,8 @@ split(PyObject *self, static PyObject * rsplit(PyObject *self, PyObject *substring, - Py_ssize_t maxcount) + Py_ssize_t maxcount, + int prune) { int kind1, kind2; void *buf1, *buf2; @@ -10315,22 +10317,22 @@ rsplit(PyObject *self, if (PyUnicode_IS_ASCII(self)) return asciilib_rsplit_whitespace( self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); else return ucs1lib_rsplit_whitespace( self, PyUnicode_1BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); case PyUnicode_2BYTE_KIND: return ucs2lib_rsplit_whitespace( self, PyUnicode_2BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); case PyUnicode_4BYTE_KIND: return ucs4lib_rsplit_whitespace( self, PyUnicode_4BYTE_DATA(self), - PyUnicode_GET_LENGTH(self), maxcount + PyUnicode_GET_LENGTH(self), maxcount, prune ); default: assert(0); @@ -10364,18 +10366,18 @@ rsplit(PyObject *self, case PyUnicode_1BYTE_KIND: if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring)) out = asciilib_rsplit( - self, buf1, len1, buf2, len2, maxcount); + self, buf1, len1, buf2, len2, maxcount, prune); else out = ucs1lib_rsplit( - self, buf1, len1, buf2, len2, maxcount); + self, buf1, len1, buf2, len2, maxcount, prune); break; case PyUnicode_2BYTE_KIND: out = ucs2lib_rsplit( - self, buf1, len1, buf2, len2, maxcount); + self, buf1, len1, buf2, len2, maxcount, prune); break; case PyUnicode_4BYTE_KIND: out = ucs4lib_rsplit( - self, buf1, len1, buf2, len2, maxcount); + self, buf1, len1, buf2, len2, maxcount, prune); break; default: out = NULL; @@ -12777,39 +12779,54 @@ unicode_rjust(PyObject *self, PyObject *args) } PyObject * -PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) +PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, int prune) { if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) return NULL; - return split(s, sep, maxsplit); + return split(s, sep, maxsplit, prune); } PyDoc_STRVAR(split__doc__, - "S.split(sep=None, maxsplit=-1) -> list of strings\n\ + "S.split(sep=None, maxsplit=-1, *, prune=None) -> list of strings\n\ \n\ Return a list of the words in S, using sep as the\n\ delimiter string. If maxsplit is given, at most maxsplit\n\ splits are done. If sep is not specified or is None, any\n\ -whitespace string is a separator and empty strings are\n\ -removed from the result."); +whitespace string is a separator. If prune is given and True,\n\ +empty strings are removed from the result. If it is not given\n\ +or None, the default behaviour is used: it is set to True if\n\ +sep is None, False otherwise."); static PyObject* unicode_split(PyObject *self, PyObject *args, PyObject *kwds) { - static char *kwlist[] = {"sep", "maxsplit", 0}; + static char *kwlist[] = {"sep", "maxsplit", "prune", 0}; PyObject *substring = Py_None; Py_ssize_t maxcount = -1; + PyObject *prune_obj = Py_None; + int prune; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split", - kwlist, &substring, &maxcount)) + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OnO:split", kwlist, + &substring, &maxcount, &prune_obj)) return NULL; + if (prune_obj == Py_None) { + if (substring == Py_None) + prune = 1; + else + prune = 0; + } else { + prune = PyObject_IsTrue(prune_obj); + if (prune < 0) + return NULL; + } + if (substring == Py_None) - return split(self, NULL, maxcount); + return split(self, NULL, maxcount, prune); if (PyUnicode_Check(substring)) - return split(self, substring, maxcount); + return split(self, substring, maxcount, prune); PyErr_Format(PyExc_TypeError, "must be str or None, not %.100s", @@ -12959,39 +12976,54 @@ unicode_rpartition(PyObject *self, PyObject *separator) } PyObject * -PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit) +PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit, int prune) { if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0)) return NULL; - return rsplit(s, sep, maxsplit); + return rsplit(s, sep, maxsplit, prune); } PyDoc_STRVAR(rsplit__doc__, - "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\ + "S.rsplit(sep=None, maxsplit=-1, *, prune=False) -> list of strings\n\ \n\ Return a list of the words in S, using sep as the\n\ delimiter string, starting at the end of the string and\n\ working to the front. If maxsplit is given, at most maxsplit\n\ splits are done. If sep is not specified, any whitespace string\n\ -is a separator."); +is a separator. If prune is given and True, empty strings are\n\ +removed from the result. If it is not given or None, the default\n\ +behaviour is used: it is set to True if sep is None, False otherwise."); static PyObject* unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds) { - static char *kwlist[] = {"sep", "maxsplit", 0}; + static char *kwlist[] = {"sep", "maxsplit", "prune", 0}; PyObject *substring = Py_None; Py_ssize_t maxcount = -1; + PyObject *prune_obj = Py_None; + int prune; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit", - kwlist, &substring, &maxcount)) + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OnO:rsplit", kwlist, + &substring, &maxcount, &prune_obj)) return NULL; + if (prune_obj == Py_None) { + if (substring == Py_None) + prune = 1; + else + prune = 0; + } else { + prune = PyObject_IsTrue(prune_obj); + if (prune < 0) + return NULL; + } + if (substring == Py_None) - return rsplit(self, NULL, maxcount); + return rsplit(self, NULL, maxcount, prune); if (PyUnicode_Check(substring)) - return rsplit(self, substring, maxcount); + return rsplit(self, substring, maxcount, prune); PyErr_Format(PyExc_TypeError, "must be str or None, not %.100s",