diff -r 2abd48a47f3b Include/unicodeobject.h --- a/Include/unicodeobject.h Wed Oct 12 00:54:35 2011 +0200 +++ b/Include/unicodeobject.h Tue Oct 11 20:36:53 2011 -0400 @@ -1812,15 +1812,6 @@ PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); -#ifndef Py_LIMITED_API -/* Externally visible for str.strip(unicode) */ -PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( - PyUnicodeObject *self, - int striptype, - PyObject *sepobj - ); -#endif - /* Using the current locale, insert the thousands grouping into the string pointed to by buffer. For the argument descriptions, see Objects/stringlib/localeutil.h */ diff -r 2abd48a47f3b Lib/test/test_bytes.py --- a/Lib/test/test_bytes.py Wed Oct 12 00:54:35 2011 +0200 +++ b/Lib/test/test_bytes.py Tue Oct 11 20:36:53 2011 -0400 @@ -978,6 +978,16 @@ self.assertRaises(BufferError, delslice) self.assertEqual(b, orig) + def test_strip_identity(self): + # make sure strip returns a new object even when no + # modification is done as bytearray is a mutable type. + b = bytearray(b'asdf') + sep = b' \t' + strips = (b.strip, b.lstrip, b.rstrip) + for fn in strips: + self.assertIsNot(fn(), b) + self.assertIsNot(fn(sep), b) + class AssortedBytesTest(unittest.TestCase): # diff -r 2abd48a47f3b Makefile.pre.in --- a/Makefile.pre.in Wed Oct 12 00:54:35 2011 +0200 +++ b/Makefile.pre.in Tue Oct 11 20:36:53 2011 -0400 @@ -630,6 +630,7 @@ $(srcdir)/Objects/stringlib/partition.h \ $(srcdir)/Objects/stringlib/split.h \ $(srcdir)/Objects/stringlib/stringdefs.h \ + $(srcdir)/Objects/stringlib/strip.h \ $(srcdir)/Objects/stringlib/transmogrify.h \ $(srcdir)/Objects/stringlib/unicodedefs.h \ $(srcdir)/Objects/stringlib/localeutil.h diff -r 2abd48a47f3b Objects/bytearrayobject.c --- a/Objects/bytearrayobject.c Wed Oct 12 00:54:35 2011 +0200 +++ b/Objects/bytearrayobject.c Tue Oct 11 20:36:53 2011 -0400 @@ -1039,6 +1039,7 @@ #define STRINGLIB_ISSPACE Py_ISSPACE #define STRINGLIB_ISLINEBREAK(x) ((x == '\n') || (x == '\r')) #define STRINGLIB_CHECK_EXACT PyByteArray_CheckExact +#define STRINGLIB_OBJECT PyByteArrayObject #define STRINGLIB_MUTABLE 1 #include "stringlib/fastsearch.h" @@ -1046,6 +1047,7 @@ #include "stringlib/find.h" #include "stringlib/partition.h" #include "stringlib/split.h" +#include "stringlib/strip.h" #include "stringlib/ctype.h" #include "stringlib/transmogrify.h" @@ -2361,132 +2363,6 @@ Py_RETURN_NONE; } -/* XXX These two helpers could be optimized if argsize == 1 */ - -static Py_ssize_t -lstrip_helper(unsigned char *myptr, Py_ssize_t mysize, - void *argptr, Py_ssize_t argsize) -{ - Py_ssize_t i = 0; - while (i < mysize && memchr(argptr, myptr[i], argsize)) - i++; - return i; -} - -static Py_ssize_t -rstrip_helper(unsigned char *myptr, Py_ssize_t mysize, - void *argptr, Py_ssize_t argsize) -{ - Py_ssize_t i = mysize - 1; - while (i >= 0 && memchr(argptr, myptr[i], argsize)) - i--; - return i + 1; -} - -PyDoc_STRVAR(strip__doc__, -"B.strip([bytes]) -> bytearray\n\ -\n\ -Strip leading and trailing bytes contained in the argument\n\ -and return the result as a new bytearray.\n\ -If the argument is omitted, strip ASCII whitespace."); -static PyObject * -bytearray_strip(PyByteArrayObject *self, PyObject *args) -{ - Py_ssize_t left, right, mysize, argsize; - void *myptr, *argptr; - PyObject *arg = Py_None; - Py_buffer varg; - if (!PyArg_ParseTuple(args, "|O:strip", &arg)) - return NULL; - if (arg == Py_None) { - argptr = "\t\n\r\f\v "; - argsize = 6; - } - else { - if (_getbuffer(arg, &varg) < 0) - return NULL; - argptr = varg.buf; - argsize = varg.len; - } - myptr = self->ob_bytes; - mysize = Py_SIZE(self); - left = lstrip_helper(myptr, mysize, argptr, argsize); - if (left == mysize) - right = left; - else - right = rstrip_helper(myptr, mysize, argptr, argsize); - if (arg != Py_None) - PyBuffer_Release(&varg); - return PyByteArray_FromStringAndSize(self->ob_bytes + left, right - left); -} - -PyDoc_STRVAR(lstrip__doc__, -"B.lstrip([bytes]) -> bytearray\n\ -\n\ -Strip leading bytes contained in the argument\n\ -and return the result as a new bytearray.\n\ -If the argument is omitted, strip leading ASCII whitespace."); -static PyObject * -bytearray_lstrip(PyByteArrayObject *self, PyObject *args) -{ - Py_ssize_t left, right, mysize, argsize; - void *myptr, *argptr; - PyObject *arg = Py_None; - Py_buffer varg; - if (!PyArg_ParseTuple(args, "|O:lstrip", &arg)) - return NULL; - if (arg == Py_None) { - argptr = "\t\n\r\f\v "; - argsize = 6; - } - else { - if (_getbuffer(arg, &varg) < 0) - return NULL; - argptr = varg.buf; - argsize = varg.len; - } - myptr = self->ob_bytes; - mysize = Py_SIZE(self); - left = lstrip_helper(myptr, mysize, argptr, argsize); - right = mysize; - if (arg != Py_None) - PyBuffer_Release(&varg); - return PyByteArray_FromStringAndSize(self->ob_bytes + left, right - left); -} - -PyDoc_STRVAR(rstrip__doc__, -"B.rstrip([bytes]) -> bytearray\n\ -\n\ -Strip trailing bytes contained in the argument\n\ -and return the result as a new bytearray.\n\ -If the argument is omitted, strip trailing ASCII whitespace."); -static PyObject * -bytearray_rstrip(PyByteArrayObject *self, PyObject *args) -{ - Py_ssize_t right, mysize, argsize; - void *myptr, *argptr; - PyObject *arg = Py_None; - Py_buffer varg; - if (!PyArg_ParseTuple(args, "|O:rstrip", &arg)) - return NULL; - if (arg == Py_None) { - argptr = "\t\n\r\f\v "; - argsize = 6; - } - else { - if (_getbuffer(arg, &varg) < 0) - return NULL; - argptr = varg.buf; - argsize = varg.len; - } - myptr = self->ob_bytes; - mysize = Py_SIZE(self); - right = rstrip_helper(myptr, mysize, argptr, argsize); - if (arg != Py_None) - PyBuffer_Release(&varg); - return PyByteArray_FromStringAndSize(self->ob_bytes, right); -} - PyDoc_STRVAR(decode_doc, "B.decode(encoding='utf-8', errors='strict') -> str\n\ \n\ @@ -2793,7 +2669,7 @@ {"join", (PyCFunction)bytearray_join, METH_O, join_doc}, {"ljust", (PyCFunction)stringlib_ljust, METH_VARARGS, ljust__doc__}, {"lower", (PyCFunction)stringlib_lower, METH_NOARGS, _Py_lower__doc__}, - {"lstrip", (PyCFunction)bytearray_lstrip, METH_VARARGS, lstrip__doc__}, + {"lstrip", (PyCFunction)stringlib_lstrip, METH_VARARGS, lstrip__doc__}, {"maketrans", (PyCFunction)bytearray_maketrans, METH_VARARGS|METH_STATIC, _Py_maketrans__doc__}, {"partition", (PyCFunction)bytearray_partition, METH_O, partition__doc__}, @@ -2806,13 +2682,13 @@ {"rjust", (PyCFunction)stringlib_rjust, METH_VARARGS, rjust__doc__}, {"rpartition", (PyCFunction)bytearray_rpartition, METH_O, rpartition__doc__}, {"rsplit", (PyCFunction)bytearray_rsplit, METH_VARARGS, rsplit__doc__}, - {"rstrip", (PyCFunction)bytearray_rstrip, METH_VARARGS, rstrip__doc__}, + {"rstrip", (PyCFunction)stringlib_rstrip, METH_VARARGS, rstrip__doc__}, {"split", (PyCFunction)bytearray_split, METH_VARARGS, split__doc__}, {"splitlines", (PyCFunction)bytearray_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, {"startswith", (PyCFunction)bytearray_startswith, METH_VARARGS , startswith__doc__}, - {"strip", (PyCFunction)bytearray_strip, METH_VARARGS, strip__doc__}, + {"strip", (PyCFunction)stringlib_strip, METH_VARARGS, strip__doc__}, {"swapcase", (PyCFunction)stringlib_swapcase, METH_NOARGS, _Py_swapcase__doc__}, {"title", (PyCFunction)stringlib_title, METH_NOARGS, _Py_title__doc__}, diff -r 2abd48a47f3b Objects/bytesobject.c --- a/Objects/bytesobject.c Wed Oct 12 00:54:35 2011 +0200 +++ b/Objects/bytesobject.c Tue Oct 11 20:36:53 2011 -0400 @@ -557,6 +557,7 @@ #include "stringlib/find.h" #include "stringlib/partition.h" #include "stringlib/split.h" +#include "stringlib/strip.h" #include "stringlib/ctype.h" #include "stringlib/transmogrify.h" @@ -974,15 +975,6 @@ }; -#define LEFTSTRIP 0 -#define RIGHTSTRIP 1 -#define BOTHSTRIP 2 - -/* Arrays indexed by above */ -static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; - -#define STRIPNAME(i) (stripformat[i]+3) - PyDoc_STRVAR(split__doc__, "B.split([sep[, maxsplit]]) -> list of bytes\n\ \n\ @@ -1337,137 +1329,6 @@ } -Py_LOCAL_INLINE(PyObject *) -do_xstrip(PyBytesObject *self, int striptype, PyObject *sepobj) -{ - Py_buffer vsep; - char *s = PyBytes_AS_STRING(self); - Py_ssize_t len = PyBytes_GET_SIZE(self); - char *sep; - Py_ssize_t seplen; - Py_ssize_t i, j; - - if (_getbuffer(sepobj, &vsep) < 0) - return NULL; - sep = vsep.buf; - seplen = vsep.len; - - i = 0; - if (striptype != RIGHTSTRIP) { - while (i < len && memchr(sep, Py_CHARMASK(s[i]), seplen)) { - i++; - } - } - - j = len; - if (striptype != LEFTSTRIP) { - do { - j--; - } while (j >= i && memchr(sep, Py_CHARMASK(s[j]), seplen)); - j++; - } - - PyBuffer_Release(&vsep); - - if (i == 0 && j == len && PyBytes_CheckExact(self)) { - Py_INCREF(self); - return (PyObject*)self; - } - else - return PyBytes_FromStringAndSize(s+i, j-i); -} - - -Py_LOCAL_INLINE(PyObject *) -do_strip(PyBytesObject *self, int striptype) -{ - char *s = PyBytes_AS_STRING(self); - Py_ssize_t len = PyBytes_GET_SIZE(self), i, j; - - i = 0; - if (striptype != RIGHTSTRIP) { - while (i < len && Py_ISSPACE(s[i])) { - i++; - } - } - - j = len; - if (striptype != LEFTSTRIP) { - do { - j--; - } while (j >= i && Py_ISSPACE(s[j])); - j++; - } - - if (i == 0 && j == len && PyBytes_CheckExact(self)) { - Py_INCREF(self); - return (PyObject*)self; - } - else - return PyBytes_FromStringAndSize(s+i, j-i); -} - - -Py_LOCAL_INLINE(PyObject *) -do_argstrip(PyBytesObject *self, int striptype, PyObject *args) -{ - PyObject *sep = NULL; - - if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) - return NULL; - - if (sep != NULL && sep != Py_None) { - return do_xstrip(self, striptype, sep); - } - return do_strip(self, striptype); -} - - -PyDoc_STRVAR(strip__doc__, -"B.strip([bytes]) -> bytes\n\ -\n\ -Strip leading and trailing bytes contained in the argument.\n\ -If the argument is omitted, strip trailing ASCII whitespace."); -static PyObject * -bytes_strip(PyBytesObject *self, PyObject *args) -{ - if (PyTuple_GET_SIZE(args) == 0) - return do_strip(self, BOTHSTRIP); /* Common case */ - else - return do_argstrip(self, BOTHSTRIP, args); -} - - -PyDoc_STRVAR(lstrip__doc__, -"B.lstrip([bytes]) -> bytes\n\ -\n\ -Strip leading bytes contained in the argument.\n\ -If the argument is omitted, strip leading ASCII whitespace."); -static PyObject * -bytes_lstrip(PyBytesObject *self, PyObject *args) -{ - if (PyTuple_GET_SIZE(args) == 0) - return do_strip(self, LEFTSTRIP); /* Common case */ - else - return do_argstrip(self, LEFTSTRIP, args); -} - - -PyDoc_STRVAR(rstrip__doc__, -"B.rstrip([bytes]) -> bytes\n\ -\n\ -Strip trailing bytes contained in the argument.\n\ -If the argument is omitted, strip trailing ASCII whitespace."); -static PyObject * -bytes_rstrip(PyBytesObject *self, PyObject *args) -{ - if (PyTuple_GET_SIZE(args) == 0) - return do_strip(self, RIGHTSTRIP); /* Common case */ - else - return do_argstrip(self, RIGHTSTRIP, args); -} - - PyDoc_STRVAR(count__doc__, "B.count(sub[, start[, end]]) -> int\n\ \n\ @@ -2446,7 +2307,7 @@ {"join", (PyCFunction)bytes_join, METH_O, join__doc__}, {"ljust", (PyCFunction)stringlib_ljust, METH_VARARGS, ljust__doc__}, {"lower", (PyCFunction)stringlib_lower, METH_NOARGS, _Py_lower__doc__}, - {"lstrip", (PyCFunction)bytes_lstrip, METH_VARARGS, lstrip__doc__}, + {"lstrip", (PyCFunction)stringlib_lstrip, METH_VARARGS, lstrip__doc__}, {"maketrans", (PyCFunction)bytes_maketrans, METH_VARARGS|METH_STATIC, _Py_maketrans__doc__}, {"partition", (PyCFunction)bytes_partition, METH_O, partition__doc__}, @@ -2457,13 +2318,13 @@ {"rpartition", (PyCFunction)bytes_rpartition, METH_O, rpartition__doc__}, {"rsplit", (PyCFunction)bytes_rsplit, METH_VARARGS, rsplit__doc__}, - {"rstrip", (PyCFunction)bytes_rstrip, METH_VARARGS, rstrip__doc__}, + {"rstrip", (PyCFunction)stringlib_rstrip, METH_VARARGS, rstrip__doc__}, {"split", (PyCFunction)bytes_split, METH_VARARGS, split__doc__}, {"splitlines", (PyCFunction)bytes_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__}, {"startswith", (PyCFunction)bytes_startswith, METH_VARARGS, startswith__doc__}, - {"strip", (PyCFunction)bytes_strip, METH_VARARGS, strip__doc__}, + {"strip", (PyCFunction)stringlib_strip, METH_VARARGS, strip__doc__}, {"swapcase", (PyCFunction)stringlib_swapcase, METH_NOARGS, _Py_swapcase__doc__}, {"title", (PyCFunction)stringlib_title, METH_NOARGS, _Py_title__doc__}, diff -r 2abd48a47f3b Objects/stringlib/asciilib.h --- a/Objects/stringlib/asciilib.h Wed Oct 12 00:54:35 2011 +0200 +++ b/Objects/stringlib/asciilib.h Tue Oct 11 20:36:53 2011 -0400 @@ -12,7 +12,7 @@ #define STRINGLIB_PARSE_CODE "U" #define STRINGLIB_EMPTY unicode_empty #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE -#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK +#define STRINGLIB_ISLINEBREAK STRINGLIB_BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL #define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL #define STRINGLIB_TOUPPER Py_UNICODE_TOUPPER diff -r 2abd48a47f3b Objects/stringlib/fastsearch.h --- a/Objects/stringlib/fastsearch.h Wed Oct 12 00:54:35 2011 +0200 +++ b/Objects/stringlib/fastsearch.h Tue Oct 11 20:36:53 2011 -0400 @@ -27,11 +27,42 @@ #error "LONG_BIT is smaller than 32" #endif +/* --- Bloom Filters ----------------------------------------------------- */ + +/* stuff to implement simple "bloom filters" for Unicode characters. + to keep things simple, we use a single bitmask, using the least 5 + bits from each unicode characters as the bit index. */ + +/* the linebreak mask is set up by Unicode_Init */ + +#define STRINGLIB_BLOOM_MASK unsigned long + #define STRINGLIB_BLOOM_ADD(mask, ch) \ ((mask |= (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) + #define STRINGLIB_BLOOM(mask, ch) \ ((mask & (1UL << ((ch) & (STRINGLIB_BLOOM_WIDTH -1))))) +#define STRINGLIB_BLOOM_MEMBER(mask, chr, str, len) \ + (STRINGLIB_BLOOM(mask, chr) \ + && (STRINGLIB(fastsearch_memchr_1char)(str, len, chr, chr, 1, FAST_SEARCH) >= 0)) + +#define STRINGLIB_BLOOM_LINEBREAK(ch) \ + ((ch) < 128U ? ascii_linebreak[(ch)] : \ + (STRINGLIB_BLOOM(bloom_linebreak, (ch)) && \ + Py_UNICODE_ISLINEBREAK(ch))) + +Py_LOCAL_INLINE(STRINGLIB_BLOOM_MASK) +STRINGLIB(make_bloom_mask)(const STRINGLIB_CHAR *ptr, Py_ssize_t len) +{ + Py_ssize_t i; + STRINGLIB_BLOOM_MASK mask = 0; + + for (i = 0; i < len; i++) + STRINGLIB_BLOOM_ADD(mask, ptr[i]); + + return mask; +} Py_LOCAL_INLINE(Py_ssize_t) STRINGLIB(fastsearch_memchr_1char)(const STRINGLIB_CHAR* s, Py_ssize_t n, diff -r 2abd48a47f3b Objects/stringlib/strip.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Objects/stringlib/strip.h Tue Oct 11 20:36:53 2011 -0400 @@ -0,0 +1,297 @@ + +#ifndef STRINGLIB_FASTSEARCH_H +#error must include "stringlib/fastsearch.h" before including this module +#endif + +#ifndef STRINGLIB_STRIP_H +#define STRINGLIB_STRIP_H /* begin include only once */ + +#define STRINGLIB_LEFTSTRIP 0 +#define STRINGLIB_RIGHTSTRIP 1 +#define STRINGLIB_BOTHSTRIP 2 + +static const char *_STRINGLIB_stripformat[] = { + "|O:lstrip", "|O:rstrip", "|O:strip" +}; + +#define STRINGLIB_STRIP_ARGFORMAT(i) ((char *)(_STRINGLIB_stripformat[i])) +#define STRINGLIB_STRIP_ARGNAME(i) ((char *)(_STRINGLIB_stripformat[i] + 3)) + +/* shared docstrings */ + +PyDoc_STRVAR(strip__doc__, + "S.strip([chars]) -> str\n\ + \n\ + Return a copy of the string S with leading and trailing\n\ + whitespace removed.\n\ + If chars is given and not None, remove characters in chars instead."); + + +PyDoc_STRVAR(lstrip__doc__, + "S.lstrip([chars]) -> str\n\ + \n\ + Return a copy of the string S with leading whitespace removed.\n\ + If chars is given and not None, remove characters in chars instead."); + + +PyDoc_STRVAR(rstrip__doc__, + "S.rstrip([chars]) -> str\n\ + \n\ + Return a copy of the string S with trailing whitespace removed.\n\ + If chars is given and not None, remove characters in chars instead."); + + +Py_LOCAL_INLINE(void) +make_char_mask(char *charmask, const char *str, Py_ssize_t len) +{ + const char *end; + + memset(charmask, 0, 0xFF); + + for (end = str + len; str < end; str++) { + charmask[Py_CHARMASK(*str)] = 1; + } +} +#endif /* end of include only once */ + +#if !STRINGLIB_IS_UNICODE +/* forward declaration */ +Py_LOCAL_INLINE(PyObject *) +STRINGLIB(argstrip)(STRINGLIB_OBJECT *self, PyObject *args, int direction); + +static PyObject * +stringlib_strip(STRINGLIB_OBJECT *self, PyObject *args) +{ + return STRINGLIB(argstrip)(self, args, STRINGLIB_BOTHSTRIP); +} + +static PyObject * +stringlib_lstrip(STRINGLIB_OBJECT *self, PyObject *args) +{ + return STRINGLIB(argstrip)(self, args, STRINGLIB_LEFTSTRIP); +} + +static PyObject * +STRINGLIB(rstrip)(STRINGLIB_OBJECT *self, PyObject *args) +{ + return STRINGLIB(argstrip)(self, args, STRINGLIB_RIGHTSTRIP); +} +#endif + +/** + * Strips whitespace characters from start and/or end + * based on `direction` using predefined whitespace characters. + * + * Returns the new offset from the beginning of the string and + * updates value pointed to by `len` pointer. + */ +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(rawstrip)(const STRINGLIB_CHAR *buf, Py_ssize_t *len, int direction) +{ + assert(buf != NULL); + assert(len != NULL && *len >= 0); + assert(direction == STRINGLIB_LEFTSTRIP || + direction == STRINGLIB_RIGHTSTRIP || + direction == STRINGLIB_BOTHSTRIP); + + Py_ssize_t i, j; + + if (*len == 0) + return 0; + + i = 0; j = *len; + if (direction != STRINGLIB_RIGHTSTRIP) { + while (i < j && STRINGLIB_ISSPACE(buf[i])) { + i++; + } + } + if (direction != STRINGLIB_LEFTSTRIP && i != j) { + do { + j--; + } while (j >= i && STRINGLIB_ISSPACE(buf[j])); + j++; + } + *len = j - i; + return i; +} + +/** + * Strips characters from start and/or end based on `direction` using + * caller supplied characters in `sep`. + * + * Returns the new offset from the beginning of the string and + * updates value pointed to by `len` pointer. + */ +Py_LOCAL_INLINE(Py_ssize_t) +STRINGLIB(xstrip)(const STRINGLIB_CHAR *buf, Py_ssize_t *len, + const STRINGLIB_CHAR *sep, Py_ssize_t seplen, + int direction) +{ + assert(buf != NULL); + assert(len != NULL && *len >= 0); + assert(sep != NULL); + assert(seplen >= 0); + assert(direction == STRINGLIB_LEFTSTRIP || + direction == STRINGLIB_RIGHTSTRIP || + direction == STRINGLIB_BOTHSTRIP); + + Py_ssize_t i, j; + +#if STRINGLIB_IS_UNICODE && STRINGLIB_SIZEOF_CHAR != 1 + STRINGLIB_BLOOM_MASK bloom_mask; +#else + char charmask[0xFF]; +#endif + + if (*len == 0) + return 0; + + if (seplen == 0) + return STRINGLIB(rawstrip)(buf, len, direction); + + /* use bloom mask for ucs2 and ucs4 */ +#if STRINGLIB_IS_UNICODE && STRINGLIB_SIZEOF_CHAR != 1 + bloom_mask = STRINGLIB(make_bloom_mask)(sep, seplen); + #define STRIPCOND(chr) (STRINGLIB_BLOOM_MEMBER(bloom_mask, (chr), \ + sep, seplen)) +#else + make_char_mask(charmask, (const char *)sep, seplen); + #define STRIPCOND(chr) (charmask[Py_CHARMASK(chr)] == 1) +#endif + + i = 0; j = *len; + if (direction != STRINGLIB_RIGHTSTRIP) { + while (i < j && STRIPCOND(buf[i])) { + i++; + } + } + if (direction != STRINGLIB_LEFTSTRIP && i != j) { + do { + j--; + } while (j >= i && STRIPCOND(buf[j])); + j++; + } + *len = j - i; + return i; +#undef STRIPCOND +} + + +/* + * A generic helper for objects with strip() API. This will handle + * parsing the argument tuple and call the correct implementation of + * lower-level strip implementation, either rawstrip() or xstrip(). + */ +Py_LOCAL_INLINE(PyObject *) +STRINGLIB(argstrip)(STRINGLIB_OBJECT *self, PyObject *args, int direction) +{ + PyObject *sepobj = NULL; + Py_ssize_t len, offset = 0; + STRINGLIB_CHAR *buf; + const char *format; + +#if STRINGLIB_IS_UNICODE + Py_ssize_t seplen; + STRINGLIB_CHAR *sep; + unsigned int kind, sepkind; + + if (PyUnicode_READY(self) < 0) + return NULL; +#else + Py_buffer vsep; +#endif + + buf = STRINGLIB_STR(self); + len = STRINGLIB_LEN(self); + + if (args == NULL || PyTuple_GET_SIZE(args) == 0) { + offset = STRINGLIB(rawstrip)(buf, &len, direction); + goto end; + } + format = STRINGLIB_STRIP_ARGFORMAT(direction); + if (!PyArg_ParseTuple(args, format, &sepobj)) { + return NULL; + } + if (sepobj == NULL || sepobj == Py_None) { + offset = STRINGLIB(rawstrip)(buf, &len, direction); + goto end; + } + +#if STRINGLIB_IS_UNICODE + if (PyUnicode_READY(sepobj) < 0) + return NULL; + + if (!STRINGLIB_CHECK_EXACT(sepobj)) { + PyErr_Format(PyExc_TypeError, + "%s arg must be None or str", + STRINGLIB_STRIP_ARGNAME(direction)); + return NULL; + } + + kind = PyUnicode_KIND(self); + sepkind = PyUnicode_KIND(sepobj); + seplen = STRINGLIB_LEN(sepobj); + if (sepkind == kind) { + sep = STRINGLIB_STR(sepobj); + offset = STRINGLIB(xstrip)(buf, &len, sep, seplen, direction); + } + else if (sepkind > kind) { + Py_ssize_t i, j; + Py_UCS4 maxchar = PyUnicode_MAX_CHAR_VALUE(self); + void *sepdata = PyUnicode_DATA(sepobj); + + /* Create a new separator string with the same kind as `self` by removing all + * codepoints larger than `kind` as they *cannot* match. + * + * XXX: maybe this should all be in a separate routine. + */ + sep = (STRINGLIB_CHAR *)PyMem_Malloc(seplen * sizeof(STRINGLIB_CHAR)); + if (sep == NULL) + return NULL; + for (i = 0, j = 0; j < seplen; j++) { + Py_UCS4 c = PyUnicode_READ(sepkind, sepdata, j); + if (c <= maxchar) { + PyUnicode_WRITE(kind, sep, i, c); + i++; + } + } + seplen = i; + offset = STRINGLIB(xstrip)(buf, &len, sep, seplen, direction); + PyMem_Free((void *)sep); + } + else { /* sepkind < kind */ + sep = _PyUnicode_AsKind(sepobj, kind); + if (sep == NULL) + return NULL; + offset = STRINGLIB(xstrip)(buf, &len, sep, seplen, direction); + PyMem_Free((void *)sep); + } +#else + if (PyObject_GetBuffer(sepobj, &vsep, PyBUF_SIMPLE) < 0) { + PyErr_Clear(); + PyErr_Format(PyExc_TypeError, + "Type %.100s doesn't support the buffer API", + Py_TYPE(sepobj)->tp_name); + return NULL; + } + offset = STRINGLIB(xstrip)(buf, &len, vsep.buf, vsep.len, direction); + PyBuffer_Release(&vsep); +#endif + +end: +#if !STRINGLIB_MUTABLE + if (offset == 0 && len == STRINGLIB_LEN(self) && + STRINGLIB_CHECK_EXACT(self)) { + Py_INCREF(self); + return (PyObject *)self; + } +#endif + +#if STRINGLIB_IS_UNICODE + /* XXX: maybe abstract this with STRINGLIB_SUBSTRING()? */ + return PyUnicode_Substring((PyObject *)self, offset, offset + len); +#else + return STRINGLIB_NEW(buf + offset, len); +#endif +} + diff -r 2abd48a47f3b Objects/stringlib/ucs1lib.h --- a/Objects/stringlib/ucs1lib.h Wed Oct 12 00:54:35 2011 +0200 +++ b/Objects/stringlib/ucs1lib.h Tue Oct 11 20:36:53 2011 -0400 @@ -12,7 +12,7 @@ #define STRINGLIB_PARSE_CODE "U" #define STRINGLIB_EMPTY unicode_empty #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE -#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK +#define STRINGLIB_ISLINEBREAK STRINGLIB_BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL #define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL #define STRINGLIB_TOUPPER Py_UNICODE_TOUPPER diff -r 2abd48a47f3b Objects/stringlib/ucs2lib.h --- a/Objects/stringlib/ucs2lib.h Wed Oct 12 00:54:35 2011 +0200 +++ b/Objects/stringlib/ucs2lib.h Tue Oct 11 20:36:53 2011 -0400 @@ -12,7 +12,7 @@ #define STRINGLIB_PARSE_CODE "U" #define STRINGLIB_EMPTY unicode_empty #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE -#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK +#define STRINGLIB_ISLINEBREAK STRINGLIB_BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL #define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL #define STRINGLIB_TOUPPER Py_UNICODE_TOUPPER diff -r 2abd48a47f3b Objects/stringlib/ucs4lib.h --- a/Objects/stringlib/ucs4lib.h Wed Oct 12 00:54:35 2011 +0200 +++ b/Objects/stringlib/ucs4lib.h Tue Oct 11 20:36:53 2011 -0400 @@ -12,7 +12,7 @@ #define STRINGLIB_PARSE_CODE "U" #define STRINGLIB_EMPTY unicode_empty #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE -#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK +#define STRINGLIB_ISLINEBREAK STRINGLIB_BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL #define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL #define STRINGLIB_TOUPPER Py_UNICODE_TOUPPER diff -r 2abd48a47f3b Objects/stringlib/unicodedefs.h --- a/Objects/stringlib/unicodedefs.h Wed Oct 12 00:54:35 2011 +0200 +++ b/Objects/stringlib/unicodedefs.h Tue Oct 11 20:36:53 2011 -0400 @@ -15,7 +15,7 @@ #define STRINGLIB_PARSE_CODE "U" #define STRINGLIB_EMPTY unicode_empty #define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE -#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK +#define STRINGLIB_ISLINEBREAK STRINGLIB_BLOOM_LINEBREAK #define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL #define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL #define STRINGLIB_TOUPPER Py_UNICODE_TOUPPER diff -r 2abd48a47f3b Objects/unicodeobject.c --- a/Objects/unicodeobject.c Wed Oct 12 00:54:35 2011 +0200 +++ b/Objects/unicodeobject.c Tue Oct 11 20:36:53 2011 -0400 @@ -420,53 +420,7 @@ } #endif -/* --- Bloom Filters ----------------------------------------------------- */ - -/* stuff to implement simple "bloom filters" for Unicode characters. - to keep things simple, we use a single bitmask, using the least 5 - bits from each unicode characters as the bit index. */ - -/* the linebreak mask is set up by Unicode_Init below */ - -#if LONG_BIT >= 128 -#define BLOOM_WIDTH 128 -#elif LONG_BIT >= 64 -#define BLOOM_WIDTH 64 -#elif LONG_BIT >= 32 -#define BLOOM_WIDTH 32 -#else -#error "LONG_BIT is smaller than 32" -#endif - -#define BLOOM_MASK unsigned long - -static BLOOM_MASK bloom_linebreak; - -#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) -#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) - -#define BLOOM_LINEBREAK(ch) \ - ((ch) < 128U ? ascii_linebreak[(ch)] : \ - (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) - -Py_LOCAL_INLINE(BLOOM_MASK) -make_bloom_mask(int kind, void* ptr, Py_ssize_t len) -{ - /* calculate simple bloom-style bitmask for a given unicode string */ - - BLOOM_MASK mask; - Py_ssize_t i; - - mask = 0; - for (i = 0; i < len; i++) - BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i)); - - return mask; -} - -#define BLOOM_MEMBER(mask, chr, str) \ - (BLOOM(mask, chr) \ - && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0)) +static unsigned long bloom_linebreak; /* --- Unicode Object ----------------------------------------------------- */ @@ -8512,6 +8466,7 @@ #include "stringlib/fastsearch.h" #include "stringlib/partition.h" #include "stringlib/split.h" +#include "stringlib/strip.h" #include "stringlib/count.h" #include "stringlib/find.h" #include "stringlib/localeutil.h" @@ -8521,6 +8476,7 @@ #include "stringlib/fastsearch.h" #include "stringlib/partition.h" #include "stringlib/split.h" +#include "stringlib/strip.h" #include "stringlib/count.h" #include "stringlib/find.h" #include "stringlib/localeutil.h" @@ -8530,6 +8486,7 @@ #include "stringlib/fastsearch.h" #include "stringlib/partition.h" #include "stringlib/split.h" +#include "stringlib/strip.h" #include "stringlib/count.h" #include "stringlib/find.h" #include "stringlib/localeutil.h" @@ -8539,6 +8496,7 @@ #include "stringlib/fastsearch.h" #include "stringlib/partition.h" #include "stringlib/split.h" +#include "stringlib/strip.h" #include "stringlib/count.h" #include "stringlib/find.h" #include "stringlib/localeutil.h" @@ -8647,7 +8605,6 @@ return -1; } - #include "stringlib/unicodedefs.h" #include "stringlib/fastsearch.h" @@ -11236,54 +11193,6 @@ return fixup(self, fixlower); } -#define LEFTSTRIP 0 -#define RIGHTSTRIP 1 -#define BOTHSTRIP 2 - -/* Arrays indexed by above */ -static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; - -#define STRIPNAME(i) (stripformat[i]+3) - -/* externally visible for str.strip(unicode) */ -PyObject * -_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) -{ - void *data; - int kind; - Py_ssize_t i, j, len; - BLOOM_MASK sepmask; - - if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1) - return NULL; - - kind = PyUnicode_KIND(self); - data = PyUnicode_DATA(self); - len = PyUnicode_GET_LENGTH(self); - sepmask = make_bloom_mask(PyUnicode_KIND(sepobj), - PyUnicode_DATA(sepobj), - PyUnicode_GET_LENGTH(sepobj)); - - i = 0; - if (striptype != RIGHTSTRIP) { - while (i < len && - BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) { - i++; - } - } - - j = len; - if (striptype != LEFTSTRIP) { - do { - j--; - } while (j >= i && - BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj)); - j++; - } - - return PyUnicode_Substring((PyObject*)self, i, j); -} - PyObject* PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end) { @@ -11330,109 +11239,47 @@ } static PyObject * -do_strip(PyUnicodeObject *self, int striptype) -{ - int kind; - void *data; - Py_ssize_t len, i, j; - - if (PyUnicode_READY(self) == -1) - return NULL; - - kind = PyUnicode_KIND(self); - data = PyUnicode_DATA(self); - len = PyUnicode_GET_LENGTH(self); - - i = 0; - if (striptype != RIGHTSTRIP) { - while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) { - i++; - } - } - - j = len; - if (striptype != LEFTSTRIP) { - do { - j--; - } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j))); - j++; - } - - return PyUnicode_Substring((PyObject*)self, i, j); -} - - -static PyObject * -do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) -{ - PyObject *sep = NULL; - - if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) - return NULL; - - if (sep != NULL && sep != Py_None) { - if (PyUnicode_Check(sep)) - return _PyUnicode_XStrip(self, striptype, sep); - else { - PyErr_Format(PyExc_TypeError, - "%s arg must be None or str", - STRIPNAME(striptype)); - return NULL; - } - } - - return do_strip(self, striptype); -} - - -PyDoc_STRVAR(strip__doc__, - "S.strip([chars]) -> str\n\ -\n\ -Return a copy of the string S with leading and trailing\n\ -whitespace removed.\n\ -If chars is given and not None, remove characters in chars instead."); +any_strip(PyUnicodeObject *self, PyObject *args, int direction) +{ + PyObject *result = NULL; + + if (PyUnicode_READY(self) < 0) + return NULL; + + switch (PyUnicode_KIND(self)) { + case PyUnicode_1BYTE_KIND: + result = ucs1lib_argstrip(self, args, direction); + break; + case PyUnicode_2BYTE_KIND: + result = ucs2lib_argstrip(self, args, direction); + break; + case PyUnicode_4BYTE_KIND: + result = ucs4lib_argstrip(self, args, direction); + break; + default: + assert(0); + break; + } + return result; +} static PyObject * unicode_strip(PyUnicodeObject *self, PyObject *args) { - if (PyTuple_GET_SIZE(args) == 0) - return do_strip(self, BOTHSTRIP); /* Common case */ - else - return do_argstrip(self, BOTHSTRIP, args); -} - - -PyDoc_STRVAR(lstrip__doc__, - "S.lstrip([chars]) -> str\n\ -\n\ -Return a copy of the string S with leading whitespace removed.\n\ -If chars is given and not None, remove characters in chars instead."); + return any_strip(self, args, STRINGLIB_BOTHSTRIP); +} static PyObject * unicode_lstrip(PyUnicodeObject *self, PyObject *args) { - if (PyTuple_GET_SIZE(args) == 0) - return do_strip(self, LEFTSTRIP); /* Common case */ - else - return do_argstrip(self, LEFTSTRIP, args); -} - - -PyDoc_STRVAR(rstrip__doc__, - "S.rstrip([chars]) -> str\n\ -\n\ -Return a copy of the string S with trailing whitespace removed.\n\ -If chars is given and not None, remove characters in chars instead."); + return any_strip(self, args, STRINGLIB_LEFTSTRIP); +} static PyObject * unicode_rstrip(PyUnicodeObject *self, PyObject *args) { - if (PyTuple_GET_SIZE(args) == 0) - return do_strip(self, RIGHTSTRIP); /* Common case */ - else - return do_argstrip(self, RIGHTSTRIP, args); -} - + return any_strip(self, args, STRINGLIB_RIGHTSTRIP); +} static PyObject* unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) @@ -13475,8 +13322,7 @@ Py_FatalError("Can't initialize 'unicode'"); /* initialize the linebreak bloom filter */ - bloom_linebreak = make_bloom_mask( - PyUnicode_2BYTE_KIND, linebreak, + bloom_linebreak = ucs2lib_make_bloom_mask(linebreak, Py_ARRAY_LENGTH(linebreak)); PyType_Ready(&EncodingMapType);