diff -r 1de6619733d9 Include/unicodeobject.h --- a/Include/unicodeobject.h Thu Sep 08 19:29:07 2011 -0700 +++ b/Include/unicodeobject.h Fri Sep 09 20:51:07 2011 -0400 @@ -1517,15 +1517,6 @@ PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s); -#ifndef Py_LIMITED_API -/* Externally visible for str.strip(unicode) */ -PyAPI_FUNC(PyObject *) _PyUnicode_XStrip( - PyUnicodeObject *self, - int striptype, - PyObject *sepobj - ); -#endif - /* Using the current locale, insert the thousands grouping into the string pointed to by buffer. For the argument descriptions, see Objects/stringlib/localeutil.h */ diff -r 1de6619733d9 Lib/test/test_bytes.py --- a/Lib/test/test_bytes.py Thu Sep 08 19:29:07 2011 -0700 +++ b/Lib/test/test_bytes.py Fri Sep 09 20:51:07 2011 -0400 @@ -978,6 +978,14 @@ self.assertRaises(BufferError, delslice) self.assertEqual(b, orig) + def test_strip_identity(self): + b = bytearray(b'asdf') + sep = b' \t' + strips = (b.strip, b.lstrip, b.rstrip) + for fn in strips: + self.assertFalse(fn() is b) + self.assertFalse(fn(sep) is b) + class AssortedBytesTest(unittest.TestCase): # diff -r 1de6619733d9 Objects/bytearrayobject.c --- a/Objects/bytearrayobject.c Thu Sep 08 19:29:07 2011 -0700 +++ b/Objects/bytearrayobject.c Fri Sep 09 20:51:07 2011 -0400 @@ -1048,6 +1048,7 @@ #include "stringlib/find.h" #include "stringlib/partition.h" #include "stringlib/split.h" +#include "stringlib/strip.h" #include "stringlib/ctype.h" #include "stringlib/transmogrify.h" @@ -1149,7 +1150,7 @@ \n\ Remove all items from B."); -static PyObject * +static PyObject * bytearray_clear(PyByteArrayObject *self) { if (PyByteArray_Resize((PyObject *)self, 0) < 0) @@ -2363,132 +2364,6 @@ Py_RETURN_NONE; } -/* XXX These two helpers could be optimized if argsize == 1 */ - -static Py_ssize_t -lstrip_helper(unsigned char *myptr, Py_ssize_t mysize, - void *argptr, Py_ssize_t argsize) -{ - Py_ssize_t i = 0; - while (i < mysize && memchr(argptr, myptr[i], argsize)) - i++; - return i; -} - -static Py_ssize_t -rstrip_helper(unsigned char *myptr, Py_ssize_t mysize, - void *argptr, Py_ssize_t argsize) -{ - Py_ssize_t i = mysize - 1; - while (i >= 0 && memchr(argptr, myptr[i], argsize)) - i--; - return i + 1; -} - -PyDoc_STRVAR(strip__doc__, -"B.strip([bytes]) -> bytearray\n\ -\n\ -Strip leading and trailing bytes contained in the argument\n\ -and return the result as a new bytearray.\n\ -If the argument is omitted, strip ASCII whitespace."); -static PyObject * -bytearray_strip(PyByteArrayObject *self, PyObject *args) -{ - Py_ssize_t left, right, mysize, argsize; - void *myptr, *argptr; - PyObject *arg = Py_None; - Py_buffer varg; - if (!PyArg_ParseTuple(args, "|O:strip", &arg)) - return NULL; - if (arg == Py_None) { - argptr = "\t\n\r\f\v "; - argsize = 6; - } - else { - if (_getbuffer(arg, &varg) < 0) - return NULL; - argptr = varg.buf; - argsize = varg.len; - } - myptr = self->ob_bytes; - mysize = Py_SIZE(self); - left = lstrip_helper(myptr, mysize, argptr, argsize); - if (left == mysize) - right = left; - else - right = rstrip_helper(myptr, mysize, argptr, argsize); - if (arg != Py_None) - PyBuffer_Release(&varg); - return PyByteArray_FromStringAndSize(self->ob_bytes + left, right - left); -} - -PyDoc_STRVAR(lstrip__doc__, -"B.lstrip([bytes]) -> bytearray\n\ -\n\ -Strip leading bytes contained in the argument\n\ -and return the result as a new bytearray.\n\ -If the argument is omitted, strip leading ASCII whitespace."); -static PyObject * -bytearray_lstrip(PyByteArrayObject *self, PyObject *args) -{ - Py_ssize_t left, right, mysize, argsize; - void *myptr, *argptr; - PyObject *arg = Py_None; - Py_buffer varg; - if (!PyArg_ParseTuple(args, "|O:lstrip", &arg)) - return NULL; - if (arg == Py_None) { - argptr = "\t\n\r\f\v "; - argsize = 6; - } - else { - if (_getbuffer(arg, &varg) < 0) - return NULL; - argptr = varg.buf; - argsize = varg.len; - } - myptr = self->ob_bytes; - mysize = Py_SIZE(self); - left = lstrip_helper(myptr, mysize, argptr, argsize); - right = mysize; - if (arg != Py_None) - PyBuffer_Release(&varg); - return PyByteArray_FromStringAndSize(self->ob_bytes + left, right - left); -} - -PyDoc_STRVAR(rstrip__doc__, -"B.rstrip([bytes]) -> bytearray\n\ -\n\ -Strip trailing bytes contained in the argument\n\ -and return the result as a new bytearray.\n\ -If the argument is omitted, strip trailing ASCII whitespace."); -static PyObject * -bytearray_rstrip(PyByteArrayObject *self, PyObject *args) -{ - Py_ssize_t right, mysize, argsize; - void *myptr, *argptr; - PyObject *arg = Py_None; - Py_buffer varg; - if (!PyArg_ParseTuple(args, "|O:rstrip", &arg)) - return NULL; - if (arg == Py_None) { - argptr = "\t\n\r\f\v "; - argsize = 6; - } - else { - if (_getbuffer(arg, &varg) < 0) - return NULL; - argptr = varg.buf; - argsize = varg.len; - } - myptr = self->ob_bytes; - mysize = Py_SIZE(self); - right = rstrip_helper(myptr, mysize, argptr, argsize); - if (arg != Py_None) - PyBuffer_Release(&varg); - return PyByteArray_FromStringAndSize(self->ob_bytes, right); -} - PyDoc_STRVAR(decode_doc, "B.decode(encoding='utf-8', errors='strict') -> str\n\ \n\ @@ -2786,7 +2661,7 @@ {"join", (PyCFunction)bytearray_join, METH_O, join_doc}, {"ljust", (PyCFunction)stringlib_ljust, METH_VARARGS, ljust__doc__}, {"lower", (PyCFunction)stringlib_lower, METH_NOARGS, _Py_lower__doc__}, - {"lstrip", (PyCFunction)bytearray_lstrip, METH_VARARGS, lstrip__doc__}, + {"lstrip", (PyCFunction)stringlib_lstrip, METH_VARARGS, lstrip__doc__}, {"maketrans", (PyCFunction)bytearray_maketrans, METH_VARARGS|METH_STATIC, _Py_maketrans__doc__}, {"partition", (PyCFunction)bytearray_partition, METH_O, partition__doc__}, @@ -2799,13 +2674,13 @@ {"rjust", (PyCFunction)stringlib_rjust, METH_VARARGS, rjust__doc__}, {"rpartition", (PyCFunction)bytearray_rpartition, METH_O, rpartition__doc__}, {"rsplit", (PyCFunction)bytearray_rsplit, METH_VARARGS, rsplit__doc__}, - {"rstrip", (PyCFunction)bytearray_rstrip, METH_VARARGS, rstrip__doc__}, + {"rstrip", (PyCFunction)stringlib_rstrip, METH_VARARGS, rstrip__doc__}, {"split", (PyCFunction)bytearray_split, METH_VARARGS, split__doc__}, {"splitlines", (PyCFunction)bytearray_splitlines, METH_VARARGS, splitlines__doc__}, {"startswith", (PyCFunction)bytearray_startswith, METH_VARARGS , startswith__doc__}, - {"strip", (PyCFunction)bytearray_strip, METH_VARARGS, strip__doc__}, + {"strip", (PyCFunction)stringlib_strip, METH_VARARGS, strip__doc__}, {"swapcase", (PyCFunction)stringlib_swapcase, METH_NOARGS, _Py_swapcase__doc__}, {"title", (PyCFunction)stringlib_title, METH_NOARGS, _Py_title__doc__}, diff -r 1de6619733d9 Objects/bytesobject.c --- a/Objects/bytesobject.c Thu Sep 08 19:29:07 2011 -0700 +++ b/Objects/bytesobject.c Fri Sep 09 20:51:07 2011 -0400 @@ -557,6 +557,7 @@ #include "stringlib/find.h" #include "stringlib/partition.h" #include "stringlib/split.h" +#include "stringlib/strip.h" #include "stringlib/ctype.h" #include "stringlib/transmogrify.h" @@ -980,15 +981,6 @@ }; -#define LEFTSTRIP 0 -#define RIGHTSTRIP 1 -#define BOTHSTRIP 2 - -/* Arrays indexed by above */ -static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; - -#define STRIPNAME(i) (stripformat[i]+3) - PyDoc_STRVAR(split__doc__, "B.split([sep[, maxsplit]]) -> list of bytes\n\ \n\ @@ -1343,137 +1335,6 @@ } -Py_LOCAL_INLINE(PyObject *) -do_xstrip(PyBytesObject *self, int striptype, PyObject *sepobj) -{ - Py_buffer vsep; - char *s = PyBytes_AS_STRING(self); - Py_ssize_t len = PyBytes_GET_SIZE(self); - char *sep; - Py_ssize_t seplen; - Py_ssize_t i, j; - - if (_getbuffer(sepobj, &vsep) < 0) - return NULL; - sep = vsep.buf; - seplen = vsep.len; - - i = 0; - if (striptype != RIGHTSTRIP) { - while (i < len && memchr(sep, Py_CHARMASK(s[i]), seplen)) { - i++; - } - } - - j = len; - if (striptype != LEFTSTRIP) { - do { - j--; - } while (j >= i && memchr(sep, Py_CHARMASK(s[j]), seplen)); - j++; - } - - PyBuffer_Release(&vsep); - - if (i == 0 && j == len && PyBytes_CheckExact(self)) { - Py_INCREF(self); - return (PyObject*)self; - } - else - return PyBytes_FromStringAndSize(s+i, j-i); -} - - -Py_LOCAL_INLINE(PyObject *) -do_strip(PyBytesObject *self, int striptype) -{ - char *s = PyBytes_AS_STRING(self); - Py_ssize_t len = PyBytes_GET_SIZE(self), i, j; - - i = 0; - if (striptype != RIGHTSTRIP) { - while (i < len && Py_ISSPACE(s[i])) { - i++; - } - } - - j = len; - if (striptype != LEFTSTRIP) { - do { - j--; - } while (j >= i && Py_ISSPACE(s[j])); - j++; - } - - if (i == 0 && j == len && PyBytes_CheckExact(self)) { - Py_INCREF(self); - return (PyObject*)self; - } - else - return PyBytes_FromStringAndSize(s+i, j-i); -} - - -Py_LOCAL_INLINE(PyObject *) -do_argstrip(PyBytesObject *self, int striptype, PyObject *args) -{ - PyObject *sep = NULL; - - if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) - return NULL; - - if (sep != NULL && sep != Py_None) { - return do_xstrip(self, striptype, sep); - } - return do_strip(self, striptype); -} - - -PyDoc_STRVAR(strip__doc__, -"B.strip([bytes]) -> bytes\n\ -\n\ -Strip leading and trailing bytes contained in the argument.\n\ -If the argument is omitted, strip trailing ASCII whitespace."); -static PyObject * -bytes_strip(PyBytesObject *self, PyObject *args) -{ - if (PyTuple_GET_SIZE(args) == 0) - return do_strip(self, BOTHSTRIP); /* Common case */ - else - return do_argstrip(self, BOTHSTRIP, args); -} - - -PyDoc_STRVAR(lstrip__doc__, -"B.lstrip([bytes]) -> bytes\n\ -\n\ -Strip leading bytes contained in the argument.\n\ -If the argument is omitted, strip leading ASCII whitespace."); -static PyObject * -bytes_lstrip(PyBytesObject *self, PyObject *args) -{ - if (PyTuple_GET_SIZE(args) == 0) - return do_strip(self, LEFTSTRIP); /* Common case */ - else - return do_argstrip(self, LEFTSTRIP, args); -} - - -PyDoc_STRVAR(rstrip__doc__, -"B.rstrip([bytes]) -> bytes\n\ -\n\ -Strip trailing bytes contained in the argument.\n\ -If the argument is omitted, strip trailing ASCII whitespace."); -static PyObject * -bytes_rstrip(PyBytesObject *self, PyObject *args) -{ - if (PyTuple_GET_SIZE(args) == 0) - return do_strip(self, RIGHTSTRIP); /* Common case */ - else - return do_argstrip(self, RIGHTSTRIP, args); -} - - PyDoc_STRVAR(count__doc__, "B.count(sub[, start[, end]]) -> int\n\ \n\ @@ -2445,7 +2306,7 @@ {"join", (PyCFunction)bytes_join, METH_O, join__doc__}, {"ljust", (PyCFunction)stringlib_ljust, METH_VARARGS, ljust__doc__}, {"lower", (PyCFunction)stringlib_lower, METH_NOARGS, _Py_lower__doc__}, - {"lstrip", (PyCFunction)bytes_lstrip, METH_VARARGS, lstrip__doc__}, + {"lstrip", (PyCFunction)stringlib_lstrip, METH_VARARGS, lstrip__doc__}, {"maketrans", (PyCFunction)bytes_maketrans, METH_VARARGS|METH_STATIC, _Py_maketrans__doc__}, {"partition", (PyCFunction)bytes_partition, METH_O, partition__doc__}, @@ -2456,13 +2317,13 @@ {"rpartition", (PyCFunction)bytes_rpartition, METH_O, rpartition__doc__}, {"rsplit", (PyCFunction)bytes_rsplit, METH_VARARGS, rsplit__doc__}, - {"rstrip", (PyCFunction)bytes_rstrip, METH_VARARGS, rstrip__doc__}, + {"rstrip", (PyCFunction)stringlib_rstrip, METH_VARARGS, rstrip__doc__}, {"split", (PyCFunction)bytes_split, METH_VARARGS, split__doc__}, {"splitlines", (PyCFunction)bytes_splitlines, METH_VARARGS, splitlines__doc__}, {"startswith", (PyCFunction)bytes_startswith, METH_VARARGS, startswith__doc__}, - {"strip", (PyCFunction)bytes_strip, METH_VARARGS, strip__doc__}, + {"strip", (PyCFunction)stringlib_strip, METH_VARARGS, strip__doc__}, {"swapcase", (PyCFunction)stringlib_swapcase, METH_NOARGS, _Py_swapcase__doc__}, {"title", (PyCFunction)stringlib_title, METH_NOARGS, _Py_title__doc__}, diff -r 1de6619733d9 Objects/stringlib/bloom.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Objects/stringlib/bloom.h Fri Sep 09 20:51:07 2011 -0400 @@ -0,0 +1,75 @@ +/* --- Bloom Filters ----------------------------------------------------- */ + +/* stuff to implement simple "bloom filters" for Unicode characters. + to keep things simple, we use a single bitmask, using the least 5 + bits from each unicode characters as the bit index. */ + +/* the linebreak mask is set up by Unicode_Init below */ + +#ifndef STRINGLIB_BLOOM_H +#define STRINGLIB_BLOOM_H + +#if LONG_BIT >= 128 +#define BLOOM_WIDTH 128 +#elif LONG_BIT >= 64 +#define BLOOM_WIDTH 64 +#elif LONG_BIT >= 32 +#define BLOOM_WIDTH 32 +#else +#error "LONG_BIT is smaller than 32" +#endif + +#define BLOOM_MASK unsigned long + +#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) +#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) + +#define BLOOM_MEMBER(mask, chr, set, setlen) \ + BLOOM(mask, chr) && _stringlib_member(chr, set, setlen) + + +Py_LOCAL_INLINE(BLOOM_MASK) +make_bloom_mask(const STRINGLIB_CHAR *ptr, Py_ssize_t len) +{ + /* calculate simple bloom-style bitmask for a given unicode string */ + + BLOOM_MASK mask; + Py_ssize_t i; + + mask = 0; + for (i = 0; i < len; i++) + BLOOM_ADD(mask, ptr[i]); + + return mask; +} + +Py_LOCAL_INLINE(void) +make_char_mask(char *charmask, const char *s, Py_ssize_t n) +{ + const char *end; + + memset(charmask, 0, 255); + + for (end = s + n; s < end; s++) + charmask[Py_CHARMASK(*s)] = 1; +} + +Py_LOCAL_INLINE(int) +_stringlib_member(STRINGLIB_CHAR chr, + const STRINGLIB_CHAR *set, + Py_ssize_t setlen) +{ + Py_ssize_t i; + +#if !STRINGLIB_IS_UNICODE + if (setlen > 16) + return memchr(set, chr, setlen) != NULL; +#endif + + for (i = 0; i < setlen; i++) + if (set[i] == chr) + return 1; + + return 0; +} +#endif diff -r 1de6619733d9 Objects/stringlib/strip.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Objects/stringlib/strip.h Fri Sep 09 20:51:07 2011 -0400 @@ -0,0 +1,206 @@ + +#ifndef STRINGLIB_STRIP_H +#define STRINGLIB_STRIP_H + +#include "bloom.h" + +#define STRINGLIB_LEFTSTRIP 0 +#define STRINGLIB_RIGHTSTRIP 1 +#define STRINGLIB_BOTHSTRIP 2 + +static const char *_STRINGLIB_stripformat[] = { + "|O:lstrip", "|O:rstrip", "|O:strip" +}; + +#define STRINGLIB_STRIP_ARGFORMAT(i) ((char *)(_STRINGLIB_stripformat[i])) +#define STRINGLIB_STRIP_ARGNAME(i) ((char *)(_STRINGLIB_stripformat[i] + 3)) + + +Py_LOCAL_INLINE(const STRINGLIB_CHAR *) +strip(const STRINGLIB_CHAR *buf, Py_ssize_t *len, int direction) +{ + assert(buf != NULL); + assert(len != NULL); + assert(direction >= STRINGLIB_LEFTSTRIP && + direction <= STRINGLIB_BOTHSTRIP); + + const STRINGLIB_CHAR *s, *e; + + if (*len == 0) { + return buf; + } + s = buf; + e = buf + *len; + if (direction != STRINGLIB_RIGHTSTRIP) { + while (s < e && STRINGLIB_ISSPACE(*s)) { + s++; + } + } + if (direction != STRINGLIB_LEFTSTRIP && s != e) { + do { + e--; + } while (e >= s && STRINGLIB_ISSPACE(*e)); + e++; + } + *len = e - s; + return s; +} + + +Py_LOCAL_INLINE(const STRINGLIB_CHAR *) +xstrip(const STRINGLIB_CHAR *buf, Py_ssize_t *len, + const STRINGLIB_CHAR *sep, Py_ssize_t seplen, + int direction) +{ + assert(buf != NULL); + assert(len != NULL); + assert(sep != NULL); + assert(direction >= STRINGLIB_LEFTSTRIP && + direction <= STRINGLIB_BOTHSTRIP); + +#if STRINGLIB_IS_UNICODE + BLOOM_MASK bloom_mask; +#else + char charmask[255]; +#endif + + const STRINGLIB_CHAR *s, *e; + + if (seplen == 0) { + return strip(buf, len, direction); + } + if (*len == 0) { + return buf; + } + s = buf; + e = buf + *len; + +#if STRINGLIB_IS_UNICODE + bloom_mask = make_bloom_mask(sep, seplen); + if (direction != STRINGLIB_RIGHTSTRIP) { + while (s < e && BLOOM_MEMBER(bloom_mask, *s, sep, seplen)) { + s++; + } + } + if (direction != STRINGLIB_LEFTSTRIP && s != e) { + do { + e--; + } while (e >= s && BLOOM_MEMBER(bloom_mask, *e, sep, seplen)); + e++; + } +#else + make_char_mask(charmask, sep, seplen); + if (direction != STRINGLIB_RIGHTSTRIP) { + while (s < e && charmask[Py_CHARMASK(*s)] == 1) { + s++; + } + } + if (direction != STRINGLIB_LEFTSTRIP && s != e) { + do { + e--; + } while (e >= s && charmask[Py_CHARMASK(*e)] == 1); + e++; + } +#endif + *len = e - s; + return s; +} + +Py_LOCAL_INLINE(PyObject *) +argstrip(PyObject *self, PyObject *args, int direction) +{ + const STRINGLIB_CHAR *s, *buf = STRINGLIB_STR(self); + Py_ssize_t len = STRINGLIB_LEN(self); +#if !STRINGLIB_MUTABLE + Py_ssize_t orig_len = len; +#endif + + if (PyTuple_GET_SIZE(args) > 0) { + PyObject *sepobj = NULL; + const char *format = STRINGLIB_STRIP_ARGFORMAT(direction); + + if (!PyArg_ParseTuple(args, format, &sepobj)) + return NULL; + + if (sepobj != NULL && sepobj != Py_None) { +#if STRINGLIB_IS_UNICODE + if (!STRINGLIB_CHECK_EXACT(sepobj)) { + PyErr_Format(PyExc_TypeError, + "%s arg must be None or str", + STRINGLIB_STRIP_ARGNAME(direction)); + return NULL; + } + const STRINGLIB_CHAR *sep = STRINGLIB_STR(sepobj); + Py_ssize_t seplen = STRINGLIB_LEN(sepobj); + s = xstrip(buf, &len, sep, seplen, direction); +#else + Py_buffer vsep; + if (PyObject_GetBuffer(sepobj, &vsep, PyBUF_SIMPLE) < 0) { + PyErr_Clear(); + PyErr_Format(PyExc_TypeError, + "Type %.100s doesn't support the buffer API", + Py_TYPE(sepobj)->tp_name); + return NULL; + } + s = xstrip(buf, &len, vsep.buf, vsep.len, direction); + PyBuffer_Release(&vsep); +#endif +#if !STRINGLIB_MUTABLE + if (s == buf && len == orig_len && STRINGLIB_CHECK_EXACT(self)) { + Py_INCREF(self); + return (PyObject *)self; + } +#endif + return STRINGLIB_NEW(s, len); + } + } + s = strip(buf, &len, direction); +#if !STRINGLIB_MUTABLE + if (s == buf && len == orig_len && STRINGLIB_CHECK_EXACT(self)) { + Py_INCREF(self); + return (PyObject *)self; + } +#endif + return STRINGLIB_NEW(s, len); +} + + +PyDoc_STRVAR(strip__doc__, +"S.strip([chars]) -> str\n\ +\n\ +Return a copy of the string S with leading and trailing\n\ +whitespace removed.\n\ +If chars is given and not None, remove characters in chars instead."); + +static PyObject * +stringlib_strip(PyObject *self, PyObject *args) +{ + return argstrip(self, args, STRINGLIB_BOTHSTRIP); +} + + +PyDoc_STRVAR(lstrip__doc__, +"S.lstrip([chars]) -> str\n\ +\n\ +Return a copy of the string S with leading whitespace removed.\n\ +If chars is given and not None, remove characters in chars instead."); + +static PyObject * +stringlib_lstrip(PyObject *self, PyObject *args) +{ + return argstrip(self, args, STRINGLIB_LEFTSTRIP); +} + + +PyDoc_STRVAR(rstrip__doc__, +"S.rstrip([chars]) -> str\n\ +\n\ +Return a copy of the string S with trailing whitespace removed.\n\ +If chars is given and not None, remove characters in chars instead."); + +static PyObject * +stringlib_rstrip(PyObject *self, PyObject *args) +{ + return argstrip(self, args, STRINGLIB_RIGHTSTRIP); +} +#endif diff -r 1de6619733d9 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Thu Sep 08 19:29:07 2011 -0700 +++ b/Objects/unicodeobject.c Fri Sep 09 20:51:07 2011 -0400 @@ -196,64 +196,16 @@ #endif } -/* --- Bloom Filters ----------------------------------------------------- */ - -/* stuff to implement simple "bloom filters" for Unicode characters. - to keep things simple, we use a single bitmask, using the least 5 - bits from each unicode characters as the bit index. */ - -/* the linebreak mask is set up by Unicode_Init below */ - -#if LONG_BIT >= 128 -#define BLOOM_WIDTH 128 -#elif LONG_BIT >= 64 -#define BLOOM_WIDTH 64 -#elif LONG_BIT >= 32 -#define BLOOM_WIDTH 32 -#else -#error "LONG_BIT is smaller than 32" -#endif - -#define BLOOM_MASK unsigned long +#include "stringlib/unicodedefs.h" +#include "stringlib/bloom.h" + static BLOOM_MASK bloom_linebreak; -#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1))))) -#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1))))) - #define BLOOM_LINEBREAK(ch) \ ((ch) < 128U ? ascii_linebreak[(ch)] : \ (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch))) -Py_LOCAL_INLINE(BLOOM_MASK) -make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len) -{ - /* calculate simple bloom-style bitmask for a given unicode string */ - - BLOOM_MASK mask; - Py_ssize_t i; - - mask = 0; - for (i = 0; i < len; i++) - BLOOM_ADD(mask, ptr[i]); - - return mask; -} - -Py_LOCAL_INLINE(int) -unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen) -{ - Py_ssize_t i; - - for (i = 0; i < setlen; i++) - if (set[i] == chr) - return 1; - - return 0; -} - -#define BLOOM_MEMBER(mask, chr, set, setlen) \ - BLOOM(mask, chr) && unicode_member(chr, set, setlen) /* --- Unicode Object ----------------------------------------------------- */ @@ -6487,13 +6439,13 @@ /* --- Helpers ------------------------------------------------------------ */ -#include "stringlib/unicodedefs.h" #include "stringlib/fastsearch.h" #include "stringlib/count.h" #include "stringlib/find.h" #include "stringlib/partition.h" #include "stringlib/split.h" +#include "stringlib/strip.h" #define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping #define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale @@ -8188,153 +8140,6 @@ return fixup(self, fixlower); } -#define LEFTSTRIP 0 -#define RIGHTSTRIP 1 -#define BOTHSTRIP 2 - -/* Arrays indexed by above */ -static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"}; - -#define STRIPNAME(i) (stripformat[i]+3) - -/* externally visible for str.strip(unicode) */ -PyObject * -_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj) -{ - Py_UNICODE *s = PyUnicode_AS_UNICODE(self); - Py_ssize_t len = PyUnicode_GET_SIZE(self); - Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj); - Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj); - Py_ssize_t i, j; - - BLOOM_MASK sepmask = make_bloom_mask(sep, seplen); - - i = 0; - if (striptype != RIGHTSTRIP) { - while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) { - i++; - } - } - - j = len; - if (striptype != LEFTSTRIP) { - do { - j--; - } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen)); - j++; - } - - if (i == 0 && j == len && PyUnicode_CheckExact(self)) { - Py_INCREF(self); - return (PyObject*)self; - } - else - return PyUnicode_FromUnicode(s+i, j-i); -} - - -static PyObject * -do_strip(PyUnicodeObject *self, int striptype) -{ - Py_UNICODE *s = PyUnicode_AS_UNICODE(self); - Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j; - - i = 0; - if (striptype != RIGHTSTRIP) { - while (i < len && Py_UNICODE_ISSPACE(s[i])) { - i++; - } - } - - j = len; - if (striptype != LEFTSTRIP) { - do { - j--; - } while (j >= i && Py_UNICODE_ISSPACE(s[j])); - j++; - } - - if (i == 0 && j == len && PyUnicode_CheckExact(self)) { - Py_INCREF(self); - return (PyObject*)self; - } - else - return PyUnicode_FromUnicode(s+i, j-i); -} - - -static PyObject * -do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args) -{ - PyObject *sep = NULL; - - if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep)) - return NULL; - - if (sep != NULL && sep != Py_None) { - if (PyUnicode_Check(sep)) - return _PyUnicode_XStrip(self, striptype, sep); - else { - PyErr_Format(PyExc_TypeError, - "%s arg must be None or str", - STRIPNAME(striptype)); - return NULL; - } - } - - return do_strip(self, striptype); -} - - -PyDoc_STRVAR(strip__doc__, - "S.strip([chars]) -> str\n\ -\n\ -Return a copy of the string S with leading and trailing\n\ -whitespace removed.\n\ -If chars is given and not None, remove characters in chars instead."); - -static PyObject * -unicode_strip(PyUnicodeObject *self, PyObject *args) -{ - if (PyTuple_GET_SIZE(args) == 0) - return do_strip(self, BOTHSTRIP); /* Common case */ - else - return do_argstrip(self, BOTHSTRIP, args); -} - - -PyDoc_STRVAR(lstrip__doc__, - "S.lstrip([chars]) -> str\n\ -\n\ -Return a copy of the string S with leading whitespace removed.\n\ -If chars is given and not None, remove characters in chars instead."); - -static PyObject * -unicode_lstrip(PyUnicodeObject *self, PyObject *args) -{ - if (PyTuple_GET_SIZE(args) == 0) - return do_strip(self, LEFTSTRIP); /* Common case */ - else - return do_argstrip(self, LEFTSTRIP, args); -} - - -PyDoc_STRVAR(rstrip__doc__, - "S.rstrip([chars]) -> str\n\ -\n\ -Return a copy of the string S with trailing whitespace removed.\n\ -If chars is given and not None, remove characters in chars instead."); - -static PyObject * -unicode_rstrip(PyUnicodeObject *self, PyObject *args) -{ - if (PyTuple_GET_SIZE(args) == 0) - return do_strip(self, RIGHTSTRIP); /* Common case */ - else - return do_argstrip(self, RIGHTSTRIP, args); -} - - static PyObject* unicode_repeat(PyUnicodeObject *str, Py_ssize_t len) { @@ -9267,14 +9072,14 @@ {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__}, {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__}, {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__}, - {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__}, + {"lstrip", (PyCFunction) stringlib_lstrip, METH_VARARGS, lstrip__doc__}, {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__}, {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__}, {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__}, - {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__}, + {"rstrip", (PyCFunction) stringlib_rstrip, METH_VARARGS, rstrip__doc__}, {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__}, {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__}, - {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__}, + {"strip", (PyCFunction) stringlib_strip, METH_VARARGS, strip__doc__}, {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__}, {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__}, {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},