Index: Doc/lib/libre.tex =================================================================== RCS file: /cvsroot/python/python/dist/src/Doc/lib/libre.tex,v retrieving revision 1.108 diff -c -r1.108 libre.tex *** Doc/lib/libre.tex 18 Oct 2003 15:28:22 -0000 1.108 --- Doc/lib/libre.tex 11 Jul 2004 03:12:48 -0000 *************** *** 528,534 **** \var{string}, use \method{search()} instead.} \end{funcdesc} ! \begin{funcdesc}{split}{pattern, string\optional{, maxsplit\code{ = 0}}} Split \var{string} by the occurrences of \var{pattern}. If capturing parentheses are used in \var{pattern}, then the text of all groups in the pattern are also returned as part of the resulting list. --- 528,535 ---- \var{string}, use \method{search()} instead.} \end{funcdesc} ! \begin{funcdesc}{split}{pattern, string\optional{, maxsplit\code{ = 0}, ! emptyok\code{ = False}}} Split \var{string} by the occurrences of \var{pattern}. If capturing parentheses are used in \var{pattern}, then the text of all groups in the pattern are also returned as part of the resulting list. *************** *** 538,543 **** --- 539,550 ---- 1.5 release, \var{maxsplit} was ignored. This has been fixed in later releases.) + Although not previously documented, this function ignores matches of + length zero. If \var{emptyok} (new in Python 2.X) is \code{True}, + zero-length matches are not ignored. In either case, overlapping + matches are ignored and only the first match that begins in each + position is used. + \begin{verbatim} >>> re.split('\W+', 'Words, words, words.') ['Words', 'words', 'words', ''] *************** *** 545,550 **** --- 552,564 ---- ['Words', ', ', 'words', ', ', 'words', '.', ''] >>> re.split('\W+', 'Words, words, words.', 1) ['Words', 'words, words.'] + >>> re.split('(?=[Ww])', 'Words, words, words.', emptyok=True) + ['', 'Words, ', 'words, ', 'words.'] + + # split an Windows-style .ini file into a list of sections + >>> re.split(r'(?m)(?=^\[)', ';rem\n[sec1]\nfoo=bar\n\n[sec2]\nbaz\n', + emptyok=True) + [';rem\n', '[sec1]\nfoo=bar\n\n', '[sec2]\nbaz\n'] \end{verbatim} This function combines and extends the functionality of Index: Lib/sre.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/sre.py,v retrieving revision 1.48 diff -c -r1.48 sre.py *** Lib/sre.py 20 Apr 2004 21:11:11 -0000 1.48 --- Lib/sre.py 11 Jul 2004 03:12:49 -0000 *************** *** 150,159 **** substitutions that were made.""" return _compile(pattern, 0).subn(repl, string, count) ! def split(pattern, string, maxsplit=0): """Split the source string by the occurrences of the pattern, ! returning a list containing the resulting substrings.""" ! return _compile(pattern, 0).split(string, maxsplit) def findall(pattern, string): """Return a list of all non-overlapping matches in the string. --- 150,160 ---- substitutions that were made.""" return _compile(pattern, 0).subn(repl, string, count) ! def split(pattern, string, maxsplit=0, emptyok=False): """Split the source string by the occurrences of the pattern, ! returning a list containing the resulting substrings. ! Empty matches are ignored unless emptyok is True.""" ! return _compile(pattern, 0).split(string, maxsplit, emptyok) def findall(pattern, string): """Return a list of all non-overlapping matches in the string. Index: Lib/test/test_re.py =================================================================== RCS file: /cvsroot/python/python/dist/src/Lib/test/test_re.py,v retrieving revision 1.50 diff -c -r1.50 test_re.py *** Lib/test/test_re.py 31 May 2004 03:09:25 -0000 1.50 --- Lib/test/test_re.py 11 Jul 2004 03:12:51 -0000 *************** *** 137,142 **** --- 137,233 ---- self.assertEqual(re.split("(:*)", ":a:b::c", 2), ['', ':', 'a', ':', 'b::c']) + def test_re_split_emptyok(self): + + # These two test cases show a primary use-case for emptyok. It allows + # easy pattern-based splitting of strings, even when one doesn't want + # to peel off the separating pattern. + + self.assertEqual(re.split("(?=:)", ":aa:b::cc", emptyok=True), + ['', ':aa', ':b', ':', ':cc']) + self.assertEqual(re.split(r'(?m)(?=^\[)', + ";comment\n[sec1]\nfoo=bar\n\n" + "[sec2]\netc=etc\n", + emptyok=True), + [';comment\n', '[sec1]\nfoo=bar\n\n', + '[sec2]\netc=etc\n']) + + # This next may look odd at first, but it's correct. There are four + # non-overlapping matches (three of which are empty), which split the + # string into five pieces. + + self.assertEqual(re.split(":*", "a:b", emptyok=True), + ['', 'a', '', 'b', '']) + + # An obvious design question is whether we shouldn't disallow an empty + # match directly after a non-empty match. The rationale for not doing + # so is that it seems like it might be useful, and the initially + # surprising behavior shown below only occurs for patterns with both + # zero-length and non-zero-length matches. --mkc + + # bug 852532: original complaint + self.assertEqual(re.compile('^$', re.MULTILINE).split('foo\n\nbar', + emptyok=True), + ['foo\n', '\nbar']) + + self.assertEqual(re.split(":*", "a", emptyok=True), ['', 'a', '']) + self.assertEqual(re.split(":*", "ab", emptyok=True), + ['', 'a', 'b', '']) + self.assertEqual(re.split(":", "a", emptyok=True), ['a']) + self.assertEqual(re.split(":", ":a", emptyok=True), ['', 'a']) + self.assertEqual(re.split(":", ":a:b::c", emptyok=True), + ['', 'a', 'b', '', 'c']) + self.assertEqual(re.split(":*", ":a:b::c", emptyok=True), + ['', '', 'a', '', 'b', '', 'c', '']) + self.assertEqual(re.split(":+", ":a:b::c", emptyok=True), + ['', 'a', 'b', 'c']) + self.assertEqual(re.split(":*?", ":a:b::c", emptyok=True), + ['', ':', 'a', ':', 'b', ':', ':', 'c', '']) + self.assertEqual(re.split(":+?", ":a:b::c", emptyok=True), + ['', 'a', 'b', '', 'c']) + self.assertEqual(re.split(":*", ":aa:bb::cc", emptyok=True), + ['', '', 'a', 'a', '', 'b', 'b', '', 'c', 'c', '']) + self.assertEqual(re.split(":+", ":aa:bb::cc", emptyok=True), + ['', 'aa', 'bb', 'cc']) + self.assertEqual(re.split("(:*)", ":a:b::c", emptyok=True), + ['', ':', '', '', 'a', ':', '', '', 'b', '::', '', + '', 'c', '', '']) + self.assertEqual(re.split("(:+)", ":a:b::c", emptyok=True), + ['', ':', 'a', ':', 'b', '::', 'c']) + self.assertEqual(re.split("(:*)", ":aa:bb::cc", emptyok=True), + ['', ':', '', '', 'a', '', 'a', ':', '', '', 'b', '', + 'b', '::', '', '', 'c', '', 'c', '', '']) + self.assertEqual(re.split("(:+)", ":aa:bb::cc", emptyok=True), + ['', ':', 'aa', ':', 'bb', '::', 'cc']) + self.assertEqual(re.split("(?::*)", ":a:b::c", emptyok=True), + ['', '', 'a', '', 'b', '', 'c', '']) + self.assertEqual(re.split("(?::+)", ":a:b::c", emptyok=True), + ['', 'a', 'b', 'c']) + self.assertEqual(re.split("(:)*", ":a:b::c", emptyok=True), + ['', ':', '', None, 'a', ':', '', None, 'b', ':', '', + None, 'c', None, '']) + self.assertEqual(re.split("(:)+", ":a:b::c", emptyok=True), + ['', ':', 'a', ':', 'b', ':', 'c']) + self.assertEqual(re.split("([b:]+)", ":a:b::c", emptyok=True), + ['', ':', 'a', ':b::', 'c']) + self.assertEqual(re.split("(b)|(:+)", ":a:b::c", emptyok=True), + ['', None, ':', 'a', None, ':', '', 'b', None, '', + None, '::', 'c']) + self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c", emptyok=True), + ['', 'a', '', '', 'c']) + + def test_qualified_re_split_emptyok(self): + self.assertEqual(re.split(":", ":a:b::c", 2, emptyok=True), + ['', 'a', 'b::c']) + self.assertEqual(re.split(':', 'a:b:c:d', 2, emptyok=True), + ['a', 'b', 'c:d']) + self.assertEqual(re.split("(:)", ":a:b::c", 2, emptyok=True), + ['', ':', 'a', ':', 'b::c']) + self.assertEqual(re.split("(:*)", ":a:b::c", 2, emptyok=True), + ['', ':', '', '', 'a:b::c']) + self.assertEqual(re.split("(:*)", "aa:b::c", 2, emptyok=True), + ['', '', 'a', '', 'a:b::c']) + def test_re_findall(self): self.assertEqual(re.findall(":+", "abc"), []) self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"]) Index: Misc/cheatsheet =================================================================== RCS file: /cvsroot/python/python/dist/src/Misc/cheatsheet,v retrieving revision 1.8 diff -c -r1.8 cheatsheet *** Misc/cheatsheet 13 Mar 2004 20:27:23 -0000 1.8 --- Misc/cheatsheet 11 Jul 2004 03:12:54 -0000 *************** *** 1691,1697 **** flags]) split(pattern, split by occurrences of . If capturing () are string[, used inpattern, then occurrences of patterns or subpatterns are ! maxsplit=0]) also returned. findall( return a list of non-overlapping matches in , either a pattern, list ofgroups or a list of tuples if the pattern has more than 1 string) group. --- 1691,1699 ---- flags]) split(pattern, split by occurrences of . If capturing () are string[, used inpattern, then occurrences of patterns or subpatterns are ! maxsplit=0, also returned. Empty matches are ignored unless emptyok is ! emptyok= True. ! False]) findall( return a list of non-overlapping matches in , either a pattern, list ofgroups or a list of tuples if the pattern has more than 1 string) group. Index: Modules/_sre.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Modules/_sre.c,v retrieving revision 2.106 diff -c -r2.106 _sre.c *** Modules/_sre.c 17 Jun 2004 18:27:16 -0000 2.106 --- Modules/_sre.c 11 Jul 2004 03:12:57 -0000 *************** *** 543,549 **** break; case SRE_OP_ANY_ALL: ! /* repeated dot wildcare. skip to the end of the target string, and backtrack from there */ TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr)); ptr = end; --- 543,549 ---- break; case SRE_OP_ANY_ALL: ! /* repeated dot wildcard. skip to the end of the target string, and backtrack from there */ TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr)); ptr = end; *************** *** 2312,2320 **** PyObject* string; int maxsplit = 0; ! static char* kwlist[] = { "source", "maxsplit", NULL }; ! if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist, ! &string, &maxsplit)) return NULL; string = state_init(&state, self, string, 0, INT_MAX); --- 2312,2321 ---- PyObject* string; int maxsplit = 0; ! int emptyok = 0; ! static char* kwlist[] = { "source", "maxsplit", "emptyok", NULL }; ! if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:split", kwlist, ! &string, &maxsplit, &emptyok)) return NULL; string = state_init(&state, self, string, 0, INT_MAX); *************** *** 2350,2357 **** pattern_error(status); goto error; } ! ! if (state.start == state.ptr) { if (last == state.end) break; /* skip one character */ --- 2351,2358 ---- pattern_error(status); goto error; } ! ! if (!emptyok && state.start == state.ptr) { if (last == state.end) break; /* skip one character */ *************** *** 2384,2389 **** --- 2385,2398 ---- n = n + 1; + if (emptyok && state.start == state.ptr) { + if (last == state.end) + break; + /* skip one character */ + last = state.ptr; + state.start = (void*) ((char*) state.ptr + state.charsize); + continue; + } last = state.start = state.ptr; }