Index: Doc/lib/libre.tex
===================================================================
RCS file: /cvsroot/python/python/dist/src/Doc/lib/libre.tex,v
retrieving revision 1.108
diff -c -r1.108 libre.tex
*** Doc/lib/libre.tex	18 Oct 2003 15:28:22 -0000	1.108
--- Doc/lib/libre.tex	11 Jul 2004 03:12:48 -0000
***************
*** 528,534 ****
    \var{string}, use \method{search()} instead.}
  \end{funcdesc}
  
! \begin{funcdesc}{split}{pattern, string\optional{, maxsplit\code{ = 0}}}
    Split \var{string} by the occurrences of \var{pattern}.  If
    capturing parentheses are used in \var{pattern}, then the text of all
    groups in the pattern are also returned as part of the resulting list.
--- 528,535 ----
    \var{string}, use \method{search()} instead.}
  \end{funcdesc}
  
! \begin{funcdesc}{split}{pattern, string\optional{, maxsplit\code{ = 0},
!   emptyok\code{ = False}}}
    Split \var{string} by the occurrences of \var{pattern}.  If
    capturing parentheses are used in \var{pattern}, then the text of all
    groups in the pattern are also returned as part of the resulting list.
***************
*** 538,543 ****
--- 539,550 ----
    1.5 release, \var{maxsplit} was ignored.  This has been fixed in
    later releases.)
  
+   Although not previously documented, this function ignores matches of
+   length zero.  If \var{emptyok} (new in Python 2.X) is \code{True},
+   zero-length matches are not ignored.  In either case, overlapping
+   matches are ignored and only the first match that begins in each
+   position is used.
+ 
  \begin{verbatim}
  >>> re.split('\W+', 'Words, words, words.')
  ['Words', 'words', 'words', '']
***************
*** 545,550 ****
--- 552,564 ----
  ['Words', ', ', 'words', ', ', 'words', '.', '']
  >>> re.split('\W+', 'Words, words, words.', 1)
  ['Words', 'words, words.']
+ >>> re.split('(?=[Ww])', 'Words, words, words.', emptyok=True)
+ ['', 'Words, ', 'words, ', 'words.']
+ 
+ # split an Windows-style .ini file into a list of sections
+ >>> re.split(r'(?m)(?=^\[)', ';rem\n[sec1]\nfoo=bar\n\n[sec2]\nbaz\n',
+              emptyok=True)
+ [';rem\n', '[sec1]\nfoo=bar\n\n', '[sec2]\nbaz\n']
  \end{verbatim}
  
    This function combines and extends the functionality of
Index: Lib/sre.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/sre.py,v
retrieving revision 1.48
diff -c -r1.48 sre.py
*** Lib/sre.py	20 Apr 2004 21:11:11 -0000	1.48
--- Lib/sre.py	11 Jul 2004 03:12:49 -0000
***************
*** 150,159 ****
      substitutions that were made."""
      return _compile(pattern, 0).subn(repl, string, count)
  
! def split(pattern, string, maxsplit=0):
      """Split the source string by the occurrences of the pattern,
!     returning a list containing the resulting substrings."""
!     return _compile(pattern, 0).split(string, maxsplit)
  
  def findall(pattern, string):
      """Return a list of all non-overlapping matches in the string.
--- 150,160 ----
      substitutions that were made."""
      return _compile(pattern, 0).subn(repl, string, count)
  
! def split(pattern, string, maxsplit=0, emptyok=False):
      """Split the source string by the occurrences of the pattern,
!     returning a list containing the resulting substrings.
!     Empty matches are ignored unless emptyok is True."""
!     return _compile(pattern, 0).split(string, maxsplit, emptyok)
  
  def findall(pattern, string):
      """Return a list of all non-overlapping matches in the string.
Index: Lib/test/test_re.py
===================================================================
RCS file: /cvsroot/python/python/dist/src/Lib/test/test_re.py,v
retrieving revision 1.50
diff -c -r1.50 test_re.py
*** Lib/test/test_re.py	31 May 2004 03:09:25 -0000	1.50
--- Lib/test/test_re.py	11 Jul 2004 03:12:51 -0000
***************
*** 137,142 ****
--- 137,233 ----
          self.assertEqual(re.split("(:*)", ":a:b::c", 2),
                           ['', ':', 'a', ':', 'b::c'])
  
+     def test_re_split_emptyok(self):
+ 
+         # These two test cases show a primary use-case for emptyok.  It allows
+         # easy pattern-based splitting of strings, even when one doesn't want
+         # to peel off the separating pattern.
+ 
+         self.assertEqual(re.split("(?=:)", ":aa:b::cc", emptyok=True),
+                          ['', ':aa', ':b', ':', ':cc'])
+         self.assertEqual(re.split(r'(?m)(?=^\[)',
+                                   ";comment\n[sec1]\nfoo=bar\n\n"
+                                   "[sec2]\netc=etc\n",
+                                   emptyok=True),
+                          [';comment\n', '[sec1]\nfoo=bar\n\n',
+                           '[sec2]\netc=etc\n'])
+ 
+         # This next may look odd at first, but it's correct.  There are four
+         # non-overlapping matches (three of which are empty), which split the
+         # string into five pieces.
+ 
+         self.assertEqual(re.split(":*", "a:b", emptyok=True),
+                          ['', 'a', '', 'b', ''])
+ 
+         # An obvious design question is whether we shouldn't disallow an empty
+         # match directly after a non-empty match.  The rationale for not doing
+         # so is that it seems like it might be useful, and the initially
+         # surprising behavior shown below only occurs for patterns with both
+         # zero-length and non-zero-length matches.  --mkc
+ 
+         # bug 852532: original complaint
+         self.assertEqual(re.compile('^$', re.MULTILINE).split('foo\n\nbar',
+                                                               emptyok=True),
+                          ['foo\n', '\nbar'])
+ 
+         self.assertEqual(re.split(":*", "a", emptyok=True), ['', 'a', ''])
+         self.assertEqual(re.split(":*", "ab", emptyok=True),
+                          ['', 'a', 'b', ''])
+         self.assertEqual(re.split(":", "a", emptyok=True), ['a'])
+         self.assertEqual(re.split(":", ":a", emptyok=True), ['', 'a'])
+         self.assertEqual(re.split(":", ":a:b::c", emptyok=True),
+                          ['', 'a', 'b', '', 'c'])
+         self.assertEqual(re.split(":*", ":a:b::c", emptyok=True),
+                          ['', '', 'a', '', 'b', '', 'c', ''])
+         self.assertEqual(re.split(":+", ":a:b::c", emptyok=True),
+                          ['', 'a', 'b', 'c'])
+         self.assertEqual(re.split(":*?", ":a:b::c", emptyok=True),
+                          ['', ':', 'a', ':', 'b', ':', ':', 'c', ''])
+         self.assertEqual(re.split(":+?", ":a:b::c", emptyok=True),
+                          ['', 'a', 'b', '', 'c'])
+         self.assertEqual(re.split(":*", ":aa:bb::cc", emptyok=True),
+                          ['', '', 'a', 'a', '', 'b', 'b', '', 'c', 'c', ''])
+         self.assertEqual(re.split(":+", ":aa:bb::cc", emptyok=True),
+                          ['', 'aa', 'bb', 'cc'])
+         self.assertEqual(re.split("(:*)", ":a:b::c", emptyok=True),
+                          ['', ':', '', '', 'a', ':', '', '', 'b', '::', '',
+                           '', 'c', '', ''])
+         self.assertEqual(re.split("(:+)", ":a:b::c", emptyok=True),
+                          ['', ':', 'a', ':', 'b', '::', 'c'])
+         self.assertEqual(re.split("(:*)", ":aa:bb::cc", emptyok=True),
+                          ['', ':', '', '', 'a', '', 'a', ':', '', '', 'b', '',
+                           'b', '::', '', '', 'c', '', 'c', '', '']) 
+         self.assertEqual(re.split("(:+)", ":aa:bb::cc", emptyok=True),
+                          ['', ':', 'aa', ':', 'bb', '::', 'cc'])
+         self.assertEqual(re.split("(?::*)", ":a:b::c", emptyok=True),
+                          ['', '', 'a', '', 'b', '', 'c', ''])
+         self.assertEqual(re.split("(?::+)", ":a:b::c", emptyok=True),
+                          ['', 'a', 'b', 'c'])
+         self.assertEqual(re.split("(:)*", ":a:b::c", emptyok=True),
+                          ['', ':', '', None, 'a', ':', '', None, 'b', ':', '',
+                           None, 'c', None, ''])
+         self.assertEqual(re.split("(:)+", ":a:b::c", emptyok=True),
+                          ['', ':', 'a', ':', 'b', ':', 'c'])
+         self.assertEqual(re.split("([b:]+)", ":a:b::c", emptyok=True),
+                          ['', ':', 'a', ':b::', 'c'])
+         self.assertEqual(re.split("(b)|(:+)", ":a:b::c", emptyok=True),
+                          ['', None, ':', 'a', None, ':', '', 'b', None, '',
+                           None, '::', 'c'])
+         self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c", emptyok=True),
+                          ['', 'a', '', '', 'c'])
+ 
+     def test_qualified_re_split_emptyok(self):
+         self.assertEqual(re.split(":", ":a:b::c", 2, emptyok=True),
+                          ['', 'a', 'b::c'])
+         self.assertEqual(re.split(':', 'a:b:c:d', 2, emptyok=True),
+                          ['a', 'b', 'c:d'])
+         self.assertEqual(re.split("(:)", ":a:b::c", 2, emptyok=True),
+                          ['', ':', 'a', ':', 'b::c'])
+         self.assertEqual(re.split("(:*)", ":a:b::c", 2, emptyok=True),
+                          ['', ':', '', '', 'a:b::c'])
+         self.assertEqual(re.split("(:*)", "aa:b::c", 2, emptyok=True),
+                          ['', '', 'a', '', 'a:b::c'])
+ 
      def test_re_findall(self):
          self.assertEqual(re.findall(":+", "abc"), [])
          self.assertEqual(re.findall(":+", "a:b::c:::d"), [":", "::", ":::"])
Index: Misc/cheatsheet
===================================================================
RCS file: /cvsroot/python/python/dist/src/Misc/cheatsheet,v
retrieving revision 1.8
diff -c -r1.8 cheatsheet
*** Misc/cheatsheet	13 Mar 2004 20:27:23 -0000	1.8
--- Misc/cheatsheet	11 Jul 2004 03:12:54 -0000
***************
*** 1691,1697 ****
  flags])
  split(pattern, split <string> by occurrences of <pattern>. If capturing () are
  string[,       used inpattern, then occurrences of patterns or subpatterns are
! maxsplit=0])   also returned.
  findall(       return a list of non-overlapping matches in <pattern>, either a
  pattern,       list ofgroups or a list of tuples if the pattern has more than 1
  string)        group.
--- 1691,1699 ----
  flags])
  split(pattern, split <string> by occurrences of <pattern>. If capturing () are
  string[,       used inpattern, then occurrences of patterns or subpatterns are
! maxsplit=0,    also returned.  Empty matches are ignored unless emptyok is
! emptyok=       True.
! False])
  findall(       return a list of non-overlapping matches in <pattern>, either a
  pattern,       list ofgroups or a list of tuples if the pattern has more than 1
  string)        group.
Index: Modules/_sre.c
===================================================================
RCS file: /cvsroot/python/python/dist/src/Modules/_sre.c,v
retrieving revision 2.106
diff -c -r2.106 _sre.c
*** Modules/_sre.c	17 Jun 2004 18:27:16 -0000	2.106
--- Modules/_sre.c	11 Jul 2004 03:12:57 -0000
***************
*** 543,549 ****
          break;
  
      case SRE_OP_ANY_ALL:
!         /* repeated dot wildcare.  skip to the end of the target
             string, and backtrack from there */
          TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
          ptr = end;
--- 543,549 ----
          break;
  
      case SRE_OP_ANY_ALL:
!         /* repeated dot wildcard.  skip to the end of the target
             string, and backtrack from there */
          TRACE(("|%p|%p|COUNT ANY_ALL\n", pattern, ptr));
          ptr = end;
***************
*** 2312,2320 ****
  
      PyObject* string;
      int maxsplit = 0;
!     static char* kwlist[] = { "source", "maxsplit", NULL };
!     if (!PyArg_ParseTupleAndKeywords(args, kw, "O|i:split", kwlist,
!                                      &string, &maxsplit))
          return NULL;
  
      string = state_init(&state, self, string, 0, INT_MAX);
--- 2312,2321 ----
  
      PyObject* string;
      int maxsplit = 0;
!     int emptyok = 0;
!     static char* kwlist[] = { "source", "maxsplit", "emptyok", NULL };
!     if (!PyArg_ParseTupleAndKeywords(args, kw, "O|ii:split", kwlist,
!                                      &string, &maxsplit, &emptyok))
          return NULL;
  
      string = state_init(&state, self, string, 0, INT_MAX);
***************
*** 2350,2357 ****
              pattern_error(status);
              goto error;
          }
!         
!         if (state.start == state.ptr) {
              if (last == state.end)
                  break;
              /* skip one character */
--- 2351,2358 ----
              pattern_error(status);
              goto error;
          }
! 
!         if (!emptyok && state.start == state.ptr) {
              if (last == state.end)
                  break;
              /* skip one character */
***************
*** 2384,2389 ****
--- 2385,2398 ----
  
          n = n + 1;
  
+         if (emptyok && state.start == state.ptr) {
+             if (last == state.end)
+                 break;
+             /* skip one character */
+             last = state.ptr;
+             state.start = (void*) ((char*) state.ptr + state.charsize);
+             continue;
+         }
          last = state.start = state.ptr;
  
      }