diff -r 5c3f5907b325 Doc/library/re.rst --- a/Doc/library/re.rst Mon Aug 22 15:30:06 2011 +1000 +++ b/Doc/library/re.rst Mon Aug 22 16:04:56 2011 +1000 @@ -329,13 +329,17 @@ ``\W`` character (or vice versa). By default Unicode alphanumerics are the ones used, but this can be changed by using the :const:`ASCII` flag. Inside a character range, ``\b`` represents the backspace - character, for compatibility with Python's string literals. + character, for compatibility with Python's string literals. + + ``\b`` also matches the boundary of a word at the start or end of a string. + It does not match an empty string. ``\B`` - Matches the empty string, but only when it is *not* at the beginning or end of a - word. This is just the opposite of ``\b``, so word characters are - Unicode alphanumerics or the underscore, although this can be changed - by using the :const:`ASCII` flag. + Matches the empty string, but only between two ``\w`` characters or between + two ``\W`` characters. + + ``\B`` does not match an empty string, but it can match next to a non-word + character at the start or end of a string. ``\d`` For Unicode (str) patterns: diff -r 5c3f5907b325 Lib/test/test_re.py --- a/Lib/test/test_re.py Mon Aug 22 15:30:06 2011 +1000 +++ b/Lib/test/test_re.py Mon Aug 22 16:04:56 2011 +1000 @@ -355,6 +355,32 @@ self.assertEqual(re.search(r"\d\D\w\W\s\S", "1aa! a", re.UNICODE).group(0), "1aa! a") + def test_string_boundaries(self): + # See http://bugs.python.org/issue10713 + self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), + "abc") + # There's a word boundary at the start of a string. + self.assertTrue(re.match(r"\b", "abc")) + # A non-empty string includes a non-boundary zero-length match. + self.assertTrue(re.search(r"\B", "abc")) + # There is no non-boundary match at the start of a string. + self.assertFalse(re.match(r"\B", "abc")) + # However, an empty string contains no word boundaries, and also no + # non-boundaries. + self.assertEqual(re.search(r"\B", ""), None) + # This one is questionable and different from the perlre behaviour, + # but describes current behavior. + self.assertEqual(re.search(r"\b", ""), None) + # A single word-character string has two boundaries, but no + # non-boundary gaps. + self.assertEqual(len(re.findall(r"\b", "a")), 2) + self.assertEqual(len(re.findall(r"\B", "a")), 0) + # If there are no words, there are no boundaries + self.assertEqual(len(re.findall(r"\b", " ")), 0) + self.assertEqual(len(re.findall(r"\b", " ")), 0) + # Can match around the whitespace. + self.assertEqual(len(re.findall(r"\B", " ")), 2) + def test_bigcharset(self): self.assertEqual(re.match("([\u2222\u2223])", "\u2222").group(1), "\u2222")