--- orig/textwrap.py 2008-10-21 20:23:45.000000000 +0200 +++ textwrap.py 2008-10-21 20:38:14.000000000 +0200 @@ -85,8 +85,9 @@ # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option! # (after stripping out empty strings). wordsep_re = re.compile( + r'(?u)' # Unicode \w r'(\s+|' # any whitespace - r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words + r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' # hyphenated words r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash # This less funky little regex just split on recognized spaces. E.g. --- test/orig/test_textwrap.py 2008-10-21 20:24:06.000000000 +0200 +++ test/test_textwrap.py 2008-10-21 20:31:12.000000000 +0200 @@ -174,7 +174,7 @@ text = ("Python 1.0.0 was released on 1994-01-26. Python 1.0.1 was\n" "released on 1994-02-15.") - self.check_wrap(text, 30, ['Python 1.0.0 was released on', + self.check_wrap(text, 34, ['Python 1.0.0 was released on', '1994-01-26. Python 1.0.1 was', 'released on 1994-02-15.']) self.check_wrap(text, 40, ['Python 1.0.0 was released on 1994-01-26.', @@ -353,6 +353,14 @@ otext = self.wrapper.fill(text) assert isinstance(otext, unicode) + def test_no_split_at_umlaut(self): + text = u"Die Empf\xe4nger-Auswahl" + self.check_wrap(text, 13, [u"Die", u"Empf\xe4nger-", u"Auswahl"]) + + def test_umlaut_is_alphabetic_character(self): + text = u"aa \xe4\xe4-\xe4\xe4" + self.check_wrap(text, 7, [u"aa \xe4\xe4-", u"\xe4\xe4"]) + def test_split(self): # Ensure that the standard _split() method works as advertised # in the comments