diff --git a/Lib/textwrap.py b/Lib/textwrap.py --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -73,22 +73,36 @@ class TextWrapper: for x in _whitespace: unicode_whitespace_trans[ord(x)] = uspace - # This funky little regex is just the trick for splitting - # text up into word-wrappable chunks. E.g. + # Word + punctuation + word_punct = r'[\w\!\"\'\&\.\,\?]' + + # This funky little regex is just the trick for matching word-wrappable + # chunks one at a time in a text. + # E.g. # "Hello there -- you goof-ball, use the -b option!" - # splits into - # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option! - # (after stripping out empty strings). - wordsep_re = re.compile( - r'(\s+|' # any whitespace - r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' # hyphenated words - r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash + # matches into + # /Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!/ + wordmatch_re = re.compile( + r'(' + # any whitespace + r'\s+' + # -- + r'|(?<=\s)-{{2,}}(?=\s)' + # -- + r'|(?<={wp})-{{2,}}(?=\w)' + # word, possibly punctuated or hyphenated + r'|\S*?\w{wp}*(-\d\w+)*([^-\w\s]+|-(?=\w)|--+([^-\w]*$))?' + # catch-all for other stuff, until next space + r'|\S+' + r')' + .format(wp=word_punct)) + del word_punct - # This less funky little regex just split on recognized spaces. E.g. + # This less funky little regex just matches on recognized spaces. E.g. # "Hello there -- you goof-ball, use the -b option!" - # splits into - # Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/ - wordsep_simple_re = re.compile(r'(\s+)') + # matches into + # /Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/ + wordmatch_simple_re = re.compile(r'(\s+|\S+)') # XXX this is not locale- or charset-aware -- string.lowercase # is US-ASCII only (and therefore English-only) @@ -159,10 +173,18 @@ class TextWrapper: otherwise. """ if self.break_on_hyphens is True: - chunks = self.wordsep_re.split(text) + wordmatch = self.wordmatch_re else: - chunks = self.wordsep_simple_re.split(text) - chunks = [c for c in chunks if c] + wordmatch = self.wordmatch_simple_re + i = 0 + n = len(text) + chunks = [] + while i < n: + m = wordmatch.match(text, i) + assert m is not None, repr(text[i:]) + c = m.group(1) + chunks.append(c) + i += len(c) return chunks def _fix_sentence_endings(self, chunks):