Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(11331)

Delta Between Two Patch Sets: Lib/textwrap.py

Issue 20491: textwrap: Non-breaking space not honored
Left Patch Set: Created 6 years, 3 months ago
Right Patch Set: Created 3 years, 9 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « Lib/test/test_textwrap.py ('k') | no next file » | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 """Text wrapping and filling. 1 """Text wrapping and filling.
2 """ 2 """
3 3
4 # Copyright (C) 1999-2001 Gregory P. Ward. 4 # Copyright (C) 1999-2001 Gregory P. Ward.
5 # Copyright (C) 2002, 2003 Python Software Foundation. 5 # Copyright (C) 2002, 2003 Python Software Foundation.
6 # Written by Greg Ward <gward@python.net> 6 # Written by Greg Ward <gward@python.net>
7 7
8 import re 8 import re
9 9
10 __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten'] 10 __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten']
11 11
12 # Hardcode the recognized whitespace characters to the US-ASCII 12 # Hardcode the recognized whitespace characters to the US-ASCII
13 # whitespace characters. The main reason for doing this is that in 13 # whitespace characters. The main reason for doing this is that
14 # ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales 14 # some Unicode spaces (like \u00a0) are non-breaking whitespaces.
15 # that character winds up in string.whitespace. Respecting
16 # string.whitespace in those cases would 1) make textwrap treat 0xa0 the
17 # same as any other whitespace char, which is clearly wrong (it's a
18 # *non-breaking* space), 2) possibly cause problems with Unicode,
19 # since 0xa0 is not in range(128).
20 _whitespace = '\t\n\x0b\x0c\r ' 15 _whitespace = '\t\n\x0b\x0c\r '
eric.araujo 2014/03/18 23:26:56 Like I said on the ticket, this comment should be
21 16
22 class TextWrapper: 17 class TextWrapper:
23 """ 18 """
24 Object for wrapping/filling text. The public interface consists of 19 Object for wrapping/filling text. The public interface consists of
25 the wrap() and fill() methods; the other methods are just there for 20 the wrap() and fill() methods; the other methods are just there for
26 subclasses to override in order to tweak the default behaviour. 21 subclasses to override in order to tweak the default behaviour.
27 If you want to completely replace the main wrapping algorithm, 22 If you want to completely replace the main wrapping algorithm,
28 you'll probably have to override _wrap_chunks(). 23 you'll probably have to override _wrap_chunks().
29 24
30 Several instance attributes control various aspects of wrapping: 25 Several instance attributes control various aspects of wrapping:
(...skipping 41 matching lines...) Expand 10 before | Expand all | Expand 10 after
72 uspace = ord(' ') 67 uspace = ord(' ')
73 for x in _whitespace: 68 for x in _whitespace:
74 unicode_whitespace_trans[ord(x)] = uspace 69 unicode_whitespace_trans[ord(x)] = uspace
75 70
76 # This funky little regex is just the trick for splitting 71 # This funky little regex is just the trick for splitting
77 # text up into word-wrappable chunks. E.g. 72 # text up into word-wrappable chunks. E.g.
78 # "Hello there -- you goof-ball, use the -b option!" 73 # "Hello there -- you goof-ball, use the -b option!"
79 # splits into 74 # splits into
80 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option! 75 # Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
81 # (after stripping out empty strings). 76 # (after stripping out empty strings).
82 wordsep_re = re.compile( 77 word_punct = r'[\w!"\'&.,?]'
83 # any whitespace 78 letter = r'[^\d\W]'
84 (r'([%s]+|' % _whitespace) + 79 whitespace = r'[%s]' % re.escape(_whitespace)
85 # hyphenated words 80 nowhitespace = '[^' + whitespace[1:]
86 (r'[^%s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])|' % _whitespace) + 81 wordsep_re = re.compile(r'''
87 # em-dash 82 ( # any whitespace
88 r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))' 83 %(ws)s+
89 ) 84 | # em-dash between words
eric.araujo 2014/03/18 23:26:56 As I recommended, this would be most readable with
85 (?<=%(wp)s) -{2,} (?=\w)
86 | # word, possibly hyphenated
87 %(nws)s+? (?:
88 # hyphenated word
89 -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-))
90 (?= %(lt)s -? %(lt)s)
91 | # end of word
92 (?=%(ws)s|\Z)
93 | # em-dash
94 (?<=%(wp)s) (?=-{2,}\w)
95 )
96 )''' % {'wp': word_punct, 'lt': letter,
97 'ws': whitespace, 'nws': nowhitespace},
98 re.VERBOSE)
99 del word_punct, letter, nowhitespace
90 100
91 # This less funky little regex just split on recognized spaces. E.g. 101 # This less funky little regex just split on recognized spaces. E.g.
92 # "Hello there -- you goof-ball, use the -b option!" 102 # "Hello there -- you goof-ball, use the -b option!"
93 # splits into 103 # splits into
94 # Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/ 104 # Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
95 wordsep_simple_re = re.compile(r'([%s]+)' % _whitespace) 105 wordsep_simple_re = re.compile(r'(%s+)' % whitespace)
106 del whitespace
96 107
97 # XXX this is not locale- or charset-aware -- string.lowercase 108 # XXX this is not locale- or charset-aware -- string.lowercase
98 # is US-ASCII only (and therefore English-only) 109 # is US-ASCII only (and therefore English-only)
99 sentence_end_re = re.compile(r'[a-z]' # lowercase letter 110 sentence_end_re = re.compile(r'[a-z]' # lowercase letter
100 r'[\.\!\?]' # sentence-ending punct. 111 r'[\.\!\?]' # sentence-ending punct.
101 r'[\"\']?' # optional end-of-quote 112 r'[\"\']?' # optional end-of-quote
102 r'\Z') # end of chunk 113 r'\Z') # end of chunk
103
104 114
105 def __init__(self, 115 def __init__(self,
106 width=70, 116 width=70,
107 initial_indent="", 117 initial_indent="",
108 subsequent_indent="", 118 subsequent_indent="",
109 expand_tabs=True, 119 expand_tabs=True,
110 replace_whitespace=True, 120 replace_whitespace=True,
111 fix_sentence_endings=False, 121 fix_sentence_endings=False,
112 break_long_words=True, 122 break_long_words=True,
113 drop_whitespace=True, 123 drop_whitespace=True,
(...skipping 16 matching lines...) Expand all
130 self.placeholder = placeholder 140 self.placeholder = placeholder
131 141
132 142
133 # -- Private methods ----------------------------------------------- 143 # -- Private methods -----------------------------------------------
134 # (possibly useful for subclasses to override) 144 # (possibly useful for subclasses to override)
135 145
136 def _munge_whitespace(self, text): 146 def _munge_whitespace(self, text):
137 """_munge_whitespace(text : string) -> string 147 """_munge_whitespace(text : string) -> string
138 148
139 Munge whitespace in text: expand tabs and convert all other 149 Munge whitespace in text: expand tabs and convert all other
140 whitespace characters to spaces. Eg. " foo\tbar\n\nbaz" 150 whitespace characters to spaces. Eg. " foo\\tbar\\n\\nbaz"
141 becomes " foo bar baz". 151 becomes " foo bar baz".
142 """ 152 """
143 if self.expand_tabs: 153 if self.expand_tabs:
144 text = text.expandtabs(self.tabsize) 154 text = text.expandtabs(self.tabsize)
145 if self.replace_whitespace: 155 if self.replace_whitespace:
146 text = text.translate(self.unicode_whitespace_trans) 156 text = text.translate(self.unicode_whitespace_trans)
147 return text 157 return text
148 158
149 159
150 def _split(self, text): 160 def _split(self, text):
(...skipping 15 matching lines...) Expand all
166 chunks = self.wordsep_re.split(text) 176 chunks = self.wordsep_re.split(text)
167 else: 177 else:
168 chunks = self.wordsep_simple_re.split(text) 178 chunks = self.wordsep_simple_re.split(text)
169 chunks = [c for c in chunks if c] 179 chunks = [c for c in chunks if c]
170 return chunks 180 return chunks
171 181
172 def _fix_sentence_endings(self, chunks): 182 def _fix_sentence_endings(self, chunks):
173 """_fix_sentence_endings(chunks : [string]) 183 """_fix_sentence_endings(chunks : [string])
174 184
175 Correct for sentence endings buried in 'chunks'. Eg. when the 185 Correct for sentence endings buried in 'chunks'. Eg. when the
176 original text contains "... foo.\nBar ...", munge_whitespace() 186 original text contains "... foo.\\nBar ...", munge_whitespace()
177 and split() will convert that to [..., "foo.", " ", "Bar", ...] 187 and split() will convert that to [..., "foo.", " ", "Bar", ...]
178 which has one too few spaces; this method simply changes the one 188 which has one too few spaces; this method simply changes the one
179 space to two. 189 space to two.
180 """ 190 """
181 i = 0 191 i = 0
182 patsearch = self.sentence_end_re.search 192 patsearch = self.sentence_end_re.search
183 while i < len(chunks)-1: 193 while i < len(chunks)-1:
184 if chunks[i+1] == " " and patsearch(chunks[i]): 194 if chunks[i+1] == " " and patsearch(chunks[i]):
185 chunks[i+1] = " " 195 chunks[i+1] = " "
186 i += 2 196 i += 2
(...skipping 215 matching lines...) Expand 10 before | Expand all | Expand 10 after
402 _leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE) 412 _leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
403 413
404 def dedent(text): 414 def dedent(text):
405 """Remove any common leading whitespace from every line in `text`. 415 """Remove any common leading whitespace from every line in `text`.
406 416
407 This can be used to make triple-quoted strings line up with the left 417 This can be used to make triple-quoted strings line up with the left
408 edge of the display, while still presenting them in the source code 418 edge of the display, while still presenting them in the source code
409 in indented form. 419 in indented form.
410 420
411 Note that tabs and spaces are both treated as whitespace, but they 421 Note that tabs and spaces are both treated as whitespace, but they
412 are not equal: the lines " hello" and "\thello" are 422 are not equal: the lines " hello" and "\\thello" are
413 considered to have no common leading whitespace. (This behaviour is 423 considered to have no common leading whitespace. (This behaviour is
414 new in Python 2.5; older versions of this module incorrectly 424 new in Python 2.5; older versions of this module incorrectly
415 expanded tabs before searching for common leading whitespace.) 425 expanded tabs before searching for common leading whitespace.)
416 """ 426 """
417 # Look for the longest leading string of spaces and tabs common to 427 # Look for the longest leading string of spaces and tabs common to
418 # all lines. 428 # all lines.
419 margin = None 429 margin = None
420 text = _whitespace_only_re.sub('', text) 430 text = _whitespace_only_re.sub('', text)
421 indents = _leading_whitespace_re.findall(text) 431 indents = _leading_whitespace_re.findall(text)
422 for indent in indents: 432 for indent in indents:
423 if margin is None: 433 if margin is None:
424 margin = indent 434 margin = indent
425 435
426 # Current line more deeply indented than previous winner: 436 # Current line more deeply indented than previous winner:
427 # no change (previous winner is still on top). 437 # no change (previous winner is still on top).
428 elif indent.startswith(margin): 438 elif indent.startswith(margin):
429 pass 439 pass
430 440
431 # Current line consistent with and no deeper than previous winner: 441 # Current line consistent with and no deeper than previous winner:
432 # it's the new winner. 442 # it's the new winner.
433 elif margin.startswith(indent): 443 elif margin.startswith(indent):
434 margin = indent 444 margin = indent
435 445
436 # Current line and previous winner have no common whitespace: 446 # Find the largest common whitespace between current line and previous
437 # there is no margin. 447 # winner.
438 else: 448 else:
439 margin = "" 449 for i, (x, y) in enumerate(zip(margin, indent)):
440 break 450 if x != y:
451 margin = margin[:i]
452 break
453 else:
454 margin = margin[:len(indent)]
441 455
442 # sanity check (testing/debugging only) 456 # sanity check (testing/debugging only)
443 if 0 and margin: 457 if 0 and margin:
444 for line in text.split("\n"): 458 for line in text.split("\n"):
445 assert not line or line.startswith(margin), \ 459 assert not line or line.startswith(margin), \
446 "line = %r, margin = %r" % (line, margin) 460 "line = %r, margin = %r" % (line, margin)
447 461
448 if margin: 462 if margin:
449 text = re.sub(r'(?m)^' + margin, '', text) 463 text = re.sub(r'(?m)^' + margin, '', text)
450 return text 464 return text
(...skipping 14 matching lines...) Expand all
465 def prefixed_lines(): 479 def prefixed_lines():
466 for line in text.splitlines(True): 480 for line in text.splitlines(True):
467 yield (prefix + line if predicate(line) else line) 481 yield (prefix + line if predicate(line) else line)
468 return ''.join(prefixed_lines()) 482 return ''.join(prefixed_lines())
469 483
470 484
471 if __name__ == "__main__": 485 if __name__ == "__main__":
472 #print dedent("\tfoo\n\tbar") 486 #print dedent("\tfoo\n\tbar")
473 #print dedent(" \thello there\n \t how are you?") 487 #print dedent(" \thello there\n \t how are you?")
474 print(dedent("Hello there.\n This is indented.")) 488 print(dedent("Hello there.\n This is indented."))
LEFTRIGHT

RSS Feeds Recent Issues | This issue
This is Rietveld 894c83f36cb7+