Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(35636)

Side by Side Diff: Lib/lib2to3/pgen2/tokenize.py

Issue 18873: "Encoding" detected in non-comment lines
Patch Set: Created 6 years, 1 month ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « Lib/idlelib/IOBinding.py ('k') | Lib/lib2to3/tests/data/false_encoding.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. 1 # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2 # All rights reserved. 2 # All rights reserved.
3 3
4 """Tokenization help for Python programs. 4 """Tokenization help for Python programs.
5 5
6 generate_tokens(readline) is a generator that breaks a stream of 6 generate_tokens(readline) is a generator that breaks a stream of
7 text into Python tokens. It accepts a readline-like method which is called 7 text into Python tokens. It accepts a readline-like method which is called
8 repeatedly to get the next line of input (or "" for EOF). It generates 8 repeatedly to get the next line of input (or "" for EOF). It generates
9 5-tuples with these members: 9 5-tuples with these members:
10 10
(...skipping 218 matching lines...) Expand 10 before | Expand all | Expand 10 after
229 elif toknum == DEDENT: 229 elif toknum == DEDENT:
230 indents.pop() 230 indents.pop()
231 continue 231 continue
232 elif toknum in (NEWLINE, NL): 232 elif toknum in (NEWLINE, NL):
233 startline = True 233 startline = True
234 elif startline and indents: 234 elif startline and indents:
235 toks_append(indents[-1]) 235 toks_append(indents[-1])
236 startline = False 236 startline = False
237 toks_append(tokval) 237 toks_append(tokval)
238 238
239 cookie_re = re.compile("coding[:=]\s*([-\w.]+)") 239 cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
240 240
241 def _get_normal_name(orig_enc): 241 def _get_normal_name(orig_enc):
242 """Imitates get_normal_name in tokenizer.c.""" 242 """Imitates get_normal_name in tokenizer.c."""
243 # Only care about the first 12 characters. 243 # Only care about the first 12 characters.
244 enc = orig_enc[:12].lower().replace("_", "-") 244 enc = orig_enc[:12].lower().replace("_", "-")
245 if enc == "utf-8" or enc.startswith("utf-8-"): 245 if enc == "utf-8" or enc.startswith("utf-8-"):
246 return "utf-8" 246 return "utf-8"
247 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ 247 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
248 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): 248 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
249 return "iso-8859-1" 249 return "iso-8859-1"
(...skipping 24 matching lines...) Expand all
274 try: 274 try:
275 return readline() 275 return readline()
276 except StopIteration: 276 except StopIteration:
277 return bytes() 277 return bytes()
278 278
279 def find_cookie(line): 279 def find_cookie(line):
280 try: 280 try:
281 line_string = line.decode('ascii') 281 line_string = line.decode('ascii')
282 except UnicodeDecodeError: 282 except UnicodeDecodeError:
283 return None 283 return None
284 284 match = cookie_re.match(line_string)
285 matches = cookie_re.findall(line_string) 285 if not match:
286 if not matches:
287 return None 286 return None
288 encoding = _get_normal_name(matches[0]) 287 encoding = _get_normal_name(match.group(1))
289 try: 288 try:
290 codec = lookup(encoding) 289 codec = lookup(encoding)
291 except LookupError: 290 except LookupError:
292 # This behaviour mimics the Python interpreter 291 # This behaviour mimics the Python interpreter
293 raise SyntaxError("unknown encoding: " + encoding) 292 raise SyntaxError("unknown encoding: " + encoding)
294 293
295 if bom_found: 294 if bom_found:
296 if codec.name != 'utf-8': 295 if codec.name != 'utf-8':
297 # This behaviour mimics the Python interpreter 296 # This behaviour mimics the Python interpreter
298 raise SyntaxError('encoding problem: utf-8') 297 raise SyntaxError('encoding problem: utf-8')
(...skipping 192 matching lines...) Expand 10 before | Expand all | Expand 10 after
491 pos = pos + 1 490 pos = pos + 1
492 491
493 for indent in indents[1:]: # pop remaining indent levels 492 for indent in indents[1:]: # pop remaining indent levels
494 yield (DEDENT, '', (lnum, 0), (lnum, 0), '') 493 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
495 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') 494 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
496 495
497 if __name__ == '__main__': # testing 496 if __name__ == '__main__': # testing
498 import sys 497 import sys
499 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline) 498 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
500 else: tokenize(sys.stdin.readline) 499 else: tokenize(sys.stdin.readline)
OLDNEW
« no previous file with comments | « Lib/idlelib/IOBinding.py ('k') | Lib/lib2to3/tests/data/false_encoding.py » ('j') | no next file with comments »

RSS Feeds Recent Issues | This issue
This is Rietveld 894c83f36cb7+