Index: Lib/tokenize.py =================================================================== --- Lib/tokenize.py (révision 66750) +++ Lib/tokenize.py (copie de travail) @@ -26,7 +26,7 @@ import re, string, sys from token import * -from codecs import lookup +from codecs import lookup, BOM_UTF8 from itertools import chain, repeat cookie_re = re.compile("coding[:=]\s*([-\w.]+)") @@ -251,7 +251,8 @@ It detects the encoding from the presence of a utf-8 bom or an encoding cookie as specified in pep-0263. If both a bom and a cookie are present, - but disagree, a SyntaxError will be raised. + but disagree, a SyntaxError will be raised. If the encoding cookie is an + invalid charset, raise a SyntaxError. If no encoding is specified, then the default of 'utf-8' will be returned. """ @@ -268,18 +269,25 @@ try: line_string = line.decode('ascii') except UnicodeDecodeError: - pass - else: - matches = cookie_re.findall(line_string) - if matches: - encoding = matches[0] - if bom_found and lookup(encoding).name != 'utf-8': - # This behaviour mimics the Python interpreter - raise SyntaxError('encoding problem: utf-8') - return encoding + return None + matches = cookie_re.findall(line_string) + if not matches: + return None + encoding = matches[0] + try: + codec = lookup(encoding) + except LookupError: + # This behaviour mimics the Python interpreter + raise SyntaxError("unknown encoding: " + encoding) + + if bom_found and codec.name != 'utf-8': + # This behaviour mimics the Python interpreter + raise SyntaxError('encoding problem: utf-8') + return encoding + first = read_or_stop() - if first.startswith(utf8_bom): + if first.startswith(BOM_UTF8): bom_found = True first = first[3:] if not first: