diff -r eae2bb1930c1 Lib/test/test_grammar.py --- a/Lib/test/test_grammar.py Wed Feb 10 09:49:36 2016 -0800 +++ b/Lib/test/test_grammar.py Thu Feb 11 12:03:45 2016 +0200 @@ -8,6 +8,66 @@ import sys # testing import * from sys import * +# These are shared with test_tokenize. +VALID_UNDERSCORE_LITERALS = [ + '0_0_0', + '4_2', + '1_0000_0000', + '0b1001_0100', + '0xffff_ffff', + '0o5_7_7', + '1_00_00.5', + '1e1_0', + '.1_4', +] +INVALID_UNDERSCORE_LITERALS = [ + # Trailing underscores: + '0_', + '42_', + '1.4j_', + '0b1_', + '0xf_', + '0o5_', + # Underscores in the base selector: + '0_b0', + '0_xf', + '0_o5', + # Underscore right after the base selector: + '0b_0', + '0x_f', + '0o_5', + # Old-style octal, still disallowed: + '0_7', + '09_99', + # Special case with exponent: + '0 if 1_Else 1', + # Underscore right before a dot: + '1_.4', + '1_.4j', + # Underscore right after a dot: + '1._4', + '1._4j', + '._5', + # Underscore right after a sign: + '1.0e+_1', + # Multiple consecutive underscores: + '4_______2', + '0.1__4', + '0b1001__0100', + '0xffff__ffff', + '0o5__77', + '1e1__0', + # Underscore right before j: + '1.4_j', + '1.4e5_j', + # Underscore right before e: + '1_e1', + '1.4_e1', + # Underscore right after e: + '1e_1', + '1.4e_1', +] + class TokenTests(unittest.TestCase): @@ -87,6 +147,14 @@ class TokenTests(unittest.TestCase): self.assertEqual(1 if 0else 0, 0) self.assertRaises(SyntaxError, eval, "0 if 1Else 0") + def test_underscore_literals(self): + for lit in VALID_UNDERSCORE_LITERALS: + self.assertEqual(eval(lit), eval(lit.replace('_', ''))) + for lit in INVALID_UNDERSCORE_LITERALS: + self.assertRaises(SyntaxError, eval, lit) + # Sanity check: no literal begins with an underscore + self.assertRaises(NameError, eval, "_0") + def test_string_literals(self): x = ''; y = ""; self.assertTrue(len(x) == 0 and x == y) x = '\''; y = "'"; self.assertTrue(len(x) == 1 and x == y and ord(x) == 39) diff -r eae2bb1930c1 Lib/test/test_tokenize.py --- a/Lib/test/test_tokenize.py Wed Feb 10 09:49:36 2016 -0800 +++ b/Lib/test/test_tokenize.py Thu Feb 11 12:03:45 2016 +0200 @@ -4,6 +4,8 @@ from tokenize import (tokenize, _tokeniz open as tokenize_open, Untokenizer) from io import BytesIO from unittest import TestCase, mock +from test.test_grammar import (VALID_UNDERSCORE_LITERALS, + INVALID_UNDERSCORE_LITERALS) import os import token @@ -185,6 +187,23 @@ def k(x): NUMBER '3.14e159' (1, 4) (1, 12) """) + def test_underscore_literals(self): + def number_token(s): + result = [] + f = BytesIO(s.encode('utf-8')) + ret = '' + for type, token, start, end, line in tokenize(f.readline): + if tok_name[type] == 'NUMBER': + return token + return ret or 'invalid token' + for lit in VALID_UNDERSCORE_LITERALS: + if 'else' in lit: + # special test for if-else expression + continue + self.assertEqual(number_token(lit), lit) + for lit in INVALID_UNDERSCORE_LITERALS: + self.assertNotEqual(number_token(lit), lit) + def test_string(self): # String literals self.check_tokenize("x = ''; y = \"\"", """\ diff -r eae2bb1930c1 Lib/tokenize.py --- a/Lib/tokenize.py Wed Feb 10 09:49:36 2016 -0800 +++ b/Lib/tokenize.py Thu Feb 11 12:03:45 2016 +0200 @@ -120,16 +120,17 @@ Comment = r'#[^\r\n]*' Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) Name = r'\w+' -Hexnumber = r'0[xX][0-9a-fA-F]+' -Binnumber = r'0[bB][01]+' -Octnumber = r'0[oO][0-7]+' -Decnumber = r'(?:0+|[1-9][0-9]*)' +Hexnumber = r'0[xX][0-9a-fA-F](?:_?[0-9a-fA-F])*' +Binnumber = r'0[bB][01](?:_?[01])*' +Octnumber = r'0[oO][0-7](?:_?[0-7])*' +Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) -Exponent = r'[eE][-+]?[0-9]+' -Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent) -Expfloat = r'[0-9]+' + Exponent +Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*' +Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', + r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) +Expfloat = r'[0-9](?:_?[0-9])*' + Exponent Floatnumber = group(Pointfloat, Expfloat) -Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]') +Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') Number = group(Imagnumber, Floatnumber, Intnumber) # Return the empty string, plus all of the valid string prefixes. diff -r eae2bb1930c1 Parser/tokenizer.c --- a/Parser/tokenizer.c Wed Feb 10 09:49:36 2016 -0800 +++ b/Parser/tokenizer.c Thu Feb 11 12:03:45 2016 +0200 @@ -1332,6 +1332,27 @@ verify_identifier(struct tok_state *tok) } #endif +static int +tok_decimal_tail(struct tok_state *tok) +{ + int c; + while (1) { + do { + c = tok_nextc(tok); + } while (isdigit(c)); + if (c != '_') { + break; + } + c = tok_nextc(tok); + if (!isdigit(c)) { + tok->done = E_TOKEN; + tok_backup(tok, c); + return 0; + } + } + return c; +} + /* Get next token, after space stripping etc. */ static int @@ -1586,64 +1607,88 @@ tok_get(struct tok_state *tok, char **p_ if (c == '0') { /* Hex, octal or binary -- maybe. */ c = tok_nextc(tok); - if (c == '.') + if (c == '.') { + c = tok_nextc(tok); goto fraction; + } if (c == 'j' || c == 'J') goto imaginary; if (c == 'x' || c == 'X') { - /* Hex */ - c = tok_nextc(tok); - if (!isxdigit(c)) { - tok->done = E_TOKEN; - tok_backup(tok, c); - return ERRORTOKEN; - } do { c = tok_nextc(tok); - } while (isxdigit(c)); + if (!isxdigit(c)) { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; + } + do { + c = tok_nextc(tok); + } while (isxdigit(c)); + } while (c == '_'); } else if (c == 'o' || c == 'O') { /* Octal */ - c = tok_nextc(tok); - if (c < '0' || c >= '8') { - tok->done = E_TOKEN; - tok_backup(tok, c); - return ERRORTOKEN; - } do { c = tok_nextc(tok); - } while ('0' <= c && c < '8'); + if (c < '0' || c >= '8') { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; + } + do { + c = tok_nextc(tok); + } while ('0' <= c && c < '8'); + } while (c == '_'); } else if (c == 'b' || c == 'B') { /* Binary */ - c = tok_nextc(tok); - if (c != '0' && c != '1') { - tok->done = E_TOKEN; - tok_backup(tok, c); - return ERRORTOKEN; - } do { c = tok_nextc(tok); - } while (c == '0' || c == '1'); + if (c != '0' && c != '1') { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; + } + do { + c = tok_nextc(tok); + } while (c == '0' || c == '1'); + } while (c == '_'); } else { int nonzero = 0; /* maybe old-style octal; c is first char of it */ /* in any case, allow '0' as a literal */ - while (c == '0') - c = tok_nextc(tok); - while (isdigit(c)) { - nonzero = 1; + while (1) { + if (c == '_') { + c = tok_nextc(tok); + if (!isdigit(c)) { + tok->done = E_TOKEN; + tok_backup(tok, c); + return ERRORTOKEN; + } + } + if (c != '0') + break; c = tok_nextc(tok); } - if (c == '.') + if (isdigit(c)) { + nonzero = 1; + c = tok_decimal_tail(tok); + if (c == 0) { + return ERRORTOKEN; + } + } + if (c == '.') { + c = tok_nextc(tok); goto fraction; + } else if (c == 'e' || c == 'E') goto exponent; else if (c == 'j' || c == 'J') goto imaginary; else if (nonzero) { + /* Old-style octal: now disallowed. */ tok->done = E_TOKEN; tok_backup(tok, c); return ERRORTOKEN; @@ -1652,17 +1697,22 @@ tok_get(struct tok_state *tok, char **p_ } else { /* Decimal */ - do { - c = tok_nextc(tok); - } while (isdigit(c)); + c = tok_decimal_tail(tok); + if (c == 0) { + return ERRORTOKEN; + } { /* Accept floating point numbers. */ if (c == '.') { + c = tok_nextc(tok); fraction: /* Fraction */ - do { - c = tok_nextc(tok); - } while (isdigit(c)); + if (isdigit(c)) { + c = tok_decimal_tail(tok); + if (c == 0) { + return ERRORTOKEN; + } + } } if (c == 'e' || c == 'E') { int e; @@ -1684,9 +1734,10 @@ tok_get(struct tok_state *tok, char **p_ *p_end = tok->cur; return NUMBER; } - do { - c = tok_nextc(tok); - } while (isdigit(c)); + c = tok_decimal_tail(tok); + if (c == 0) { + return ERRORTOKEN; + } } if (c == 'j' || c == 'J') /* Imaginary part */ diff -r eae2bb1930c1 Python/ast.c --- a/Python/ast.c Wed Feb 10 09:49:36 2016 -0800 +++ b/Python/ast.c Thu Feb 11 12:03:45 2016 +0200 @@ -3941,7 +3941,7 @@ ast_for_stmt(struct compiling *c, const } static PyObject * -parsenumber(struct compiling *c, const char *s) +parsenumber_raw(struct compiling *c, const char *s) { const char *end; long x; @@ -3984,6 +3984,31 @@ parsenumber(struct compiling *c, const c } static PyObject * +parsenumber(struct compiling *c, const char *s) +{ + char *dup, *end; + PyObject *res = NULL; + + assert(s != NULL); + + if (strchr(s, '_') == NULL) { + return parsenumber_raw(c, s); + } + /* Create a duplicate without underscores. */ + dup = PyMem_Malloc(strlen(s) + 1); + end = dup; + for (; *s; s++) { + if (*s != '_') { + *end++ = *s; + } + } + *end-- = '\0'; + res = parsenumber_raw(c, dup); + PyMem_Free(dup); + return res; +} + +static PyObject * decode_utf8(struct compiling *c, const char **sPtr, const char *end) { const char *s, *t;