diff -r c711c36cf988 Lib/test/test_grammar.py --- a/Lib/test/test_grammar.py Wed Feb 10 10:31:43 2016 +0200 +++ b/Lib/test/test_grammar.py Wed Feb 10 18:47:35 2016 +0100 @@ -8,6 +8,54 @@ import sys # testing import * from sys import * +# These are shared with test_tokenize. +VALID_UNDERSCORE_LITERALS = [ + '0_0_0', + '4_2', + '4_______2', + '1_0000_0000', + '0b_1001_0100', + '0x_ffff_ffff', + '0o_5_7_7', + '1__.4', + '1.4_j', + '1.4e5_j', + '1_00_00_.5', + '1_e10', + '1_E10', + '1_e1_0', + '1.4e_1', + '.1_4', + '.1_4e_1', + '0 if 1_____else 1', + '0 if 1.0_____else 1', +] +INVALID_UNDERSCORE_LITERALS = [ + # Trailing underscores: + '0_', + '42_', + '1.4j_', + '0b1_', + '0xf_', + '0o5_', + # Underscores in the base selector: + '0_b0', + '0_xf', + '0_o5', + # Old-style octal, still disallowed: + '0_7', + '09_99', + # Special case with exponent: + '0 if 1_Else 1', + # Underscore right after a dot: + '1._4', + '1._4j', + '1._4e5_j', + '._5', + # Underscore right after a sign: + '1.0e+_1', +] + class TokenTests(unittest.TestCase): @@ -87,6 +135,14 @@ class TokenTests(unittest.TestCase): self.assertEqual(1 if 0else 0, 0) self.assertRaises(SyntaxError, eval, "0 if 1Else 0") + def test_underscore_literals(self): + for lit in VALID_UNDERSCORE_LITERALS: + self.assertEqual(eval(lit), eval(lit.replace('_', ''))) + for lit in INVALID_UNDERSCORE_LITERALS: + self.assertRaises(SyntaxError, eval, lit) + # Sanity check: no literal begins with an underscore + self.assertRaises(NameError, eval, "_0") + def test_string_literals(self): x = ''; y = ""; self.assertTrue(len(x) == 0 and x == y) x = '\''; y = "'"; self.assertTrue(len(x) == 1 and x == y and ord(x) == 39) diff -r c711c36cf988 Lib/test/test_tokenize.py --- a/Lib/test/test_tokenize.py Wed Feb 10 10:31:43 2016 +0200 +++ b/Lib/test/test_tokenize.py Wed Feb 10 18:47:35 2016 +0100 @@ -4,6 +4,8 @@ from tokenize import (tokenize, _tokeniz open as tokenize_open, Untokenizer) from io import BytesIO from unittest import TestCase, mock +from test.test_grammar import (VALID_UNDERSCORE_LITERALS, + INVALID_UNDERSCORE_LITERALS) import os import token @@ -185,6 +187,23 @@ def k(x): NUMBER '3.14e159' (1, 4) (1, 12) """) + def test_underscore_literals(self): + def number_token(s): + result = [] + f = BytesIO(s.encode('utf-8')) + ret = '' + for type, token, start, end, line in tokenize(f.readline): + if tok_name[type] == 'NUMBER': + return token + return ret or 'invalid token' + for lit in VALID_UNDERSCORE_LITERALS: + if 'else' in lit: + # special test for if-else expression + continue + self.assertEqual(number_token(lit), lit) + for lit in INVALID_UNDERSCORE_LITERALS: + self.assertNotEqual(number_token(lit), lit) + def test_string(self): # String literals self.check_tokenize("x = ''; y = \"\"", """\ diff -r c711c36cf988 Lib/tokenize.py --- a/Lib/tokenize.py Wed Feb 10 10:31:43 2016 +0200 +++ b/Lib/tokenize.py Wed Feb 10 18:47:35 2016 +0100 @@ -120,16 +120,17 @@ Comment = r'#[^\r\n]*' Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) Name = r'\w+' -Hexnumber = r'0[xX][0-9a-fA-F]+' -Binnumber = r'0[bB][01]+' -Octnumber = r'0[oO][0-7]+' -Decnumber = r'(?:0+|[1-9][0-9]*)' +Hexnumber = r'0[xX][0-9a-fA-F_]*[0-9a-fA-F]' +Binnumber = r'0[bB][01_]*[01]' +Octnumber = r'0[oO][0-7_]*[0-7]' +Decnumber = r'(?:0(?:[0_]*0)?|[1-9](?:[0-9_]*[0-9])?)' Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) -Exponent = r'[eE][-+]?[0-9]+' -Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent) -Expfloat = r'[0-9]+' + Exponent +Exponent = r'[eE]_*(?:[-+][0-9](?:[0-9_]*[0-9])?|[0-9_]*[0-9])' +Pointfloat = group(r'[0-9][0-9_]*\.(?:[0-9](?:[0-9_]*[0-9])?)?', + r'\.[0-9](?:[0-9_]*[0-9])?') + maybe(Exponent) +Expfloat = r'[0-9][0-9_]*' + Exponent Floatnumber = group(Pointfloat, Expfloat) -Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]') +Imagnumber = group(r'[0-9][0-9_]*[jJ]', Floatnumber + r'_*[jJ]') Number = group(Imagnumber, Floatnumber, Intnumber) # Return the empty string, plus all of the valid string prefixes. diff -r c711c36cf988 Parser/tokenizer.c --- a/Parser/tokenizer.c Wed Feb 10 10:31:43 2016 +0200 +++ b/Parser/tokenizer.c Wed Feb 10 18:47:35 2016 +0100 @@ -1586,14 +1586,19 @@ tok_get(struct tok_state *tok, char **p_ if (c == '0') { /* Hex, octal or binary -- maybe. */ c = tok_nextc(tok); - if (c == '.') + if (c == '.') { + c = tok_nextc(tok); goto fraction; + } if (c == 'j' || c == 'J') goto imaginary; + /* Note: no underscore is allowed in the middle of + "0x", "0b" or "0o". */ if (c == 'x' || c == 'X') { - /* Hex */ - c = tok_nextc(tok); + do { + c = tok_nextc(tok); + } while (c == '_'); if (!isxdigit(c)) { tok->done = E_TOKEN; tok_backup(tok, c); @@ -1601,11 +1606,13 @@ tok_get(struct tok_state *tok, char **p_ } do { c = tok_nextc(tok); - } while (isxdigit(c)); + } while (isxdigit(c) || c == '_'); } else if (c == 'o' || c == 'O') { /* Octal */ - c = tok_nextc(tok); + do { + c = tok_nextc(tok); + } while (c == '_'); if (c < '0' || c >= '8') { tok->done = E_TOKEN; tok_backup(tok, c); @@ -1613,11 +1620,13 @@ tok_get(struct tok_state *tok, char **p_ } do { c = tok_nextc(tok); - } while ('0' <= c && c < '8'); + } while (('0' <= c && c < '8') || c == '_'); } else if (c == 'b' || c == 'B') { /* Binary */ - c = tok_nextc(tok); + do { + c = tok_nextc(tok); + } while (c == '_'); if (c != '0' && c != '1') { tok->done = E_TOKEN; tok_backup(tok, c); @@ -1625,25 +1634,28 @@ tok_get(struct tok_state *tok, char **p_ } do { c = tok_nextc(tok); - } while (c == '0' || c == '1'); + } while (c == '0' || c == '1' || c == '_'); } else { int nonzero = 0; /* maybe old-style octal; c is first char of it */ /* in any case, allow '0' as a literal */ - while (c == '0') + while (c == '0' || c == '_') c = tok_nextc(tok); - while (isdigit(c)) { + while (isdigit(c) || c == '_') { nonzero = 1; c = tok_nextc(tok); } - if (c == '.') + if (c == '.') { + c = tok_nextc(tok); goto fraction; + } else if (c == 'e' || c == 'E') goto exponent; else if (c == 'j' || c == 'J') goto imaginary; else if (nonzero) { + /* Old-style octal: now disallowed. */ tok->done = E_TOKEN; tok_backup(tok, c); return ERRORTOKEN; @@ -1654,22 +1666,30 @@ tok_get(struct tok_state *tok, char **p_ /* Decimal */ do { c = tok_nextc(tok); - } while (isdigit(c)); + } while (isdigit(c) || c == '_'); { /* Accept floating point numbers. */ if (c == '.') { + c = tok_nextc(tok); fraction: /* Fraction */ - do { - c = tok_nextc(tok); - } while (isdigit(c)); + /* Right after dot, an underscore is not allowed. */ + if (isdigit(c)) { + do { + c = tok_nextc(tok); + } while (isdigit(c) || c == '_'); + } } if (c == 'e' || c == 'E') { + int n = 0; int e; exponent: e = c; /* Exponent part */ - c = tok_nextc(tok); + do { + c = tok_nextc(tok); + n++; + } while (c == '_'); if (c == '+' || c == '-') { c = tok_nextc(tok); if (!isdigit(c)) { @@ -1679,6 +1699,9 @@ tok_get(struct tok_state *tok, char **p_ } } else if (!isdigit(c)) { tok_backup(tok, c); + for (int i = 0; i < n - 1; i++) { + tok_backup(tok, '_'); + } tok_backup(tok, e); *p_start = tok->start; *p_end = tok->cur; @@ -1686,7 +1709,7 @@ tok_get(struct tok_state *tok, char **p_ } do { c = tok_nextc(tok); - } while (isdigit(c)); + } while (isdigit(c) || c == '_'); } if (c == 'j' || c == 'J') /* Imaginary part */ @@ -1697,6 +1720,11 @@ tok_get(struct tok_state *tok, char **p_ tok_backup(tok, c); *p_start = tok->start; *p_end = tok->cur; + if (*(tok->cur - 1) == '_') { + /* Literals may not end in an underscore. */ + tok->done = E_TOKEN; + return ERRORTOKEN; + } return NUMBER; } diff -r c711c36cf988 Python/ast.c --- a/Python/ast.c Wed Feb 10 10:31:43 2016 +0200 +++ b/Python/ast.c Wed Feb 10 18:47:35 2016 +0100 @@ -3943,44 +3943,68 @@ ast_for_stmt(struct compiling *c, const static PyObject * parsenumber(struct compiling *c, const char *s) { - const char *end; + char *dup, *end; long x; double dx; Py_complex compl; int imflag; + PyObject *res = NULL; assert(s != NULL); + + /* Create a duplicate without underscores. */ + dup = _PyMem_RawStrdup(s); + end = dup; + for (const char *ch = s; *ch; ch++) { + if (*ch != '_') { + *end++ = *ch; + } + } + *end-- = '\0'; /* Now points at last character in dup. */ + errno = 0; - end = s + strlen(s) - 1; imflag = *end == 'j' || *end == 'J'; - if (s[0] == '0') { - x = (long) PyOS_strtoul(s, (char **)&end, 0); + + if (dup[0] == '0') { + x = (long) PyOS_strtoul(dup, (char **)&end, 0); if (x < 0 && errno == 0) { - return PyLong_FromString(s, (char **)0, 0); + res = PyLong_FromString(dup, (char **)0, 0); + goto exit; } } else - x = PyOS_strtol(s, (char **)&end, 0); + { + x = PyOS_strtol(dup, (char **)&end, 0); + } + if (*end == '\0') { - if (errno != 0) - return PyLong_FromString(s, (char **)0, 0); - return PyLong_FromLong(x); + if (errno != 0) { + res = PyLong_FromString(dup, (char **)0, 0); + } else { + res = PyLong_FromLong(x); + } + goto exit; } /* XXX Huge floats may silently fail */ if (imflag) { compl.real = 0.; - compl.imag = PyOS_string_to_double(s, (char **)&end, NULL); + compl.imag = PyOS_string_to_double(dup, (char **)&end, NULL); if (compl.imag == -1.0 && PyErr_Occurred()) - return NULL; - return PyComplex_FromCComplex(compl); + goto exit; + res = PyComplex_FromCComplex(compl); + goto exit; } else { - dx = PyOS_string_to_double(s, NULL, NULL); + dx = PyOS_string_to_double(dup, NULL, NULL); if (dx == -1.0 && PyErr_Occurred()) - return NULL; - return PyFloat_FromDouble(dx); + goto exit; + res = PyFloat_FromDouble(dx); + goto exit; } + exit: + PyMem_RawFree(dup); + return res; } static PyObject *