diff --git a/Lib/idlelib/HyperParser.py b/Lib/idlelib/HyperParser.py --- a/Lib/idlelib/HyperParser.py +++ b/Lib/idlelib/HyperParser.py @@ -6,11 +6,23 @@ """ import string -import keyword +from keyword import iskeyword from idlelib import PyParse + +# all ASCII chars that may be in an identifier +_ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_") +_IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)] +del _ASCII_ID_CHARS + +# all ASCII chars that may be the first char of an identifier +_ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_") +_IS_ASCII_FIRST_ID_CHAR = \ + [(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)] +del _ASCII_ID_FIRST_CHARS + + class HyperParser: - def __init__(self, editwin, index): "To initialize, analyze the surroundings of the given index." @@ -143,25 +155,72 @@ return beforeindex, afterindex - # Ascii chars that may be in a white space + _IS_ASCII_ID_CHAR = _IS_ASCII_ID_CHAR + _IS_ASCII_FIRST_ID_CHAR = _IS_ASCII_FIRST_ID_CHAR + + # the set of built-in identifiers which are also keywords, + # i.e. keyword.iskeyword() returns True for them + _ID_KEYWORDS = frozenset({"True", "False", "None"}) + + @classmethod + def _eat_identifier(cls, str, limit, pos): + """Given a string and pos, return the number of chars in the + identifier which ends at pos, or 0 if there is no such one. + + This ignores non-identifier eywords are not identifiers. + """ + is_ascii_id_char = cls._IS_ASCII_ID_CHAR + + # Start at the end (pos) and work backwards. + i = pos + + # Go backwards as long as the characters are valid ASCII + # identifier characters. This is an optimization, since it + # is faster in the common case where most of the characters + # are ASCII. + while i > limit and ( + ord(str[i - 1]) < 128 and + is_ascii_id_char[ord(str[i - 1])] + ): + i -= 1 + + # If the above loop ended due to reaching a non-ASCII + # character, continue going backwards using the most generic + # test for whether a string contains only valid identifier + # characters. + if i > limit and ord(str[i - 1]) >= 128: + while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier(): + i -= 4 + if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier(): + i -= 2 + if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier(): + i -= 1 + + # The identifier candidate starts here. If it isn't a valid + # identifier, don't eat anything. At this point that is only + # possible if the first character isn't a valid first + # character for an identifier. + if not str[i:pos].isidentifier(): + return 0 + else: + # All characters in str[i:pos] are valid ASCII identifier + # characters, so it is enough to check that the first is + # valid as the first character of an identifier. + if not cls._IS_ASCII_FIRST_ID_CHAR[ord(str[i])]: + return 0 + + # All keywords are valid identifiers, but should not be + # considered identifiers here, except for True, False and None. + if i < pos and ( + iskeyword(str[i:pos]) and + str[i:pos] not in cls._ID_KEYWORDS + ): + return 0 + + return pos - i + + # This string includes all chars that may be in a white space _whitespace_chars = " \t\n\\" - # Ascii chars that may be in an identifier - _id_chars = string.ascii_letters + string.digits + "_" - # Ascii chars that may be the first char of an identifier - _id_first_chars = string.ascii_letters + "_" - - # Given a string and pos, return the number of chars in the - # identifier which ends at pos, or 0 if there is no such one. Saved - # words are not identifiers. - def _eat_identifier(self, str, limit, pos): - i = pos - while i > limit and str[i-1] in self._id_chars: - i -= 1 - if (i < pos and (str[i] not in self._id_first_chars or - (keyword.iskeyword(str[i:pos]) and - str[i:pos] not in {'None', 'False', 'True'}))): - i = pos - return pos - i def get_expression(self): """Return a string with the Python expression which ends at the diff --git a/Lib/idlelib/idle_test/test_hyperparser.py b/Lib/idlelib/idle_test/test_hyperparser.py --- a/Lib/idlelib/idle_test/test_hyperparser.py +++ b/Lib/idlelib/idle_test/test_hyperparser.py @@ -30,6 +30,7 @@ "z = ((r'asdf')+('a')))\n" '[x for x in\n' 'for = False\n' + 'cliché = "this is a string with unicode, what a cliché"' ) @classmethod @@ -93,6 +94,8 @@ self.assertTrue(p.is_in_string()) p = get('4.6') self.assertTrue(p.is_in_string()) + p = get('12.54') + self.assertTrue(p.is_in_string()) def test_is_in_code(self): get = self.get_parser @@ -180,12 +183,55 @@ p = get('10.0') self.assertEqual(p.get_expression(), '') + p = get('10.6') + self.assertEqual(p.get_expression(), '') + + p = get('10.11') + self.assertEqual(p.get_expression(), '') + p = get('11.3') self.assertEqual(p.get_expression(), '') p = get('11.11') self.assertEqual(p.get_expression(), 'False') + p = get('12.6') + self.assertEqual(p.get_expression(), 'cliché') + + p = get('12.54') + self.assertEqual(p.get_expression(), '') + + def test_eat_identifier(self): + eat_id = HyperParser._eat_identifier + + # invalid first character which is valid elsewhere in an identifier + self.assertEquals(eat_id('2notid', 0, 6), 0) + + # ASCII-only valid identifiers + self.assertEquals(eat_id('valid_id', 0, 8), 8) + self.assertEquals(eat_id('_valid_id', 0, 9), 9) + self.assertEquals(eat_id('valid_id_', 0, 9), 9) + self.assertEquals(eat_id('_2valid_id', 0, 10), 10) + + # keywords which should be "eaten" + self.assertEquals(eat_id('True', 0, 4), 4) + self.assertEquals(eat_id('False', 0, 5), 5) + self.assertEquals(eat_id('None', 0, 4), 4) + + # keywords which should not be "eaten" + self.assertEquals(eat_id('for', 0, 3), 0) + self.assertEquals(eat_id('import', 0, 6), 0) + self.assertEquals(eat_id('return', 0, 6), 0) + + # valid unicode identifiers + self.assertEquals(eat_id('cliche', 0, 6), 6) + self.assertEquals(eat_id('cliché', 0, 6), 6) + self.assertEquals(eat_id('a٢', 0, 2), 2) + + # invalid unicode identifiers + self.assertEquals(eat_id('2cliché_ascii_after_unicode', 0, 27), 0) + self.assertEquals(eat_id('٢cliché_ascii_after_unicode', 0, 27), 0) + self.assertEquals(eat_id('a²', 0, 2), 0) if __name__ == '__main__': unittest.main(verbosity=2)