diff -r 180f5bf7d1b9 Lib/sre_parse.py --- a/Lib/sre_parse.py Thu Sep 11 14:33:02 2014 +0300 +++ b/Lib/sre_parse.py Fri Sep 12 10:17:18 2014 +0300 @@ -87,12 +87,12 @@ class SubPattern: # a subpattern, in intermediate form + width = None def __init__(self, pattern, data=None): self.pattern = pattern if data is None: data = [] self.data = data - self.width = None def dump(self, level=0): nl = 1 seqtypes = (tuple, list) @@ -142,8 +142,6 @@ if self.width: return self.width lo = hi = 0 - UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) - REPEATCODES = (MIN_REPEAT, MAX_REPEAT) for op, av in self.data: if op is BRANCH: i = MAXREPEAT - 1 @@ -162,11 +160,11 @@ i, j = av[1].getwidth() lo = lo + i hi = hi + j - elif op in REPEATCODES: + elif op in _REPEATCODES: i, j = av[2].getwidth() lo = lo + i * av[0] hi = hi + j * av[1] - elif op in UNITCODES: + elif op in _UNITCODES: lo = lo + 1 hi = hi + 1 elif op == SUCCESS: @@ -177,34 +175,31 @@ class Tokenizer: def __init__(self, string): self.istext = isinstance(string, str) + if not self.istext: + string = str(string, 'latin1') self.string = string self.index = 0 self.__next() def __next(self): - if self.index >= len(self.string): + index = self.index + try: + char = self.string[index] + except IndexError: self.next = None return - char = self.string[self.index:self.index+1] - # Special case for the str8, since indexing returns a integer - # XXX This is only needed for test_bug_926075 in test_re.py - if char and not self.istext: - char = chr(char[0]) if char == "\\": try: - c = self.string[self.index + 1] + char += self.string[index + 1] except IndexError: raise error("bogus escape (end of line)") - if not self.istext: - c = chr(c) - char = char + c - self.index = self.index + len(char) + index += 1 + self.index = index + 1 self.next = char - def match(self, char, skip=1): + def match(self, char): if char == self.next: - if skip: - self.__next() - return 1 - return 0 + self.__next() + return True + return False def get(self): this = self.next self.__next() @@ -256,7 +251,7 @@ if code: return code code = CATEGORIES.get(escape) - if code and code[0] == IN: + if code and code[0] is IN: return code try: c = escape[1:2] @@ -265,7 +260,7 @@ escape += source.getwhile(2, HEXDIGITS) if len(escape) != 4: raise ValueError - return LITERAL, int(escape[2:], 16) & 0xff + return LITERAL, int(escape[2:], 16) elif c == "u" and source.istext: # unicode escape (exactly four digits) escape += source.getwhile(4, HEXDIGITS) @@ -307,7 +302,7 @@ escape += source.getwhile(2, HEXDIGITS) if len(escape) != 4: raise ValueError - return LITERAL, int(escape[2:], 16) & 0xff + return LITERAL, int(escape[2:], 16) elif c == "u" and source.istext: # unicode escape (exactly four digits) escape += source.getwhile(4, HEXDIGITS) @@ -325,15 +320,15 @@ elif c == "0": # octal escape escape += source.getwhile(2, OCTDIGITS) - return LITERAL, int(escape[1:], 8) & 0xff + return LITERAL, int(escape[1:], 8) elif c in DIGITS: # octal escape *or* decimal group reference (sigh) if source.next in DIGITS: - escape = escape + source.get() + escape += source.get() if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and source.next in OCTDIGITS): # got three octal digits; this is an octal escape - escape = escape + source.get() + escape += source.get() return LITERAL, int(escape[1:], 8) & 0xff # not an octal escape, so this is a group reference group = int(escape[1:]) @@ -348,22 +343,18 @@ pass raise error("bogus escape: %s" % repr(escape)) -def _parse_sub(source, state, nested=1): +def _parse_sub(source, state, nested=True): # parse an alternation: a|b|c items = [] itemsappend = items.append sourcematch = source.match - while 1: + while True: itemsappend(_parse(source, state)) - if sourcematch("|"): - continue - if not nested: + if not sourcematch("|"): break - if not source.next or sourcematch(")", 0): - break - else: - raise error("pattern not properly closed") + if nested and source.next and source.next != ")": + raise error("pattern not properly closed") if len(items) == 1: return items[0] @@ -372,7 +363,7 @@ subpatternappend = subpattern.append # check if all items share a common prefix - while 1: + while True: prefix = None for item in items: if not item: @@ -392,16 +383,12 @@ # check if the branch can be replaced by a character set for item in items: - if len(item) != 1 or item[0][0] != LITERAL: + if len(item) != 1 or item[0][0] is not LITERAL: break else: # we can store this as a character set instead of a # branch (the compiler may optimize this even more) - set = [] - setappend = set.append - for item in items: - setappend(item[0]) - subpatternappend((IN, set)) + subpatternappend((IN, [item[0] for item in items])) return subpattern subpattern.append((BRANCH, (None, items))) @@ -411,20 +398,18 @@ item_yes = _parse(source, state) if source.match("|"): item_no = _parse(source, state) - if source.match("|"): + if source.next == "|": raise error("conditional backref with more than two branches") else: item_no = None - if source.next and not source.match(")", 0): + if source.next and source.next != ")": raise error("pattern not properly closed") subpattern = SubPattern(state) subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) return subpattern -_PATTERNENDERS = set("|)") -_ASSERTCHARS = set("=!<") -_LOOKBEHINDASSERTCHARS = set("=!") -_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) +_REPEATCODES = (MIN_REPEAT, MAX_REPEAT) +_UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) def _parse(source, state): # parse a simple pattern @@ -435,32 +420,35 @@ sourceget = source.get sourcematch = source.match _len = len - PATTERNENDERS = _PATTERNENDERS - ASSERTCHARS = _ASSERTCHARS - LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS - REPEATCODES = _REPEATCODES + _ord = ord + verbose = state.flags & SRE_FLAG_VERBOSE - while 1: + while True: - if source.next in PATTERNENDERS: + this = source.next + if not this: + break # end of pattern + if this in "|)": break # end of subpattern - this = sourceget() - if this is None: - break # end of pattern + sourceget() - if state.flags & SRE_FLAG_VERBOSE: + if verbose: # skip whitespace and comments if this in WHITESPACE: continue if this == "#": - while 1: + while True: this = sourceget() - if this in (None, "\n"): + if not this or this == "\n": break continue - if this and this[0] not in SPECIAL_CHARS: - subpatternappend((LITERAL, ord(this))) + if this[0] == "\\": + code = _escape(source, this, state) + subpatternappend(code) + + elif this not in SPECIAL_CHARS: + subpatternappend((LITERAL, _ord(this))) elif this == "[": # character set @@ -472,39 +460,38 @@ setappend((NEGATE, None)) # check remaining characters start = set[:] - while 1: + while True: this = sourceget() + if not this: + raise error("unexpected end of regular expression") if this == "]" and set != start: break - elif this and this[0] == "\\": + elif this[0] == "\\": code1 = _class_escape(source, this) - elif this: - code1 = LITERAL, ord(this) else: - raise error("unexpected end of regular expression") + code1 = LITERAL, _ord(this) if sourcematch("-"): # potential range this = sourceget() + if not this: + raise error("unexpected end of regular expression") if this == "]": if code1[0] is IN: code1 = code1[1][0] setappend(code1) - setappend((LITERAL, ord("-"))) + setappend((LITERAL, _ord("-"))) break - elif this: - if this[0] == "\\": - code2 = _class_escape(source, this) - else: - code2 = LITERAL, ord(this) - if code1[0] != LITERAL or code2[0] != LITERAL: - raise error("bad character range") - lo = code1[1] - hi = code2[1] - if hi < lo: - raise error("bad character range") - setappend((RANGE, (lo, hi))) + if this[0] == "\\": + code2 = _class_escape(source, this) else: - raise error("unexpected end of regular expression") + code2 = LITERAL, _ord(this) + if code1[0] != LITERAL or code2[0] != LITERAL: + raise error("bad character range") + lo = code1[1] + hi = code2[1] + if hi < lo: + raise error("bad character range") + setappend((RANGE, (lo, hi))) else: if code1[0] is IN: code1 = code1[1][0] @@ -519,7 +506,7 @@ # XXX: should add charmap optimization here subpatternappend((IN, set)) - elif this and this[0] in REPEAT_CHARS: + elif this in REPEAT_CHARS: # repeat previous item if this == "?": min, max = 0, 1 @@ -530,20 +517,20 @@ min, max = 1, MAXREPEAT elif this == "{": if source.next == "}": - subpatternappend((LITERAL, ord(this))) + subpatternappend((LITERAL, _ord(this))) continue here = source.tell() min, max = 0, MAXREPEAT lo = hi = "" while source.next in DIGITS: - lo = lo + source.get() + lo += sourceget() if sourcematch(","): while source.next in DIGITS: - hi = hi + sourceget() + hi += sourceget() else: hi = lo if not sourcematch("}"): - subpatternappend((LITERAL, ord(this))) + subpatternappend((LITERAL, _ord(this))) source.seek(here) continue if lo: @@ -565,7 +552,7 @@ item = None if not item or (_len(item) == 1 and item[0][0] == AT): raise error("nothing to repeat") - if item[0][0] in REPEATCODES: + if item[0][0] in _REPEATCODES: raise error("multiple repeat") if sourcematch("?"): subpattern[-1] = (MIN_REPEAT, (min, max, item)) @@ -582,18 +569,21 @@ if sourcematch("?"): group = 0 # options - if sourcematch("P"): + char = sourceget() + if not char: + raise error("unexpected end of pattern") + if char == "P": # python extensions if sourcematch("<"): # named group: skip forward to end of name name = "" - while 1: + while True: char = sourceget() - if char is None: + if not char: raise error("unterminated name") if char == ">": break - name = name + char + name += char group = 1 if not name: raise error("missing group name") @@ -602,50 +592,48 @@ elif sourcematch("="): # named backreference name = "" - while 1: + while True: char = sourceget() - if char is None: + if not char: raise error("unterminated name") if char == ")": break - name = name + char + name += char if not name: raise error("missing group name") if not name.isidentifier(): raise error("bad character in backref group name " "%r" % name) gid = state.groupdict.get(name) - if gid is None: + if not gid: msg = "unknown group name: {0!r}".format(name) raise error(msg) subpatternappend((GROUPREF, gid)) continue else: char = sourceget() - if char is None: + if not char: raise error("unexpected end of pattern") raise error("unknown specifier: ?P%s" % char) - elif sourcematch(":"): + elif char == ":": # non-capturing group group = 2 - elif sourcematch("#"): + elif char == "#": # comment - while 1: - if source.next is None or source.next == ")": + while True: + if not source.next: + raise error("unbalanced parenthesis") + if sourceget() == ")": break - sourceget() - if not sourcematch(")"): - raise error("unbalanced parenthesis") continue - elif source.next in ASSERTCHARS: + elif char in "=!<": # lookahead assertions - char = sourceget() dir = 1 if char == "<": - if source.next not in LOOKBEHINDASSERTCHARS: + char = sourceget() + if not char or char not in "=!": raise error("syntax error") dir = -1 # lookbehind - char = sourceget() p = _parse_sub(source, state) if not sourcematch(")"): raise error("unbalanced parenthesis") @@ -654,22 +642,22 @@ else: subpatternappend((ASSERT_NOT, (dir, p))) continue - elif sourcematch("("): + elif char == "(": # conditional backreference group condname = "" - while 1: + while True: char = sourceget() - if char is None: + if not char: raise error("unterminated name") if char == ")": break - condname = condname + char + condname += char group = 2 if not condname: raise error("missing group name") if condname.isidentifier(): condgroup = state.groupdict.get(condname) - if condgroup is None: + if not condgroup: msg = "unknown group name: {0!r}".format(condname) raise error(msg) else: @@ -677,12 +665,14 @@ condgroup = int(condname) except ValueError: raise error("bad character in group name") + elif char in FLAGS: + # flags + state.flags |= FLAGS[char] + while source.next in FLAGS: + state.flags |= FLAGS[sourceget()] + verbose = state.flags & SRE_FLAG_VERBOSE else: - # flags - if not source.next in FLAGS: - raise error("unexpected end of pattern") - while source.next in FLAGS: - state.flags = state.flags | FLAGS[sourceget()] + raise error("unexpected end of pattern " + char) if group: # parse group contents if group == 2: @@ -696,13 +686,13 @@ p = _parse_sub(source, state) if not sourcematch(")"): raise error("unbalanced parenthesis") - if group is not None: + if group: state.closegroup(group) subpatternappend((SUBPATTERN, (group, p))) else: - while 1: + while True: char = sourceget() - if char is None: + if not char: raise error("unexpected end of pattern") if char == ")": break @@ -714,10 +704,6 @@ elif this == "$": subpattern.append((AT, AT_END)) - elif this and this[0] == "\\": - code = _escape(source, this, state) - subpatternappend(code) - else: raise error("parser error") @@ -748,11 +734,11 @@ p = _parse_sub(source, pattern, 0) p.pattern.flags = fix_flags(str, p.pattern.flags) - tail = source.get() - if tail == ")": - raise error("unbalanced parenthesis") - elif tail: - raise error("bogus characters at end of regular expression") + if source.next: + if source.next == ")": + raise error("unbalanced parenthesis") + else: + raise error("bogus characters at end of regular expression") if flags & SRE_FLAG_DEBUG: p.dump() @@ -781,7 +767,7 @@ literals.append(None) while True: this = sget() - if this is None: + if not this: break # end of replacement string if this[0] == "\\": # group @@ -791,7 +777,7 @@ if s.match("<"): while True: char = sget() - if char is None: + if not char: raise error("unterminated group name") if char == ">": break