diff -r 10081a0ca4bd Lib/sre_parse.py --- a/Lib/sre_parse.py Thu Oct 24 22:04:37 2013 +0300 +++ b/Lib/sre_parse.py Thu Oct 24 23:12:26 2013 +0300 @@ -89,12 +89,12 @@ class SubPattern: # a subpattern, in intermediate form + width = None def __init__(self, pattern, data=None): self.pattern = pattern if data is None: data = [] self.data = data - self.width = None def dump(self, level=0): nl = 1 seqtypes = (tuple, list) @@ -112,7 +112,7 @@ if i > 0: print(level*" " + "or") a.dump(level+1); nl = 1 - i = i + 1 + i += 1 elif isinstance(av, seqtypes): for a in av: if isinstance(a, SubPattern): @@ -141,11 +141,9 @@ self.data.append(code) def getwidth(self): # determine the width (min, max) for this subpattern - if self.width: + if self.width is not None: return self.width lo = hi = 0 - UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) - REPEATCODES = (MIN_REPEAT, MAX_REPEAT) for op, av in self.data: if op is BRANCH: i = MAXREPEAT - 1 @@ -154,23 +152,23 @@ l, h = av.getwidth() i = min(i, l) j = max(j, h) - lo = lo + i - hi = hi + j + lo += i + hi += j elif op is CALL: i, j = av.getwidth() - lo = lo + i - hi = hi + j + lo += i + hi += j elif op is SUBPATTERN: i, j = av[1].getwidth() - lo = lo + i - hi = hi + j - elif op in REPEATCODES: + lo += i + hi += j + elif op in _REPEATCODES: i, j = av[2].getwidth() - lo = lo + i * av[0] - hi = hi + j * av[1] - elif op in UNITCODES: - lo = lo + 1 - hi = hi + 1 + lo += i * av[0] + hi += j * av[1] + elif op in _UNITCODES: + lo += 1 + hi += 1 elif op == SUCCESS: break self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) @@ -179,34 +177,31 @@ class Tokenizer: def __init__(self, string): self.istext = isinstance(string, str) + if not self.istext: + string = str(string, 'latin1') self.string = string self.index = 0 self.__next() def __next(self): - if self.index >= len(self.string): + index = self.index + try: + char = self.string[index] + except IndexError: self.next = None return - char = self.string[self.index:self.index+1] - # Special case for the str8, since indexing returns a integer - # XXX This is only needed for test_bug_926075 in test_re.py - if char and not self.istext: - char = chr(char[0]) if char == "\\": try: - c = self.string[self.index + 1] + char += self.string[index + 1] except IndexError: raise error("bogus escape (end of line)") - if not self.istext: - c = chr(c) - char = char + c - self.index = self.index + len(char) + index += 1 + self.index = index + 1 self.next = char - def match(self, char, skip=1): + def match(self, char): if char == self.next: - if skip: - self.__next() - return 1 - return 0 + self.__next() + return True + return False def get(self): this = self.next self.__next() @@ -255,10 +250,10 @@ def _class_escape(source, escape): # handle escape code inside character class code = ESCAPES.get(escape) - if code: + if code is not None: return code code = CATEGORIES.get(escape) - if code and code[0] == IN: + if code is not None and code[0] is IN: return code try: c = escape[1:2] @@ -267,7 +262,7 @@ escape += source.getwhile(2, HEXDIGITS) if len(escape) != 4: raise ValueError - return LITERAL, int(escape[2:], 16) & 0xff + return LITERAL, int(escape[2:], 16) elif c == "u" and source.istext: # unicode escape (exactly four digits) escape += source.getwhile(4, HEXDIGITS) @@ -297,10 +292,10 @@ def _escape(source, escape, state): # handle escape code in expression code = CATEGORIES.get(escape) - if code: + if code is not None: return code code = ESCAPES.get(escape) - if code: + if code is not None: return code try: c = escape[1:2] @@ -309,7 +304,7 @@ escape += source.getwhile(2, HEXDIGITS) if len(escape) != 4: raise ValueError - return LITERAL, int(escape[2:], 16) & 0xff + return LITERAL, int(escape[2:], 16) elif c == "u" and source.istext: # unicode escape (exactly four digits) escape += source.getwhile(4, HEXDIGITS) @@ -331,11 +326,11 @@ elif c in DIGITS: # octal escape *or* decimal group reference (sigh) if source.next in DIGITS: - escape = escape + source.get() + escape += source.get() if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and source.next in OCTDIGITS): # got three octal digits; this is an octal escape - escape = escape + source.get() + escape += source.get() return LITERAL, int(escape[1:], 8) & 0xff # not an octal escape, so this is a group reference group = int(escape[1:]) @@ -350,22 +345,18 @@ pass raise error("bogus escape: %s" % repr(escape)) -def _parse_sub(source, state, nested=1): +def _parse_sub(source, state, nested=True): # parse an alternation: a|b|c items = [] itemsappend = items.append sourcematch = source.match - while 1: + while True: itemsappend(_parse(source, state)) - if sourcematch("|"): - continue - if not nested: + if not sourcematch("|"): break - if not source.next or sourcematch(")", 0): - break - else: - raise error("pattern not properly closed") + if nested and source.next is not None and source.next != ")": + raise error("pattern not properly closed") if len(items) == 1: return items[0] @@ -374,7 +365,7 @@ subpatternappend = subpattern.append # check if all items share a common prefix - while 1: + while True: prefix = None for item in items: if not item: @@ -394,16 +385,12 @@ # check if the branch can be replaced by a character set for item in items: - if len(item) != 1 or item[0][0] != LITERAL: + if len(item) != 1 or item[0][0] is not LITERAL: break else: # we can store this as a character set instead of a # branch (the compiler may optimize this even more) - set = [] - setappend = set.append - for item in items: - setappend(item[0]) - subpatternappend((IN, set)) + subpatternappend((IN, [item[0] for item in items])) return subpattern subpattern.append((BRANCH, (None, items))) @@ -413,20 +400,18 @@ item_yes = _parse(source, state) if source.match("|"): item_no = _parse(source, state) - if source.match("|"): + if source.next == "|": raise error("conditional backref with more than two branches") else: item_no = None - if source.next and not source.match(")", 0): + if source.next is not None and source.next != ")": raise error("pattern not properly closed") subpattern = SubPattern(state) subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) return subpattern -_PATTERNENDERS = set("|)") -_ASSERTCHARS = set("=!<") -_LOOKBEHINDASSERTCHARS = set("=!") -_REPEATCODES = set([MIN_REPEAT, MAX_REPEAT]) +_REPEATCODES = (MIN_REPEAT, MAX_REPEAT) +_UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) def _parse(source, state): # parse a simple pattern @@ -437,32 +422,35 @@ sourceget = source.get sourcematch = source.match _len = len - PATTERNENDERS = _PATTERNENDERS - ASSERTCHARS = _ASSERTCHARS - LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS - REPEATCODES = _REPEATCODES + _ord = ord + verbose = state.flags & SRE_FLAG_VERBOSE - while 1: + while True: - if source.next in PATTERNENDERS: - break # end of subpattern - this = sourceget() + this = source.next if this is None: break # end of pattern + if this in "|)": + break # end of subpattern + sourceget() - if state.flags & SRE_FLAG_VERBOSE: + if verbose: # skip whitespace and comments if this in WHITESPACE: continue if this == "#": - while 1: + while True: this = sourceget() - if this in (None, "\n"): + if this is None or this == "\n": break continue - if this and this[0] not in SPECIAL_CHARS: - subpatternappend((LITERAL, ord(this))) + if this[0] == "\\": + code = _escape(source, this, state) + subpatternappend(code) + + elif this not in SPECIAL_CHARS: + subpatternappend((LITERAL, _ord(this))) elif this == "[": # character set @@ -474,39 +462,38 @@ setappend((NEGATE, None)) # check remaining characters start = set[:] - while 1: + while True: this = sourceget() + if this is None: + raise error("unexpected end of regular expression") if this == "]" and set != start: break - elif this and this[0] == "\\": + elif this[0] == "\\": code1 = _class_escape(source, this) - elif this: - code1 = LITERAL, ord(this) else: - raise error("unexpected end of regular expression") + code1 = LITERAL, _ord(this) if sourcematch("-"): # potential range this = sourceget() + if this is None: + raise error("unexpected end of regular expression") if this == "]": if code1[0] is IN: code1 = code1[1][0] setappend(code1) - setappend((LITERAL, ord("-"))) + setappend((LITERAL, _ord("-"))) break - elif this: - if this[0] == "\\": - code2 = _class_escape(source, this) - else: - code2 = LITERAL, ord(this) - if code1[0] != LITERAL or code2[0] != LITERAL: - raise error("bad character range") - lo = code1[1] - hi = code2[1] - if hi < lo: - raise error("bad character range") - setappend((RANGE, (lo, hi))) + if this[0] == "\\": + code2 = _class_escape(source, this) else: - raise error("unexpected end of regular expression") + code2 = LITERAL, _ord(this) + if code1[0] != LITERAL or code2[0] != LITERAL: + raise error("bad character range") + lo = code1[1] + hi = code2[1] + if hi < lo: + raise error("bad character range") + setappend((RANGE, (lo, hi))) else: if code1[0] is IN: code1 = code1[1][0] @@ -521,7 +508,7 @@ # XXX: should add charmap optimization here subpatternappend((IN, set)) - elif this and this[0] in REPEAT_CHARS: + elif this in REPEAT_CHARS: # repeat previous item if this == "?": min, max = 0, 1 @@ -532,20 +519,20 @@ min, max = 1, MAXREPEAT elif this == "{": if source.next == "}": - subpatternappend((LITERAL, ord(this))) + subpatternappend((LITERAL, _ord(this))) continue here = source.tell() min, max = 0, MAXREPEAT lo = hi = "" while source.next in DIGITS: - lo = lo + source.get() + lo += sourceget() if sourcematch(","): while source.next in DIGITS: - hi = hi + sourceget() + hi += sourceget() else: hi = lo if not sourcematch("}"): - subpatternappend((LITERAL, ord(this))) + subpatternappend((LITERAL, _ord(this))) source.seek(here) continue if lo: @@ -567,7 +554,7 @@ item = None if not item or (_len(item) == 1 and item[0][0] == AT): raise error("nothing to repeat") - if item[0][0] in REPEATCODES: + if item[0][0] in _REPEATCODES: raise error("multiple repeat") if sourcematch("?"): subpattern[-1] = (MIN_REPEAT, (min, max, item)) @@ -584,18 +571,21 @@ if sourcematch("?"): group = 0 # options - if sourcematch("P"): + char = sourceget() + if char is None: + raise error("unexpected end of pattern") + if char == "P": # python extensions if sourcematch("<"): # named group: skip forward to end of name name = "" - while 1: + while True: char = sourceget() if char is None: raise error("unterminated name") if char == ">": break - name = name + char + name += char group = 1 if not name: raise error("missing group name") @@ -604,13 +594,13 @@ elif sourcematch("="): # named backreference name = "" - while 1: + while True: char = sourceget() if char is None: raise error("unterminated name") if char == ")": break - name = name + char + name += char if not name: raise error("missing group name") if not name.isidentifier(): @@ -626,27 +616,25 @@ if char is None: raise error("unexpected end of pattern") raise error("unknown specifier: ?P%s" % char) - elif sourcematch(":"): + elif char == ":": # non-capturing group group = 2 - elif sourcematch("#"): + elif char == "#": # comment - while 1: - if source.next is None or source.next == ")": + while True: + if source.next is None: + raise error("unbalanced parenthesis") + if sourceget() == ")": break - sourceget() - if not sourcematch(")"): - raise error("unbalanced parenthesis") continue - elif source.next in ASSERTCHARS: + elif char in "=!<": # lookahead assertions - char = sourceget() dir = 1 if char == "<": - if source.next not in LOOKBEHINDASSERTCHARS: + char = sourceget() + if char is None or char not in "=!": raise error("syntax error") dir = -1 # lookbehind - char = sourceget() p = _parse_sub(source, state) if not sourcematch(")"): raise error("unbalanced parenthesis") @@ -655,16 +643,16 @@ else: subpatternappend((ASSERT_NOT, (dir, p))) continue - elif sourcematch("("): + elif char == "(": # conditional backreference group condname = "" - while 1: + while True: char = sourceget() if char is None: raise error("unterminated name") if char == ")": break - condname = condname + char + condname += char group = 2 if not condname: raise error("missing group name") @@ -677,12 +665,14 @@ condgroup = int(condname) except ValueError: raise error("bad character in group name") + elif char in FLAGS: + # flags + state.flags |= FLAGS[char] + while source.next in FLAGS: + state.flags |= FLAGS[sourceget()] + verbose = state.flags & SRE_FLAG_VERBOSE else: - # flags - if not source.next in FLAGS: - raise error("unexpected end of pattern") - while source.next in FLAGS: - state.flags = state.flags | FLAGS[sourceget()] + raise error("unexpected end of pattern " + char) if group: # parse group contents if group == 2: @@ -700,7 +690,7 @@ state.closegroup(group) subpatternappend((SUBPATTERN, (group, p))) else: - while 1: + while True: char = sourceget() if char is None: raise error("unexpected end of pattern") @@ -714,10 +704,6 @@ elif this == "$": subpattern.append((AT, AT_END)) - elif this and this[0] == "\\": - code = _escape(source, this, state) - subpatternappend(code) - else: raise error("parser error") @@ -748,11 +734,11 @@ p = _parse_sub(source, pattern, 0) p.pattern.flags = fix_flags(str, p.pattern.flags) - tail = source.get() - if tail == ")": - raise error("unbalanced parenthesis") - elif tail: - raise error("bogus characters at end of regular expression") + if source.next is not None: + if source.next == ")": + raise error("unbalanced parenthesis") + else: + raise error("bogus characters at end of regular expression") if flags & SRE_FLAG_DEBUG: p.dump()