Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(38234)

Delta Between Two Patch Sets: Lib/sre_parse.py

Issue 22578: Add addition attributes to re.error
Left Patch Set: Created 5 years ago
Right Patch Set: Created 5 years ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
Left: Side by side diff | Download
Right: Side by side diff | Download
« no previous file with change/comment | « Lib/sre_constants.py ('k') | Lib/test/test_re.py » ('j') | no next file with change/comment »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
LEFTRIGHT
1 # 1 #
2 # Secret Labs' Regular Expression Engine 2 # Secret Labs' Regular Expression Engine
3 # 3 #
4 # convert re-style regular expression to sre pattern 4 # convert re-style regular expression to sre pattern
5 # 5 #
6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
7 # 7 #
8 # See the sre.py file for information on usage and redistribution. 8 # See the sre.py file for information on usage and redistribution.
9 # 9 #
10 10
11 """Internal support module for sre""" 11 """Internal support module for sre"""
12 12
13 # XXX: show string offset and offending character for all errors 13 # XXX: show string offset and offending character for all errors
14 14
15 from sre_constants import * 15 from sre_constants import *
16 from _sre import MAXREPEAT 16 from _sre import MAXREPEAT
17 17
18 SPECIAL_CHARS = ".\\[{()*+?^$|" 18 SPECIAL_CHARS = ".\\[{()*+?^$|"
19 REPEAT_CHARS = "*+?{" 19 REPEAT_CHARS = "*+?{"
20 20
21 DIGITS = set("0123456789") 21 DIGITS = frozenset("0123456789")
22 22
23 OCTDIGITS = set("01234567") 23 OCTDIGITS = frozenset("01234567")
24 HEXDIGITS = set("0123456789abcdefABCDEF") 24 HEXDIGITS = frozenset("0123456789abcdefABCDEF")
25 25
26 WHITESPACE = set(" \t\n\r\v\f") 26 WHITESPACE = frozenset(" \t\n\r\v\f")
27
28 _REPEATCODES = frozenset((MIN_REPEAT, MAX_REPEAT))
29 _UNITCODES = frozenset((ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY))
27 30
28 ESCAPES = { 31 ESCAPES = {
29 r"\a": (LITERAL, ord("\a")), 32 r"\a": (LITERAL, ord("\a")),
30 r"\b": (LITERAL, ord("\b")), 33 r"\b": (LITERAL, ord("\b")),
31 r"\f": (LITERAL, ord("\f")), 34 r"\f": (LITERAL, ord("\f")),
32 r"\n": (LITERAL, ord("\n")), 35 r"\n": (LITERAL, ord("\n")),
33 r"\r": (LITERAL, ord("\r")), 36 r"\r": (LITERAL, ord("\r")),
34 r"\t": (LITERAL, ord("\t")), 37 r"\t": (LITERAL, ord("\t")),
35 r"\v": (LITERAL, ord("\v")), 38 r"\v": (LITERAL, ord("\v")),
36 r"\\": (LITERAL, ord("\\")) 39 r"\\": (LITERAL, ord("\\"))
(...skipping 109 matching lines...) Expand 10 before | Expand all | Expand 10 after
146 return SubPattern(self.pattern, self.data[index]) 149 return SubPattern(self.pattern, self.data[index])
147 return self.data[index] 150 return self.data[index]
148 def __setitem__(self, index, code): 151 def __setitem__(self, index, code):
149 self.data[index] = code 152 self.data[index] = code
150 def insert(self, index, code): 153 def insert(self, index, code):
151 self.data.insert(index, code) 154 self.data.insert(index, code)
152 def append(self, code): 155 def append(self, code):
153 self.data.append(code) 156 self.data.append(code)
154 def getwidth(self): 157 def getwidth(self):
155 # determine the width (min, max) for this subpattern 158 # determine the width (min, max) for this subpattern
156 if self.width: 159 if self.width is not None:
157 return self.width 160 return self.width
158 lo = hi = 0 161 lo = hi = 0
159 UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)
160 REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
161 for op, av in self.data: 162 for op, av in self.data:
162 if op is BRANCH: 163 if op is BRANCH:
163 i = MAXREPEAT - 1 164 i = MAXREPEAT - 1
164 j = 0 165 j = 0
165 for av in av[1]: 166 for av in av[1]:
166 l, h = av.getwidth() 167 l, h = av.getwidth()
167 i = min(i, l) 168 i = min(i, l)
168 j = max(j, h) 169 j = max(j, h)
169 lo = lo + i 170 lo = lo + i
170 hi = hi + j 171 hi = hi + j
171 elif op is CALL: 172 elif op is CALL:
172 i, j = av.getwidth() 173 i, j = av.getwidth()
173 lo = lo + i 174 lo = lo + i
174 hi = hi + j 175 hi = hi + j
175 elif op is SUBPATTERN: 176 elif op is SUBPATTERN:
176 i, j = av[1].getwidth() 177 i, j = av[1].getwidth()
177 lo = lo + i 178 lo = lo + i
178 hi = hi + j 179 hi = hi + j
179 elif op in REPEATCODES: 180 elif op in _REPEATCODES:
180 i, j = av[2].getwidth() 181 i, j = av[2].getwidth()
181 lo = lo + i * av[0] 182 lo = lo + i * av[0]
182 hi = hi + j * av[1] 183 hi = hi + j * av[1]
183 elif op in UNITCODES: 184 elif op in _UNITCODES:
184 lo = lo + 1 185 lo = lo + 1
185 hi = hi + 1 186 hi = hi + 1
186 elif op == SUCCESS: 187 elif op == SUCCESS:
187 break 188 break
188 self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) 189 self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
189 return self.width 190 return self.width
190 191
191 class Tokenizer: 192 class Tokenizer:
192 def __init__(self, string): 193 def __init__(self, string):
193 self.istext = isinstance(string, str) 194 self.istext = isinstance(string, str)
194 self.string = string 195 self.string = string
196 if not self.istext:
197 string = str(string, 'latin1')
198 self.decoded_string = string
195 self.index = 0 199 self.index = 0
196 self.__next() 200 self.__next()
197 def __next(self): 201 def __next(self):
198 if self.index >= len(self.string): 202 index = self.index
203 try:
204 char = self.decoded_string[index]
205 except IndexError:
199 self.next = None 206 self.next = None
200 return 207 return
201 char = self.string[self.index:self.index+1]
202 # Special case for the str8, since indexing returns a integer
203 # XXX This is only needed for test_bug_926075 in test_re.py
204 if char and not self.istext:
205 char = chr(char[0])
206 if char == "\\": 208 if char == "\\":
209 index += 1
207 try: 210 try:
208 c = self.string[self.index + 1] 211 char += self.decoded_string[index]
209 except IndexError: 212 except IndexError:
210 self.next = None 213 raise self.error("bogus escape (end of line)") from None
211 raise self.error("bogus escape (end of line)", 0) 214 self.index = index + 1
212 if not self.istext:
213 c = chr(c)
214 char = char + c
215 self.index = self.index + len(char)
216 self.next = char 215 self.next = char
217 def match(self, char, skip=1): 216 def match(self, char):
218 if char == self.next: 217 if char == self.next:
219 if skip: 218 self.__next()
220 self.__next() 219 return True
221 return 1 220 return False
222 return 0
223 def get(self): 221 def get(self):
224 this = self.next 222 this = self.next
225 self.__next() 223 self.__next()
226 return this 224 return this
227 def getwhile(self, n, charset): 225 def getwhile(self, n, charset):
228 result = '' 226 result = ''
229 for _ in range(n): 227 for _ in range(n):
230 c = self.next 228 c = self.next
231 if c not in charset: 229 if c not in charset:
232 break 230 break
233 result += c 231 result += c
234 self.__next() 232 self.__next()
235 return result 233 return result
234 def getuntil(self, terminator):
235 result = ''
236 while True:
237 c = self.next
238 self.__next()
239 if c is None:
240 raise self.error("unterminated name")
241 if c == terminator:
242 break
243 result += c
244 return result
236 def tell(self): 245 def tell(self):
237 return self.index - len(self.next or '') 246 return self.index - len(self.next or '')
238 def seek(self, index): 247 def seek(self, index):
239 self.index = index 248 self.index = index
240 self.__next() 249 self.__next()
241 250
242 def error(self, msg, offset): 251 def error(self, msg, offset=0):
243 return error(msg, self.string, self.tell() - offset) 252 return error(msg, self.string, self.tell() - offset)
244 253
245 # The following three functions are not used in this module anymore, but we keep 254 # The following three functions are not used in this module anymore, but we keep
246 # them here (with DeprecationWarnings) for backwards compatibility. 255 # them here (with DeprecationWarnings) for backwards compatibility.
247 256
248 def isident(char): 257 def isident(char):
249 import warnings 258 import warnings
250 warnings.warn('sre_parse.isident() will be removed in 3.5', 259 warnings.warn('sre_parse.isident() will be removed in 3.5',
251 DeprecationWarning, stacklevel=2) 260 DeprecationWarning, stacklevel=2)
252 return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" 261 return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
(...skipping 15 matching lines...) Expand all
268 if not isident(char) and not isdigit(char): 277 if not isident(char) and not isdigit(char):
269 return False 278 return False
270 return True 279 return True
271 280
272 def _class_escape(source, escape): 281 def _class_escape(source, escape):
273 # handle escape code inside character class 282 # handle escape code inside character class
274 code = ESCAPES.get(escape) 283 code = ESCAPES.get(escape)
275 if code: 284 if code:
276 return code 285 return code
277 code = CATEGORIES.get(escape) 286 code = CATEGORIES.get(escape)
278 if code and code[0] == IN: 287 if code and code[0] is IN:
279 return code 288 return code
280 try: 289 try:
281 c = escape[1:2] 290 c = escape[1:2]
282 if c == "x": 291 if c == "x":
283 # hexadecimal escape (exactly two digits) 292 # hexadecimal escape (exactly two digits)
284 escape += source.getwhile(2, HEXDIGITS) 293 escape += source.getwhile(2, HEXDIGITS)
285 if len(escape) != 4: 294 if len(escape) != 4:
286 raise ValueError 295 raise ValueError
287 return LITERAL, int(escape[2:], 16) & 0xff 296 return LITERAL, int(escape[2:], 16)
288 elif c == "u" and source.istext: 297 elif c == "u" and source.istext:
289 # unicode escape (exactly four digits) 298 # unicode escape (exactly four digits)
290 escape += source.getwhile(4, HEXDIGITS) 299 escape += source.getwhile(4, HEXDIGITS)
291 if len(escape) != 6: 300 if len(escape) != 6:
292 raise ValueError 301 raise ValueError
293 return LITERAL, int(escape[2:], 16) 302 return LITERAL, int(escape[2:], 16)
294 elif c == "U" and source.istext: 303 elif c == "U" and source.istext:
295 # unicode escape (exactly eight digits) 304 # unicode escape (exactly eight digits)
296 escape += source.getwhile(8, HEXDIGITS) 305 escape += source.getwhile(8, HEXDIGITS)
297 if len(escape) != 10: 306 if len(escape) != 10:
(...skipping 25 matching lines...) Expand all
323 code = ESCAPES.get(escape) 332 code = ESCAPES.get(escape)
324 if code: 333 if code:
325 return code 334 return code
326 try: 335 try:
327 c = escape[1:2] 336 c = escape[1:2]
328 if c == "x": 337 if c == "x":
329 # hexadecimal escape 338 # hexadecimal escape
330 escape += source.getwhile(2, HEXDIGITS) 339 escape += source.getwhile(2, HEXDIGITS)
331 if len(escape) != 4: 340 if len(escape) != 4:
332 raise ValueError 341 raise ValueError
333 return LITERAL, int(escape[2:], 16) & 0xff 342 return LITERAL, int(escape[2:], 16)
334 elif c == "u" and source.istext: 343 elif c == "u" and source.istext:
335 # unicode escape (exactly four digits) 344 # unicode escape (exactly four digits)
336 escape += source.getwhile(4, HEXDIGITS) 345 escape += source.getwhile(4, HEXDIGITS)
337 if len(escape) != 6: 346 if len(escape) != 6:
338 raise ValueError 347 raise ValueError
339 return LITERAL, int(escape[2:], 16) 348 return LITERAL, int(escape[2:], 16)
340 elif c == "U" and source.istext: 349 elif c == "U" and source.istext:
341 # unicode escape (exactly eight digits) 350 # unicode escape (exactly eight digits)
342 escape += source.getwhile(8, HEXDIGITS) 351 escape += source.getwhile(8, HEXDIGITS)
343 if len(escape) != 10: 352 if len(escape) != 10:
344 raise ValueError 353 raise ValueError
345 c = int(escape[2:], 16) 354 c = int(escape[2:], 16)
346 chr(c) # raise ValueError for invalid code 355 chr(c) # raise ValueError for invalid code
347 return LITERAL, c 356 return LITERAL, c
348 elif c == "0": 357 elif c == "0":
349 # octal escape 358 # octal escape
350 escape += source.getwhile(2, OCTDIGITS) 359 escape += source.getwhile(2, OCTDIGITS)
351 return LITERAL, int(escape[1:], 8) 360 return LITERAL, int(escape[1:], 8)
352 elif c in DIGITS: 361 elif c in DIGITS:
353 # octal escape *or* decimal group reference (sigh) 362 # octal escape *or* decimal group reference (sigh)
354 if source.next in DIGITS: 363 if source.next in DIGITS:
355 escape = escape + source.get() 364 escape += source.get()
356 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and 365 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
357 source.next in OCTDIGITS): 366 source.next in OCTDIGITS):
358 # got three octal digits; this is an octal escape 367 # got three octal digits; this is an octal escape
359 escape = escape + source.get() 368 escape += source.get()
360 c = int(escape[1:], 8) 369 c = int(escape[1:], 8)
361 if c > 0o377: 370 if c > 0o377:
362 raise source.error('octal escape value %r outside of ' 371 raise source.error('octal escape value %r outside of '
363 'range 0-0o377' % escape, 372 'range 0-0o377' % escape,
364 len(escape)) 373 len(escape))
365 return LITERAL, c 374 return LITERAL, c
366 # not an octal escape, so this is a group reference 375 # not an octal escape, so this is a group reference
367 group = int(escape[1:]) 376 group = int(escape[1:])
368 if group < state.groups: 377 if group < state.groups:
369 if not state.checkgroup(group): 378 if not state.checkgroup(group):
370 raise source.error("cannot refer to open group", 379 raise source.error("cannot refer to open group",
371 len(escape)) 380 len(escape))
372 return GROUPREF, group 381 return GROUPREF, group
373 raise ValueError 382 raise ValueError
374 if len(escape) == 2: 383 if len(escape) == 2:
375 return LITERAL, ord(escape[1]) 384 return LITERAL, ord(escape[1])
376 except ValueError: 385 except ValueError:
377 pass 386 pass
378 raise source.error("bogus escape: %s" % repr(escape), len(escape)) 387 raise source.error("bogus escape: %s" % repr(escape), len(escape))
379 388
380 def _parse_sub(source, state, nested=1): 389 def _parse_sub(source, state, nested=True):
381 # parse an alternation: a|b|c 390 # parse an alternation: a|b|c
382 391
383 items = [] 392 items = []
384 itemsappend = items.append 393 itemsappend = items.append
385 sourcematch = source.match 394 sourcematch = source.match
386 while 1: 395 while True:
387 itemsappend(_parse(source, state)) 396 itemsappend(_parse(source, state))
388 if sourcematch("|"): 397 if not sourcematch("|"):
389 continue
390 if not nested:
391 break 398 break
392 if not source.next or sourcematch(")", 0): 399 if nested and source.next is not None and source.next != ")":
393 break 400 raise source.error("pattern not properly closed")
394 else:
395 raise source.error("pattern not properly closed", 0)
396 401
397 if len(items) == 1: 402 if len(items) == 1:
398 return items[0] 403 return items[0]
399 404
400 subpattern = SubPattern(state) 405 subpattern = SubPattern(state)
401 subpatternappend = subpattern.append 406 subpatternappend = subpattern.append
402 407
403 # check if all items share a common prefix 408 # check if all items share a common prefix
404 while 1: 409 while True:
405 prefix = None 410 prefix = None
406 for item in items: 411 for item in items:
407 if not item: 412 if not item:
408 break 413 break
409 if prefix is None: 414 if prefix is None:
410 prefix = item[0] 415 prefix = item[0]
411 elif item[0] != prefix: 416 elif item[0] != prefix:
412 break 417 break
413 else: 418 else:
414 # all subitems start with a common "prefix". 419 # all subitems start with a common "prefix".
415 # move it out of the branch 420 # move it out of the branch
416 for item in items: 421 for item in items:
417 del item[0] 422 del item[0]
418 subpatternappend(prefix) 423 subpatternappend(prefix)
419 continue # check next one 424 continue # check next one
420 break 425 break
421 426
422 # check if the branch can be replaced by a character set 427 # check if the branch can be replaced by a character set
423 for item in items: 428 for item in items:
424 if len(item) != 1 or item[0][0] != LITERAL: 429 if len(item) != 1 or item[0][0] is not LITERAL:
425 break 430 break
426 else: 431 else:
427 # we can store this as a character set instead of a 432 # we can store this as a character set instead of a
428 # branch (the compiler may optimize this even more) 433 # branch (the compiler may optimize this even more)
429 set = [] 434 subpatternappend((IN, [item[0] for item in items]))
430 setappend = set.append
431 for item in items:
432 setappend(item[0])
433 subpatternappend((IN, set))
434 return subpattern 435 return subpattern
435 436
436 subpattern.append((BRANCH, (None, items))) 437 subpattern.append((BRANCH, (None, items)))
437 return subpattern 438 return subpattern
438 439
439 def _parse_sub_cond(source, state, condgroup): 440 def _parse_sub_cond(source, state, condgroup):
440 item_yes = _parse(source, state) 441 item_yes = _parse(source, state)
441 if source.match("|"): 442 if source.match("|"):
442 item_no = _parse(source, state) 443 item_no = _parse(source, state)
443 if source.match("|"): 444 if source.next == "|":
444 raise source.error("conditional backref with more than two branches" , 445 raise source.error("conditional backref with more than two branches" )
445 1)
446 else: 446 else:
447 item_no = None 447 item_no = None
448 if source.next and not source.match(")", 0): 448 if source.next is not None and source.next != ")":
449 raise source.error("pattern not properly closed", 0) 449 raise source.error("pattern not properly closed")
450 subpattern = SubPattern(state) 450 subpattern = SubPattern(state)
451 subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) 451 subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
452 return subpattern 452 return subpattern
453
454 _PATTERNENDERS = set("|)")
455 _ASSERTCHARS = set("=!<")
456 _LOOKBEHINDASSERTCHARS = set("=!")
457 _REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])
458 453
459 def _parse(source, state): 454 def _parse(source, state):
460 # parse a simple pattern 455 # parse a simple pattern
461 subpattern = SubPattern(state) 456 subpattern = SubPattern(state)
462 457
463 # precompute constants into local variables 458 # precompute constants into local variables
464 subpatternappend = subpattern.append 459 subpatternappend = subpattern.append
465 sourceget = source.get 460 sourceget = source.get
466 sourcematch = source.match 461 sourcematch = source.match
467 _len = len 462 _len = len
468 PATTERNENDERS = _PATTERNENDERS 463 _ord = ord
469 ASSERTCHARS = _ASSERTCHARS 464 verbose = state.flags & SRE_FLAG_VERBOSE
470 LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS 465
471 REPEATCODES = _REPEATCODES 466 while True:
472 467
473 while 1: 468 this = source.next
474
475 if source.next in PATTERNENDERS:
476 break # end of subpattern
477 this = sourceget()
478 if this is None: 469 if this is None:
479 break # end of pattern 470 break # end of pattern
480 471 if this in "|)":
481 if state.flags & SRE_FLAG_VERBOSE: 472 break # end of subpattern
473 sourceget()
474
475 if verbose:
482 # skip whitespace and comments 476 # skip whitespace and comments
483 if this in WHITESPACE: 477 if this in WHITESPACE:
484 continue 478 continue
485 if this == "#": 479 if this == "#":
486 while 1: 480 while True:
487 this = sourceget() 481 this = sourceget()
488 if this in (None, "\n"): 482 if this is None or this == "\n":
489 break 483 break
490 continue 484 continue
491 485
492 if this and this[0] not in SPECIAL_CHARS: 486 if this[0] == "\\":
493 subpatternappend((LITERAL, ord(this))) 487 code = _escape(source, this, state)
488 subpatternappend(code)
489
490 elif this not in SPECIAL_CHARS:
491 subpatternappend((LITERAL, _ord(this)))
494 492
495 elif this == "[": 493 elif this == "[":
496 # character set 494 # character set
497 set = [] 495 set = []
498 setappend = set.append 496 setappend = set.append
499 ## if sourcematch(":"): 497 ## if sourcematch(":"):
500 ## pass # handle character classes 498 ## pass # handle character classes
501 if sourcematch("^"): 499 if sourcematch("^"):
502 setappend((NEGATE, None)) 500 setappend((NEGATE, None))
503 # check remaining characters 501 # check remaining characters
504 start = set[:] 502 start = set[:]
505 while 1: 503 while True:
506 this = sourceget() 504 this = sourceget()
505 if this is None:
506 raise source.error("unexpected end of regular expression")
507 if this == "]" and set != start: 507 if this == "]" and set != start:
508 break 508 break
509 elif this and this[0] == "\\": 509 elif this[0] == "\\":
510 code1 = _class_escape(source, this) 510 code1 = _class_escape(source, this)
511 elif this:
512 code1 = LITERAL, ord(this)
513 else: 511 else:
514 raise source.error("unexpected end of regular expression", 0 ) 512 code1 = LITERAL, _ord(this)
515 if sourcematch("-"): 513 if sourcematch("-"):
516 # potential range 514 # potential range
517 this = sourceget() 515 this = sourceget()
516 if this is None:
517 raise source.error("unexpected end of regular expression ")
518 if this == "]": 518 if this == "]":
519 if code1[0] is IN: 519 if code1[0] is IN:
520 code1 = code1[1][0] 520 code1 = code1[1][0]
521 setappend(code1) 521 setappend(code1)
522 setappend((LITERAL, ord("-"))) 522 setappend((LITERAL, _ord("-")))
523 break 523 break
524 elif this: 524 if this[0] == "\\":
525 if this[0] == "\\": 525 code2 = _class_escape(source, this)
526 code2 = _class_escape(source, this)
527 else:
528 code2 = LITERAL, ord(this)
529 if code1[0] != LITERAL or code2[0] != LITERAL:
530 raise source.error("bad character range", len(this))
531 lo = code1[1]
532 hi = code2[1]
533 if hi < lo:
534 raise source.error("bad character range", len(this))
535 setappend((RANGE, (lo, hi)))
536 else: 526 else:
537 raise source.error("unexpected end of regular expression ", 0) 527 code2 = LITERAL, _ord(this)
528 if code1[0] != LITERAL or code2[0] != LITERAL:
529 raise source.error("bad character range", len(this))
530 lo = code1[1]
531 hi = code2[1]
532 if hi < lo:
533 raise source.error("bad character range", len(this))
534 setappend((RANGE, (lo, hi)))
538 else: 535 else:
539 if code1[0] is IN: 536 if code1[0] is IN:
540 code1 = code1[1][0] 537 code1 = code1[1][0]
541 setappend(code1) 538 setappend(code1)
542 539
543 # XXX: <fl> should move set optimization to compiler! 540 # XXX: <fl> should move set optimization to compiler!
544 if _len(set)==1 and set[0][0] is LITERAL: 541 if _len(set)==1 and set[0][0] is LITERAL:
545 subpatternappend(set[0]) # optimization 542 subpatternappend(set[0]) # optimization
546 elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: 543 elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
547 subpatternappend((NOT_LITERAL, set[1][1])) # optimization 544 subpatternappend((NOT_LITERAL, set[1][1])) # optimization
548 else: 545 else:
549 # XXX: <fl> should add charmap optimization here 546 # XXX: <fl> should add charmap optimization here
550 subpatternappend((IN, set)) 547 subpatternappend((IN, set))
551 548
552 elif this and this[0] in REPEAT_CHARS: 549 elif this in REPEAT_CHARS:
553 # repeat previous item 550 # repeat previous item
554 here = source.tell() 551 here = source.tell()
555 if this == "?": 552 if this == "?":
556 min, max = 0, 1 553 min, max = 0, 1
557 elif this == "*": 554 elif this == "*":
558 min, max = 0, MAXREPEAT 555 min, max = 0, MAXREPEAT
559 556
560 elif this == "+": 557 elif this == "+":
561 min, max = 1, MAXREPEAT 558 min, max = 1, MAXREPEAT
562 elif this == "{": 559 elif this == "{":
563 if source.next == "}": 560 if source.next == "}":
564 subpatternappend((LITERAL, ord(this))) 561 subpatternappend((LITERAL, _ord(this)))
565 continue 562 continue
566 min, max = 0, MAXREPEAT 563 min, max = 0, MAXREPEAT
567 lo = hi = "" 564 lo = hi = ""
568 while source.next in DIGITS: 565 while source.next in DIGITS:
569 lo = lo + source.get() 566 lo += sourceget()
570 if sourcematch(","): 567 if sourcematch(","):
571 while source.next in DIGITS: 568 while source.next in DIGITS:
572 hi = hi + sourceget() 569 hi += sourceget()
573 else: 570 else:
574 hi = lo 571 hi = lo
575 if not sourcematch("}"): 572 if not sourcematch("}"):
576 subpatternappend((LITERAL, ord(this))) 573 subpatternappend((LITERAL, _ord(this)))
577 source.seek(here) 574 source.seek(here)
578 continue 575 continue
579 if lo: 576 if lo:
580 min = int(lo) 577 min = int(lo)
581 if min >= MAXREPEAT: 578 if min >= MAXREPEAT:
582 raise OverflowError("the repetition number is too large" ) 579 raise OverflowError("the repetition number is too large" )
583 if hi: 580 if hi:
584 max = int(hi) 581 max = int(hi)
585 if max >= MAXREPEAT: 582 if max >= MAXREPEAT:
586 raise OverflowError("the repetition number is too large" ) 583 raise OverflowError("the repetition number is too large" )
587 if max < min: 584 if max < min:
588 raise source.error("bad repeat interval", 585 raise source.error("bad repeat interval",
589 source.tell() - here) 586 source.tell() - here)
590 else: 587 else:
591 raise source.error("not supported", len(this)) 588 raise source.error("not supported", len(this))
592 # figure out which item to repeat 589 # figure out which item to repeat
593 if subpattern: 590 if subpattern:
594 item = subpattern[-1:] 591 item = subpattern[-1:]
595 else: 592 else:
596 item = None 593 item = None
597 if not item or (_len(item) == 1 and item[0][0] == AT): 594 if not item or (_len(item) == 1 and item[0][0] == AT):
598 raise source.error("nothing to repeat", 595 raise source.error("nothing to repeat",
599 source.tell() - here + len(this)) 596 source.tell() - here + len(this))
600 if item[0][0] in REPEATCODES: 597 if item[0][0] in _REPEATCODES:
601 raise source.error("multiple repeat", 598 raise source.error("multiple repeat",
602 source.tell() - here + len(this)) 599 source.tell() - here + len(this))
603 if sourcematch("?"): 600 if sourcematch("?"):
604 subpattern[-1] = (MIN_REPEAT, (min, max, item)) 601 subpattern[-1] = (MIN_REPEAT, (min, max, item))
605 else: 602 else:
606 subpattern[-1] = (MAX_REPEAT, (min, max, item)) 603 subpattern[-1] = (MAX_REPEAT, (min, max, item))
607 604
608 elif this == ".": 605 elif this == ".":
609 subpatternappend((ANY, None)) 606 subpatternappend((ANY, None))
610 607
611 elif this == "(": 608 elif this == "(":
612 group = 1 609 group = 1
613 name = None 610 name = None
614 condgroup = None 611 condgroup = None
615 if sourcematch("?"): 612 if sourcematch("?"):
616 group = 0 613 group = 0
617 # options 614 # options
618 if sourcematch("P"): 615 char = sourceget()
616 if char is None:
617 raise self.error("unexpected end of pattern")
618 if char == "P":
619 # python extensions 619 # python extensions
620 if sourcematch("<"): 620 if sourcematch("<"):
621 # named group: skip forward to end of name 621 # named group: skip forward to end of name
622 name = "" 622 name = source.getuntil(">")
623 while 1:
624 char = sourceget()
625 if char is None:
626 raise source.error("unterminated name", 0)
627 if char == ">":
628 break
629 name = name + char
630 group = 1 623 group = 1
631 if not name: 624 if not name:
632 raise source.error("missing group name", 1) 625 raise source.error("missing group name", 1)
633 if not name.isidentifier(): 626 if not name.isidentifier():
634 raise source.error("bad character in group name " 627 raise source.error("bad character in group name "
635 "%r" % name, 628 "%r" % name,
636 len(name) + 1) 629 len(name) + 1)
637 elif sourcematch("="): 630 elif sourcematch("="):
638 # named backreference 631 # named backreference
639 name = "" 632 name = source.getuntil(")")
640 while 1:
641 char = sourceget()
642 if char is None:
643 raise source.error("unterminated name", 0)
644 if char == ")":
645 break
646 name = name + char
647 if not name: 633 if not name:
648 raise source.error("missing group name", 1) 634 raise source.error("missing group name", 1)
649 if not name.isidentifier(): 635 if not name.isidentifier():
650 raise source.error("bad character in backref " 636 raise source.error("bad character in backref "
651 "group name %r" % name, 637 "group name %r" % name,
652 len(name) + 1) 638 len(name) + 1)
653 gid = state.groupdict.get(name) 639 gid = state.groupdict.get(name)
654 if gid is None: 640 if gid is None:
655 msg = "unknown group name: {0!r}".format(name) 641 msg = "unknown group name: {0!r}".format(name)
656 raise source.error(msg, len(name) + 1) 642 raise source.error(msg, len(name) + 1)
657 subpatternappend((GROUPREF, gid)) 643 subpatternappend((GROUPREF, gid))
658 continue 644 continue
659 else: 645 else:
660 char = sourceget() 646 char = sourceget()
661 if char is None: 647 if char is None:
662 raise source.error("unexpected end of pattern", 0) 648 raise source.error("unexpected end of pattern")
663 raise source.error("unknown specifier: ?P%s" % char, 649 raise source.error("unknown specifier: ?P%s" % char,
664 len(char)) 650 len(char))
665 elif sourcematch(":"): 651 elif char == ":":
666 # non-capturing group 652 # non-capturing group
667 group = 2 653 group = 2
668 elif sourcematch("#"): 654 elif char == "#":
669 # comment 655 # comment
670 while 1: 656 while True:
671 if source.next is None or source.next == ")": 657 if source.next is None:
658 raise source.error("unbalanced parenthesis")
659 if sourceget() == ")":
672 break 660 break
673 sourceget()
674 if not sourcematch(")"):
675 raise source.error("unbalanced parenthesis", 0)
676 continue 661 continue
677 elif source.next in ASSERTCHARS: 662 elif char in "=!<":
678 # lookahead assertions 663 # lookahead assertions
679 char = sourceget()
680 dir = 1 664 dir = 1
681 if char == "<": 665 if char == "<":
682 if source.next not in LOOKBEHINDASSERTCHARS: 666 char = sourceget()
683 raise source.error("syntax error", 0) 667 if char is None or char not in "=!":
668 raise source.error("syntax error")
684 dir = -1 # lookbehind 669 dir = -1 # lookbehind
685 char = sourceget()
686 p = _parse_sub(source, state) 670 p = _parse_sub(source, state)
687 if not sourcematch(")"): 671 if not sourcematch(")"):
688 raise source.error("unbalanced parenthesis", 0) 672 raise source.error("unbalanced parenthesis")
689 if char == "=": 673 if char == "=":
690 subpatternappend((ASSERT, (dir, p))) 674 subpatternappend((ASSERT, (dir, p)))
691 else: 675 else:
692 subpatternappend((ASSERT_NOT, (dir, p))) 676 subpatternappend((ASSERT_NOT, (dir, p)))
693 continue 677 continue
694 elif sourcematch("("): 678 elif char == "(":
695 # conditional backreference group 679 # conditional backreference group
696 condname = "" 680 condname = source.getuntil(")")
697 while 1:
698 char = sourceget()
699 if char is None:
700 raise source.error("unterminated name", 0)
701 if char == ")":
702 break
703 condname = condname + char
704 group = 2 681 group = 2
705 if not condname: 682 if not condname:
706 raise source.error("missing group name", 1) 683 raise source.error("missing group name", 1)
707 if condname.isidentifier(): 684 if condname.isidentifier():
708 condgroup = state.groupdict.get(condname) 685 condgroup = state.groupdict.get(condname)
709 if condgroup is None: 686 if condgroup is None:
710 msg = "unknown group name: {0!r}".format(condname) 687 msg = "unknown group name: {0!r}".format(condname)
711 raise source.error(msg, len(condname) + 1) 688 raise source.error(msg, len(condname) + 1)
712 else: 689 else:
713 try: 690 try:
714 condgroup = int(condname) 691 condgroup = int(condname)
715 if condgroup < 0: 692 if condgroup < 0:
716 raise ValueError 693 raise ValueError
717 except ValueError: 694 except ValueError:
718 raise source.error("bad character in group name", 695 raise source.error("bad character in group name",
719 len(condname) + 1) 696 len(condname) + 1)
720 if not condgroup: 697 if not condgroup:
721 raise source.error("bad group number", 698 raise source.error("bad group number",
722 len(condname) + 1) 699 len(condname) + 1)
723 if condgroup >= MAXGROUPS: 700 if condgroup >= MAXGROUPS:
724 raise source.error("the group number is too large", 701 raise source.error("the group number is too large",
725 len(condname) + 1) 702 len(condname) + 1)
703 elif char in FLAGS:
704 # flags
705 state.flags |= FLAGS[char]
706 while source.next in FLAGS:
707 state.flags |= FLAGS[sourceget()]
708 verbose = state.flags & SRE_FLAG_VERBOSE
726 else: 709 else:
727 # flags 710 raise source.error("unexpected end of pattern")
728 if not source.next in FLAGS:
729 raise source.error("unexpected end of pattern", 0)
730 while source.next in FLAGS:
731 state.flags = state.flags | FLAGS[sourceget()]
732 if group: 711 if group:
733 # parse group contents 712 # parse group contents
734 if group == 2: 713 if group == 2:
735 # anonymous group 714 # anonymous group
736 group = None 715 group = None
737 else: 716 else:
738 try: 717 try:
739 group = state.opengroup(name) 718 group = state.opengroup(name)
740 except error as err: 719 except error as err:
741 raise source.error(err.msg, len(name) + 1) 720 raise source.error(err.msg, len(name) + 1)
742 if condgroup: 721 if condgroup:
743 p = _parse_sub_cond(source, state, condgroup) 722 p = _parse_sub_cond(source, state, condgroup)
744 else: 723 else:
745 p = _parse_sub(source, state) 724 p = _parse_sub(source, state)
746 if not sourcematch(")"): 725 if not sourcematch(")"):
747 raise source.error("unbalanced parenthesis", 0) 726 raise source.error("unbalanced parenthesis")
748 if group is not None: 727 if group is not None:
749 state.closegroup(group) 728 state.closegroup(group)
750 subpatternappend((SUBPATTERN, (group, p))) 729 subpatternappend((SUBPATTERN, (group, p)))
751 else: 730 else:
752 while 1: 731 while True:
753 char = sourceget() 732 char = sourceget()
754 if char is None: 733 if char is None:
755 raise source.error("unexpected end of pattern", 0) 734 raise source.error("unexpected end of pattern")
756 if char == ")": 735 if char == ")":
757 break 736 break
758 raise source.error("unknown extension", len(char)) 737 raise source.error("unknown extension", len(char))
759 738
760 elif this == "^": 739 elif this == "^":
761 subpatternappend((AT, AT_BEGINNING)) 740 subpatternappend((AT, AT_BEGINNING))
762 741
763 elif this == "$": 742 elif this == "$":
764 subpattern.append((AT, AT_END)) 743 subpattern.append((AT, AT_END))
765
766 elif this and this[0] == "\\":
767 code = _escape(source, this, state)
768 subpatternappend(code)
769 744
770 else: 745 else:
771 raise source.error("parser error", len(this)) 746 raise source.error("parser error", len(this))
772 747
773 return subpattern 748 return subpattern
774 749
775 def fix_flags(src, flags): 750 def fix_flags(src, flags):
776 # Check and fix flags according to the type of pattern (str or bytes) 751 # Check and fix flags according to the type of pattern (str or bytes)
777 if isinstance(src, str): 752 if isinstance(src, str):
778 if not flags & SRE_FLAG_ASCII: 753 if not flags & SRE_FLAG_ASCII:
(...skipping 11 matching lines...) Expand all
790 source = Tokenizer(str) 765 source = Tokenizer(str)
791 766
792 if pattern is None: 767 if pattern is None:
793 pattern = Pattern() 768 pattern = Pattern()
794 pattern.flags = flags 769 pattern.flags = flags
795 pattern.str = str 770 pattern.str = str
796 771
797 p = _parse_sub(source, pattern, 0) 772 p = _parse_sub(source, pattern, 0)
798 p.pattern.flags = fix_flags(str, p.pattern.flags) 773 p.pattern.flags = fix_flags(str, p.pattern.flags)
799 774
800 tail = source.get() 775 if source.next is not None:
801 if tail == ")": 776 if source.next == ")":
802 raise source.error("unbalanced parenthesis", 1) 777 raise source.error("unbalanced parenthesis")
803 elif tail: 778 else:
804 raise source.error("bogus characters at end of regular expression", 779 raise source.error("bogus characters at end of regular expression",
805 len(tail)) 780 len(tail))
806 781
807 if flags & SRE_FLAG_DEBUG: 782 if flags & SRE_FLAG_DEBUG:
808 p.dump() 783 p.dump()
809 784
810 if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: 785 if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
811 # the VERBOSE flag was switched on inside the pattern. to be 786 # the VERBOSE flag was switched on inside the pattern. to be
812 # on the safe side, we'll parse the whole thing again... 787 # on the safe side, we'll parse the whole thing again...
813 return parse(str, p.pattern.flags) 788 return parse(str, p.pattern.flags)
814 789
815 return p 790 return p
(...skipping 16 matching lines...) Expand all
832 while True: 807 while True:
833 this = sget() 808 this = sget()
834 if this is None: 809 if this is None:
835 break # end of replacement string 810 break # end of replacement string
836 if this[0] == "\\": 811 if this[0] == "\\":
837 # group 812 # group
838 c = this[1] 813 c = this[1]
839 if c == "g": 814 if c == "g":
840 name = "" 815 name = ""
841 if s.match("<"): 816 if s.match("<"):
842 while True: 817 name = s.getuntil(">")
843 char = sget()
844 if char is None:
845 raise s.error("unterminated group name", 0)
846 if char == ">":
847 break
848 name += char
849 if not name: 818 if not name:
850 raise s.error("missing group name", 1) 819 raise s.error("missing group name", 1)
851 try: 820 try:
852 index = int(name) 821 index = int(name)
853 if index < 0: 822 if index < 0:
854 raise s.error("negative group number", len(name) + 1) 823 raise s.error("negative group number", len(name) + 1)
855 if index >= MAXGROUPS: 824 if index >= MAXGROUPS:
856 raise s.error("the group number is too large", 825 raise s.error("the group number is too large",
857 len(name) + 1) 826 len(name) + 1)
858 except ValueError: 827 except ValueError:
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
897 if literal: 866 if literal:
898 literals.append(''.join(literal)) 867 literals.append(''.join(literal))
899 if not isinstance(source, str): 868 if not isinstance(source, str):
900 # The tokenizer implicitly decodes bytes objects as latin-1, we must 869 # The tokenizer implicitly decodes bytes objects as latin-1, we must
901 # therefore re-encode the final representation. 870 # therefore re-encode the final representation.
902 literals = [None if s is None else s.encode('latin-1') for s in literals ] 871 literals = [None if s is None else s.encode('latin-1') for s in literals ]
903 return groups, literals 872 return groups, literals
904 873
905 def expand_template(template, match): 874 def expand_template(template, match):
906 g = match.group 875 g = match.group
907 sep = match.string[:0] 876 empty = match.string[:0]
908 groups, literals = template 877 groups, literals = template
909 literals = literals[:] 878 literals = literals[:]
910 try: 879 try:
911 for index, group in groups: 880 for index, group in groups:
912 literals[index] = s = g(group) 881 literals[index] = g(group) or empty
913 if s is None:
914 raise error("unmatched group")
915 except IndexError: 882 except IndexError:
916 raise error("invalid group reference") 883 raise error("invalid group reference")
917 return sep.join(literals) 884 return empty.join(literals)
LEFTRIGHT

RSS Feeds Recent Issues | This issue
This is Rietveld 894c83f36cb7+