Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(169008)

Side by Side Diff: Lib/sre_parse.py

Issue 22578: Add addition attributes to re.error
Patch Set: Created 5 years, 2 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
« no previous file with comments | « Lib/sre_constants.py ('k') | Lib/test/test_re.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # 1 #
2 # Secret Labs' Regular Expression Engine 2 # Secret Labs' Regular Expression Engine
3 # 3 #
4 # convert re-style regular expression to sre pattern 4 # convert re-style regular expression to sre pattern
5 # 5 #
6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. 6 # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
7 # 7 #
8 # See the sre.py file for information on usage and redistribution. 8 # See the sre.py file for information on usage and redistribution.
9 # 9 #
10 10
(...skipping 174 matching lines...) Expand 10 before | Expand all | Expand 10 after
185 lo = lo + 1 185 lo = lo + 1
186 hi = hi + 1 186 hi = hi + 1
187 elif op == SUCCESS: 187 elif op == SUCCESS:
188 break 188 break
189 self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) 189 self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
190 return self.width 190 return self.width
191 191
192 class Tokenizer: 192 class Tokenizer:
193 def __init__(self, string): 193 def __init__(self, string):
194 self.istext = isinstance(string, str) 194 self.istext = isinstance(string, str)
195 self.string = string
195 if not self.istext: 196 if not self.istext:
196 string = str(string, 'latin1') 197 string = str(string, 'latin1')
197 self.string = string 198 self.decoded_string = string
198 self.index = 0 199 self.index = 0
199 self.__next() 200 self.__next()
200 def __next(self): 201 def __next(self):
201 index = self.index 202 index = self.index
202 try: 203 try:
203 char = self.string[index] 204 char = self.decoded_string[index]
204 except IndexError: 205 except IndexError:
205 self.next = None 206 self.next = None
206 return 207 return
207 if char == "\\": 208 if char == "\\":
208 index += 1 209 index += 1
209 try: 210 try:
210 char += self.string[index] 211 char += self.decoded_string[index]
211 except IndexError: 212 except IndexError:
212 raise error("bogus escape (end of line)") 213 raise self.error("bogus escape (end of line)") from None
213 self.index = index + 1 214 self.index = index + 1
214 self.next = char 215 self.next = char
215 def match(self, char): 216 def match(self, char):
216 if char == self.next: 217 if char == self.next:
217 self.__next() 218 self.__next()
218 return True 219 return True
219 return False 220 return False
220 def get(self): 221 def get(self):
221 this = self.next 222 this = self.next
222 self.__next() 223 self.__next()
223 return this 224 return this
224 def getwhile(self, n, charset): 225 def getwhile(self, n, charset):
225 result = '' 226 result = ''
226 for _ in range(n): 227 for _ in range(n):
227 c = self.next 228 c = self.next
228 if c not in charset: 229 if c not in charset:
229 break 230 break
230 result += c 231 result += c
231 self.__next() 232 self.__next()
232 return result 233 return result
233 def getuntil(self, terminator): 234 def getuntil(self, terminator):
234 result = '' 235 result = ''
235 while True: 236 while True:
236 c = self.next 237 c = self.next
237 self.__next() 238 self.__next()
238 if c is None: 239 if c is None:
239 raise error("unterminated name") 240 raise self.error("unterminated name")
240 if c == terminator: 241 if c == terminator:
241 break 242 break
242 result += c 243 result += c
243 return result 244 return result
244 def tell(self): 245 def tell(self):
245 return self.index, self.next 246 return self.index - len(self.next or '')
246 def seek(self, index): 247 def seek(self, index):
247 self.index, self.next = index 248 self.index = index
249 self.__next()
250
251 def error(self, msg, offset=0):
252 return error(msg, self.string, self.tell() - offset)
248 253
249 # The following three functions are not used in this module anymore, but we keep 254 # The following three functions are not used in this module anymore, but we keep
250 # them here (with DeprecationWarnings) for backwards compatibility. 255 # them here (with DeprecationWarnings) for backwards compatibility.
251 256
252 def isident(char): 257 def isident(char):
253 import warnings 258 import warnings
254 warnings.warn('sre_parse.isident() will be removed in 3.5', 259 warnings.warn('sre_parse.isident() will be removed in 3.5',
255 DeprecationWarning, stacklevel=2) 260 DeprecationWarning, stacklevel=2)
256 return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" 261 return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
257 262
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
301 if len(escape) != 10: 306 if len(escape) != 10:
302 raise ValueError 307 raise ValueError
303 c = int(escape[2:], 16) 308 c = int(escape[2:], 16)
304 chr(c) # raise ValueError for invalid code 309 chr(c) # raise ValueError for invalid code
305 return LITERAL, c 310 return LITERAL, c
306 elif c in OCTDIGITS: 311 elif c in OCTDIGITS:
307 # octal escape (up to three digits) 312 # octal escape (up to three digits)
308 escape += source.getwhile(2, OCTDIGITS) 313 escape += source.getwhile(2, OCTDIGITS)
309 c = int(escape[1:], 8) 314 c = int(escape[1:], 8)
310 if c > 0o377: 315 if c > 0o377:
311 raise error('octal escape value %r outside of ' 316 raise source.error('octal escape value %r outside of '
312 'range 0-0o377' % escape) 317 'range 0-0o377' % escape, len(escape))
313 return LITERAL, c 318 return LITERAL, c
314 elif c in DIGITS: 319 elif c in DIGITS:
315 raise ValueError 320 raise ValueError
316 if len(escape) == 2: 321 if len(escape) == 2:
317 return LITERAL, ord(escape[1]) 322 return LITERAL, ord(escape[1])
318 except ValueError: 323 except ValueError:
319 pass 324 pass
320 raise error("bogus escape: %s" % repr(escape)) 325 raise source.error("bogus escape: %s" % repr(escape), len(escape))
321 326
322 def _escape(source, escape, state): 327 def _escape(source, escape, state):
323 # handle escape code in expression 328 # handle escape code in expression
324 code = CATEGORIES.get(escape) 329 code = CATEGORIES.get(escape)
325 if code: 330 if code:
326 return code 331 return code
327 code = ESCAPES.get(escape) 332 code = ESCAPES.get(escape)
328 if code: 333 if code:
329 return code 334 return code
330 try: 335 try:
(...skipping 25 matching lines...) Expand all
356 elif c in DIGITS: 361 elif c in DIGITS:
357 # octal escape *or* decimal group reference (sigh) 362 # octal escape *or* decimal group reference (sigh)
358 if source.next in DIGITS: 363 if source.next in DIGITS:
359 escape += source.get() 364 escape += source.get()
360 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and 365 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and
361 source.next in OCTDIGITS): 366 source.next in OCTDIGITS):
362 # got three octal digits; this is an octal escape 367 # got three octal digits; this is an octal escape
363 escape += source.get() 368 escape += source.get()
364 c = int(escape[1:], 8) 369 c = int(escape[1:], 8)
365 if c > 0o377: 370 if c > 0o377:
366 raise error('octal escape value %r outside of ' 371 raise source.error('octal escape value %r outside of '
367 'range 0-0o377' % escape) 372 'range 0-0o377' % escape,
373 len(escape))
368 return LITERAL, c 374 return LITERAL, c
369 # not an octal escape, so this is a group reference 375 # not an octal escape, so this is a group reference
370 group = int(escape[1:]) 376 group = int(escape[1:])
371 if group < state.groups: 377 if group < state.groups:
372 if not state.checkgroup(group): 378 if not state.checkgroup(group):
373 raise error("cannot refer to open group") 379 raise source.error("cannot refer to open group",
380 len(escape))
374 return GROUPREF, group 381 return GROUPREF, group
375 raise ValueError 382 raise ValueError
376 if len(escape) == 2: 383 if len(escape) == 2:
377 return LITERAL, ord(escape[1]) 384 return LITERAL, ord(escape[1])
378 except ValueError: 385 except ValueError:
379 pass 386 pass
380 raise error("bogus escape: %s" % repr(escape)) 387 raise source.error("bogus escape: %s" % repr(escape), len(escape))
381 388
382 def _parse_sub(source, state, nested=True): 389 def _parse_sub(source, state, nested=True):
383 # parse an alternation: a|b|c 390 # parse an alternation: a|b|c
384 391
385 items = [] 392 items = []
386 itemsappend = items.append 393 itemsappend = items.append
387 sourcematch = source.match 394 sourcematch = source.match
388 while True: 395 while True:
389 itemsappend(_parse(source, state)) 396 itemsappend(_parse(source, state))
390 if not sourcematch("|"): 397 if not sourcematch("|"):
391 break 398 break
392 if nested and source.next is not None and source.next != ")": 399 if nested and source.next is not None and source.next != ")":
393 raise error("pattern not properly closed") 400 raise source.error("pattern not properly closed")
394 401
395 if len(items) == 1: 402 if len(items) == 1:
396 return items[0] 403 return items[0]
397 404
398 subpattern = SubPattern(state) 405 subpattern = SubPattern(state)
399 subpatternappend = subpattern.append 406 subpatternappend = subpattern.append
400 407
401 # check if all items share a common prefix 408 # check if all items share a common prefix
402 while True: 409 while True:
403 prefix = None 410 prefix = None
(...skipping 24 matching lines...) Expand all
428 return subpattern 435 return subpattern
429 436
430 subpattern.append((BRANCH, (None, items))) 437 subpattern.append((BRANCH, (None, items)))
431 return subpattern 438 return subpattern
432 439
433 def _parse_sub_cond(source, state, condgroup): 440 def _parse_sub_cond(source, state, condgroup):
434 item_yes = _parse(source, state) 441 item_yes = _parse(source, state)
435 if source.match("|"): 442 if source.match("|"):
436 item_no = _parse(source, state) 443 item_no = _parse(source, state)
437 if source.next == "|": 444 if source.next == "|":
438 raise error("conditional backref with more than two branches") 445 raise source.error("conditional backref with more than two branches" )
439 else: 446 else:
440 item_no = None 447 item_no = None
441 if source.next is not None and source.next != ")": 448 if source.next is not None and source.next != ")":
442 raise error("pattern not properly closed") 449 raise source.error("pattern not properly closed")
443 subpattern = SubPattern(state) 450 subpattern = SubPattern(state)
444 subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) 451 subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
445 return subpattern 452 return subpattern
446 453
447 def _parse(source, state): 454 def _parse(source, state):
448 # parse a simple pattern 455 # parse a simple pattern
449 subpattern = SubPattern(state) 456 subpattern = SubPattern(state)
450 457
451 # precompute constants into local variables 458 # precompute constants into local variables
452 subpatternappend = subpattern.append 459 subpatternappend = subpattern.append
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
489 setappend = set.append 496 setappend = set.append
490 ## if sourcematch(":"): 497 ## if sourcematch(":"):
491 ## pass # handle character classes 498 ## pass # handle character classes
492 if sourcematch("^"): 499 if sourcematch("^"):
493 setappend((NEGATE, None)) 500 setappend((NEGATE, None))
494 # check remaining characters 501 # check remaining characters
495 start = set[:] 502 start = set[:]
496 while True: 503 while True:
497 this = sourceget() 504 this = sourceget()
498 if this is None: 505 if this is None:
499 raise error("unexpected end of regular expression") 506 raise source.error("unexpected end of regular expression")
500 if this == "]" and set != start: 507 if this == "]" and set != start:
501 break 508 break
502 elif this[0] == "\\": 509 elif this[0] == "\\":
503 code1 = _class_escape(source, this) 510 code1 = _class_escape(source, this)
504 else: 511 else:
505 code1 = LITERAL, _ord(this) 512 code1 = LITERAL, _ord(this)
506 if sourcematch("-"): 513 if sourcematch("-"):
507 # potential range 514 # potential range
508 this = sourceget() 515 this = sourceget()
509 if this is None: 516 if this is None:
510 raise error("unexpected end of regular expression") 517 raise source.error("unexpected end of regular expression ")
511 if this == "]": 518 if this == "]":
512 if code1[0] is IN: 519 if code1[0] is IN:
513 code1 = code1[1][0] 520 code1 = code1[1][0]
514 setappend(code1) 521 setappend(code1)
515 setappend((LITERAL, _ord("-"))) 522 setappend((LITERAL, _ord("-")))
516 break 523 break
517 if this[0] == "\\": 524 if this[0] == "\\":
518 code2 = _class_escape(source, this) 525 code2 = _class_escape(source, this)
519 else: 526 else:
520 code2 = LITERAL, _ord(this) 527 code2 = LITERAL, _ord(this)
521 if code1[0] != LITERAL or code2[0] != LITERAL: 528 if code1[0] != LITERAL or code2[0] != LITERAL:
522 raise error("bad character range") 529 raise source.error("bad character range", len(this))
523 lo = code1[1] 530 lo = code1[1]
524 hi = code2[1] 531 hi = code2[1]
525 if hi < lo: 532 if hi < lo:
526 raise error("bad character range") 533 raise source.error("bad character range", len(this))
527 setappend((RANGE, (lo, hi))) 534 setappend((RANGE, (lo, hi)))
528 else: 535 else:
529 if code1[0] is IN: 536 if code1[0] is IN:
530 code1 = code1[1][0] 537 code1 = code1[1][0]
531 setappend(code1) 538 setappend(code1)
532 539
533 # XXX: <fl> should move set optimization to compiler! 540 # XXX: <fl> should move set optimization to compiler!
534 if _len(set)==1 and set[0][0] is LITERAL: 541 if _len(set)==1 and set[0][0] is LITERAL:
535 subpatternappend(set[0]) # optimization 542 subpatternappend(set[0]) # optimization
536 elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: 543 elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:
537 subpatternappend((NOT_LITERAL, set[1][1])) # optimization 544 subpatternappend((NOT_LITERAL, set[1][1])) # optimization
538 else: 545 else:
539 # XXX: <fl> should add charmap optimization here 546 # XXX: <fl> should add charmap optimization here
540 subpatternappend((IN, set)) 547 subpatternappend((IN, set))
541 548
542 elif this in REPEAT_CHARS: 549 elif this in REPEAT_CHARS:
543 # repeat previous item 550 # repeat previous item
551 here = source.tell()
544 if this == "?": 552 if this == "?":
545 min, max = 0, 1 553 min, max = 0, 1
546 elif this == "*": 554 elif this == "*":
547 min, max = 0, MAXREPEAT 555 min, max = 0, MAXREPEAT
548 556
549 elif this == "+": 557 elif this == "+":
550 min, max = 1, MAXREPEAT 558 min, max = 1, MAXREPEAT
551 elif this == "{": 559 elif this == "{":
552 if source.next == "}": 560 if source.next == "}":
553 subpatternappend((LITERAL, _ord(this))) 561 subpatternappend((LITERAL, _ord(this)))
554 continue 562 continue
555 here = source.tell()
556 min, max = 0, MAXREPEAT 563 min, max = 0, MAXREPEAT
557 lo = hi = "" 564 lo = hi = ""
558 while source.next in DIGITS: 565 while source.next in DIGITS:
559 lo += sourceget() 566 lo += sourceget()
560 if sourcematch(","): 567 if sourcematch(","):
561 while source.next in DIGITS: 568 while source.next in DIGITS:
562 hi += sourceget() 569 hi += sourceget()
563 else: 570 else:
564 hi = lo 571 hi = lo
565 if not sourcematch("}"): 572 if not sourcematch("}"):
566 subpatternappend((LITERAL, _ord(this))) 573 subpatternappend((LITERAL, _ord(this)))
567 source.seek(here) 574 source.seek(here)
568 continue 575 continue
569 if lo: 576 if lo:
570 min = int(lo) 577 min = int(lo)
571 if min >= MAXREPEAT: 578 if min >= MAXREPEAT:
572 raise OverflowError("the repetition number is too large" ) 579 raise OverflowError("the repetition number is too large" )
573 if hi: 580 if hi:
574 max = int(hi) 581 max = int(hi)
575 if max >= MAXREPEAT: 582 if max >= MAXREPEAT:
576 raise OverflowError("the repetition number is too large" ) 583 raise OverflowError("the repetition number is too large" )
577 if max < min: 584 if max < min:
578 raise error("bad repeat interval") 585 raise source.error("bad repeat interval",
586 source.tell() - here)
579 else: 587 else:
580 raise error("not supported") 588 raise source.error("not supported", len(this))
581 # figure out which item to repeat 589 # figure out which item to repeat
582 if subpattern: 590 if subpattern:
583 item = subpattern[-1:] 591 item = subpattern[-1:]
584 else: 592 else:
585 item = None 593 item = None
586 if not item or (_len(item) == 1 and item[0][0] == AT): 594 if not item or (_len(item) == 1 and item[0][0] == AT):
587 raise error("nothing to repeat") 595 raise source.error("nothing to repeat",
596 source.tell() - here + len(this))
588 if item[0][0] in _REPEATCODES: 597 if item[0][0] in _REPEATCODES:
589 raise error("multiple repeat") 598 raise source.error("multiple repeat",
599 source.tell() - here + len(this))
590 if sourcematch("?"): 600 if sourcematch("?"):
591 subpattern[-1] = (MIN_REPEAT, (min, max, item)) 601 subpattern[-1] = (MIN_REPEAT, (min, max, item))
592 else: 602 else:
593 subpattern[-1] = (MAX_REPEAT, (min, max, item)) 603 subpattern[-1] = (MAX_REPEAT, (min, max, item))
594 604
595 elif this == ".": 605 elif this == ".":
596 subpatternappend((ANY, None)) 606 subpatternappend((ANY, None))
597 607
598 elif this == "(": 608 elif this == "(":
599 group = 1 609 group = 1
600 name = None 610 name = None
601 condgroup = None 611 condgroup = None
602 if sourcematch("?"): 612 if sourcematch("?"):
603 group = 0 613 group = 0
604 # options 614 # options
605 char = sourceget() 615 char = sourceget()
606 if char is None: 616 if char is None:
607 raise error("unexpected end of pattern") 617 raise self.error("unexpected end of pattern")
608 if char == "P": 618 if char == "P":
609 # python extensions 619 # python extensions
610 if sourcematch("<"): 620 if sourcematch("<"):
611 # named group: skip forward to end of name 621 # named group: skip forward to end of name
612 name = source.getuntil(">") 622 name = source.getuntil(">")
613 group = 1 623 group = 1
614 if not name: 624 if not name:
615 raise error("missing group name") 625 raise source.error("missing group name", 1)
616 if not name.isidentifier(): 626 if not name.isidentifier():
617 raise error("bad character in group name %r" % name) 627 raise source.error("bad character in group name "
628 "%r" % name,
629 len(name) + 1)
618 elif sourcematch("="): 630 elif sourcematch("="):
619 # named backreference 631 # named backreference
620 name = source.getuntil(")") 632 name = source.getuntil(")")
621 if not name: 633 if not name:
622 raise error("missing group name") 634 raise source.error("missing group name", 1)
623 if not name.isidentifier(): 635 if not name.isidentifier():
624 raise error("bad character in backref group name " 636 raise source.error("bad character in backref "
625 "%r" % name) 637 "group name %r" % name,
638 len(name) + 1)
626 gid = state.groupdict.get(name) 639 gid = state.groupdict.get(name)
627 if gid is None: 640 if gid is None:
628 msg = "unknown group name: {0!r}".format(name) 641 msg = "unknown group name: {0!r}".format(name)
629 raise error(msg) 642 raise source.error(msg, len(name) + 1)
630 subpatternappend((GROUPREF, gid)) 643 subpatternappend((GROUPREF, gid))
631 continue 644 continue
632 else: 645 else:
633 char = sourceget() 646 char = sourceget()
634 if char is None: 647 if char is None:
635 raise error("unexpected end of pattern") 648 raise source.error("unexpected end of pattern")
636 raise error("unknown specifier: ?P%s" % char) 649 raise source.error("unknown specifier: ?P%s" % char,
650 len(char))
637 elif char == ":": 651 elif char == ":":
638 # non-capturing group 652 # non-capturing group
639 group = 2 653 group = 2
640 elif char == "#": 654 elif char == "#":
641 # comment 655 # comment
642 while True: 656 while True:
643 if source.next is None: 657 if source.next is None:
644 raise error("unbalanced parenthesis") 658 raise source.error("unbalanced parenthesis")
645 if sourceget() == ")": 659 if sourceget() == ")":
646 break 660 break
647 continue 661 continue
648 elif char in "=!<": 662 elif char in "=!<":
649 # lookahead assertions 663 # lookahead assertions
650 dir = 1 664 dir = 1
651 if char == "<": 665 if char == "<":
652 char = sourceget() 666 char = sourceget()
653 if char is None or char not in "=!": 667 if char is None or char not in "=!":
654 raise error("syntax error") 668 raise source.error("syntax error")
655 dir = -1 # lookbehind 669 dir = -1 # lookbehind
656 p = _parse_sub(source, state) 670 p = _parse_sub(source, state)
657 if not sourcematch(")"): 671 if not sourcematch(")"):
658 raise error("unbalanced parenthesis") 672 raise source.error("unbalanced parenthesis")
659 if char == "=": 673 if char == "=":
660 subpatternappend((ASSERT, (dir, p))) 674 subpatternappend((ASSERT, (dir, p)))
661 else: 675 else:
662 subpatternappend((ASSERT_NOT, (dir, p))) 676 subpatternappend((ASSERT_NOT, (dir, p)))
663 continue 677 continue
664 elif char == "(": 678 elif char == "(":
665 # conditional backreference group 679 # conditional backreference group
666 condname = source.getuntil(")") 680 condname = source.getuntil(")")
667 group = 2 681 group = 2
668 if not condname: 682 if not condname:
669 raise error("missing group name") 683 raise source.error("missing group name", 1)
670 if condname.isidentifier(): 684 if condname.isidentifier():
671 condgroup = state.groupdict.get(condname) 685 condgroup = state.groupdict.get(condname)
672 if condgroup is None: 686 if condgroup is None:
673 msg = "unknown group name: {0!r}".format(condname) 687 msg = "unknown group name: {0!r}".format(condname)
674 raise error(msg) 688 raise source.error(msg, len(condname) + 1)
675 else: 689 else:
676 try: 690 try:
677 condgroup = int(condname) 691 condgroup = int(condname)
678 if condgroup < 0: 692 if condgroup < 0:
679 raise ValueError 693 raise ValueError
680 except ValueError: 694 except ValueError:
681 raise error("bad character in group name") 695 raise source.error("bad character in group name",
696 len(condname) + 1)
682 if not condgroup: 697 if not condgroup:
683 raise error("bad group number") 698 raise source.error("bad group number",
699 len(condname) + 1)
684 if condgroup >= MAXGROUPS: 700 if condgroup >= MAXGROUPS:
685 raise error("the group number is too large") 701 raise source.error("the group number is too large",
702 len(condname) + 1)
686 elif char in FLAGS: 703 elif char in FLAGS:
687 # flags 704 # flags
688 state.flags |= FLAGS[char] 705 state.flags |= FLAGS[char]
689 while source.next in FLAGS: 706 while source.next in FLAGS:
690 state.flags |= FLAGS[sourceget()] 707 state.flags |= FLAGS[sourceget()]
691 verbose = state.flags & SRE_FLAG_VERBOSE 708 verbose = state.flags & SRE_FLAG_VERBOSE
692 else: 709 else:
693 raise error("unexpected end of pattern " + char) 710 raise source.error("unexpected end of pattern")
694 if group: 711 if group:
695 # parse group contents 712 # parse group contents
696 if group == 2: 713 if group == 2:
697 # anonymous group 714 # anonymous group
698 group = None 715 group = None
699 else: 716 else:
700 group = state.opengroup(name) 717 try:
718 group = state.opengroup(name)
719 except error as err:
720 raise source.error(err.msg, len(name) + 1)
701 if condgroup: 721 if condgroup:
702 p = _parse_sub_cond(source, state, condgroup) 722 p = _parse_sub_cond(source, state, condgroup)
703 else: 723 else:
704 p = _parse_sub(source, state) 724 p = _parse_sub(source, state)
705 if not sourcematch(")"): 725 if not sourcematch(")"):
706 raise error("unbalanced parenthesis") 726 raise source.error("unbalanced parenthesis")
707 if group is not None: 727 if group is not None:
708 state.closegroup(group) 728 state.closegroup(group)
709 subpatternappend((SUBPATTERN, (group, p))) 729 subpatternappend((SUBPATTERN, (group, p)))
710 else: 730 else:
711 while True: 731 while True:
712 char = sourceget() 732 char = sourceget()
713 if char is None: 733 if char is None:
714 raise error("unexpected end of pattern") 734 raise source.error("unexpected end of pattern")
715 if char == ")": 735 if char == ")":
716 break 736 break
717 raise error("unknown extension") 737 raise source.error("unknown extension", len(char))
718 738
719 elif this == "^": 739 elif this == "^":
720 subpatternappend((AT, AT_BEGINNING)) 740 subpatternappend((AT, AT_BEGINNING))
721 741
722 elif this == "$": 742 elif this == "$":
723 subpattern.append((AT, AT_END)) 743 subpattern.append((AT, AT_END))
724 744
725 else: 745 else:
726 raise error("parser error") 746 raise source.error("parser error", len(this))
727 747
728 return subpattern 748 return subpattern
729 749
730 def fix_flags(src, flags): 750 def fix_flags(src, flags):
731 # Check and fix flags according to the type of pattern (str or bytes) 751 # Check and fix flags according to the type of pattern (str or bytes)
732 if isinstance(src, str): 752 if isinstance(src, str):
733 if not flags & SRE_FLAG_ASCII: 753 if not flags & SRE_FLAG_ASCII:
734 flags |= SRE_FLAG_UNICODE 754 flags |= SRE_FLAG_UNICODE
735 elif flags & SRE_FLAG_UNICODE: 755 elif flags & SRE_FLAG_UNICODE:
736 raise ValueError("ASCII and UNICODE flags are incompatible") 756 raise ValueError("ASCII and UNICODE flags are incompatible")
(...skipping 10 matching lines...) Expand all
747 if pattern is None: 767 if pattern is None:
748 pattern = Pattern() 768 pattern = Pattern()
749 pattern.flags = flags 769 pattern.flags = flags
750 pattern.str = str 770 pattern.str = str
751 771
752 p = _parse_sub(source, pattern, 0) 772 p = _parse_sub(source, pattern, 0)
753 p.pattern.flags = fix_flags(str, p.pattern.flags) 773 p.pattern.flags = fix_flags(str, p.pattern.flags)
754 774
755 if source.next is not None: 775 if source.next is not None:
756 if source.next == ")": 776 if source.next == ")":
757 raise error("unbalanced parenthesis") 777 raise source.error("unbalanced parenthesis")
758 else: 778 else:
759 raise error("bogus characters at end of regular expression") 779 raise source.error("bogus characters at end of regular expression",
780 len(tail))
760 781
761 if flags & SRE_FLAG_DEBUG: 782 if flags & SRE_FLAG_DEBUG:
762 p.dump() 783 p.dump()
763 784
764 if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: 785 if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
765 # the VERBOSE flag was switched on inside the pattern. to be 786 # the VERBOSE flag was switched on inside the pattern. to be
766 # on the safe side, we'll parse the whole thing again... 787 # on the safe side, we'll parse the whole thing again...
767 return parse(str, p.pattern.flags) 788 return parse(str, p.pattern.flags)
768 789
769 return p 790 return p
(...skipping 18 matching lines...) Expand all
788 if this is None: 809 if this is None:
789 break # end of replacement string 810 break # end of replacement string
790 if this[0] == "\\": 811 if this[0] == "\\":
791 # group 812 # group
792 c = this[1] 813 c = this[1]
793 if c == "g": 814 if c == "g":
794 name = "" 815 name = ""
795 if s.match("<"): 816 if s.match("<"):
796 name = s.getuntil(">") 817 name = s.getuntil(">")
797 if not name: 818 if not name:
798 raise error("missing group name") 819 raise s.error("missing group name", 1)
799 try: 820 try:
800 index = int(name) 821 index = int(name)
801 if index < 0: 822 if index < 0:
802 raise error("negative group number") 823 raise s.error("negative group number", len(name) + 1)
803 if index >= MAXGROUPS: 824 if index >= MAXGROUPS:
804 raise error("the group number is too large") 825 raise s.error("the group number is too large",
826 len(name) + 1)
805 except ValueError: 827 except ValueError:
806 if not name.isidentifier(): 828 if not name.isidentifier():
807 raise error("bad character in group name") 829 raise s.error("bad character in group name",
830 len(name) + 1)
808 try: 831 try:
809 index = pattern.groupindex[name] 832 index = pattern.groupindex[name]
810 except KeyError: 833 except KeyError:
811 msg = "unknown group name: {0!r}".format(name) 834 msg = "unknown group name: {0!r}".format(name)
812 raise IndexError(msg) 835 raise IndexError(msg)
813 addgroup(index) 836 addgroup(index)
814 elif c == "0": 837 elif c == "0":
815 if s.next in OCTDIGITS: 838 if s.next in OCTDIGITS:
816 this += sget() 839 this += sget()
817 if s.next in OCTDIGITS: 840 if s.next in OCTDIGITS:
818 this += sget() 841 this += sget()
819 lappend(chr(int(this[1:], 8) & 0xff)) 842 lappend(chr(int(this[1:], 8) & 0xff))
820 elif c in DIGITS: 843 elif c in DIGITS:
821 isoctal = False 844 isoctal = False
822 if s.next in DIGITS: 845 if s.next in DIGITS:
823 this += sget() 846 this += sget()
824 if (c in OCTDIGITS and this[2] in OCTDIGITS and 847 if (c in OCTDIGITS and this[2] in OCTDIGITS and
825 s.next in OCTDIGITS): 848 s.next in OCTDIGITS):
826 this += sget() 849 this += sget()
827 isoctal = True 850 isoctal = True
828 c = int(this[1:], 8) 851 c = int(this[1:], 8)
829 if c > 0o377: 852 if c > 0o377:
830 raise error('octal escape value %r outside of ' 853 raise s.error('octal escape value %r outside of '
831 'range 0-0o377' % this) 854 'range 0-0o377' % this, len(this))
832 lappend(chr(c)) 855 lappend(chr(c))
833 if not isoctal: 856 if not isoctal:
834 addgroup(int(this[1:])) 857 addgroup(int(this[1:]))
835 else: 858 else:
836 try: 859 try:
837 this = chr(ESCAPES[this][1]) 860 this = chr(ESCAPES[this][1])
838 except KeyError: 861 except KeyError:
839 pass 862 pass
840 lappend(this) 863 lappend(this)
841 else: 864 else:
(...skipping 10 matching lines...) Expand all
852 g = match.group 875 g = match.group
853 empty = match.string[:0] 876 empty = match.string[:0]
854 groups, literals = template 877 groups, literals = template
855 literals = literals[:] 878 literals = literals[:]
856 try: 879 try:
857 for index, group in groups: 880 for index, group in groups:
858 literals[index] = g(group) or empty 881 literals[index] = g(group) or empty
859 except IndexError: 882 except IndexError:
860 raise error("invalid group reference") 883 raise error("invalid group reference")
861 return empty.join(literals) 884 return empty.join(literals)
OLDNEW
« no previous file with comments | « Lib/sre_constants.py ('k') | Lib/test/test_re.py » ('j') | no next file with comments »

RSS Feeds Recent Issues | This issue
This is Rietveld 894c83f36cb7+