|
1 """Iterator based sre token scanner |
|
2 |
|
3 """ |
|
4 |
|
5 import re |
|
6 import sre_parse |
|
7 import sre_compile |
|
8 import sre_constants |
|
9 |
|
10 from re import VERBOSE, MULTILINE, DOTALL |
|
11 from sre_constants import BRANCH, SUBPATTERN |
|
12 |
|
13 __all__ = ['Scanner', 'pattern'] |
|
14 |
|
15 FLAGS = (VERBOSE | MULTILINE | DOTALL) |
|
16 |
|
17 class Scanner(object): |
|
18 def __init__(self, lexicon, flags=FLAGS): |
|
19 self.actions = [None] |
|
20 # Combine phrases into a compound pattern |
|
21 s = sre_parse.Pattern() |
|
22 s.flags = flags |
|
23 p = [] |
|
24 for idx, token in enumerate(lexicon): |
|
25 phrase = token.pattern |
|
26 try: |
|
27 subpattern = sre_parse.SubPattern(s, |
|
28 [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) |
|
29 except sre_constants.error: |
|
30 raise |
|
31 p.append(subpattern) |
|
32 self.actions.append(token) |
|
33 |
|
34 s.groups = len(p) + 1 # NOTE(guido): Added to make SRE validation work |
|
35 p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) |
|
36 self.scanner = sre_compile.compile(p) |
|
37 |
|
38 def iterscan(self, string, idx=0, context=None): |
|
39 """Yield match, end_idx for each match |
|
40 |
|
41 """ |
|
42 match = self.scanner.scanner(string, idx).match |
|
43 actions = self.actions |
|
44 lastend = idx |
|
45 end = len(string) |
|
46 while True: |
|
47 m = match() |
|
48 if m is None: |
|
49 break |
|
50 matchbegin, matchend = m.span() |
|
51 if lastend == matchend: |
|
52 break |
|
53 action = actions[m.lastindex] |
|
54 if action is not None: |
|
55 rval, next_pos = action(m, context) |
|
56 if next_pos is not None and next_pos != matchend: |
|
57 # "fast forward" the scanner |
|
58 matchend = next_pos |
|
59 match = self.scanner.scanner(string, matchend).match |
|
60 yield rval, matchend |
|
61 lastend = matchend |
|
62 |
|
63 |
|
64 def pattern(pattern, flags=FLAGS): |
|
65 def decorator(fn): |
|
66 fn.pattern = pattern |
|
67 fn.regex = re.compile(pattern, flags) |
|
68 return fn |
|
69 return decorator |