|
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. |
|
2 # Licensed to PSF under a Contributor Agreement. |
|
3 |
|
4 # Pgen imports |
|
5 from . import grammar, token, tokenize |
|
6 |
|
7 class PgenGrammar(grammar.Grammar): |
|
8 pass |
|
9 |
|
10 class ParserGenerator(object): |
|
11 |
|
12 def __init__(self, filename, stream=None): |
|
13 close_stream = None |
|
14 if stream is None: |
|
15 stream = open(filename) |
|
16 close_stream = stream.close |
|
17 self.filename = filename |
|
18 self.stream = stream |
|
19 self.generator = tokenize.generate_tokens(stream.readline) |
|
20 self.gettoken() # Initialize lookahead |
|
21 self.dfas, self.startsymbol = self.parse() |
|
22 if close_stream is not None: |
|
23 close_stream() |
|
24 self.first = {} # map from symbol name to set of tokens |
|
25 self.addfirstsets() |
|
26 |
|
27 def make_grammar(self): |
|
28 c = PgenGrammar() |
|
29 names = self.dfas.keys() |
|
30 names.sort() |
|
31 names.remove(self.startsymbol) |
|
32 names.insert(0, self.startsymbol) |
|
33 for name in names: |
|
34 i = 256 + len(c.symbol2number) |
|
35 c.symbol2number[name] = i |
|
36 c.number2symbol[i] = name |
|
37 for name in names: |
|
38 dfa = self.dfas[name] |
|
39 states = [] |
|
40 for state in dfa: |
|
41 arcs = [] |
|
42 for label, next in state.arcs.iteritems(): |
|
43 arcs.append((self.make_label(c, label), dfa.index(next))) |
|
44 if state.isfinal: |
|
45 arcs.append((0, dfa.index(state))) |
|
46 states.append(arcs) |
|
47 c.states.append(states) |
|
48 c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name)) |
|
49 c.start = c.symbol2number[self.startsymbol] |
|
50 return c |
|
51 |
|
52 def make_first(self, c, name): |
|
53 rawfirst = self.first[name] |
|
54 first = {} |
|
55 for label in rawfirst: |
|
56 ilabel = self.make_label(c, label) |
|
57 ##assert ilabel not in first # XXX failed on <> ... != |
|
58 first[ilabel] = 1 |
|
59 return first |
|
60 |
|
61 def make_label(self, c, label): |
|
62 # XXX Maybe this should be a method on a subclass of converter? |
|
63 ilabel = len(c.labels) |
|
64 if label[0].isalpha(): |
|
65 # Either a symbol name or a named token |
|
66 if label in c.symbol2number: |
|
67 # A symbol name (a non-terminal) |
|
68 if label in c.symbol2label: |
|
69 return c.symbol2label[label] |
|
70 else: |
|
71 c.labels.append((c.symbol2number[label], None)) |
|
72 c.symbol2label[label] = ilabel |
|
73 return ilabel |
|
74 else: |
|
75 # A named token (NAME, NUMBER, STRING) |
|
76 itoken = getattr(token, label, None) |
|
77 assert isinstance(itoken, int), label |
|
78 assert itoken in token.tok_name, label |
|
79 if itoken in c.tokens: |
|
80 return c.tokens[itoken] |
|
81 else: |
|
82 c.labels.append((itoken, None)) |
|
83 c.tokens[itoken] = ilabel |
|
84 return ilabel |
|
85 else: |
|
86 # Either a keyword or an operator |
|
87 assert label[0] in ('"', "'"), label |
|
88 value = eval(label) |
|
89 if value[0].isalpha(): |
|
90 # A keyword |
|
91 if value in c.keywords: |
|
92 return c.keywords[value] |
|
93 else: |
|
94 c.labels.append((token.NAME, value)) |
|
95 c.keywords[value] = ilabel |
|
96 return ilabel |
|
97 else: |
|
98 # An operator (any non-numeric token) |
|
99 itoken = grammar.opmap[value] # Fails if unknown token |
|
100 if itoken in c.tokens: |
|
101 return c.tokens[itoken] |
|
102 else: |
|
103 c.labels.append((itoken, None)) |
|
104 c.tokens[itoken] = ilabel |
|
105 return ilabel |
|
106 |
|
107 def addfirstsets(self): |
|
108 names = self.dfas.keys() |
|
109 names.sort() |
|
110 for name in names: |
|
111 if name not in self.first: |
|
112 self.calcfirst(name) |
|
113 #print name, self.first[name].keys() |
|
114 |
|
115 def calcfirst(self, name): |
|
116 dfa = self.dfas[name] |
|
117 self.first[name] = None # dummy to detect left recursion |
|
118 state = dfa[0] |
|
119 totalset = {} |
|
120 overlapcheck = {} |
|
121 for label, next in state.arcs.iteritems(): |
|
122 if label in self.dfas: |
|
123 if label in self.first: |
|
124 fset = self.first[label] |
|
125 if fset is None: |
|
126 raise ValueError("recursion for rule %r" % name) |
|
127 else: |
|
128 self.calcfirst(label) |
|
129 fset = self.first[label] |
|
130 totalset.update(fset) |
|
131 overlapcheck[label] = fset |
|
132 else: |
|
133 totalset[label] = 1 |
|
134 overlapcheck[label] = {label: 1} |
|
135 inverse = {} |
|
136 for label, itsfirst in overlapcheck.iteritems(): |
|
137 for symbol in itsfirst: |
|
138 if symbol in inverse: |
|
139 raise ValueError("rule %s is ambiguous; %s is in the" |
|
140 " first sets of %s as well as %s" % |
|
141 (name, symbol, label, inverse[symbol])) |
|
142 inverse[symbol] = label |
|
143 self.first[name] = totalset |
|
144 |
|
145 def parse(self): |
|
146 dfas = {} |
|
147 startsymbol = None |
|
148 # MSTART: (NEWLINE | RULE)* ENDMARKER |
|
149 while self.type != token.ENDMARKER: |
|
150 while self.type == token.NEWLINE: |
|
151 self.gettoken() |
|
152 # RULE: NAME ':' RHS NEWLINE |
|
153 name = self.expect(token.NAME) |
|
154 self.expect(token.OP, ":") |
|
155 a, z = self.parse_rhs() |
|
156 self.expect(token.NEWLINE) |
|
157 #self.dump_nfa(name, a, z) |
|
158 dfa = self.make_dfa(a, z) |
|
159 #self.dump_dfa(name, dfa) |
|
160 oldlen = len(dfa) |
|
161 self.simplify_dfa(dfa) |
|
162 newlen = len(dfa) |
|
163 dfas[name] = dfa |
|
164 #print name, oldlen, newlen |
|
165 if startsymbol is None: |
|
166 startsymbol = name |
|
167 return dfas, startsymbol |
|
168 |
|
169 def make_dfa(self, start, finish): |
|
170 # To turn an NFA into a DFA, we define the states of the DFA |
|
171 # to correspond to *sets* of states of the NFA. Then do some |
|
172 # state reduction. Let's represent sets as dicts with 1 for |
|
173 # values. |
|
174 assert isinstance(start, NFAState) |
|
175 assert isinstance(finish, NFAState) |
|
176 def closure(state): |
|
177 base = {} |
|
178 addclosure(state, base) |
|
179 return base |
|
180 def addclosure(state, base): |
|
181 assert isinstance(state, NFAState) |
|
182 if state in base: |
|
183 return |
|
184 base[state] = 1 |
|
185 for label, next in state.arcs: |
|
186 if label is None: |
|
187 addclosure(next, base) |
|
188 states = [DFAState(closure(start), finish)] |
|
189 for state in states: # NB states grows while we're iterating |
|
190 arcs = {} |
|
191 for nfastate in state.nfaset: |
|
192 for label, next in nfastate.arcs: |
|
193 if label is not None: |
|
194 addclosure(next, arcs.setdefault(label, {})) |
|
195 for label, nfaset in arcs.iteritems(): |
|
196 for st in states: |
|
197 if st.nfaset == nfaset: |
|
198 break |
|
199 else: |
|
200 st = DFAState(nfaset, finish) |
|
201 states.append(st) |
|
202 state.addarc(st, label) |
|
203 return states # List of DFAState instances; first one is start |
|
204 |
|
205 def dump_nfa(self, name, start, finish): |
|
206 print "Dump of NFA for", name |
|
207 todo = [start] |
|
208 for i, state in enumerate(todo): |
|
209 print " State", i, state is finish and "(final)" or "" |
|
210 for label, next in state.arcs: |
|
211 if next in todo: |
|
212 j = todo.index(next) |
|
213 else: |
|
214 j = len(todo) |
|
215 todo.append(next) |
|
216 if label is None: |
|
217 print " -> %d" % j |
|
218 else: |
|
219 print " %s -> %d" % (label, j) |
|
220 |
|
221 def dump_dfa(self, name, dfa): |
|
222 print "Dump of DFA for", name |
|
223 for i, state in enumerate(dfa): |
|
224 print " State", i, state.isfinal and "(final)" or "" |
|
225 for label, next in state.arcs.iteritems(): |
|
226 print " %s -> %d" % (label, dfa.index(next)) |
|
227 |
|
228 def simplify_dfa(self, dfa): |
|
229 # This is not theoretically optimal, but works well enough. |
|
230 # Algorithm: repeatedly look for two states that have the same |
|
231 # set of arcs (same labels pointing to the same nodes) and |
|
232 # unify them, until things stop changing. |
|
233 |
|
234 # dfa is a list of DFAState instances |
|
235 changes = True |
|
236 while changes: |
|
237 changes = False |
|
238 for i, state_i in enumerate(dfa): |
|
239 for j in range(i+1, len(dfa)): |
|
240 state_j = dfa[j] |
|
241 if state_i == state_j: |
|
242 #print " unify", i, j |
|
243 del dfa[j] |
|
244 for state in dfa: |
|
245 state.unifystate(state_j, state_i) |
|
246 changes = True |
|
247 break |
|
248 |
|
249 def parse_rhs(self): |
|
250 # RHS: ALT ('|' ALT)* |
|
251 a, z = self.parse_alt() |
|
252 if self.value != "|": |
|
253 return a, z |
|
254 else: |
|
255 aa = NFAState() |
|
256 zz = NFAState() |
|
257 aa.addarc(a) |
|
258 z.addarc(zz) |
|
259 while self.value == "|": |
|
260 self.gettoken() |
|
261 a, z = self.parse_alt() |
|
262 aa.addarc(a) |
|
263 z.addarc(zz) |
|
264 return aa, zz |
|
265 |
|
266 def parse_alt(self): |
|
267 # ALT: ITEM+ |
|
268 a, b = self.parse_item() |
|
269 while (self.value in ("(", "[") or |
|
270 self.type in (token.NAME, token.STRING)): |
|
271 c, d = self.parse_item() |
|
272 b.addarc(c) |
|
273 b = d |
|
274 return a, b |
|
275 |
|
276 def parse_item(self): |
|
277 # ITEM: '[' RHS ']' | ATOM ['+' | '*'] |
|
278 if self.value == "[": |
|
279 self.gettoken() |
|
280 a, z = self.parse_rhs() |
|
281 self.expect(token.OP, "]") |
|
282 a.addarc(z) |
|
283 return a, z |
|
284 else: |
|
285 a, z = self.parse_atom() |
|
286 value = self.value |
|
287 if value not in ("+", "*"): |
|
288 return a, z |
|
289 self.gettoken() |
|
290 z.addarc(a) |
|
291 if value == "+": |
|
292 return a, z |
|
293 else: |
|
294 return a, a |
|
295 |
|
296 def parse_atom(self): |
|
297 # ATOM: '(' RHS ')' | NAME | STRING |
|
298 if self.value == "(": |
|
299 self.gettoken() |
|
300 a, z = self.parse_rhs() |
|
301 self.expect(token.OP, ")") |
|
302 return a, z |
|
303 elif self.type in (token.NAME, token.STRING): |
|
304 a = NFAState() |
|
305 z = NFAState() |
|
306 a.addarc(z, self.value) |
|
307 self.gettoken() |
|
308 return a, z |
|
309 else: |
|
310 self.raise_error("expected (...) or NAME or STRING, got %s/%s", |
|
311 self.type, self.value) |
|
312 |
|
313 def expect(self, type, value=None): |
|
314 if self.type != type or (value is not None and self.value != value): |
|
315 self.raise_error("expected %s/%s, got %s/%s", |
|
316 type, value, self.type, self.value) |
|
317 value = self.value |
|
318 self.gettoken() |
|
319 return value |
|
320 |
|
321 def gettoken(self): |
|
322 tup = self.generator.next() |
|
323 while tup[0] in (tokenize.COMMENT, tokenize.NL): |
|
324 tup = self.generator.next() |
|
325 self.type, self.value, self.begin, self.end, self.line = tup |
|
326 #print token.tok_name[self.type], repr(self.value) |
|
327 |
|
328 def raise_error(self, msg, *args): |
|
329 if args: |
|
330 try: |
|
331 msg = msg % args |
|
332 except: |
|
333 msg = " ".join([msg] + map(str, args)) |
|
334 raise SyntaxError(msg, (self.filename, self.end[0], |
|
335 self.end[1], self.line)) |
|
336 |
|
337 class NFAState(object): |
|
338 |
|
339 def __init__(self): |
|
340 self.arcs = [] # list of (label, NFAState) pairs |
|
341 |
|
342 def addarc(self, next, label=None): |
|
343 assert label is None or isinstance(label, str) |
|
344 assert isinstance(next, NFAState) |
|
345 self.arcs.append((label, next)) |
|
346 |
|
347 class DFAState(object): |
|
348 |
|
349 def __init__(self, nfaset, final): |
|
350 assert isinstance(nfaset, dict) |
|
351 assert isinstance(iter(nfaset).next(), NFAState) |
|
352 assert isinstance(final, NFAState) |
|
353 self.nfaset = nfaset |
|
354 self.isfinal = final in nfaset |
|
355 self.arcs = {} # map from label to DFAState |
|
356 |
|
357 def addarc(self, next, label): |
|
358 assert isinstance(label, str) |
|
359 assert label not in self.arcs |
|
360 assert isinstance(next, DFAState) |
|
361 self.arcs[label] = next |
|
362 |
|
363 def unifystate(self, old, new): |
|
364 for label, next in self.arcs.iteritems(): |
|
365 if next is old: |
|
366 self.arcs[label] = new |
|
367 |
|
368 def __eq__(self, other): |
|
369 # Equality test -- ignore the nfaset instance variable |
|
370 assert isinstance(other, DFAState) |
|
371 if self.isfinal != other.isfinal: |
|
372 return False |
|
373 # Can't just return self.arcs == other.arcs, because that |
|
374 # would invoke this method recursively, with cycles... |
|
375 if len(self.arcs) != len(other.arcs): |
|
376 return False |
|
377 for label, next in self.arcs.iteritems(): |
|
378 if next is not other.arcs.get(label): |
|
379 return False |
|
380 return True |
|
381 |
|
382 def generate_grammar(filename="Grammar.txt"): |
|
383 p = ParserGenerator(filename) |
|
384 return p.make_grammar() |