|
1 |
|
2 /* Parser implementation */ |
|
3 |
|
4 /* For a description, see the comments at end of this file */ |
|
5 |
|
6 /* XXX To do: error recovery */ |
|
7 |
|
8 #include "Python.h" |
|
9 #include "pgenheaders.h" |
|
10 #include "token.h" |
|
11 #include "grammar.h" |
|
12 #include "node.h" |
|
13 #include "parser.h" |
|
14 #include "errcode.h" |
|
15 |
|
16 |
|
17 #ifdef Py_DEBUG |
|
18 extern int Py_DebugFlag; |
|
19 #define D(x) if (!Py_DebugFlag); else x |
|
20 #else |
|
21 #define D(x) |
|
22 #endif |
|
23 |
|
24 |
|
25 /* STACK DATA TYPE */ |
|
26 |
|
27 static void s_reset(stack *); |
|
28 |
|
29 static void |
|
30 s_reset(stack *s) |
|
31 { |
|
32 s->s_top = &s->s_base[MAXSTACK]; |
|
33 } |
|
34 |
|
35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK]) |
|
36 |
|
37 static int |
|
38 s_push(register stack *s, dfa *d, node *parent) |
|
39 { |
|
40 register stackentry *top; |
|
41 if (s->s_top == s->s_base) { |
|
42 fprintf(stderr, "s_push: parser stack overflow\n"); |
|
43 return E_NOMEM; |
|
44 } |
|
45 top = --s->s_top; |
|
46 top->s_dfa = d; |
|
47 top->s_parent = parent; |
|
48 top->s_state = 0; |
|
49 return 0; |
|
50 } |
|
51 |
|
52 #ifdef Py_DEBUG |
|
53 |
|
54 static void |
|
55 s_pop(register stack *s) |
|
56 { |
|
57 if (s_empty(s)) |
|
58 Py_FatalError("s_pop: parser stack underflow -- FATAL"); |
|
59 s->s_top++; |
|
60 } |
|
61 |
|
62 #else /* !Py_DEBUG */ |
|
63 |
|
64 #define s_pop(s) (s)->s_top++ |
|
65 |
|
66 #endif |
|
67 |
|
68 |
|
69 /* PARSER CREATION */ |
|
70 |
|
71 parser_state * |
|
72 PyParser_New(grammar *g, int start) |
|
73 { |
|
74 parser_state *ps; |
|
75 |
|
76 if (!g->g_accel) |
|
77 PyGrammar_AddAccelerators(g); |
|
78 ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state)); |
|
79 if (ps == NULL) |
|
80 return NULL; |
|
81 ps->p_grammar = g; |
|
82 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD |
|
83 ps->p_flags = 0; |
|
84 #endif |
|
85 ps->p_tree = PyNode_New(start); |
|
86 if (ps->p_tree == NULL) { |
|
87 PyMem_FREE(ps); |
|
88 return NULL; |
|
89 } |
|
90 s_reset(&ps->p_stack); |
|
91 (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree); |
|
92 return ps; |
|
93 } |
|
94 |
|
95 void |
|
96 PyParser_Delete(parser_state *ps) |
|
97 { |
|
98 /* NB If you want to save the parse tree, |
|
99 you must set p_tree to NULL before calling delparser! */ |
|
100 PyNode_Free(ps->p_tree); |
|
101 PyMem_FREE(ps); |
|
102 } |
|
103 |
|
104 |
|
105 /* PARSER STACK OPERATIONS */ |
|
106 |
|
107 static int |
|
108 shift(register stack *s, int type, char *str, int newstate, int lineno, int col_offset) |
|
109 { |
|
110 int err; |
|
111 assert(!s_empty(s)); |
|
112 err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset); |
|
113 if (err) |
|
114 return err; |
|
115 s->s_top->s_state = newstate; |
|
116 return 0; |
|
117 } |
|
118 |
|
119 static int |
|
120 push(register stack *s, int type, dfa *d, int newstate, int lineno, int col_offset) |
|
121 { |
|
122 int err; |
|
123 register node *n; |
|
124 n = s->s_top->s_parent; |
|
125 assert(!s_empty(s)); |
|
126 err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset); |
|
127 if (err) |
|
128 return err; |
|
129 s->s_top->s_state = newstate; |
|
130 return s_push(s, d, CHILD(n, NCH(n)-1)); |
|
131 } |
|
132 |
|
133 |
|
134 /* PARSER PROPER */ |
|
135 |
|
136 static int |
|
137 classify(parser_state *ps, int type, char *str) |
|
138 { |
|
139 grammar *g = ps->p_grammar; |
|
140 register int n = g->g_ll.ll_nlabels; |
|
141 |
|
142 if (type == NAME) { |
|
143 register char *s = str; |
|
144 register label *l = g->g_ll.ll_label; |
|
145 register int i; |
|
146 for (i = n; i > 0; i--, l++) { |
|
147 if (l->lb_type != NAME || l->lb_str == NULL || |
|
148 l->lb_str[0] != s[0] || |
|
149 strcmp(l->lb_str, s) != 0) |
|
150 continue; |
|
151 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD |
|
152 if (ps->p_flags & CO_FUTURE_PRINT_FUNCTION && |
|
153 s[0] == 'p' && strcmp(s, "print") == 0) { |
|
154 break; /* no longer a keyword */ |
|
155 } |
|
156 #endif |
|
157 D(printf("It's a keyword\n")); |
|
158 return n - i; |
|
159 } |
|
160 } |
|
161 |
|
162 { |
|
163 register label *l = g->g_ll.ll_label; |
|
164 register int i; |
|
165 for (i = n; i > 0; i--, l++) { |
|
166 if (l->lb_type == type && l->lb_str == NULL) { |
|
167 D(printf("It's a token we know\n")); |
|
168 return n - i; |
|
169 } |
|
170 } |
|
171 } |
|
172 |
|
173 D(printf("Illegal token\n")); |
|
174 return -1; |
|
175 } |
|
176 |
|
177 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD |
|
178 static void |
|
179 future_hack(parser_state *ps) |
|
180 { |
|
181 node *n = ps->p_stack.s_top->s_parent; |
|
182 node *ch, *cch; |
|
183 int i; |
|
184 |
|
185 /* from __future__ import ..., must have at least 4 children */ |
|
186 n = CHILD(n, 0); |
|
187 if (NCH(n) < 4) |
|
188 return; |
|
189 ch = CHILD(n, 0); |
|
190 if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0) |
|
191 return; |
|
192 ch = CHILD(n, 1); |
|
193 if (NCH(ch) == 1 && STR(CHILD(ch, 0)) && |
|
194 strcmp(STR(CHILD(ch, 0)), "__future__") != 0) |
|
195 return; |
|
196 ch = CHILD(n, 3); |
|
197 /* ch can be a star, a parenthesis or import_as_names */ |
|
198 if (TYPE(ch) == STAR) |
|
199 return; |
|
200 if (TYPE(ch) == LPAR) |
|
201 ch = CHILD(n, 4); |
|
202 |
|
203 for (i = 0; i < NCH(ch); i += 2) { |
|
204 cch = CHILD(ch, i); |
|
205 if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) { |
|
206 char *str_ch = STR(CHILD(cch, 0)); |
|
207 if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) { |
|
208 ps->p_flags |= CO_FUTURE_WITH_STATEMENT; |
|
209 } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) { |
|
210 ps->p_flags |= CO_FUTURE_PRINT_FUNCTION; |
|
211 } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) { |
|
212 ps->p_flags |= CO_FUTURE_UNICODE_LITERALS; |
|
213 } |
|
214 } |
|
215 } |
|
216 } |
|
217 #endif /* future keyword */ |
|
218 |
|
219 int |
|
220 PyParser_AddToken(register parser_state *ps, register int type, char *str, |
|
221 int lineno, int col_offset, int *expected_ret) |
|
222 { |
|
223 register int ilabel; |
|
224 int err; |
|
225 |
|
226 D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str)); |
|
227 |
|
228 /* Find out which label this token is */ |
|
229 ilabel = classify(ps, type, str); |
|
230 if (ilabel < 0) |
|
231 return E_SYNTAX; |
|
232 |
|
233 /* Loop until the token is shifted or an error occurred */ |
|
234 for (;;) { |
|
235 /* Fetch the current dfa and state */ |
|
236 register dfa *d = ps->p_stack.s_top->s_dfa; |
|
237 register state *s = &d->d_state[ps->p_stack.s_top->s_state]; |
|
238 |
|
239 D(printf(" DFA '%s', state %d:", |
|
240 d->d_name, ps->p_stack.s_top->s_state)); |
|
241 |
|
242 /* Check accelerator */ |
|
243 if (s->s_lower <= ilabel && ilabel < s->s_upper) { |
|
244 register int x = s->s_accel[ilabel - s->s_lower]; |
|
245 if (x != -1) { |
|
246 if (x & (1<<7)) { |
|
247 /* Push non-terminal */ |
|
248 int nt = (x >> 8) + NT_OFFSET; |
|
249 int arrow = x & ((1<<7)-1); |
|
250 dfa *d1 = PyGrammar_FindDFA( |
|
251 ps->p_grammar, nt); |
|
252 if ((err = push(&ps->p_stack, nt, d1, |
|
253 arrow, lineno, col_offset)) > 0) { |
|
254 D(printf(" MemError: push\n")); |
|
255 return err; |
|
256 } |
|
257 D(printf(" Push ...\n")); |
|
258 continue; |
|
259 } |
|
260 |
|
261 /* Shift the token */ |
|
262 if ((err = shift(&ps->p_stack, type, str, |
|
263 x, lineno, col_offset)) > 0) { |
|
264 D(printf(" MemError: shift.\n")); |
|
265 return err; |
|
266 } |
|
267 D(printf(" Shift.\n")); |
|
268 /* Pop while we are in an accept-only state */ |
|
269 while (s = &d->d_state |
|
270 [ps->p_stack.s_top->s_state], |
|
271 s->s_accept && s->s_narcs == 1) { |
|
272 D(printf(" DFA '%s', state %d: " |
|
273 "Direct pop.\n", |
|
274 d->d_name, |
|
275 ps->p_stack.s_top->s_state)); |
|
276 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD |
|
277 if (d->d_name[0] == 'i' && |
|
278 strcmp(d->d_name, |
|
279 "import_stmt") == 0) |
|
280 future_hack(ps); |
|
281 #endif |
|
282 s_pop(&ps->p_stack); |
|
283 if (s_empty(&ps->p_stack)) { |
|
284 D(printf(" ACCEPT.\n")); |
|
285 return E_DONE; |
|
286 } |
|
287 d = ps->p_stack.s_top->s_dfa; |
|
288 } |
|
289 return E_OK; |
|
290 } |
|
291 } |
|
292 |
|
293 if (s->s_accept) { |
|
294 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD |
|
295 if (d->d_name[0] == 'i' && |
|
296 strcmp(d->d_name, "import_stmt") == 0) |
|
297 future_hack(ps); |
|
298 #endif |
|
299 /* Pop this dfa and try again */ |
|
300 s_pop(&ps->p_stack); |
|
301 D(printf(" Pop ...\n")); |
|
302 if (s_empty(&ps->p_stack)) { |
|
303 D(printf(" Error: bottom of stack.\n")); |
|
304 return E_SYNTAX; |
|
305 } |
|
306 continue; |
|
307 } |
|
308 |
|
309 /* Stuck, report syntax error */ |
|
310 D(printf(" Error.\n")); |
|
311 if (expected_ret) { |
|
312 if (s->s_lower == s->s_upper - 1) { |
|
313 /* Only one possible expected token */ |
|
314 *expected_ret = ps->p_grammar-> |
|
315 g_ll.ll_label[s->s_lower].lb_type; |
|
316 } |
|
317 else |
|
318 *expected_ret = -1; |
|
319 } |
|
320 return E_SYNTAX; |
|
321 } |
|
322 } |
|
323 |
|
324 |
|
325 #ifdef Py_DEBUG |
|
326 |
|
327 /* DEBUG OUTPUT */ |
|
328 |
|
329 void |
|
330 dumptree(grammar *g, node *n) |
|
331 { |
|
332 int i; |
|
333 |
|
334 if (n == NULL) |
|
335 printf("NIL"); |
|
336 else { |
|
337 label l; |
|
338 l.lb_type = TYPE(n); |
|
339 l.lb_str = STR(n); |
|
340 printf("%s", PyGrammar_LabelRepr(&l)); |
|
341 if (ISNONTERMINAL(TYPE(n))) { |
|
342 printf("("); |
|
343 for (i = 0; i < NCH(n); i++) { |
|
344 if (i > 0) |
|
345 printf(","); |
|
346 dumptree(g, CHILD(n, i)); |
|
347 } |
|
348 printf(")"); |
|
349 } |
|
350 } |
|
351 } |
|
352 |
|
353 void |
|
354 showtree(grammar *g, node *n) |
|
355 { |
|
356 int i; |
|
357 |
|
358 if (n == NULL) |
|
359 return; |
|
360 if (ISNONTERMINAL(TYPE(n))) { |
|
361 for (i = 0; i < NCH(n); i++) |
|
362 showtree(g, CHILD(n, i)); |
|
363 } |
|
364 else if (ISTERMINAL(TYPE(n))) { |
|
365 printf("%s", _PyParser_TokenNames[TYPE(n)]); |
|
366 if (TYPE(n) == NUMBER || TYPE(n) == NAME) |
|
367 printf("(%s)", STR(n)); |
|
368 printf(" "); |
|
369 } |
|
370 else |
|
371 printf("? "); |
|
372 } |
|
373 |
|
374 void |
|
375 printtree(parser_state *ps) |
|
376 { |
|
377 if (Py_DebugFlag) { |
|
378 printf("Parse tree:\n"); |
|
379 dumptree(ps->p_grammar, ps->p_tree); |
|
380 printf("\n"); |
|
381 printf("Tokens:\n"); |
|
382 showtree(ps->p_grammar, ps->p_tree); |
|
383 printf("\n"); |
|
384 } |
|
385 printf("Listing:\n"); |
|
386 PyNode_ListTree(ps->p_tree); |
|
387 printf("\n"); |
|
388 } |
|
389 |
|
390 #endif /* Py_DEBUG */ |
|
391 |
|
392 /* |
|
393 |
|
394 Description |
|
395 ----------- |
|
396 |
|
397 The parser's interface is different than usual: the function addtoken() |
|
398 must be called for each token in the input. This makes it possible to |
|
399 turn it into an incremental parsing system later. The parsing system |
|
400 constructs a parse tree as it goes. |
|
401 |
|
402 A parsing rule is represented as a Deterministic Finite-state Automaton |
|
403 (DFA). A node in a DFA represents a state of the parser; an arc represents |
|
404 a transition. Transitions are either labeled with terminal symbols or |
|
405 with non-terminals. When the parser decides to follow an arc labeled |
|
406 with a non-terminal, it is invoked recursively with the DFA representing |
|
407 the parsing rule for that as its initial state; when that DFA accepts, |
|
408 the parser that invoked it continues. The parse tree constructed by the |
|
409 recursively called parser is inserted as a child in the current parse tree. |
|
410 |
|
411 The DFA's can be constructed automatically from a more conventional |
|
412 language description. An extended LL(1) grammar (ELL(1)) is suitable. |
|
413 Certain restrictions make the parser's life easier: rules that can produce |
|
414 the empty string should be outlawed (there are other ways to put loops |
|
415 or optional parts in the language). To avoid the need to construct |
|
416 FIRST sets, we can require that all but the last alternative of a rule |
|
417 (really: arc going out of a DFA's state) must begin with a terminal |
|
418 symbol. |
|
419 |
|
420 As an example, consider this grammar: |
|
421 |
|
422 expr: term (OP term)* |
|
423 term: CONSTANT | '(' expr ')' |
|
424 |
|
425 The DFA corresponding to the rule for expr is: |
|
426 |
|
427 ------->.---term-->.-------> |
|
428 ^ | |
|
429 | | |
|
430 \----OP----/ |
|
431 |
|
432 The parse tree generated for the input a+b is: |
|
433 |
|
434 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b))) |
|
435 |
|
436 */ |