|
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. |
|
2 # Licensed to PSF under a Contributor Agreement. |
|
3 |
|
4 """Parser engine for the grammar tables generated by pgen. |
|
5 |
|
6 The grammar table must be loaded first. |
|
7 |
|
8 See Parser/parser.c in the Python distribution for additional info on |
|
9 how this parsing engine works. |
|
10 |
|
11 """ |
|
12 |
|
13 # Local imports |
|
14 from . import token |
|
15 |
|
16 class ParseError(Exception): |
|
17 """Exception to signal the parser is stuck.""" |
|
18 |
|
19 def __init__(self, msg, type, value, context): |
|
20 Exception.__init__(self, "%s: type=%r, value=%r, context=%r" % |
|
21 (msg, type, value, context)) |
|
22 self.msg = msg |
|
23 self.type = type |
|
24 self.value = value |
|
25 self.context = context |
|
26 |
|
27 class Parser(object): |
|
28 """Parser engine. |
|
29 |
|
30 The proper usage sequence is: |
|
31 |
|
32 p = Parser(grammar, [converter]) # create instance |
|
33 p.setup([start]) # prepare for parsing |
|
34 <for each input token>: |
|
35 if p.addtoken(...): # parse a token; may raise ParseError |
|
36 break |
|
37 root = p.rootnode # root of abstract syntax tree |
|
38 |
|
39 A Parser instance may be reused by calling setup() repeatedly. |
|
40 |
|
41 A Parser instance contains state pertaining to the current token |
|
42 sequence, and should not be used concurrently by different threads |
|
43 to parse separate token sequences. |
|
44 |
|
45 See driver.py for how to get input tokens by tokenizing a file or |
|
46 string. |
|
47 |
|
48 Parsing is complete when addtoken() returns True; the root of the |
|
49 abstract syntax tree can then be retrieved from the rootnode |
|
50 instance variable. When a syntax error occurs, addtoken() raises |
|
51 the ParseError exception. There is no error recovery; the parser |
|
52 cannot be used after a syntax error was reported (but it can be |
|
53 reinitialized by calling setup()). |
|
54 |
|
55 """ |
|
56 |
|
57 def __init__(self, grammar, convert=None): |
|
58 """Constructor. |
|
59 |
|
60 The grammar argument is a grammar.Grammar instance; see the |
|
61 grammar module for more information. |
|
62 |
|
63 The parser is not ready yet for parsing; you must call the |
|
64 setup() method to get it started. |
|
65 |
|
66 The optional convert argument is a function mapping concrete |
|
67 syntax tree nodes to abstract syntax tree nodes. If not |
|
68 given, no conversion is done and the syntax tree produced is |
|
69 the concrete syntax tree. If given, it must be a function of |
|
70 two arguments, the first being the grammar (a grammar.Grammar |
|
71 instance), and the second being the concrete syntax tree node |
|
72 to be converted. The syntax tree is converted from the bottom |
|
73 up. |
|
74 |
|
75 A concrete syntax tree node is a (type, value, context, nodes) |
|
76 tuple, where type is the node type (a token or symbol number), |
|
77 value is None for symbols and a string for tokens, context is |
|
78 None or an opaque value used for error reporting (typically a |
|
79 (lineno, offset) pair), and nodes is a list of children for |
|
80 symbols, and None for tokens. |
|
81 |
|
82 An abstract syntax tree node may be anything; this is entirely |
|
83 up to the converter function. |
|
84 |
|
85 """ |
|
86 self.grammar = grammar |
|
87 self.convert = convert or (lambda grammar, node: node) |
|
88 |
|
89 def setup(self, start=None): |
|
90 """Prepare for parsing. |
|
91 |
|
92 This *must* be called before starting to parse. |
|
93 |
|
94 The optional argument is an alternative start symbol; it |
|
95 defaults to the grammar's start symbol. |
|
96 |
|
97 You can use a Parser instance to parse any number of programs; |
|
98 each time you call setup() the parser is reset to an initial |
|
99 state determined by the (implicit or explicit) start symbol. |
|
100 |
|
101 """ |
|
102 if start is None: |
|
103 start = self.grammar.start |
|
104 # Each stack entry is a tuple: (dfa, state, node). |
|
105 # A node is a tuple: (type, value, context, children), |
|
106 # where children is a list of nodes or None, and context may be None. |
|
107 newnode = (start, None, None, []) |
|
108 stackentry = (self.grammar.dfas[start], 0, newnode) |
|
109 self.stack = [stackentry] |
|
110 self.rootnode = None |
|
111 self.used_names = set() # Aliased to self.rootnode.used_names in pop() |
|
112 |
|
113 def addtoken(self, type, value, context): |
|
114 """Add a token; return True iff this is the end of the program.""" |
|
115 # Map from token to label |
|
116 ilabel = self.classify(type, value, context) |
|
117 # Loop until the token is shifted; may raise exceptions |
|
118 while True: |
|
119 dfa, state, node = self.stack[-1] |
|
120 states, first = dfa |
|
121 arcs = states[state] |
|
122 # Look for a state with this label |
|
123 for i, newstate in arcs: |
|
124 t, v = self.grammar.labels[i] |
|
125 if ilabel == i: |
|
126 # Look it up in the list of labels |
|
127 assert t < 256 |
|
128 # Shift a token; we're done with it |
|
129 self.shift(type, value, newstate, context) |
|
130 # Pop while we are in an accept-only state |
|
131 state = newstate |
|
132 while states[state] == [(0, state)]: |
|
133 self.pop() |
|
134 if not self.stack: |
|
135 # Done parsing! |
|
136 return True |
|
137 dfa, state, node = self.stack[-1] |
|
138 states, first = dfa |
|
139 # Done with this token |
|
140 return False |
|
141 elif t >= 256: |
|
142 # See if it's a symbol and if we're in its first set |
|
143 itsdfa = self.grammar.dfas[t] |
|
144 itsstates, itsfirst = itsdfa |
|
145 if ilabel in itsfirst: |
|
146 # Push a symbol |
|
147 self.push(t, self.grammar.dfas[t], newstate, context) |
|
148 break # To continue the outer while loop |
|
149 else: |
|
150 if (0, state) in arcs: |
|
151 # An accepting state, pop it and try something else |
|
152 self.pop() |
|
153 if not self.stack: |
|
154 # Done parsing, but another token is input |
|
155 raise ParseError("too much input", |
|
156 type, value, context) |
|
157 else: |
|
158 # No success finding a transition |
|
159 raise ParseError("bad input", type, value, context) |
|
160 |
|
161 def classify(self, type, value, context): |
|
162 """Turn a token into a label. (Internal)""" |
|
163 if type == token.NAME: |
|
164 # Keep a listing of all used names |
|
165 self.used_names.add(value) |
|
166 # Check for reserved words |
|
167 ilabel = self.grammar.keywords.get(value) |
|
168 if ilabel is not None: |
|
169 return ilabel |
|
170 ilabel = self.grammar.tokens.get(type) |
|
171 if ilabel is None: |
|
172 raise ParseError("bad token", type, value, context) |
|
173 return ilabel |
|
174 |
|
175 def shift(self, type, value, newstate, context): |
|
176 """Shift a token. (Internal)""" |
|
177 dfa, state, node = self.stack[-1] |
|
178 newnode = (type, value, context, None) |
|
179 newnode = self.convert(self.grammar, newnode) |
|
180 if newnode is not None: |
|
181 node[-1].append(newnode) |
|
182 self.stack[-1] = (dfa, newstate, node) |
|
183 |
|
184 def push(self, type, newdfa, newstate, context): |
|
185 """Push a nonterminal. (Internal)""" |
|
186 dfa, state, node = self.stack[-1] |
|
187 newnode = (type, None, context, []) |
|
188 self.stack[-1] = (dfa, newstate, node) |
|
189 self.stack.append((newdfa, 0, newnode)) |
|
190 |
|
191 def pop(self): |
|
192 """Pop a nonterminal. (Internal)""" |
|
193 popdfa, popstate, popnode = self.stack.pop() |
|
194 newnode = self.convert(self.grammar, popnode) |
|
195 if newnode is not None: |
|
196 if self.stack: |
|
197 dfa, state, node = self.stack[-1] |
|
198 node[-1].append(newnode) |
|
199 else: |
|
200 self.rootnode = newnode |
|
201 self.rootnode.used_names = self.used_names |