|
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. |
|
2 # Licensed to PSF under a Contributor Agreement. |
|
3 |
|
4 """Convert graminit.[ch] spit out by pgen to Python code. |
|
5 |
|
6 Pgen is the Python parser generator. It is useful to quickly create a |
|
7 parser from a grammar file in Python's grammar notation. But I don't |
|
8 want my parsers to be written in C (yet), so I'm translating the |
|
9 parsing tables to Python data structures and writing a Python parse |
|
10 engine. |
|
11 |
|
12 Note that the token numbers are constants determined by the standard |
|
13 Python tokenizer. The standard token module defines these numbers and |
|
14 their names (the names are not used much). The token numbers are |
|
15 hardcoded into the Python tokenizer and into pgen. A Python |
|
16 implementation of the Python tokenizer is also available, in the |
|
17 standard tokenize module. |
|
18 |
|
19 On the other hand, symbol numbers (representing the grammar's |
|
20 non-terminals) are assigned by pgen based on the actual grammar |
|
21 input. |
|
22 |
|
23 Note: this module is pretty much obsolete; the pgen module generates |
|
24 equivalent grammar tables directly from the Grammar.txt input file |
|
25 without having to invoke the Python pgen C program. |
|
26 |
|
27 """ |
|
28 |
|
29 # Python imports |
|
30 import re |
|
31 |
|
32 # Local imports |
|
33 from pgen2 import grammar, token |
|
34 |
|
35 |
|
36 class Converter(grammar.Grammar): |
|
37 """Grammar subclass that reads classic pgen output files. |
|
38 |
|
39 The run() method reads the tables as produced by the pgen parser |
|
40 generator, typically contained in two C files, graminit.h and |
|
41 graminit.c. The other methods are for internal use only. |
|
42 |
|
43 See the base class for more documentation. |
|
44 |
|
45 """ |
|
46 |
|
47 def run(self, graminit_h, graminit_c): |
|
48 """Load the grammar tables from the text files written by pgen.""" |
|
49 self.parse_graminit_h(graminit_h) |
|
50 self.parse_graminit_c(graminit_c) |
|
51 self.finish_off() |
|
52 |
|
53 def parse_graminit_h(self, filename): |
|
54 """Parse the .h file writen by pgen. (Internal) |
|
55 |
|
56 This file is a sequence of #define statements defining the |
|
57 nonterminals of the grammar as numbers. We build two tables |
|
58 mapping the numbers to names and back. |
|
59 |
|
60 """ |
|
61 try: |
|
62 f = open(filename) |
|
63 except IOError, err: |
|
64 print "Can't open %s: %s" % (filename, err) |
|
65 return False |
|
66 self.symbol2number = {} |
|
67 self.number2symbol = {} |
|
68 lineno = 0 |
|
69 for line in f: |
|
70 lineno += 1 |
|
71 mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line) |
|
72 if not mo and line.strip(): |
|
73 print "%s(%s): can't parse %s" % (filename, lineno, |
|
74 line.strip()) |
|
75 else: |
|
76 symbol, number = mo.groups() |
|
77 number = int(number) |
|
78 assert symbol not in self.symbol2number |
|
79 assert number not in self.number2symbol |
|
80 self.symbol2number[symbol] = number |
|
81 self.number2symbol[number] = symbol |
|
82 return True |
|
83 |
|
84 def parse_graminit_c(self, filename): |
|
85 """Parse the .c file writen by pgen. (Internal) |
|
86 |
|
87 The file looks as follows. The first two lines are always this: |
|
88 |
|
89 #include "pgenheaders.h" |
|
90 #include "grammar.h" |
|
91 |
|
92 After that come four blocks: |
|
93 |
|
94 1) one or more state definitions |
|
95 2) a table defining dfas |
|
96 3) a table defining labels |
|
97 4) a struct defining the grammar |
|
98 |
|
99 A state definition has the following form: |
|
100 - one or more arc arrays, each of the form: |
|
101 static arc arcs_<n>_<m>[<k>] = { |
|
102 {<i>, <j>}, |
|
103 ... |
|
104 }; |
|
105 - followed by a state array, of the form: |
|
106 static state states_<s>[<t>] = { |
|
107 {<k>, arcs_<n>_<m>}, |
|
108 ... |
|
109 }; |
|
110 |
|
111 """ |
|
112 try: |
|
113 f = open(filename) |
|
114 except IOError, err: |
|
115 print "Can't open %s: %s" % (filename, err) |
|
116 return False |
|
117 # The code below essentially uses f's iterator-ness! |
|
118 lineno = 0 |
|
119 |
|
120 # Expect the two #include lines |
|
121 lineno, line = lineno+1, f.next() |
|
122 assert line == '#include "pgenheaders.h"\n', (lineno, line) |
|
123 lineno, line = lineno+1, f.next() |
|
124 assert line == '#include "grammar.h"\n', (lineno, line) |
|
125 |
|
126 # Parse the state definitions |
|
127 lineno, line = lineno+1, f.next() |
|
128 allarcs = {} |
|
129 states = [] |
|
130 while line.startswith("static arc "): |
|
131 while line.startswith("static arc "): |
|
132 mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$", |
|
133 line) |
|
134 assert mo, (lineno, line) |
|
135 n, m, k = map(int, mo.groups()) |
|
136 arcs = [] |
|
137 for _ in range(k): |
|
138 lineno, line = lineno+1, f.next() |
|
139 mo = re.match(r"\s+{(\d+), (\d+)},$", line) |
|
140 assert mo, (lineno, line) |
|
141 i, j = map(int, mo.groups()) |
|
142 arcs.append((i, j)) |
|
143 lineno, line = lineno+1, f.next() |
|
144 assert line == "};\n", (lineno, line) |
|
145 allarcs[(n, m)] = arcs |
|
146 lineno, line = lineno+1, f.next() |
|
147 mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line) |
|
148 assert mo, (lineno, line) |
|
149 s, t = map(int, mo.groups()) |
|
150 assert s == len(states), (lineno, line) |
|
151 state = [] |
|
152 for _ in range(t): |
|
153 lineno, line = lineno+1, f.next() |
|
154 mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line) |
|
155 assert mo, (lineno, line) |
|
156 k, n, m = map(int, mo.groups()) |
|
157 arcs = allarcs[n, m] |
|
158 assert k == len(arcs), (lineno, line) |
|
159 state.append(arcs) |
|
160 states.append(state) |
|
161 lineno, line = lineno+1, f.next() |
|
162 assert line == "};\n", (lineno, line) |
|
163 lineno, line = lineno+1, f.next() |
|
164 self.states = states |
|
165 |
|
166 # Parse the dfas |
|
167 dfas = {} |
|
168 mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line) |
|
169 assert mo, (lineno, line) |
|
170 ndfas = int(mo.group(1)) |
|
171 for i in range(ndfas): |
|
172 lineno, line = lineno+1, f.next() |
|
173 mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$', |
|
174 line) |
|
175 assert mo, (lineno, line) |
|
176 symbol = mo.group(2) |
|
177 number, x, y, z = map(int, mo.group(1, 3, 4, 5)) |
|
178 assert self.symbol2number[symbol] == number, (lineno, line) |
|
179 assert self.number2symbol[number] == symbol, (lineno, line) |
|
180 assert x == 0, (lineno, line) |
|
181 state = states[z] |
|
182 assert y == len(state), (lineno, line) |
|
183 lineno, line = lineno+1, f.next() |
|
184 mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line) |
|
185 assert mo, (lineno, line) |
|
186 first = {} |
|
187 rawbitset = eval(mo.group(1)) |
|
188 for i, c in enumerate(rawbitset): |
|
189 byte = ord(c) |
|
190 for j in range(8): |
|
191 if byte & (1<<j): |
|
192 first[i*8 + j] = 1 |
|
193 dfas[number] = (state, first) |
|
194 lineno, line = lineno+1, f.next() |
|
195 assert line == "};\n", (lineno, line) |
|
196 self.dfas = dfas |
|
197 |
|
198 # Parse the labels |
|
199 labels = [] |
|
200 lineno, line = lineno+1, f.next() |
|
201 mo = re.match(r"static label labels\[(\d+)\] = {$", line) |
|
202 assert mo, (lineno, line) |
|
203 nlabels = int(mo.group(1)) |
|
204 for i in range(nlabels): |
|
205 lineno, line = lineno+1, f.next() |
|
206 mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line) |
|
207 assert mo, (lineno, line) |
|
208 x, y = mo.groups() |
|
209 x = int(x) |
|
210 if y == "0": |
|
211 y = None |
|
212 else: |
|
213 y = eval(y) |
|
214 labels.append((x, y)) |
|
215 lineno, line = lineno+1, f.next() |
|
216 assert line == "};\n", (lineno, line) |
|
217 self.labels = labels |
|
218 |
|
219 # Parse the grammar struct |
|
220 lineno, line = lineno+1, f.next() |
|
221 assert line == "grammar _PyParser_Grammar = {\n", (lineno, line) |
|
222 lineno, line = lineno+1, f.next() |
|
223 mo = re.match(r"\s+(\d+),$", line) |
|
224 assert mo, (lineno, line) |
|
225 ndfas = int(mo.group(1)) |
|
226 assert ndfas == len(self.dfas) |
|
227 lineno, line = lineno+1, f.next() |
|
228 assert line == "\tdfas,\n", (lineno, line) |
|
229 lineno, line = lineno+1, f.next() |
|
230 mo = re.match(r"\s+{(\d+), labels},$", line) |
|
231 assert mo, (lineno, line) |
|
232 nlabels = int(mo.group(1)) |
|
233 assert nlabels == len(self.labels), (lineno, line) |
|
234 lineno, line = lineno+1, f.next() |
|
235 mo = re.match(r"\s+(\d+)$", line) |
|
236 assert mo, (lineno, line) |
|
237 start = int(mo.group(1)) |
|
238 assert start in self.number2symbol, (lineno, line) |
|
239 self.start = start |
|
240 lineno, line = lineno+1, f.next() |
|
241 assert line == "};\n", (lineno, line) |
|
242 try: |
|
243 lineno, line = lineno+1, f.next() |
|
244 except StopIteration: |
|
245 pass |
|
246 else: |
|
247 assert 0, (lineno, line) |
|
248 |
|
249 def finish_off(self): |
|
250 """Create additional useful structures. (Internal).""" |
|
251 self.keywords = {} # map from keyword strings to arc labels |
|
252 self.tokens = {} # map from numeric token values to arc labels |
|
253 for ilabel, (type, value) in enumerate(self.labels): |
|
254 if type == token.NAME and value is not None: |
|
255 self.keywords[value] = ilabel |
|
256 elif value is None: |
|
257 self.tokens[type] = ilabel |