|
1 # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. |
|
2 # Licensed to PSF under a Contributor Agreement. |
|
3 |
|
4 """This module defines the data structures used to represent a grammar. |
|
5 |
|
6 These are a bit arcane because they are derived from the data |
|
7 structures used by Python's 'pgen' parser generator. |
|
8 |
|
9 There's also a table here mapping operators to their names in the |
|
10 token module; the Python tokenize module reports all operators as the |
|
11 fallback token code OP, but the parser needs the actual token code. |
|
12 |
|
13 """ |
|
14 |
|
15 # Python imports |
|
16 import pickle |
|
17 |
|
18 # Local imports |
|
19 from . import token, tokenize |
|
20 |
|
21 |
|
22 class Grammar(object): |
|
23 """Pgen parsing tables tables conversion class. |
|
24 |
|
25 Once initialized, this class supplies the grammar tables for the |
|
26 parsing engine implemented by parse.py. The parsing engine |
|
27 accesses the instance variables directly. The class here does not |
|
28 provide initialization of the tables; several subclasses exist to |
|
29 do this (see the conv and pgen modules). |
|
30 |
|
31 The load() method reads the tables from a pickle file, which is |
|
32 much faster than the other ways offered by subclasses. The pickle |
|
33 file is written by calling dump() (after loading the grammar |
|
34 tables using a subclass). The report() method prints a readable |
|
35 representation of the tables to stdout, for debugging. |
|
36 |
|
37 The instance variables are as follows: |
|
38 |
|
39 symbol2number -- a dict mapping symbol names to numbers. Symbol |
|
40 numbers are always 256 or higher, to distinguish |
|
41 them from token numbers, which are between 0 and |
|
42 255 (inclusive). |
|
43 |
|
44 number2symbol -- a dict mapping numbers to symbol names; |
|
45 these two are each other's inverse. |
|
46 |
|
47 states -- a list of DFAs, where each DFA is a list of |
|
48 states, each state is is a list of arcs, and each |
|
49 arc is a (i, j) pair where i is a label and j is |
|
50 a state number. The DFA number is the index into |
|
51 this list. (This name is slightly confusing.) |
|
52 Final states are represented by a special arc of |
|
53 the form (0, j) where j is its own state number. |
|
54 |
|
55 dfas -- a dict mapping symbol numbers to (DFA, first) |
|
56 pairs, where DFA is an item from the states list |
|
57 above, and first is a set of tokens that can |
|
58 begin this grammar rule (represented by a dict |
|
59 whose values are always 1). |
|
60 |
|
61 labels -- a list of (x, y) pairs where x is either a token |
|
62 number or a symbol number, and y is either None |
|
63 or a string; the strings are keywords. The label |
|
64 number is the index in this list; label numbers |
|
65 are used to mark state transitions (arcs) in the |
|
66 DFAs. |
|
67 |
|
68 start -- the number of the grammar's start symbol. |
|
69 |
|
70 keywords -- a dict mapping keyword strings to arc labels. |
|
71 |
|
72 tokens -- a dict mapping token numbers to arc labels. |
|
73 |
|
74 """ |
|
75 |
|
76 def __init__(self): |
|
77 self.symbol2number = {} |
|
78 self.number2symbol = {} |
|
79 self.states = [] |
|
80 self.dfas = {} |
|
81 self.labels = [(0, "EMPTY")] |
|
82 self.keywords = {} |
|
83 self.tokens = {} |
|
84 self.symbol2label = {} |
|
85 self.start = 256 |
|
86 |
|
87 def dump(self, filename): |
|
88 """Dump the grammar tables to a pickle file.""" |
|
89 f = open(filename, "wb") |
|
90 pickle.dump(self.__dict__, f, 2) |
|
91 f.close() |
|
92 |
|
93 def load(self, filename): |
|
94 """Load the grammar tables from a pickle file.""" |
|
95 f = open(filename, "rb") |
|
96 d = pickle.load(f) |
|
97 f.close() |
|
98 self.__dict__.update(d) |
|
99 |
|
100 def report(self): |
|
101 """Dump the grammar tables to standard output, for debugging.""" |
|
102 from pprint import pprint |
|
103 print "s2n" |
|
104 pprint(self.symbol2number) |
|
105 print "n2s" |
|
106 pprint(self.number2symbol) |
|
107 print "states" |
|
108 pprint(self.states) |
|
109 print "dfas" |
|
110 pprint(self.dfas) |
|
111 print "labels" |
|
112 pprint(self.labels) |
|
113 print "start", self.start |
|
114 |
|
115 |
|
116 # Map from operator to number (since tokenize doesn't do this) |
|
117 |
|
118 opmap_raw = """ |
|
119 ( LPAR |
|
120 ) RPAR |
|
121 [ LSQB |
|
122 ] RSQB |
|
123 : COLON |
|
124 , COMMA |
|
125 ; SEMI |
|
126 + PLUS |
|
127 - MINUS |
|
128 * STAR |
|
129 / SLASH |
|
130 | VBAR |
|
131 & AMPER |
|
132 < LESS |
|
133 > GREATER |
|
134 = EQUAL |
|
135 . DOT |
|
136 % PERCENT |
|
137 ` BACKQUOTE |
|
138 { LBRACE |
|
139 } RBRACE |
|
140 @ AT |
|
141 == EQEQUAL |
|
142 != NOTEQUAL |
|
143 <> NOTEQUAL |
|
144 <= LESSEQUAL |
|
145 >= GREATEREQUAL |
|
146 ~ TILDE |
|
147 ^ CIRCUMFLEX |
|
148 << LEFTSHIFT |
|
149 >> RIGHTSHIFT |
|
150 ** DOUBLESTAR |
|
151 += PLUSEQUAL |
|
152 -= MINEQUAL |
|
153 *= STAREQUAL |
|
154 /= SLASHEQUAL |
|
155 %= PERCENTEQUAL |
|
156 &= AMPEREQUAL |
|
157 |= VBAREQUAL |
|
158 ^= CIRCUMFLEXEQUAL |
|
159 <<= LEFTSHIFTEQUAL |
|
160 >>= RIGHTSHIFTEQUAL |
|
161 **= DOUBLESTAREQUAL |
|
162 // DOUBLESLASH |
|
163 //= DOUBLESLASHEQUAL |
|
164 -> RARROW |
|
165 """ |
|
166 |
|
167 opmap = {} |
|
168 for line in opmap_raw.splitlines(): |
|
169 if line: |
|
170 op, name = line.split() |
|
171 opmap[op] = getattr(token, name) |