|
1 # Regex test suite and benchmark suite v1.5a2 |
|
2 # Due to the use of r"aw" strings, this file will |
|
3 # only work with Python 1.5 or higher. |
|
4 |
|
5 # The 3 possible outcomes for each pattern |
|
6 [SUCCEED, FAIL, SYNTAX_ERROR] = range(3) |
|
7 |
|
8 # Benchmark suite (needs expansion) |
|
9 # |
|
10 # The benchmark suite does not test correctness, just speed. The |
|
11 # first element of each tuple is the regex pattern; the second is a |
|
12 # string to match it against. The benchmarking code will embed the |
|
13 # second string inside several sizes of padding, to test how regex |
|
14 # matching performs on large strings. |
|
15 |
|
16 benchmarks = [ |
|
17 ('Python', 'Python'), # Simple text literal |
|
18 ('.*Python', 'Python'), # Bad text literal |
|
19 ('.*Python.*', 'Python'), # Worse text literal |
|
20 ('.*\\(Python\\)', 'Python'), # Bad text literal with grouping |
|
21 |
|
22 ('(Python\\|Perl\\|Tcl', 'Perl'), # Alternation |
|
23 ('\\(Python\\|Perl\\|Tcl\\)', 'Perl'), # Grouped alternation |
|
24 ('\\(Python\\)\\1', 'PythonPython'), # Backreference |
|
25 # ('\\([0a-z][a-z]*,\\)+', 'a5,b7,c9,'), # Disable the fastmap optimization |
|
26 ('\\([a-z][a-z0-9]*,\\)+', 'a5,b7,c9,') # A few sets |
|
27 ] |
|
28 |
|
29 # Test suite (for verifying correctness) |
|
30 # |
|
31 # The test suite is a list of 5- or 3-tuples. The 5 parts of a |
|
32 # complete tuple are: |
|
33 # element 0: a string containing the pattern |
|
34 # 1: the string to match against the pattern |
|
35 # 2: the expected result (SUCCEED, FAIL, SYNTAX_ERROR) |
|
36 # 3: a string that will be eval()'ed to produce a test string. |
|
37 # This is an arbitrary Python expression; the available |
|
38 # variables are "found" (the whole match), and "g1", "g2", ... |
|
39 # up to "g10" contain the contents of each group, or the |
|
40 # string 'None' if the group wasn't given a value. |
|
41 # 4: The expected result of evaluating the expression. |
|
42 # If the two don't match, an error is reported. |
|
43 # |
|
44 # If the regex isn't expected to work, the latter two elements can be omitted. |
|
45 |
|
46 tests = [ |
|
47 ('abc', 'abc', SUCCEED, |
|
48 'found', 'abc'), |
|
49 ('abc', 'xbc', FAIL), |
|
50 ('abc', 'axc', FAIL), |
|
51 ('abc', 'abx', FAIL), |
|
52 ('abc', 'xabcy', SUCCEED, |
|
53 'found', 'abc'), |
|
54 ('abc', 'ababc', SUCCEED, |
|
55 'found', 'abc'), |
|
56 ('ab*c', 'abc', SUCCEED, |
|
57 'found', 'abc'), |
|
58 ('ab*bc', 'abc', SUCCEED, |
|
59 'found', 'abc'), |
|
60 ('ab*bc', 'abbc', SUCCEED, |
|
61 'found', 'abbc'), |
|
62 ('ab*bc', 'abbbbc', SUCCEED, |
|
63 'found', 'abbbbc'), |
|
64 ('ab+bc', 'abbc', SUCCEED, |
|
65 'found', 'abbc'), |
|
66 ('ab+bc', 'abc', FAIL), |
|
67 ('ab+bc', 'abq', FAIL), |
|
68 ('ab+bc', 'abbbbc', SUCCEED, |
|
69 'found', 'abbbbc'), |
|
70 ('ab?bc', 'abbc', SUCCEED, |
|
71 'found', 'abbc'), |
|
72 ('ab?bc', 'abc', SUCCEED, |
|
73 'found', 'abc'), |
|
74 ('ab?bc', 'abbbbc', FAIL), |
|
75 ('ab?c', 'abc', SUCCEED, |
|
76 'found', 'abc'), |
|
77 ('^abc$', 'abc', SUCCEED, |
|
78 'found', 'abc'), |
|
79 ('^abc$', 'abcc', FAIL), |
|
80 ('^abc', 'abcc', SUCCEED, |
|
81 'found', 'abc'), |
|
82 ('^abc$', 'aabc', FAIL), |
|
83 ('abc$', 'aabc', SUCCEED, |
|
84 'found', 'abc'), |
|
85 ('^', 'abc', SUCCEED, |
|
86 'found+"-"', '-'), |
|
87 ('$', 'abc', SUCCEED, |
|
88 'found+"-"', '-'), |
|
89 ('a.c', 'abc', SUCCEED, |
|
90 'found', 'abc'), |
|
91 ('a.c', 'axc', SUCCEED, |
|
92 'found', 'axc'), |
|
93 ('a.*c', 'axyzc', SUCCEED, |
|
94 'found', 'axyzc'), |
|
95 ('a.*c', 'axyzd', FAIL), |
|
96 ('a[bc]d', 'abc', FAIL), |
|
97 ('a[bc]d', 'abd', SUCCEED, |
|
98 'found', 'abd'), |
|
99 ('a[b-d]e', 'abd', FAIL), |
|
100 ('a[b-d]e', 'ace', SUCCEED, |
|
101 'found', 'ace'), |
|
102 ('a[b-d]', 'aac', SUCCEED, |
|
103 'found', 'ac'), |
|
104 ('a[-b]', 'a-', SUCCEED, |
|
105 'found', 'a-'), |
|
106 ('a[b-]', 'a-', SUCCEED, |
|
107 'found', 'a-'), |
|
108 ('a[]b', '-', SYNTAX_ERROR), |
|
109 ('a[', '-', SYNTAX_ERROR), |
|
110 ('a\\', '-', SYNTAX_ERROR), |
|
111 ('abc\\)', '-', SYNTAX_ERROR), |
|
112 ('\\(abc', '-', SYNTAX_ERROR), |
|
113 ('a]', 'a]', SUCCEED, |
|
114 'found', 'a]'), |
|
115 ('a[]]b', 'a]b', SUCCEED, |
|
116 'found', 'a]b'), |
|
117 ('a[^bc]d', 'aed', SUCCEED, |
|
118 'found', 'aed'), |
|
119 ('a[^bc]d', 'abd', FAIL), |
|
120 ('a[^-b]c', 'adc', SUCCEED, |
|
121 'found', 'adc'), |
|
122 ('a[^-b]c', 'a-c', FAIL), |
|
123 ('a[^]b]c', 'a]c', FAIL), |
|
124 ('a[^]b]c', 'adc', SUCCEED, |
|
125 'found', 'adc'), |
|
126 ('\\ba\\b', 'a-', SUCCEED, |
|
127 '"-"', '-'), |
|
128 ('\\ba\\b', '-a', SUCCEED, |
|
129 '"-"', '-'), |
|
130 ('\\ba\\b', '-a-', SUCCEED, |
|
131 '"-"', '-'), |
|
132 ('\\by\\b', 'xy', FAIL), |
|
133 ('\\by\\b', 'yz', FAIL), |
|
134 ('\\by\\b', 'xyz', FAIL), |
|
135 ('ab\\|cd', 'abc', SUCCEED, |
|
136 'found', 'ab'), |
|
137 ('ab\\|cd', 'abcd', SUCCEED, |
|
138 'found', 'ab'), |
|
139 ('\\(\\)ef', 'def', SUCCEED, |
|
140 'found+"-"+g1', 'ef-'), |
|
141 ('$b', 'b', FAIL), |
|
142 ('a(b', 'a(b', SUCCEED, |
|
143 'found+"-"+g1', 'a(b-None'), |
|
144 ('a(*b', 'ab', SUCCEED, |
|
145 'found', 'ab'), |
|
146 ('a(*b', 'a((b', SUCCEED, |
|
147 'found', 'a((b'), |
|
148 ('a\\\\b', 'a\\b', SUCCEED, |
|
149 'found', 'a\\b'), |
|
150 ('\\(\\(a\\)\\)', 'abc', SUCCEED, |
|
151 'found+"-"+g1+"-"+g2', 'a-a-a'), |
|
152 ('\\(a\\)b\\(c\\)', 'abc', SUCCEED, |
|
153 'found+"-"+g1+"-"+g2', 'abc-a-c'), |
|
154 ('a+b+c', 'aabbabc', SUCCEED, |
|
155 'found', 'abc'), |
|
156 ('\\(a+\\|b\\)*', 'ab', SUCCEED, |
|
157 'found+"-"+g1', 'ab-b'), |
|
158 ('\\(a+\\|b\\)+', 'ab', SUCCEED, |
|
159 'found+"-"+g1', 'ab-b'), |
|
160 ('\\(a+\\|b\\)?', 'ab', SUCCEED, |
|
161 'found+"-"+g1', 'a-a'), |
|
162 ('\\)\\(', '-', SYNTAX_ERROR), |
|
163 ('[^ab]*', 'cde', SUCCEED, |
|
164 'found', 'cde'), |
|
165 ('abc', '', FAIL), |
|
166 ('a*', '', SUCCEED, |
|
167 'found', ''), |
|
168 ('a\\|b\\|c\\|d\\|e', 'e', SUCCEED, |
|
169 'found', 'e'), |
|
170 ('\\(a\\|b\\|c\\|d\\|e\\)f', 'ef', SUCCEED, |
|
171 'found+"-"+g1', 'ef-e'), |
|
172 ('abcd*efg', 'abcdefg', SUCCEED, |
|
173 'found', 'abcdefg'), |
|
174 ('ab*', 'xabyabbbz', SUCCEED, |
|
175 'found', 'ab'), |
|
176 ('ab*', 'xayabbbz', SUCCEED, |
|
177 'found', 'a'), |
|
178 ('\\(ab\\|cd\\)e', 'abcde', SUCCEED, |
|
179 'found+"-"+g1', 'cde-cd'), |
|
180 ('[abhgefdc]ij', 'hij', SUCCEED, |
|
181 'found', 'hij'), |
|
182 ('^\\(ab\\|cd\\)e', 'abcde', FAIL, |
|
183 'xg1y', 'xy'), |
|
184 ('\\(abc\\|\\)ef', 'abcdef', SUCCEED, |
|
185 'found+"-"+g1', 'ef-'), |
|
186 ('\\(a\\|b\\)c*d', 'abcd', SUCCEED, |
|
187 'found+"-"+g1', 'bcd-b'), |
|
188 ('\\(ab\\|ab*\\)bc', 'abc', SUCCEED, |
|
189 'found+"-"+g1', 'abc-a'), |
|
190 ('a\\([bc]*\\)c*', 'abc', SUCCEED, |
|
191 'found+"-"+g1', 'abc-bc'), |
|
192 ('a\\([bc]*\\)\\(c*d\\)', 'abcd', SUCCEED, |
|
193 'found+"-"+g1+"-"+g2', 'abcd-bc-d'), |
|
194 ('a\\([bc]+\\)\\(c*d\\)', 'abcd', SUCCEED, |
|
195 'found+"-"+g1+"-"+g2', 'abcd-bc-d'), |
|
196 ('a\\([bc]*\\)\\(c+d\\)', 'abcd', SUCCEED, |
|
197 'found+"-"+g1+"-"+g2', 'abcd-b-cd'), |
|
198 ('a[bcd]*dcdcde', 'adcdcde', SUCCEED, |
|
199 'found', 'adcdcde'), |
|
200 ('a[bcd]+dcdcde', 'adcdcde', FAIL), |
|
201 ('\\(ab\\|a\\)b*c', 'abc', SUCCEED, |
|
202 'found+"-"+g1', 'abc-ab'), |
|
203 ('\\(\\(a\\)\\(b\\)c\\)\\(d\\)', 'abcd', SUCCEED, |
|
204 'g1+"-"+g2+"-"+g3+"-"+g4', 'abc-a-b-d'), |
|
205 ('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', SUCCEED, |
|
206 'found', 'alpha'), |
|
207 ('^a\\(bc+\\|b[eh]\\)g\\|.h$', 'abh', SUCCEED, |
|
208 'found+"-"+g1', 'bh-None'), |
|
209 ('\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'effgz', SUCCEED, |
|
210 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'), |
|
211 ('\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'ij', SUCCEED, |
|
212 'found+"-"+g1+"-"+g2', 'ij-ij-j'), |
|
213 ('\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'effg', FAIL), |
|
214 ('\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'bcdd', FAIL), |
|
215 ('\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'reffgz', SUCCEED, |
|
216 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'), |
|
217 ('\\(\\(\\(\\(\\(\\(\\(\\(\\(a\\)\\)\\)\\)\\)\\)\\)\\)\\)', 'a', SUCCEED, |
|
218 'found', 'a'), |
|
219 ('multiple words of text', 'uh-uh', FAIL), |
|
220 ('multiple words', 'multiple words, yeah', SUCCEED, |
|
221 'found', 'multiple words'), |
|
222 ('\\(.*\\)c\\(.*\\)', 'abcde', SUCCEED, |
|
223 'found+"-"+g1+"-"+g2', 'abcde-ab-de'), |
|
224 ('(\\(.*\\), \\(.*\\))', '(a, b)', SUCCEED, |
|
225 'g2+"-"+g1', 'b-a'), |
|
226 ('[k]', 'ab', FAIL), |
|
227 ('a[-]?c', 'ac', SUCCEED, |
|
228 'found', 'ac'), |
|
229 ('\\(abc\\)\\1', 'abcabc', SUCCEED, |
|
230 'g1', 'abc'), |
|
231 ('\\([a-c]*\\)\\1', 'abcabc', SUCCEED, |
|
232 'g1', 'abc'), |
|
233 ('^\\(.+\\)?B', 'AB', SUCCEED, |
|
234 'g1', 'A'), |
|
235 ('\\(a+\\).\\1$', 'aaaaa', SUCCEED, |
|
236 'found+"-"+g1', 'aaaaa-aa'), |
|
237 ('^\\(a+\\).\\1$', 'aaaa', FAIL), |
|
238 ('\\(abc\\)\\1', 'abcabc', SUCCEED, |
|
239 'found+"-"+g1', 'abcabc-abc'), |
|
240 ('\\([a-c]+\\)\\1', 'abcabc', SUCCEED, |
|
241 'found+"-"+g1', 'abcabc-abc'), |
|
242 ('\\(a\\)\\1', 'aa', SUCCEED, |
|
243 'found+"-"+g1', 'aa-a'), |
|
244 ('\\(a+\\)\\1', 'aa', SUCCEED, |
|
245 'found+"-"+g1', 'aa-a'), |
|
246 ('\\(a+\\)+\\1', 'aa', SUCCEED, |
|
247 'found+"-"+g1', 'aa-a'), |
|
248 ('\\(a\\).+\\1', 'aba', SUCCEED, |
|
249 'found+"-"+g1', 'aba-a'), |
|
250 ('\\(a\\)ba*\\1', 'aba', SUCCEED, |
|
251 'found+"-"+g1', 'aba-a'), |
|
252 ('\\(aa\\|a\\)a\\1$', 'aaa', SUCCEED, |
|
253 'found+"-"+g1', 'aaa-a'), |
|
254 ('\\(a\\|aa\\)a\\1$', 'aaa', SUCCEED, |
|
255 'found+"-"+g1', 'aaa-a'), |
|
256 ('\\(a+\\)a\\1$', 'aaa', SUCCEED, |
|
257 'found+"-"+g1', 'aaa-a'), |
|
258 ('\\([abc]*\\)\\1', 'abcabc', SUCCEED, |
|
259 'found+"-"+g1', 'abcabc-abc'), |
|
260 ('\\(a\\)\\(b\\)c\\|ab', 'ab', SUCCEED, |
|
261 'found+"-"+g1+"-"+g2', 'ab-None-None'), |
|
262 ('\\(a\\)+x', 'aaax', SUCCEED, |
|
263 'found+"-"+g1', 'aaax-a'), |
|
264 ('\\([ac]\\)+x', 'aacx', SUCCEED, |
|
265 'found+"-"+g1', 'aacx-c'), |
|
266 ('\\([^/]*/\\)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', SUCCEED, |
|
267 'found+"-"+g1', 'd:msgs/tdir/sub1/-tdir/'), |
|
268 ('\\([^.]*\\)\\.\\([^:]*\\):[T ]+\\(.*\\)', 'track1.title:TBlah blah blah', SUCCEED, |
|
269 'found+"-"+g1+"-"+g2+"-"+g3', 'track1.title:TBlah blah blah-track1-title-Blah blah blah'), |
|
270 ('\\([^N]*N\\)+', 'abNNxyzN', SUCCEED, |
|
271 'found+"-"+g1', 'abNNxyzN-xyzN'), |
|
272 ('\\([^N]*N\\)+', 'abNNxyz', SUCCEED, |
|
273 'found+"-"+g1', 'abNN-N'), |
|
274 ('\\([abc]*\\)x', 'abcx', SUCCEED, |
|
275 'found+"-"+g1', 'abcx-abc'), |
|
276 ('\\([abc]*\\)x', 'abc', FAIL), |
|
277 ('\\([xyz]*\\)x', 'abcx', SUCCEED, |
|
278 'found+"-"+g1', 'x-'), |
|
279 ('\\(a\\)+b\\|aac', 'aac', SUCCEED, |
|
280 'found+"-"+g1', 'aac-None'), |
|
281 ('\<a', 'a', SUCCEED, 'found', 'a'), |
|
282 ('\<a', '!', FAIL), |
|
283 ('a\<b', 'ab', FAIL), |
|
284 ('a\>', 'ab', FAIL), |
|
285 ('a\>', 'a!', SUCCEED, 'found', 'a'), |
|
286 ('a\>', 'a', SUCCEED, 'found', 'a'), |
|
287 ] |