Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | # Regex test suite and benchmark suite v1.5a2 |
2 | # Due to the use of r"aw" strings, this file will | |
3 | # only work with Python 1.5 or higher. | |
4 | ||
5 | # The 3 possible outcomes for each pattern | |
6 | [SUCCEED, FAIL, SYNTAX_ERROR] = range(3) | |
7 | ||
8 | # Benchmark suite (needs expansion) | |
9 | # | |
10 | # The benchmark suite does not test correctness, just speed. The | |
11 | # first element of each tuple is the regex pattern; the second is a | |
12 | # string to match it against. The benchmarking code will embed the | |
13 | # second string inside several sizes of padding, to test how regex | |
14 | # matching performs on large strings. | |
15 | ||
16 | benchmarks = [ | |
17 | ('Python', 'Python'), # Simple text literal | |
18 | ('.*Python', 'Python'), # Bad text literal | |
19 | ('.*Python.*', 'Python'), # Worse text literal | |
20 | ('.*\\(Python\\)', 'Python'), # Bad text literal with grouping | |
21 | ||
22 | ('(Python\\|Perl\\|Tcl', 'Perl'), # Alternation | |
23 | ('\\(Python\\|Perl\\|Tcl\\)', 'Perl'), # Grouped alternation | |
24 | ('\\(Python\\)\\1', 'PythonPython'), # Backreference | |
25 | # ('\\([0a-z][a-z]*,\\)+', 'a5,b7,c9,'), # Disable the fastmap optimization | |
26 | ('\\([a-z][a-z0-9]*,\\)+', 'a5,b7,c9,') # A few sets | |
27 | ] | |
28 | ||
29 | # Test suite (for verifying correctness) | |
30 | # | |
31 | # The test suite is a list of 5- or 3-tuples. The 5 parts of a | |
32 | # complete tuple are: | |
33 | # element 0: a string containing the pattern | |
34 | # 1: the string to match against the pattern | |
35 | # 2: the expected result (SUCCEED, FAIL, SYNTAX_ERROR) | |
36 | # 3: a string that will be eval()'ed to produce a test string. | |
37 | # This is an arbitrary Python expression; the available | |
38 | # variables are "found" (the whole match), and "g1", "g2", ... | |
39 | # up to "g10" contain the contents of each group, or the | |
40 | # string 'None' if the group wasn't given a value. | |
41 | # 4: The expected result of evaluating the expression. | |
42 | # If the two don't match, an error is reported. | |
43 | # | |
44 | # If the regex isn't expected to work, the latter two elements can be omitted. | |
45 | ||
46 | tests = [ | |
47 | ('abc', 'abc', SUCCEED, | |
48 | 'found', 'abc'), | |
49 | ('abc', 'xbc', FAIL), | |
50 | ('abc', 'axc', FAIL), | |
51 | ('abc', 'abx', FAIL), | |
52 | ('abc', 'xabcy', SUCCEED, | |
53 | 'found', 'abc'), | |
54 | ('abc', 'ababc', SUCCEED, | |
55 | 'found', 'abc'), | |
56 | ('ab*c', 'abc', SUCCEED, | |
57 | 'found', 'abc'), | |
58 | ('ab*bc', 'abc', SUCCEED, | |
59 | 'found', 'abc'), | |
60 | ('ab*bc', 'abbc', SUCCEED, | |
61 | 'found', 'abbc'), | |
62 | ('ab*bc', 'abbbbc', SUCCEED, | |
63 | 'found', 'abbbbc'), | |
64 | ('ab+bc', 'abbc', SUCCEED, | |
65 | 'found', 'abbc'), | |
66 | ('ab+bc', 'abc', FAIL), | |
67 | ('ab+bc', 'abq', FAIL), | |
68 | ('ab+bc', 'abbbbc', SUCCEED, | |
69 | 'found', 'abbbbc'), | |
70 | ('ab?bc', 'abbc', SUCCEED, | |
71 | 'found', 'abbc'), | |
72 | ('ab?bc', 'abc', SUCCEED, | |
73 | 'found', 'abc'), | |
74 | ('ab?bc', 'abbbbc', FAIL), | |
75 | ('ab?c', 'abc', SUCCEED, | |
76 | 'found', 'abc'), | |
77 | ('^abc$', 'abc', SUCCEED, | |
78 | 'found', 'abc'), | |
79 | ('^abc$', 'abcc', FAIL), | |
80 | ('^abc', 'abcc', SUCCEED, | |
81 | 'found', 'abc'), | |
82 | ('^abc$', 'aabc', FAIL), | |
83 | ('abc$', 'aabc', SUCCEED, | |
84 | 'found', 'abc'), | |
85 | ('^', 'abc', SUCCEED, | |
86 | 'found+"-"', '-'), | |
87 | ('$', 'abc', SUCCEED, | |
88 | 'found+"-"', '-'), | |
89 | ('a.c', 'abc', SUCCEED, | |
90 | 'found', 'abc'), | |
91 | ('a.c', 'axc', SUCCEED, | |
92 | 'found', 'axc'), | |
93 | ('a.*c', 'axyzc', SUCCEED, | |
94 | 'found', 'axyzc'), | |
95 | ('a.*c', 'axyzd', FAIL), | |
96 | ('a[bc]d', 'abc', FAIL), | |
97 | ('a[bc]d', 'abd', SUCCEED, | |
98 | 'found', 'abd'), | |
99 | ('a[b-d]e', 'abd', FAIL), | |
100 | ('a[b-d]e', 'ace', SUCCEED, | |
101 | 'found', 'ace'), | |
102 | ('a[b-d]', 'aac', SUCCEED, | |
103 | 'found', 'ac'), | |
104 | ('a[-b]', 'a-', SUCCEED, | |
105 | 'found', 'a-'), | |
106 | ('a[b-]', 'a-', SUCCEED, | |
107 | 'found', 'a-'), | |
108 | ('a[]b', '-', SYNTAX_ERROR), | |
109 | ('a[', '-', SYNTAX_ERROR), | |
110 | ('a\\', '-', SYNTAX_ERROR), | |
111 | ('abc\\)', '-', SYNTAX_ERROR), | |
112 | ('\\(abc', '-', SYNTAX_ERROR), | |
113 | ('a]', 'a]', SUCCEED, | |
114 | 'found', 'a]'), | |
115 | ('a[]]b', 'a]b', SUCCEED, | |
116 | 'found', 'a]b'), | |
117 | ('a[^bc]d', 'aed', SUCCEED, | |
118 | 'found', 'aed'), | |
119 | ('a[^bc]d', 'abd', FAIL), | |
120 | ('a[^-b]c', 'adc', SUCCEED, | |
121 | 'found', 'adc'), | |
122 | ('a[^-b]c', 'a-c', FAIL), | |
123 | ('a[^]b]c', 'a]c', FAIL), | |
124 | ('a[^]b]c', 'adc', SUCCEED, | |
125 | 'found', 'adc'), | |
126 | ('\\ba\\b', 'a-', SUCCEED, | |
127 | '"-"', '-'), | |
128 | ('\\ba\\b', '-a', SUCCEED, | |
129 | '"-"', '-'), | |
130 | ('\\ba\\b', '-a-', SUCCEED, | |
131 | '"-"', '-'), | |
132 | ('\\by\\b', 'xy', FAIL), | |
133 | ('\\by\\b', 'yz', FAIL), | |
134 | ('\\by\\b', 'xyz', FAIL), | |
135 | ('ab\\|cd', 'abc', SUCCEED, | |
136 | 'found', 'ab'), | |
137 | ('ab\\|cd', 'abcd', SUCCEED, | |
138 | 'found', 'ab'), | |
139 | ('\\(\\)ef', 'def', SUCCEED, | |
140 | 'found+"-"+g1', 'ef-'), | |
141 | ('$b', 'b', FAIL), | |
142 | ('a(b', 'a(b', SUCCEED, | |
143 | 'found+"-"+g1', 'a(b-None'), | |
144 | ('a(*b', 'ab', SUCCEED, | |
145 | 'found', 'ab'), | |
146 | ('a(*b', 'a((b', SUCCEED, | |
147 | 'found', 'a((b'), | |
148 | ('a\\\\b', 'a\\b', SUCCEED, | |
149 | 'found', 'a\\b'), | |
150 | ('\\(\\(a\\)\\)', 'abc', SUCCEED, | |
151 | 'found+"-"+g1+"-"+g2', 'a-a-a'), | |
152 | ('\\(a\\)b\\(c\\)', 'abc', SUCCEED, | |
153 | 'found+"-"+g1+"-"+g2', 'abc-a-c'), | |
154 | ('a+b+c', 'aabbabc', SUCCEED, | |
155 | 'found', 'abc'), | |
156 | ('\\(a+\\|b\\)*', 'ab', SUCCEED, | |
157 | 'found+"-"+g1', 'ab-b'), | |
158 | ('\\(a+\\|b\\)+', 'ab', SUCCEED, | |
159 | 'found+"-"+g1', 'ab-b'), | |
160 | ('\\(a+\\|b\\)?', 'ab', SUCCEED, | |
161 | 'found+"-"+g1', 'a-a'), | |
162 | ('\\)\\(', '-', SYNTAX_ERROR), | |
163 | ('[^ab]*', 'cde', SUCCEED, | |
164 | 'found', 'cde'), | |
165 | ('abc', '', FAIL), | |
166 | ('a*', '', SUCCEED, | |
167 | 'found', ''), | |
168 | ('a\\|b\\|c\\|d\\|e', 'e', SUCCEED, | |
169 | 'found', 'e'), | |
170 | ('\\(a\\|b\\|c\\|d\\|e\\)f', 'ef', SUCCEED, | |
171 | 'found+"-"+g1', 'ef-e'), | |
172 | ('abcd*efg', 'abcdefg', SUCCEED, | |
173 | 'found', 'abcdefg'), | |
174 | ('ab*', 'xabyabbbz', SUCCEED, | |
175 | 'found', 'ab'), | |
176 | ('ab*', 'xayabbbz', SUCCEED, | |
177 | 'found', 'a'), | |
178 | ('\\(ab\\|cd\\)e', 'abcde', SUCCEED, | |
179 | 'found+"-"+g1', 'cde-cd'), | |
180 | ('[abhgefdc]ij', 'hij', SUCCEED, | |
181 | 'found', 'hij'), | |
182 | ('^\\(ab\\|cd\\)e', 'abcde', FAIL, | |
183 | 'xg1y', 'xy'), | |
184 | ('\\(abc\\|\\)ef', 'abcdef', SUCCEED, | |
185 | 'found+"-"+g1', 'ef-'), | |
186 | ('\\(a\\|b\\)c*d', 'abcd', SUCCEED, | |
187 | 'found+"-"+g1', 'bcd-b'), | |
188 | ('\\(ab\\|ab*\\)bc', 'abc', SUCCEED, | |
189 | 'found+"-"+g1', 'abc-a'), | |
190 | ('a\\([bc]*\\)c*', 'abc', SUCCEED, | |
191 | 'found+"-"+g1', 'abc-bc'), | |
192 | ('a\\([bc]*\\)\\(c*d\\)', 'abcd', SUCCEED, | |
193 | 'found+"-"+g1+"-"+g2', 'abcd-bc-d'), | |
194 | ('a\\([bc]+\\)\\(c*d\\)', 'abcd', SUCCEED, | |
195 | 'found+"-"+g1+"-"+g2', 'abcd-bc-d'), | |
196 | ('a\\([bc]*\\)\\(c+d\\)', 'abcd', SUCCEED, | |
197 | 'found+"-"+g1+"-"+g2', 'abcd-b-cd'), | |
198 | ('a[bcd]*dcdcde', 'adcdcde', SUCCEED, | |
199 | 'found', 'adcdcde'), | |
200 | ('a[bcd]+dcdcde', 'adcdcde', FAIL), | |
201 | ('\\(ab\\|a\\)b*c', 'abc', SUCCEED, | |
202 | 'found+"-"+g1', 'abc-ab'), | |
203 | ('\\(\\(a\\)\\(b\\)c\\)\\(d\\)', 'abcd', SUCCEED, | |
204 | 'g1+"-"+g2+"-"+g3+"-"+g4', 'abc-a-b-d'), | |
205 | ('[a-zA-Z_][a-zA-Z0-9_]*', 'alpha', SUCCEED, | |
206 | 'found', 'alpha'), | |
207 | ('^a\\(bc+\\|b[eh]\\)g\\|.h$', 'abh', SUCCEED, | |
208 | 'found+"-"+g1', 'bh-None'), | |
209 | ('\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'effgz', SUCCEED, | |
210 | 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'), | |
211 | ('\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'ij', SUCCEED, | |
212 | 'found+"-"+g1+"-"+g2', 'ij-ij-j'), | |
213 | ('\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'effg', FAIL), | |
214 | ('\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'bcdd', FAIL), | |
215 | ('\\(bc+d$\\|ef*g.\\|h?i\\(j\\|k\\)\\)', 'reffgz', SUCCEED, | |
216 | 'found+"-"+g1+"-"+g2', 'effgz-effgz-None'), | |
217 | ('\\(\\(\\(\\(\\(\\(\\(\\(\\(a\\)\\)\\)\\)\\)\\)\\)\\)\\)', 'a', SUCCEED, | |
218 | 'found', 'a'), | |
219 | ('multiple words of text', 'uh-uh', FAIL), | |
220 | ('multiple words', 'multiple words, yeah', SUCCEED, | |
221 | 'found', 'multiple words'), | |
222 | ('\\(.*\\)c\\(.*\\)', 'abcde', SUCCEED, | |
223 | 'found+"-"+g1+"-"+g2', 'abcde-ab-de'), | |
224 | ('(\\(.*\\), \\(.*\\))', '(a, b)', SUCCEED, | |
225 | 'g2+"-"+g1', 'b-a'), | |
226 | ('[k]', 'ab', FAIL), | |
227 | ('a[-]?c', 'ac', SUCCEED, | |
228 | 'found', 'ac'), | |
229 | ('\\(abc\\)\\1', 'abcabc', SUCCEED, | |
230 | 'g1', 'abc'), | |
231 | ('\\([a-c]*\\)\\1', 'abcabc', SUCCEED, | |
232 | 'g1', 'abc'), | |
233 | ('^\\(.+\\)?B', 'AB', SUCCEED, | |
234 | 'g1', 'A'), | |
235 | ('\\(a+\\).\\1$', 'aaaaa', SUCCEED, | |
236 | 'found+"-"+g1', 'aaaaa-aa'), | |
237 | ('^\\(a+\\).\\1$', 'aaaa', FAIL), | |
238 | ('\\(abc\\)\\1', 'abcabc', SUCCEED, | |
239 | 'found+"-"+g1', 'abcabc-abc'), | |
240 | ('\\([a-c]+\\)\\1', 'abcabc', SUCCEED, | |
241 | 'found+"-"+g1', 'abcabc-abc'), | |
242 | ('\\(a\\)\\1', 'aa', SUCCEED, | |
243 | 'found+"-"+g1', 'aa-a'), | |
244 | ('\\(a+\\)\\1', 'aa', SUCCEED, | |
245 | 'found+"-"+g1', 'aa-a'), | |
246 | ('\\(a+\\)+\\1', 'aa', SUCCEED, | |
247 | 'found+"-"+g1', 'aa-a'), | |
248 | ('\\(a\\).+\\1', 'aba', SUCCEED, | |
249 | 'found+"-"+g1', 'aba-a'), | |
250 | ('\\(a\\)ba*\\1', 'aba', SUCCEED, | |
251 | 'found+"-"+g1', 'aba-a'), | |
252 | ('\\(aa\\|a\\)a\\1$', 'aaa', SUCCEED, | |
253 | 'found+"-"+g1', 'aaa-a'), | |
254 | ('\\(a\\|aa\\)a\\1$', 'aaa', SUCCEED, | |
255 | 'found+"-"+g1', 'aaa-a'), | |
256 | ('\\(a+\\)a\\1$', 'aaa', SUCCEED, | |
257 | 'found+"-"+g1', 'aaa-a'), | |
258 | ('\\([abc]*\\)\\1', 'abcabc', SUCCEED, | |
259 | 'found+"-"+g1', 'abcabc-abc'), | |
260 | ('\\(a\\)\\(b\\)c\\|ab', 'ab', SUCCEED, | |
261 | 'found+"-"+g1+"-"+g2', 'ab-None-None'), | |
262 | ('\\(a\\)+x', 'aaax', SUCCEED, | |
263 | 'found+"-"+g1', 'aaax-a'), | |
264 | ('\\([ac]\\)+x', 'aacx', SUCCEED, | |
265 | 'found+"-"+g1', 'aacx-c'), | |
266 | ('\\([^/]*/\\)*sub1/', 'd:msgs/tdir/sub1/trial/away.cpp', SUCCEED, | |
267 | 'found+"-"+g1', 'd:msgs/tdir/sub1/-tdir/'), | |
268 | ('\\([^.]*\\)\\.\\([^:]*\\):[T ]+\\(.*\\)', 'track1.title:TBlah blah blah', SUCCEED, | |
269 | 'found+"-"+g1+"-"+g2+"-"+g3', 'track1.title:TBlah blah blah-track1-title-Blah blah blah'), | |
270 | ('\\([^N]*N\\)+', 'abNNxyzN', SUCCEED, | |
271 | 'found+"-"+g1', 'abNNxyzN-xyzN'), | |
272 | ('\\([^N]*N\\)+', 'abNNxyz', SUCCEED, | |
273 | 'found+"-"+g1', 'abNN-N'), | |
274 | ('\\([abc]*\\)x', 'abcx', SUCCEED, | |
275 | 'found+"-"+g1', 'abcx-abc'), | |
276 | ('\\([abc]*\\)x', 'abc', FAIL), | |
277 | ('\\([xyz]*\\)x', 'abcx', SUCCEED, | |
278 | 'found+"-"+g1', 'x-'), | |
279 | ('\\(a\\)+b\\|aac', 'aac', SUCCEED, | |
280 | 'found+"-"+g1', 'aac-None'), | |
281 | ('\<a', 'a', SUCCEED, 'found', 'a'), | |
282 | ('\<a', '!', FAIL), | |
283 | ('a\<b', 'ab', FAIL), | |
284 | ('a\>', 'ab', FAIL), | |
285 | ('a\>', 'a!', SUCCEED, 'found', 'a'), | |
286 | ('a\>', 'a', SUCCEED, 'found', 'a'), | |
287 | ] |