Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | # |
2 | # Secret Labs' Regular Expression Engine | |
3 | # | |
4 | # convert template to internal format | |
5 | # | |
6 | # Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. | |
7 | # | |
8 | # See the sre.py file for information on usage and redistribution. | |
9 | # | |
10 | ||
11 | """Internal support module for sre""" | |
12 | ||
13 | import _sre, sys | |
14 | ||
15 | from sre_constants import * | |
16 | ||
17 | assert _sre.MAGIC == MAGIC, "SRE module mismatch" | |
18 | ||
19 | if _sre.CODESIZE == 2: | |
20 | MAXCODE = 65535 | |
21 | else: | |
22 | MAXCODE = 0xFFFFFFFFL | |
23 | ||
24 | def _identityfunction(x): | |
25 | return x | |
26 | ||
27 | def _compile(code, pattern, flags): | |
28 | # internal: compile a (sub)pattern | |
29 | emit = code.append | |
30 | _len = len | |
31 | LITERAL_CODES = {LITERAL:1, NOT_LITERAL:1} | |
32 | REPEATING_CODES = {REPEAT:1, MIN_REPEAT:1, MAX_REPEAT:1} | |
33 | SUCCESS_CODES = {SUCCESS:1, FAILURE:1} | |
34 | ASSERT_CODES = {ASSERT:1, ASSERT_NOT:1} | |
35 | for op, av in pattern: | |
36 | if op in LITERAL_CODES: | |
37 | if flags & SRE_FLAG_IGNORECASE: | |
38 | emit(OPCODES[OP_IGNORE[op]]) | |
39 | emit(_sre.getlower(av, flags)) | |
40 | else: | |
41 | emit(OPCODES[op]) | |
42 | emit(av) | |
43 | elif op is IN: | |
44 | if flags & SRE_FLAG_IGNORECASE: | |
45 | emit(OPCODES[OP_IGNORE[op]]) | |
46 | def fixup(literal, flags=flags): | |
47 | return _sre.getlower(literal, flags) | |
48 | else: | |
49 | emit(OPCODES[op]) | |
50 | fixup = _identityfunction | |
51 | skip = _len(code); emit(0) | |
52 | _compile_charset(av, flags, code, fixup) | |
53 | code[skip] = _len(code) - skip | |
54 | elif op is ANY: | |
55 | if flags & SRE_FLAG_DOTALL: | |
56 | emit(OPCODES[ANY_ALL]) | |
57 | else: | |
58 | emit(OPCODES[ANY]) | |
59 | elif op in REPEATING_CODES: | |
60 | if flags & SRE_FLAG_TEMPLATE: | |
61 | raise error, "internal: unsupported template operator" | |
62 | emit(OPCODES[REPEAT]) | |
63 | skip = _len(code); emit(0) | |
64 | emit(av[0]) | |
65 | emit(av[1]) | |
66 | _compile(code, av[2], flags) | |
67 | emit(OPCODES[SUCCESS]) | |
68 | code[skip] = _len(code) - skip | |
69 | elif _simple(av) and op is not REPEAT: | |
70 | if op is MAX_REPEAT: | |
71 | emit(OPCODES[REPEAT_ONE]) | |
72 | else: | |
73 | emit(OPCODES[MIN_REPEAT_ONE]) | |
74 | skip = _len(code); emit(0) | |
75 | emit(av[0]) | |
76 | emit(av[1]) | |
77 | _compile(code, av[2], flags) | |
78 | emit(OPCODES[SUCCESS]) | |
79 | code[skip] = _len(code) - skip | |
80 | else: | |
81 | emit(OPCODES[REPEAT]) | |
82 | skip = _len(code); emit(0) | |
83 | emit(av[0]) | |
84 | emit(av[1]) | |
85 | _compile(code, av[2], flags) | |
86 | code[skip] = _len(code) - skip | |
87 | if op is MAX_REPEAT: | |
88 | emit(OPCODES[MAX_UNTIL]) | |
89 | else: | |
90 | emit(OPCODES[MIN_UNTIL]) | |
91 | elif op is SUBPATTERN: | |
92 | if av[0]: | |
93 | emit(OPCODES[MARK]) | |
94 | emit((av[0]-1)*2) | |
95 | # _compile_info(code, av[1], flags) | |
96 | _compile(code, av[1], flags) | |
97 | if av[0]: | |
98 | emit(OPCODES[MARK]) | |
99 | emit((av[0]-1)*2+1) | |
100 | elif op in SUCCESS_CODES: | |
101 | emit(OPCODES[op]) | |
102 | elif op in ASSERT_CODES: | |
103 | emit(OPCODES[op]) | |
104 | skip = _len(code); emit(0) | |
105 | if av[0] >= 0: | |
106 | emit(0) # look ahead | |
107 | else: | |
108 | lo, hi = av[1].getwidth() | |
109 | if lo != hi: | |
110 | raise error, "look-behind requires fixed-width pattern" | |
111 | emit(lo) # look behind | |
112 | _compile(code, av[1], flags) | |
113 | emit(OPCODES[SUCCESS]) | |
114 | code[skip] = _len(code) - skip | |
115 | elif op is CALL: | |
116 | emit(OPCODES[op]) | |
117 | skip = _len(code); emit(0) | |
118 | _compile(code, av, flags) | |
119 | emit(OPCODES[SUCCESS]) | |
120 | code[skip] = _len(code) - skip | |
121 | elif op is AT: | |
122 | emit(OPCODES[op]) | |
123 | if flags & SRE_FLAG_MULTILINE: | |
124 | av = AT_MULTILINE.get(av, av) | |
125 | if flags & SRE_FLAG_LOCALE: | |
126 | av = AT_LOCALE.get(av, av) | |
127 | elif flags & SRE_FLAG_UNICODE: | |
128 | av = AT_UNICODE.get(av, av) | |
129 | emit(ATCODES[av]) | |
130 | elif op is BRANCH: | |
131 | emit(OPCODES[op]) | |
132 | tail = [] | |
133 | tailappend = tail.append | |
134 | for av in av[1]: | |
135 | skip = _len(code); emit(0) | |
136 | # _compile_info(code, av, flags) | |
137 | _compile(code, av, flags) | |
138 | emit(OPCODES[JUMP]) | |
139 | tailappend(_len(code)); emit(0) | |
140 | code[skip] = _len(code) - skip | |
141 | emit(0) # end of branch | |
142 | for tail in tail: | |
143 | code[tail] = _len(code) - tail | |
144 | elif op is CATEGORY: | |
145 | emit(OPCODES[op]) | |
146 | if flags & SRE_FLAG_LOCALE: | |
147 | av = CH_LOCALE[av] | |
148 | elif flags & SRE_FLAG_UNICODE: | |
149 | av = CH_UNICODE[av] | |
150 | emit(CHCODES[av]) | |
151 | elif op is GROUPREF: | |
152 | if flags & SRE_FLAG_IGNORECASE: | |
153 | emit(OPCODES[OP_IGNORE[op]]) | |
154 | else: | |
155 | emit(OPCODES[op]) | |
156 | emit(av-1) | |
157 | elif op is GROUPREF_EXISTS: | |
158 | emit(OPCODES[op]) | |
159 | emit(av[0]-1) | |
160 | skipyes = _len(code); emit(0) | |
161 | _compile(code, av[1], flags) | |
162 | if av[2]: | |
163 | emit(OPCODES[JUMP]) | |
164 | skipno = _len(code); emit(0) | |
165 | code[skipyes] = _len(code) - skipyes + 1 | |
166 | _compile(code, av[2], flags) | |
167 | code[skipno] = _len(code) - skipno | |
168 | else: | |
169 | code[skipyes] = _len(code) - skipyes + 1 | |
170 | else: | |
171 | raise ValueError, ("unsupported operand type", op) | |
172 | ||
173 | def _compile_charset(charset, flags, code, fixup=None): | |
174 | # compile charset subprogram | |
175 | emit = code.append | |
176 | if fixup is None: | |
177 | fixup = _identityfunction | |
178 | for op, av in _optimize_charset(charset, fixup): | |
179 | emit(OPCODES[op]) | |
180 | if op is NEGATE: | |
181 | pass | |
182 | elif op is LITERAL: | |
183 | emit(fixup(av)) | |
184 | elif op is RANGE: | |
185 | emit(fixup(av[0])) | |
186 | emit(fixup(av[1])) | |
187 | elif op is CHARSET: | |
188 | code.extend(av) | |
189 | elif op is BIGCHARSET: | |
190 | code.extend(av) | |
191 | elif op is CATEGORY: | |
192 | if flags & SRE_FLAG_LOCALE: | |
193 | emit(CHCODES[CH_LOCALE[av]]) | |
194 | elif flags & SRE_FLAG_UNICODE: | |
195 | emit(CHCODES[CH_UNICODE[av]]) | |
196 | else: | |
197 | emit(CHCODES[av]) | |
198 | else: | |
199 | raise error, "internal: unsupported set operator" | |
200 | emit(OPCODES[FAILURE]) | |
201 | ||
202 | def _optimize_charset(charset, fixup): | |
203 | # internal: optimize character set | |
204 | out = [] | |
205 | outappend = out.append | |
206 | charmap = [0]*256 | |
207 | try: | |
208 | for op, av in charset: | |
209 | if op is NEGATE: | |
210 | outappend((op, av)) | |
211 | elif op is LITERAL: | |
212 | charmap[fixup(av)] = 1 | |
213 | elif op is RANGE: | |
214 | for i in range(fixup(av[0]), fixup(av[1])+1): | |
215 | charmap[i] = 1 | |
216 | elif op is CATEGORY: | |
217 | # XXX: could append to charmap tail | |
218 | return charset # cannot compress | |
219 | except IndexError: | |
220 | # character set contains unicode characters | |
221 | return _optimize_unicode(charset, fixup) | |
222 | # compress character map | |
223 | i = p = n = 0 | |
224 | runs = [] | |
225 | runsappend = runs.append | |
226 | for c in charmap: | |
227 | if c: | |
228 | if n == 0: | |
229 | p = i | |
230 | n = n + 1 | |
231 | elif n: | |
232 | runsappend((p, n)) | |
233 | n = 0 | |
234 | i = i + 1 | |
235 | if n: | |
236 | runsappend((p, n)) | |
237 | if len(runs) <= 2: | |
238 | # use literal/range | |
239 | for p, n in runs: | |
240 | if n == 1: | |
241 | outappend((LITERAL, p)) | |
242 | else: | |
243 | outappend((RANGE, (p, p+n-1))) | |
244 | if len(out) < len(charset): | |
245 | return out | |
246 | else: | |
247 | # use bitmap | |
248 | data = _mk_bitmap(charmap) | |
249 | outappend((CHARSET, data)) | |
250 | return out | |
251 | return charset | |
252 | ||
253 | def _mk_bitmap(bits): | |
254 | data = [] | |
255 | dataappend = data.append | |
256 | if _sre.CODESIZE == 2: | |
257 | start = (1, 0) | |
258 | else: | |
259 | start = (1L, 0L) | |
260 | m, v = start | |
261 | for c in bits: | |
262 | if c: | |
263 | v = v + m | |
264 | m = m + m | |
265 | if m > MAXCODE: | |
266 | dataappend(v) | |
267 | m, v = start | |
268 | return data | |
269 | ||
270 | # To represent a big charset, first a bitmap of all characters in the | |
271 | # set is constructed. Then, this bitmap is sliced into chunks of 256 | |
272 | # characters, duplicate chunks are eliminitated, and each chunk is | |
273 | # given a number. In the compiled expression, the charset is | |
274 | # represented by a 16-bit word sequence, consisting of one word for | |
275 | # the number of different chunks, a sequence of 256 bytes (128 words) | |
276 | # of chunk numbers indexed by their original chunk position, and a | |
277 | # sequence of chunks (16 words each). | |
278 | ||
279 | # Compression is normally good: in a typical charset, large ranges of | |
280 | # Unicode will be either completely excluded (e.g. if only cyrillic | |
281 | # letters are to be matched), or completely included (e.g. if large | |
282 | # subranges of Kanji match). These ranges will be represented by | |
283 | # chunks of all one-bits or all zero-bits. | |
284 | ||
285 | # Matching can be also done efficiently: the more significant byte of | |
286 | # the Unicode character is an index into the chunk number, and the | |
287 | # less significant byte is a bit index in the chunk (just like the | |
288 | # CHARSET matching). | |
289 | ||
290 | # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets | |
291 | # of the basic multilingual plane; an efficient representation | |
292 | # for all of UTF-16 has not yet been developed. This means, | |
293 | # in particular, that negated charsets cannot be represented as | |
294 | # bigcharsets. | |
295 | ||
296 | def _optimize_unicode(charset, fixup): | |
297 | try: | |
298 | import array | |
299 | except ImportError: | |
300 | return charset | |
301 | charmap = [0]*65536 | |
302 | negate = 0 | |
303 | try: | |
304 | for op, av in charset: | |
305 | if op is NEGATE: | |
306 | negate = 1 | |
307 | elif op is LITERAL: | |
308 | charmap[fixup(av)] = 1 | |
309 | elif op is RANGE: | |
310 | for i in xrange(fixup(av[0]), fixup(av[1])+1): | |
311 | charmap[i] = 1 | |
312 | elif op is CATEGORY: | |
313 | # XXX: could expand category | |
314 | return charset # cannot compress | |
315 | except IndexError: | |
316 | # non-BMP characters | |
317 | return charset | |
318 | if negate: | |
319 | if sys.maxunicode != 65535: | |
320 | # XXX: negation does not work with big charsets | |
321 | return charset | |
322 | for i in xrange(65536): | |
323 | charmap[i] = not charmap[i] | |
324 | comps = {} | |
325 | mapping = [0]*256 | |
326 | block = 0 | |
327 | data = [] | |
328 | for i in xrange(256): | |
329 | chunk = tuple(charmap[i*256:(i+1)*256]) | |
330 | new = comps.setdefault(chunk, block) | |
331 | mapping[i] = new | |
332 | if new == block: | |
333 | block = block + 1 | |
334 | data = data + _mk_bitmap(chunk) | |
335 | header = [block] | |
336 | if _sre.CODESIZE == 2: | |
337 | code = 'H' | |
338 | else: | |
339 | code = 'I' | |
340 | # Convert block indices to byte array of 256 bytes | |
341 | mapping = array.array('b', mapping).tostring() | |
342 | # Convert byte array to word array | |
343 | mapping = array.array(code, mapping) | |
344 | assert mapping.itemsize == _sre.CODESIZE | |
345 | header = header + mapping.tolist() | |
346 | data[0:0] = header | |
347 | return [(BIGCHARSET, data)] | |
348 | ||
349 | def _simple(av): | |
350 | # check if av is a "simple" operator | |
351 | lo, hi = av[2].getwidth() | |
352 | if lo == 0 and hi == MAXREPEAT: | |
353 | raise error, "nothing to repeat" | |
354 | return lo == hi == 1 and av[2][0][0] != SUBPATTERN | |
355 | ||
356 | def _compile_info(code, pattern, flags): | |
357 | # internal: compile an info block. in the current version, | |
358 | # this contains min/max pattern width, and an optional literal | |
359 | # prefix or a character map | |
360 | lo, hi = pattern.getwidth() | |
361 | if lo == 0: | |
362 | return # not worth it | |
363 | # look for a literal prefix | |
364 | prefix = [] | |
365 | prefixappend = prefix.append | |
366 | prefix_skip = 0 | |
367 | charset = [] # not used | |
368 | charsetappend = charset.append | |
369 | if not (flags & SRE_FLAG_IGNORECASE): | |
370 | # look for literal prefix | |
371 | for op, av in pattern.data: | |
372 | if op is LITERAL: | |
373 | if len(prefix) == prefix_skip: | |
374 | prefix_skip = prefix_skip + 1 | |
375 | prefixappend(av) | |
376 | elif op is SUBPATTERN and len(av[1]) == 1: | |
377 | op, av = av[1][0] | |
378 | if op is LITERAL: | |
379 | prefixappend(av) | |
380 | else: | |
381 | break | |
382 | else: | |
383 | break | |
384 | # if no prefix, look for charset prefix | |
385 | if not prefix and pattern.data: | |
386 | op, av = pattern.data[0] | |
387 | if op is SUBPATTERN and av[1]: | |
388 | op, av = av[1][0] | |
389 | if op is LITERAL: | |
390 | charsetappend((op, av)) | |
391 | elif op is BRANCH: | |
392 | c = [] | |
393 | cappend = c.append | |
394 | for p in av[1]: | |
395 | if not p: | |
396 | break | |
397 | op, av = p[0] | |
398 | if op is LITERAL: | |
399 | cappend((op, av)) | |
400 | else: | |
401 | break | |
402 | else: | |
403 | charset = c | |
404 | elif op is BRANCH: | |
405 | c = [] | |
406 | cappend = c.append | |
407 | for p in av[1]: | |
408 | if not p: | |
409 | break | |
410 | op, av = p[0] | |
411 | if op is LITERAL: | |
412 | cappend((op, av)) | |
413 | else: | |
414 | break | |
415 | else: | |
416 | charset = c | |
417 | elif op is IN: | |
418 | charset = av | |
419 | ## if prefix: | |
420 | ## print "*** PREFIX", prefix, prefix_skip | |
421 | ## if charset: | |
422 | ## print "*** CHARSET", charset | |
423 | # add an info block | |
424 | emit = code.append | |
425 | emit(OPCODES[INFO]) | |
426 | skip = len(code); emit(0) | |
427 | # literal flag | |
428 | mask = 0 | |
429 | if prefix: | |
430 | mask = SRE_INFO_PREFIX | |
431 | if len(prefix) == prefix_skip == len(pattern.data): | |
432 | mask = mask + SRE_INFO_LITERAL | |
433 | elif charset: | |
434 | mask = mask + SRE_INFO_CHARSET | |
435 | emit(mask) | |
436 | # pattern length | |
437 | if lo < MAXCODE: | |
438 | emit(lo) | |
439 | else: | |
440 | emit(MAXCODE) | |
441 | prefix = prefix[:MAXCODE] | |
442 | if hi < MAXCODE: | |
443 | emit(hi) | |
444 | else: | |
445 | emit(0) | |
446 | # add literal prefix | |
447 | if prefix: | |
448 | emit(len(prefix)) # length | |
449 | emit(prefix_skip) # skip | |
450 | code.extend(prefix) | |
451 | # generate overlap table | |
452 | table = [-1] + ([0]*len(prefix)) | |
453 | for i in xrange(len(prefix)): | |
454 | table[i+1] = table[i]+1 | |
455 | while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]: | |
456 | table[i+1] = table[table[i+1]-1]+1 | |
457 | code.extend(table[1:]) # don't store first entry | |
458 | elif charset: | |
459 | _compile_charset(charset, flags, code) | |
460 | code[skip] = len(code) - skip | |
461 | ||
462 | try: | |
463 | unicode | |
464 | except NameError: | |
465 | STRING_TYPES = (type(""),) | |
466 | else: | |
467 | STRING_TYPES = (type(""), type(unicode(""))) | |
468 | ||
469 | def isstring(obj): | |
470 | for tp in STRING_TYPES: | |
471 | if isinstance(obj, tp): | |
472 | return 1 | |
473 | return 0 | |
474 | ||
475 | def _code(p, flags): | |
476 | ||
477 | flags = p.pattern.flags | flags | |
478 | code = [] | |
479 | ||
480 | # compile info block | |
481 | _compile_info(code, p, flags) | |
482 | ||
483 | # compile the pattern | |
484 | _compile(code, p.data, flags) | |
485 | ||
486 | code.append(OPCODES[SUCCESS]) | |
487 | ||
488 | return code | |
489 | ||
490 | def compile(p, flags=0): | |
491 | # internal: convert pattern list to internal format | |
492 | ||
493 | if isstring(p): | |
494 | import sre_parse | |
495 | pattern = p | |
496 | p = sre_parse.parse(p, flags) | |
497 | else: | |
498 | pattern = None | |
499 | ||
500 | code = _code(p, flags) | |
501 | ||
502 | # print code | |
503 | ||
504 | # XXX: <fl> get rid of this limitation! | |
505 | if p.pattern.groups > 100: | |
506 | raise AssertionError( | |
507 | "sorry, but this version only supports 100 named groups" | |
508 | ) | |
509 | ||
510 | # map in either direction | |
511 | groupindex = p.pattern.groupdict | |
512 | indexgroup = [None] * p.pattern.groups | |
513 | for k, i in groupindex.items(): | |
514 | indexgroup[i] = k | |
515 | ||
516 | return _sre.compile( | |
517 | pattern, flags, code, | |
518 | p.pattern.groups-1, | |
519 | groupindex, indexgroup | |
520 | ) |