Initial commit of OpenSPARC T2 architecture model.
[OpenSPARC-T2-SAM] / sam-t2 / devtools / v8plus / lib / python2.4 / sre_compile.py
CommitLineData
920dae64
AT
1#
2# Secret Labs' Regular Expression Engine
3#
4# convert template to internal format
5#
6# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
7#
8# See the sre.py file for information on usage and redistribution.
9#
10
11"""Internal support module for sre"""
12
13import _sre, sys
14
15from sre_constants import *
16
17assert _sre.MAGIC == MAGIC, "SRE module mismatch"
18
19if _sre.CODESIZE == 2:
20 MAXCODE = 65535
21else:
22 MAXCODE = 0xFFFFFFFFL
23
24def _identityfunction(x):
25 return x
26
27def _compile(code, pattern, flags):
28 # internal: compile a (sub)pattern
29 emit = code.append
30 _len = len
31 LITERAL_CODES = {LITERAL:1, NOT_LITERAL:1}
32 REPEATING_CODES = {REPEAT:1, MIN_REPEAT:1, MAX_REPEAT:1}
33 SUCCESS_CODES = {SUCCESS:1, FAILURE:1}
34 ASSERT_CODES = {ASSERT:1, ASSERT_NOT:1}
35 for op, av in pattern:
36 if op in LITERAL_CODES:
37 if flags & SRE_FLAG_IGNORECASE:
38 emit(OPCODES[OP_IGNORE[op]])
39 emit(_sre.getlower(av, flags))
40 else:
41 emit(OPCODES[op])
42 emit(av)
43 elif op is IN:
44 if flags & SRE_FLAG_IGNORECASE:
45 emit(OPCODES[OP_IGNORE[op]])
46 def fixup(literal, flags=flags):
47 return _sre.getlower(literal, flags)
48 else:
49 emit(OPCODES[op])
50 fixup = _identityfunction
51 skip = _len(code); emit(0)
52 _compile_charset(av, flags, code, fixup)
53 code[skip] = _len(code) - skip
54 elif op is ANY:
55 if flags & SRE_FLAG_DOTALL:
56 emit(OPCODES[ANY_ALL])
57 else:
58 emit(OPCODES[ANY])
59 elif op in REPEATING_CODES:
60 if flags & SRE_FLAG_TEMPLATE:
61 raise error, "internal: unsupported template operator"
62 emit(OPCODES[REPEAT])
63 skip = _len(code); emit(0)
64 emit(av[0])
65 emit(av[1])
66 _compile(code, av[2], flags)
67 emit(OPCODES[SUCCESS])
68 code[skip] = _len(code) - skip
69 elif _simple(av) and op is not REPEAT:
70 if op is MAX_REPEAT:
71 emit(OPCODES[REPEAT_ONE])
72 else:
73 emit(OPCODES[MIN_REPEAT_ONE])
74 skip = _len(code); emit(0)
75 emit(av[0])
76 emit(av[1])
77 _compile(code, av[2], flags)
78 emit(OPCODES[SUCCESS])
79 code[skip] = _len(code) - skip
80 else:
81 emit(OPCODES[REPEAT])
82 skip = _len(code); emit(0)
83 emit(av[0])
84 emit(av[1])
85 _compile(code, av[2], flags)
86 code[skip] = _len(code) - skip
87 if op is MAX_REPEAT:
88 emit(OPCODES[MAX_UNTIL])
89 else:
90 emit(OPCODES[MIN_UNTIL])
91 elif op is SUBPATTERN:
92 if av[0]:
93 emit(OPCODES[MARK])
94 emit((av[0]-1)*2)
95 # _compile_info(code, av[1], flags)
96 _compile(code, av[1], flags)
97 if av[0]:
98 emit(OPCODES[MARK])
99 emit((av[0]-1)*2+1)
100 elif op in SUCCESS_CODES:
101 emit(OPCODES[op])
102 elif op in ASSERT_CODES:
103 emit(OPCODES[op])
104 skip = _len(code); emit(0)
105 if av[0] >= 0:
106 emit(0) # look ahead
107 else:
108 lo, hi = av[1].getwidth()
109 if lo != hi:
110 raise error, "look-behind requires fixed-width pattern"
111 emit(lo) # look behind
112 _compile(code, av[1], flags)
113 emit(OPCODES[SUCCESS])
114 code[skip] = _len(code) - skip
115 elif op is CALL:
116 emit(OPCODES[op])
117 skip = _len(code); emit(0)
118 _compile(code, av, flags)
119 emit(OPCODES[SUCCESS])
120 code[skip] = _len(code) - skip
121 elif op is AT:
122 emit(OPCODES[op])
123 if flags & SRE_FLAG_MULTILINE:
124 av = AT_MULTILINE.get(av, av)
125 if flags & SRE_FLAG_LOCALE:
126 av = AT_LOCALE.get(av, av)
127 elif flags & SRE_FLAG_UNICODE:
128 av = AT_UNICODE.get(av, av)
129 emit(ATCODES[av])
130 elif op is BRANCH:
131 emit(OPCODES[op])
132 tail = []
133 tailappend = tail.append
134 for av in av[1]:
135 skip = _len(code); emit(0)
136 # _compile_info(code, av, flags)
137 _compile(code, av, flags)
138 emit(OPCODES[JUMP])
139 tailappend(_len(code)); emit(0)
140 code[skip] = _len(code) - skip
141 emit(0) # end of branch
142 for tail in tail:
143 code[tail] = _len(code) - tail
144 elif op is CATEGORY:
145 emit(OPCODES[op])
146 if flags & SRE_FLAG_LOCALE:
147 av = CH_LOCALE[av]
148 elif flags & SRE_FLAG_UNICODE:
149 av = CH_UNICODE[av]
150 emit(CHCODES[av])
151 elif op is GROUPREF:
152 if flags & SRE_FLAG_IGNORECASE:
153 emit(OPCODES[OP_IGNORE[op]])
154 else:
155 emit(OPCODES[op])
156 emit(av-1)
157 elif op is GROUPREF_EXISTS:
158 emit(OPCODES[op])
159 emit(av[0]-1)
160 skipyes = _len(code); emit(0)
161 _compile(code, av[1], flags)
162 if av[2]:
163 emit(OPCODES[JUMP])
164 skipno = _len(code); emit(0)
165 code[skipyes] = _len(code) - skipyes + 1
166 _compile(code, av[2], flags)
167 code[skipno] = _len(code) - skipno
168 else:
169 code[skipyes] = _len(code) - skipyes + 1
170 else:
171 raise ValueError, ("unsupported operand type", op)
172
173def _compile_charset(charset, flags, code, fixup=None):
174 # compile charset subprogram
175 emit = code.append
176 if fixup is None:
177 fixup = _identityfunction
178 for op, av in _optimize_charset(charset, fixup):
179 emit(OPCODES[op])
180 if op is NEGATE:
181 pass
182 elif op is LITERAL:
183 emit(fixup(av))
184 elif op is RANGE:
185 emit(fixup(av[0]))
186 emit(fixup(av[1]))
187 elif op is CHARSET:
188 code.extend(av)
189 elif op is BIGCHARSET:
190 code.extend(av)
191 elif op is CATEGORY:
192 if flags & SRE_FLAG_LOCALE:
193 emit(CHCODES[CH_LOCALE[av]])
194 elif flags & SRE_FLAG_UNICODE:
195 emit(CHCODES[CH_UNICODE[av]])
196 else:
197 emit(CHCODES[av])
198 else:
199 raise error, "internal: unsupported set operator"
200 emit(OPCODES[FAILURE])
201
202def _optimize_charset(charset, fixup):
203 # internal: optimize character set
204 out = []
205 outappend = out.append
206 charmap = [0]*256
207 try:
208 for op, av in charset:
209 if op is NEGATE:
210 outappend((op, av))
211 elif op is LITERAL:
212 charmap[fixup(av)] = 1
213 elif op is RANGE:
214 for i in range(fixup(av[0]), fixup(av[1])+1):
215 charmap[i] = 1
216 elif op is CATEGORY:
217 # XXX: could append to charmap tail
218 return charset # cannot compress
219 except IndexError:
220 # character set contains unicode characters
221 return _optimize_unicode(charset, fixup)
222 # compress character map
223 i = p = n = 0
224 runs = []
225 runsappend = runs.append
226 for c in charmap:
227 if c:
228 if n == 0:
229 p = i
230 n = n + 1
231 elif n:
232 runsappend((p, n))
233 n = 0
234 i = i + 1
235 if n:
236 runsappend((p, n))
237 if len(runs) <= 2:
238 # use literal/range
239 for p, n in runs:
240 if n == 1:
241 outappend((LITERAL, p))
242 else:
243 outappend((RANGE, (p, p+n-1)))
244 if len(out) < len(charset):
245 return out
246 else:
247 # use bitmap
248 data = _mk_bitmap(charmap)
249 outappend((CHARSET, data))
250 return out
251 return charset
252
253def _mk_bitmap(bits):
254 data = []
255 dataappend = data.append
256 if _sre.CODESIZE == 2:
257 start = (1, 0)
258 else:
259 start = (1L, 0L)
260 m, v = start
261 for c in bits:
262 if c:
263 v = v + m
264 m = m + m
265 if m > MAXCODE:
266 dataappend(v)
267 m, v = start
268 return data
269
270# To represent a big charset, first a bitmap of all characters in the
271# set is constructed. Then, this bitmap is sliced into chunks of 256
272# characters, duplicate chunks are eliminitated, and each chunk is
273# given a number. In the compiled expression, the charset is
274# represented by a 16-bit word sequence, consisting of one word for
275# the number of different chunks, a sequence of 256 bytes (128 words)
276# of chunk numbers indexed by their original chunk position, and a
277# sequence of chunks (16 words each).
278
279# Compression is normally good: in a typical charset, large ranges of
280# Unicode will be either completely excluded (e.g. if only cyrillic
281# letters are to be matched), or completely included (e.g. if large
282# subranges of Kanji match). These ranges will be represented by
283# chunks of all one-bits or all zero-bits.
284
285# Matching can be also done efficiently: the more significant byte of
286# the Unicode character is an index into the chunk number, and the
287# less significant byte is a bit index in the chunk (just like the
288# CHARSET matching).
289
290# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
291# of the basic multilingual plane; an efficient representation
292# for all of UTF-16 has not yet been developed. This means,
293# in particular, that negated charsets cannot be represented as
294# bigcharsets.
295
296def _optimize_unicode(charset, fixup):
297 try:
298 import array
299 except ImportError:
300 return charset
301 charmap = [0]*65536
302 negate = 0
303 try:
304 for op, av in charset:
305 if op is NEGATE:
306 negate = 1
307 elif op is LITERAL:
308 charmap[fixup(av)] = 1
309 elif op is RANGE:
310 for i in xrange(fixup(av[0]), fixup(av[1])+1):
311 charmap[i] = 1
312 elif op is CATEGORY:
313 # XXX: could expand category
314 return charset # cannot compress
315 except IndexError:
316 # non-BMP characters
317 return charset
318 if negate:
319 if sys.maxunicode != 65535:
320 # XXX: negation does not work with big charsets
321 return charset
322 for i in xrange(65536):
323 charmap[i] = not charmap[i]
324 comps = {}
325 mapping = [0]*256
326 block = 0
327 data = []
328 for i in xrange(256):
329 chunk = tuple(charmap[i*256:(i+1)*256])
330 new = comps.setdefault(chunk, block)
331 mapping[i] = new
332 if new == block:
333 block = block + 1
334 data = data + _mk_bitmap(chunk)
335 header = [block]
336 if _sre.CODESIZE == 2:
337 code = 'H'
338 else:
339 code = 'I'
340 # Convert block indices to byte array of 256 bytes
341 mapping = array.array('b', mapping).tostring()
342 # Convert byte array to word array
343 mapping = array.array(code, mapping)
344 assert mapping.itemsize == _sre.CODESIZE
345 header = header + mapping.tolist()
346 data[0:0] = header
347 return [(BIGCHARSET, data)]
348
349def _simple(av):
350 # check if av is a "simple" operator
351 lo, hi = av[2].getwidth()
352 if lo == 0 and hi == MAXREPEAT:
353 raise error, "nothing to repeat"
354 return lo == hi == 1 and av[2][0][0] != SUBPATTERN
355
356def _compile_info(code, pattern, flags):
357 # internal: compile an info block. in the current version,
358 # this contains min/max pattern width, and an optional literal
359 # prefix or a character map
360 lo, hi = pattern.getwidth()
361 if lo == 0:
362 return # not worth it
363 # look for a literal prefix
364 prefix = []
365 prefixappend = prefix.append
366 prefix_skip = 0
367 charset = [] # not used
368 charsetappend = charset.append
369 if not (flags & SRE_FLAG_IGNORECASE):
370 # look for literal prefix
371 for op, av in pattern.data:
372 if op is LITERAL:
373 if len(prefix) == prefix_skip:
374 prefix_skip = prefix_skip + 1
375 prefixappend(av)
376 elif op is SUBPATTERN and len(av[1]) == 1:
377 op, av = av[1][0]
378 if op is LITERAL:
379 prefixappend(av)
380 else:
381 break
382 else:
383 break
384 # if no prefix, look for charset prefix
385 if not prefix and pattern.data:
386 op, av = pattern.data[0]
387 if op is SUBPATTERN and av[1]:
388 op, av = av[1][0]
389 if op is LITERAL:
390 charsetappend((op, av))
391 elif op is BRANCH:
392 c = []
393 cappend = c.append
394 for p in av[1]:
395 if not p:
396 break
397 op, av = p[0]
398 if op is LITERAL:
399 cappend((op, av))
400 else:
401 break
402 else:
403 charset = c
404 elif op is BRANCH:
405 c = []
406 cappend = c.append
407 for p in av[1]:
408 if not p:
409 break
410 op, av = p[0]
411 if op is LITERAL:
412 cappend((op, av))
413 else:
414 break
415 else:
416 charset = c
417 elif op is IN:
418 charset = av
419## if prefix:
420## print "*** PREFIX", prefix, prefix_skip
421## if charset:
422## print "*** CHARSET", charset
423 # add an info block
424 emit = code.append
425 emit(OPCODES[INFO])
426 skip = len(code); emit(0)
427 # literal flag
428 mask = 0
429 if prefix:
430 mask = SRE_INFO_PREFIX
431 if len(prefix) == prefix_skip == len(pattern.data):
432 mask = mask + SRE_INFO_LITERAL
433 elif charset:
434 mask = mask + SRE_INFO_CHARSET
435 emit(mask)
436 # pattern length
437 if lo < MAXCODE:
438 emit(lo)
439 else:
440 emit(MAXCODE)
441 prefix = prefix[:MAXCODE]
442 if hi < MAXCODE:
443 emit(hi)
444 else:
445 emit(0)
446 # add literal prefix
447 if prefix:
448 emit(len(prefix)) # length
449 emit(prefix_skip) # skip
450 code.extend(prefix)
451 # generate overlap table
452 table = [-1] + ([0]*len(prefix))
453 for i in xrange(len(prefix)):
454 table[i+1] = table[i]+1
455 while table[i+1] > 0 and prefix[i] != prefix[table[i+1]-1]:
456 table[i+1] = table[table[i+1]-1]+1
457 code.extend(table[1:]) # don't store first entry
458 elif charset:
459 _compile_charset(charset, flags, code)
460 code[skip] = len(code) - skip
461
462try:
463 unicode
464except NameError:
465 STRING_TYPES = (type(""),)
466else:
467 STRING_TYPES = (type(""), type(unicode("")))
468
469def isstring(obj):
470 for tp in STRING_TYPES:
471 if isinstance(obj, tp):
472 return 1
473 return 0
474
475def _code(p, flags):
476
477 flags = p.pattern.flags | flags
478 code = []
479
480 # compile info block
481 _compile_info(code, p, flags)
482
483 # compile the pattern
484 _compile(code, p.data, flags)
485
486 code.append(OPCODES[SUCCESS])
487
488 return code
489
490def compile(p, flags=0):
491 # internal: convert pattern list to internal format
492
493 if isstring(p):
494 import sre_parse
495 pattern = p
496 p = sre_parse.parse(p, flags)
497 else:
498 pattern = None
499
500 code = _code(p, flags)
501
502 # print code
503
504 # XXX: <fl> get rid of this limitation!
505 if p.pattern.groups > 100:
506 raise AssertionError(
507 "sorry, but this version only supports 100 named groups"
508 )
509
510 # map in either direction
511 groupindex = p.pattern.groupdict
512 indexgroup = [None] * p.pattern.groups
513 for k, i in groupindex.items():
514 indexgroup[i] = k
515
516 return _sre.compile(
517 pattern, flags, code,
518 p.pattern.groups-1,
519 groupindex, indexgroup
520 )