# Secret Labs' Regular Expression Engine
# convert template to internal format
# Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
# See the sre.py file for information on usage and redistribution.
"""Internal support module for sre"""
from sre_constants
import *
assert _sre
.MAGIC
== MAGIC
, "SRE module mismatch"
def _identityfunction(x
):
def _compile(code
, pattern
, flags
):
# internal: compile a (sub)pattern
LITERAL_CODES
= {LITERAL
:1, NOT_LITERAL
:1}
REPEATING_CODES
= {REPEAT
:1, MIN_REPEAT
:1, MAX_REPEAT
:1}
SUCCESS_CODES
= {SUCCESS
:1, FAILURE
:1}
ASSERT_CODES
= {ASSERT
:1, ASSERT_NOT
:1}
if flags
& SRE_FLAG_IGNORECASE
:
emit(OPCODES
[OP_IGNORE
[op
]])
emit(_sre
.getlower(av
, flags
))
if flags
& SRE_FLAG_IGNORECASE
:
emit(OPCODES
[OP_IGNORE
[op
]])
def fixup(literal
, flags
=flags
):
return _sre
.getlower(literal
, flags
)
fixup
= _identityfunction
skip
= _len(code
); emit(0)
_compile_charset(av
, flags
, code
, fixup
)
code
[skip
] = _len(code
) - skip
if flags
& SRE_FLAG_DOTALL
:
elif op
in REPEATING_CODES
:
if flags
& SRE_FLAG_TEMPLATE
:
raise error
, "internal: unsupported template operator"
skip
= _len(code
); emit(0)
_compile(code
, av
[2], flags
)
code
[skip
] = _len(code
) - skip
elif _simple(av
) and op
is not REPEAT
:
emit(OPCODES
[REPEAT_ONE
])
emit(OPCODES
[MIN_REPEAT_ONE
])
skip
= _len(code
); emit(0)
_compile(code
, av
[2], flags
)
code
[skip
] = _len(code
) - skip
skip
= _len(code
); emit(0)
_compile(code
, av
[2], flags
)
code
[skip
] = _len(code
) - skip
# _compile_info(code, av[1], flags)
_compile(code
, av
[1], flags
)
elif op
in SUCCESS_CODES
:
skip
= _len(code
); emit(0)
lo
, hi
= av
[1].getwidth()
raise error
, "look-behind requires fixed-width pattern"
_compile(code
, av
[1], flags
)
code
[skip
] = _len(code
) - skip
skip
= _len(code
); emit(0)
_compile(code
, av
, flags
)
code
[skip
] = _len(code
) - skip
if flags
& SRE_FLAG_MULTILINE
:
av
= AT_MULTILINE
.get(av
, av
)
if flags
& SRE_FLAG_LOCALE
:
av
= AT_LOCALE
.get(av
, av
)
elif flags
& SRE_FLAG_UNICODE
:
av
= AT_UNICODE
.get(av
, av
)
skip
= _len(code
); emit(0)
# _compile_info(code, av, flags)
_compile(code
, av
, flags
)
tailappend(_len(code
)); emit(0)
code
[skip
] = _len(code
) - skip
code
[tail
] = _len(code
) - tail
if flags
& SRE_FLAG_LOCALE
:
elif flags
& SRE_FLAG_UNICODE
:
if flags
& SRE_FLAG_IGNORECASE
:
emit(OPCODES
[OP_IGNORE
[op
]])
elif op
is GROUPREF_EXISTS
:
skipyes
= _len(code
); emit(0)
_compile(code
, av
[1], flags
)
skipno
= _len(code
); emit(0)
code
[skipyes
] = _len(code
) - skipyes
+ 1
_compile(code
, av
[2], flags
)
code
[skipno
] = _len(code
) - skipno
code
[skipyes
] = _len(code
) - skipyes
+ 1
raise ValueError, ("unsupported operand type", op
)
def _compile_charset(charset
, flags
, code
, fixup
=None):
# compile charset subprogram
fixup
= _identityfunction
for op
, av
in _optimize_charset(charset
, fixup
):
if flags
& SRE_FLAG_LOCALE
:
emit(CHCODES
[CH_LOCALE
[av
]])
elif flags
& SRE_FLAG_UNICODE
:
emit(CHCODES
[CH_UNICODE
[av
]])
raise error
, "internal: unsupported set operator"
def _optimize_charset(charset
, fixup
):
# internal: optimize character set
for i
in range(fixup(av
[0]), fixup(av
[1])+1):
# XXX: could append to charmap tail
return charset
# cannot compress
# character set contains unicode characters
return _optimize_unicode(charset
, fixup
)
outappend((RANGE
, (p
, p
+n
-1)))
if len(out
) < len(charset
):
data
= _mk_bitmap(charmap
)
outappend((CHARSET
, data
))
# To represent a big charset, first a bitmap of all characters in the
# set is constructed. Then, this bitmap is sliced into chunks of 256
# characters, duplicate chunks are eliminitated, and each chunk is
# given a number. In the compiled expression, the charset is
# represented by a 16-bit word sequence, consisting of one word for
# the number of different chunks, a sequence of 256 bytes (128 words)
# of chunk numbers indexed by their original chunk position, and a
# sequence of chunks (16 words each).
# Compression is normally good: in a typical charset, large ranges of
# Unicode will be either completely excluded (e.g. if only cyrillic
# letters are to be matched), or completely included (e.g. if large
# subranges of Kanji match). These ranges will be represented by
# chunks of all one-bits or all zero-bits.
# Matching can be also done efficiently: the more significant byte of
# the Unicode character is an index into the chunk number, and the
# less significant byte is a bit index in the chunk (just like the
# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
# of the basic multilingual plane; an efficient representation
# for all of UTF-16 has not yet been developed. This means,
# in particular, that negated charsets cannot be represented as
def _optimize_unicode(charset
, fixup
):
for i
in xrange(fixup(av
[0]), fixup(av
[1])+1):
# XXX: could expand category
return charset
# cannot compress
if sys
.maxunicode
!= 65535:
# XXX: negation does not work with big charsets
charmap
[i
] = not charmap
[i
]
chunk
= tuple(charmap
[i
*256:(i
+1)*256])
new
= comps
.setdefault(chunk
, block
)
data
= data
+ _mk_bitmap(chunk
)
# Convert block indices to byte array of 256 bytes
mapping
= array
.array('b', mapping
).tostring()
# Convert byte array to word array
mapping
= array
.array(code
, mapping
)
assert mapping
.itemsize
== _sre
.CODESIZE
header
= header
+ mapping
.tolist()
return [(BIGCHARSET
, data
)]
# check if av is a "simple" operator
lo
, hi
= av
[2].getwidth()
if lo
== 0 and hi
== MAXREPEAT
:
raise error
, "nothing to repeat"
return lo
== hi
== 1 and av
[2][0][0] != SUBPATTERN
def _compile_info(code
, pattern
, flags
):
# internal: compile an info block. in the current version,
# this contains min/max pattern width, and an optional literal
# prefix or a character map
lo
, hi
= pattern
.getwidth()
# look for a literal prefix
prefixappend
= prefix
.append
charsetappend
= charset
.append
if not (flags
& SRE_FLAG_IGNORECASE
):
# look for literal prefix
for op
, av
in pattern
.data
:
if len(prefix
) == prefix_skip
:
prefix_skip
= prefix_skip
+ 1
elif op
is SUBPATTERN
and len(av
[1]) == 1:
# if no prefix, look for charset prefix
if not prefix
and pattern
.data
:
if op
is SUBPATTERN
and av
[1]:
## print "*** PREFIX", prefix, prefix_skip
## print "*** CHARSET", charset
skip
= len(code
); emit(0)
if len(prefix
) == prefix_skip
== len(pattern
.data
):
mask
= mask
+ SRE_INFO_LITERAL
mask
= mask
+ SRE_INFO_CHARSET
prefix
= prefix
[:MAXCODE
]
emit(len(prefix
)) # length
table
= [-1] + ([0]*len(prefix
))
for i
in xrange(len(prefix
)):
while table
[i
+1] > 0 and prefix
[i
] != prefix
[table
[i
+1]-1]:
table
[i
+1] = table
[table
[i
+1]-1]+1
code
.extend(table
[1:]) # don't store first entry
_compile_charset(charset
, flags
, code
)
code
[skip
] = len(code
) - skip
STRING_TYPES
= (type(""),)
STRING_TYPES
= (type(""), type(unicode("")))
flags
= p
.pattern
.flags | flags
_compile_info(code
, p
, flags
)
_compile(code
, p
.data
, flags
)
code
.append(OPCODES
[SUCCESS
])
# internal: convert pattern list to internal format
p
= sre_parse
.parse(p
, flags
)
# XXX: <fl> get rid of this limitation!
if p
.pattern
.groups
> 100:
"sorry, but this version only supports 100 named groups"
# map in either direction
groupindex
= p
.pattern
.groupdict
indexgroup
= [None] * p
.pattern
.groups
for k
, i
in groupindex
.items():