# Secret Labs' Regular Expression Engine
# convert re-style regular expression to sre pattern
# Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved.
# See the sre.py file for information on usage and redistribution.
"""Internal support module for sre"""
# XXX: show string offset and offending character for all errors
from sre_constants
import *
SPECIAL_CHARS
= ".\\[{()*+?^$|"
DIGITS
= tuple("0123456789")
OCTDIGITS
= tuple("01234567")
HEXDIGITS
= tuple("0123456789abcdefABCDEF")
WHITESPACE
= tuple(" \t\n\r\v\f")
r
"\a": (LITERAL
, ord("\a")),
r
"\b": (LITERAL
, ord("\b")),
r
"\f": (LITERAL
, ord("\f")),
r
"\n": (LITERAL
, ord("\n")),
r
"\r": (LITERAL
, ord("\r")),
r
"\t": (LITERAL
, ord("\t")),
r
"\v": (LITERAL
, ord("\v")),
r
"\\": (LITERAL
, ord("\\"))
r
"\A": (AT
, AT_BEGINNING_STRING
), # start of string
r
"\b": (AT
, AT_BOUNDARY
),
r
"\B": (AT
, AT_NON_BOUNDARY
),
r
"\d": (IN
, [(CATEGORY
, CATEGORY_DIGIT
)]),
r
"\D": (IN
, [(CATEGORY
, CATEGORY_NOT_DIGIT
)]),
r
"\s": (IN
, [(CATEGORY
, CATEGORY_SPACE
)]),
r
"\S": (IN
, [(CATEGORY
, CATEGORY_NOT_SPACE
)]),
r
"\w": (IN
, [(CATEGORY
, CATEGORY_WORD
)]),
r
"\W": (IN
, [(CATEGORY
, CATEGORY_NOT_WORD
)]),
r
"\Z": (AT
, AT_END_STRING
), # end of string
"i": SRE_FLAG_IGNORECASE
,
# master pattern object. keeps track of global attributes
def opengroup(self
, name
=None):
ogid
= self
.groupdict
.get(name
, None)
raise error
, ("redefinition of group name %s as group %d; "
"was group %d" % (repr(name
), gid
, ogid
))
self
.groupdict
[name
] = gid
def closegroup(self
, gid
):
def checkgroup(self
, gid
):
return gid
< self
.groups
and gid
not in self
.open
# a subpattern, in intermediate form
def __init__(self
, pattern
, data
=None):
seqtypes
= type(()), type([])
print level
*" " + op
,; nl
= 0
print (level
+1)*" " + op
, a
elif type(av
) in seqtypes
:
if isinstance(a
, SubPattern
):
def __delitem__(self
, index
):
def __getitem__(self
, index
):
def __setitem__(self
, index
, code
):
def __getslice__(self
, start
, stop
):
return SubPattern(self
.pattern
, self
.data
[start
:stop
])
def insert(self
, index
, code
):
self
.data
.insert(index
, code
)
# determine the width (min, max) for this subpattern
UNITCODES
= (ANY
, RANGE
, IN
, LITERAL
, NOT_LITERAL
, CATEGORY
)
REPEATCODES
= (MIN_REPEAT
, MAX_REPEAT
)
lo
= lo
+ long(i
) * av
[0]
hi
= hi
+ long(j
) * av
[1]
self
.width
= int(min(lo
, sys
.maxint
)), int(min(hi
, sys
.maxint
))
def __init__(self
, string
):
if self
.index
>= len(self
.string
):
char
= self
.string
[self
.index
]
c
= self
.string
[self
.index
+ 1]
raise error
, "bogus escape (end of line)"
self
.index
= self
.index
+ len(char
)
def match(self
, char
, skip
=1):
return self
.index
, self
.next
self
.index
, self
.next
= index
return "a" <= char
<= "z" or "A" <= char
<= "Z" or char
== "_"
return "0" <= char
<= "9"
# check that group name is a valid string
if not isident(char
) and not isdigit(char
):
def _class_escape(source
, escape
):
# handle escape code inside character class
code
= ESCAPES
.get(escape
)
code
= CATEGORIES
.get(escape
)
# hexadecimal escape (exactly two digits)
while source
.next
in HEXDIGITS
and len(escape
) < 4:
escape
= escape
+ source
.get()
raise error
, "bogus escape: %s" % repr("\\" + escape
)
return LITERAL
, int(escape
, 16) & 0xff
# octal escape (up to three digits)
while source
.next
in OCTDIGITS
and len(escape
) < 4:
escape
= escape
+ source
.get()
return LITERAL
, int(escape
, 8) & 0xff
raise error
, "bogus escape: %s" % repr(escape
)
return LITERAL
, ord(escape
[1])
raise error
, "bogus escape: %s" % repr(escape
)
def _escape(source
, escape
, state
):
# handle escape code in expression
code
= CATEGORIES
.get(escape
)
code
= ESCAPES
.get(escape
)
while source
.next
in HEXDIGITS
and len(escape
) < 4:
escape
= escape
+ source
.get()
return LITERAL
, int(escape
[2:], 16) & 0xff
while source
.next
in OCTDIGITS
and len(escape
) < 4:
escape
= escape
+ source
.get()
return LITERAL
, int(escape
[1:], 8) & 0xff
# octal escape *or* decimal group reference (sigh)
if source
.next
in DIGITS
:
escape
= escape
+ source
.get()
if (escape
[1] in OCTDIGITS
and escape
[2] in OCTDIGITS
and
source
.next
in OCTDIGITS
):
# got three octal digits; this is an octal escape
escape
= escape
+ source
.get()
return LITERAL
, int(escape
[1:], 8) & 0xff
# not an octal escape, so this is a group reference
if not state
.checkgroup(group
):
raise error
, "cannot refer to open group"
return LITERAL
, ord(escape
[1])
raise error
, "bogus escape: %s" % repr(escape
)
def _parse_sub(source
, state
, nested
=1):
# parse an alternation: a|b|c
itemsappend
= items
.append
sourcematch
= source
.match
itemsappend(_parse(source
, state
))
if not source
.next
or sourcematch(")", 0):
raise error
, "pattern not properly closed"
subpattern
= SubPattern(state
)
subpatternappend
= subpattern
.append
# check if all items share a common prefix
# all subitems start with a common "prefix".
# move it out of the branch
continue # check next one
# check if the branch can be replaced by a character set
if len(item
) != 1 or item
[0][0] != LITERAL
:
# we can store this as a character set instead of a
# branch (the compiler may optimize this even more)
subpatternappend((IN
, set))
subpattern
.append((BRANCH
, (None, items
)))
def _parse_sub_cond(source
, state
, condgroup
):
item_yes
= _parse(source
, state
)
item_no
= _parse(source
, state
)
raise error
, "conditional backref with more than two branches"
if source
.next
and not source
.match(")", 0):
raise error
, "pattern not properly closed"
subpattern
= SubPattern(state
)
subpattern
.append((GROUPREF_EXISTS
, (condgroup
, item_yes
, item_no
)))
def _parse(source
, state
):
subpattern
= SubPattern(state
)
# precompute constants into local variables
subpatternappend
= subpattern
.append
sourcematch
= source
.match
PATTERNENDERS
= ("|", ")")
ASSERTCHARS
= ("=", "!", "<")
LOOKBEHINDASSERTCHARS
= ("=", "!")
REPEATCODES
= (MIN_REPEAT
, MAX_REPEAT
)
if source
.next
in PATTERNENDERS
:
break # end of subpattern
if state
.flags
& SRE_FLAG_VERBOSE
:
# skip whitespace and comments
if this
and this
[0] not in SPECIAL_CHARS
:
subpatternappend((LITERAL
, ord(this
)))
## pass # handle character classes
setappend((NEGATE
, None))
# check remaining characters
if this
== "]" and set != start
:
elif this
and this
[0] == "\\":
code1
= _class_escape(source
, this
)
code1
= LITERAL
, ord(this
)
raise error
, "unexpected end of regular expression"
setappend((LITERAL
, ord("-")))
code2
= _class_escape(source
, this
)
code2
= LITERAL
, ord(this
)
if code1
[0] != LITERAL
or code2
[0] != LITERAL
:
raise error
, "bad character range"
raise error
, "bad character range"
setappend((RANGE
, (lo
, hi
)))
raise error
, "unexpected end of regular expression"
# XXX: <fl> should move set optimization to compiler!
if _len(set)==1 and set[0][0] is LITERAL
:
subpatternappend(set[0]) # optimization
elif _len(set)==2 and set[0][0] is NEGATE
and set[1][0] is LITERAL
:
subpatternappend((NOT_LITERAL
, set[1][1])) # optimization
# XXX: <fl> should add charmap optimization here
subpatternappend((IN
, set))
elif this
and this
[0] in REPEAT_CHARS
:
subpatternappend((LITERAL
, ord(this
)))
while source
.next
in DIGITS
:
while source
.next
in DIGITS
:
subpatternappend((LITERAL
, ord(this
)))
raise error
, "bad repeat interval"
raise error
, "not supported"
# figure out which item to repeat
if not item
or (_len(item
) == 1 and item
[0][0] == AT
):
raise error
, "nothing to repeat"
if item
[0][0] in REPEATCODES
:
raise error
, "multiple repeat"
subpattern
[-1] = (MIN_REPEAT
, (min, max, item
))
subpattern
[-1] = (MAX_REPEAT
, (min, max, item
))
subpatternappend((ANY
, None))
# named group: skip forward to end of name
raise error
, "unterminated name"
raise error
, "bad character in group name"
raise error
, "unterminated name"
raise error
, "bad character in group name"
gid
= state
.groupdict
.get(name
)
raise error
, "unknown group name"
subpatternappend((GROUPREF
, gid
))
raise error
, "unexpected end of pattern"
raise error
, "unknown specifier: ?P%s" % char
if source
.next
is None or source
.next
== ")":
raise error
, "unbalanced parenthesis"
elif source
.next
in ASSERTCHARS
:
if source
.next
not in LOOKBEHINDASSERTCHARS
:
raise error
, "syntax error"
p
= _parse_sub(source
, state
)
raise error
, "unbalanced parenthesis"
subpatternappend((ASSERT
, (dir, p
)))
subpatternappend((ASSERT_NOT
, (dir, p
)))
# conditional backreference group
raise error
, "unterminated name"
condname
= condname
+ char
condgroup
= state
.groupdict
.get(condname
)
raise error
, "unknown group name"
condgroup
= int(condname
)
raise error
, "bad character in group name"
if not source
.next
in FLAGS
:
raise error
, "unexpected end of pattern"
while source
.next
in FLAGS
:
state
.flags
= state
.flags | FLAGS
[sourceget()]
group
= state
.opengroup(name
)
p
= _parse_sub_cond(source
, state
, condgroup
)
p
= _parse_sub(source
, state
)
raise error
, "unbalanced parenthesis"
subpatternappend((SUBPATTERN
, (group
, p
)))
raise error
, "unexpected end of pattern"
raise error
, "unknown extension"
subpatternappend((AT
, AT_BEGINNING
))
subpattern
.append((AT
, AT_END
))
elif this
and this
[0] == "\\":
code
= _escape(source
, this
, state
)
raise error
, "parser error"
def parse(str, flags
=0, pattern
=None):
# parse 're' pattern into list of (opcode, argument) tuples
p
= _parse_sub(source
, pattern
, 0)
raise error
, "unbalanced parenthesis"
raise error
, "bogus characters at end of regular expression"
if flags
& SRE_FLAG_DEBUG
:
if not (flags
& SRE_FLAG_VERBOSE
) and p
.pattern
.flags
& SRE_FLAG_VERBOSE
:
# the VERBOSE flag was switched on inside the pattern. to be
# on the safe side, we'll parse the whole thing again...
return parse(str, p
.pattern
.flags
)
def parse_template(source
, pattern
):
# parse 're' replacement string into list of literals and
def literal(literal
, p
=p
, pappend
=a
):
if p
and p
[-1][0] is LITERAL
:
p
[-1] = LITERAL
, p
[-1][1] + literal
pappend((LITERAL
, literal
))
if type(sep
) is type(""):
break # end of replacement string
if this
and this
[0] == "\\":
raise error
, "unterminated group name"
raise error
, "bad group name"
raise error
, "negative group number"
raise error
, "bad character in group name"
index
= pattern
.groupindex
[name
]
raise IndexError, "unknown group name"
literal(makechar(int(this
[1:], 8) & 0xff))
if (c
in OCTDIGITS
and this
[2] in OCTDIGITS
and
literal(makechar(int(this
[1:], 8) & 0xff))
this
= makechar(ESCAPES
[this
][1])
# convert template to groups and literals lists
groupsappend
= groups
.append
literals
= [None] * len(p
)
# literal[i] is already None
def expand_template(template
, match
):
groups
, literals
= template
for index
, group
in groups
:
literals
[index
] = s
= g(group
)
raise error
, "unmatched group"
raise error
, "invalid group reference"
return sep
.join(literals
)