Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | # |
2 | # Secret Labs' Regular Expression Engine | |
3 | # | |
4 | # convert re-style regular expression to sre pattern | |
5 | # | |
6 | # Copyright (c) 1998-2001 by Secret Labs AB. All rights reserved. | |
7 | # | |
8 | # See the sre.py file for information on usage and redistribution. | |
9 | # | |
10 | ||
11 | """Internal support module for sre""" | |
12 | ||
13 | # XXX: show string offset and offending character for all errors | |
14 | ||
15 | import sys | |
16 | ||
17 | from sre_constants import * | |
18 | ||
19 | SPECIAL_CHARS = ".\\[{()*+?^$|" | |
20 | REPEAT_CHARS = "*+?{" | |
21 | ||
22 | DIGITS = tuple("0123456789") | |
23 | ||
24 | OCTDIGITS = tuple("01234567") | |
25 | HEXDIGITS = tuple("0123456789abcdefABCDEF") | |
26 | ||
27 | WHITESPACE = tuple(" \t\n\r\v\f") | |
28 | ||
29 | ESCAPES = { | |
30 | r"\a": (LITERAL, ord("\a")), | |
31 | r"\b": (LITERAL, ord("\b")), | |
32 | r"\f": (LITERAL, ord("\f")), | |
33 | r"\n": (LITERAL, ord("\n")), | |
34 | r"\r": (LITERAL, ord("\r")), | |
35 | r"\t": (LITERAL, ord("\t")), | |
36 | r"\v": (LITERAL, ord("\v")), | |
37 | r"\\": (LITERAL, ord("\\")) | |
38 | } | |
39 | ||
40 | CATEGORIES = { | |
41 | r"\A": (AT, AT_BEGINNING_STRING), # start of string | |
42 | r"\b": (AT, AT_BOUNDARY), | |
43 | r"\B": (AT, AT_NON_BOUNDARY), | |
44 | r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]), | |
45 | r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]), | |
46 | r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]), | |
47 | r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]), | |
48 | r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]), | |
49 | r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]), | |
50 | r"\Z": (AT, AT_END_STRING), # end of string | |
51 | } | |
52 | ||
53 | FLAGS = { | |
54 | # standard flags | |
55 | "i": SRE_FLAG_IGNORECASE, | |
56 | "L": SRE_FLAG_LOCALE, | |
57 | "m": SRE_FLAG_MULTILINE, | |
58 | "s": SRE_FLAG_DOTALL, | |
59 | "x": SRE_FLAG_VERBOSE, | |
60 | # extensions | |
61 | "t": SRE_FLAG_TEMPLATE, | |
62 | "u": SRE_FLAG_UNICODE, | |
63 | } | |
64 | ||
65 | class Pattern: | |
66 | # master pattern object. keeps track of global attributes | |
67 | def __init__(self): | |
68 | self.flags = 0 | |
69 | self.open = [] | |
70 | self.groups = 1 | |
71 | self.groupdict = {} | |
72 | def opengroup(self, name=None): | |
73 | gid = self.groups | |
74 | self.groups = gid + 1 | |
75 | if name is not None: | |
76 | ogid = self.groupdict.get(name, None) | |
77 | if ogid is not None: | |
78 | raise error, ("redefinition of group name %s as group %d; " | |
79 | "was group %d" % (repr(name), gid, ogid)) | |
80 | self.groupdict[name] = gid | |
81 | self.open.append(gid) | |
82 | return gid | |
83 | def closegroup(self, gid): | |
84 | self.open.remove(gid) | |
85 | def checkgroup(self, gid): | |
86 | return gid < self.groups and gid not in self.open | |
87 | ||
88 | class SubPattern: | |
89 | # a subpattern, in intermediate form | |
90 | def __init__(self, pattern, data=None): | |
91 | self.pattern = pattern | |
92 | if data is None: | |
93 | data = [] | |
94 | self.data = data | |
95 | self.width = None | |
96 | def dump(self, level=0): | |
97 | nl = 1 | |
98 | seqtypes = type(()), type([]) | |
99 | for op, av in self.data: | |
100 | print level*" " + op,; nl = 0 | |
101 | if op == "in": | |
102 | # member sublanguage | |
103 | print; nl = 1 | |
104 | for op, a in av: | |
105 | print (level+1)*" " + op, a | |
106 | elif op == "branch": | |
107 | print; nl = 1 | |
108 | i = 0 | |
109 | for a in av[1]: | |
110 | if i > 0: | |
111 | print level*" " + "or" | |
112 | a.dump(level+1); nl = 1 | |
113 | i = i + 1 | |
114 | elif type(av) in seqtypes: | |
115 | for a in av: | |
116 | if isinstance(a, SubPattern): | |
117 | if not nl: print | |
118 | a.dump(level+1); nl = 1 | |
119 | else: | |
120 | print a, ; nl = 0 | |
121 | else: | |
122 | print av, ; nl = 0 | |
123 | if not nl: print | |
124 | def __repr__(self): | |
125 | return repr(self.data) | |
126 | def __len__(self): | |
127 | return len(self.data) | |
128 | def __delitem__(self, index): | |
129 | del self.data[index] | |
130 | def __getitem__(self, index): | |
131 | return self.data[index] | |
132 | def __setitem__(self, index, code): | |
133 | self.data[index] = code | |
134 | def __getslice__(self, start, stop): | |
135 | return SubPattern(self.pattern, self.data[start:stop]) | |
136 | def insert(self, index, code): | |
137 | self.data.insert(index, code) | |
138 | def append(self, code): | |
139 | self.data.append(code) | |
140 | def getwidth(self): | |
141 | # determine the width (min, max) for this subpattern | |
142 | if self.width: | |
143 | return self.width | |
144 | lo = hi = 0L | |
145 | UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY) | |
146 | REPEATCODES = (MIN_REPEAT, MAX_REPEAT) | |
147 | for op, av in self.data: | |
148 | if op is BRANCH: | |
149 | i = sys.maxint | |
150 | j = 0 | |
151 | for av in av[1]: | |
152 | l, h = av.getwidth() | |
153 | i = min(i, l) | |
154 | j = max(j, h) | |
155 | lo = lo + i | |
156 | hi = hi + j | |
157 | elif op is CALL: | |
158 | i, j = av.getwidth() | |
159 | lo = lo + i | |
160 | hi = hi + j | |
161 | elif op is SUBPATTERN: | |
162 | i, j = av[1].getwidth() | |
163 | lo = lo + i | |
164 | hi = hi + j | |
165 | elif op in REPEATCODES: | |
166 | i, j = av[2].getwidth() | |
167 | lo = lo + long(i) * av[0] | |
168 | hi = hi + long(j) * av[1] | |
169 | elif op in UNITCODES: | |
170 | lo = lo + 1 | |
171 | hi = hi + 1 | |
172 | elif op == SUCCESS: | |
173 | break | |
174 | self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint)) | |
175 | return self.width | |
176 | ||
177 | class Tokenizer: | |
178 | def __init__(self, string): | |
179 | self.string = string | |
180 | self.index = 0 | |
181 | self.__next() | |
182 | def __next(self): | |
183 | if self.index >= len(self.string): | |
184 | self.next = None | |
185 | return | |
186 | char = self.string[self.index] | |
187 | if char[0] == "\\": | |
188 | try: | |
189 | c = self.string[self.index + 1] | |
190 | except IndexError: | |
191 | raise error, "bogus escape (end of line)" | |
192 | char = char + c | |
193 | self.index = self.index + len(char) | |
194 | self.next = char | |
195 | def match(self, char, skip=1): | |
196 | if char == self.next: | |
197 | if skip: | |
198 | self.__next() | |
199 | return 1 | |
200 | return 0 | |
201 | def get(self): | |
202 | this = self.next | |
203 | self.__next() | |
204 | return this | |
205 | def tell(self): | |
206 | return self.index, self.next | |
207 | def seek(self, index): | |
208 | self.index, self.next = index | |
209 | ||
210 | def isident(char): | |
211 | return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" | |
212 | ||
213 | def isdigit(char): | |
214 | return "0" <= char <= "9" | |
215 | ||
216 | def isname(name): | |
217 | # check that group name is a valid string | |
218 | if not isident(name[0]): | |
219 | return False | |
220 | for char in name[1:]: | |
221 | if not isident(char) and not isdigit(char): | |
222 | return False | |
223 | return True | |
224 | ||
225 | def _class_escape(source, escape): | |
226 | # handle escape code inside character class | |
227 | code = ESCAPES.get(escape) | |
228 | if code: | |
229 | return code | |
230 | code = CATEGORIES.get(escape) | |
231 | if code: | |
232 | return code | |
233 | try: | |
234 | c = escape[1:2] | |
235 | if c == "x": | |
236 | # hexadecimal escape (exactly two digits) | |
237 | while source.next in HEXDIGITS and len(escape) < 4: | |
238 | escape = escape + source.get() | |
239 | escape = escape[2:] | |
240 | if len(escape) != 2: | |
241 | raise error, "bogus escape: %s" % repr("\\" + escape) | |
242 | return LITERAL, int(escape, 16) & 0xff | |
243 | elif c in OCTDIGITS: | |
244 | # octal escape (up to three digits) | |
245 | while source.next in OCTDIGITS and len(escape) < 4: | |
246 | escape = escape + source.get() | |
247 | escape = escape[1:] | |
248 | return LITERAL, int(escape, 8) & 0xff | |
249 | elif c in DIGITS: | |
250 | raise error, "bogus escape: %s" % repr(escape) | |
251 | if len(escape) == 2: | |
252 | return LITERAL, ord(escape[1]) | |
253 | except ValueError: | |
254 | pass | |
255 | raise error, "bogus escape: %s" % repr(escape) | |
256 | ||
257 | def _escape(source, escape, state): | |
258 | # handle escape code in expression | |
259 | code = CATEGORIES.get(escape) | |
260 | if code: | |
261 | return code | |
262 | code = ESCAPES.get(escape) | |
263 | if code: | |
264 | return code | |
265 | try: | |
266 | c = escape[1:2] | |
267 | if c == "x": | |
268 | # hexadecimal escape | |
269 | while source.next in HEXDIGITS and len(escape) < 4: | |
270 | escape = escape + source.get() | |
271 | if len(escape) != 4: | |
272 | raise ValueError | |
273 | return LITERAL, int(escape[2:], 16) & 0xff | |
274 | elif c == "0": | |
275 | # octal escape | |
276 | while source.next in OCTDIGITS and len(escape) < 4: | |
277 | escape = escape + source.get() | |
278 | return LITERAL, int(escape[1:], 8) & 0xff | |
279 | elif c in DIGITS: | |
280 | # octal escape *or* decimal group reference (sigh) | |
281 | if source.next in DIGITS: | |
282 | escape = escape + source.get() | |
283 | if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and | |
284 | source.next in OCTDIGITS): | |
285 | # got three octal digits; this is an octal escape | |
286 | escape = escape + source.get() | |
287 | return LITERAL, int(escape[1:], 8) & 0xff | |
288 | # not an octal escape, so this is a group reference | |
289 | group = int(escape[1:]) | |
290 | if group < state.groups: | |
291 | if not state.checkgroup(group): | |
292 | raise error, "cannot refer to open group" | |
293 | return GROUPREF, group | |
294 | raise ValueError | |
295 | if len(escape) == 2: | |
296 | return LITERAL, ord(escape[1]) | |
297 | except ValueError: | |
298 | pass | |
299 | raise error, "bogus escape: %s" % repr(escape) | |
300 | ||
301 | def _parse_sub(source, state, nested=1): | |
302 | # parse an alternation: a|b|c | |
303 | ||
304 | items = [] | |
305 | itemsappend = items.append | |
306 | sourcematch = source.match | |
307 | while 1: | |
308 | itemsappend(_parse(source, state)) | |
309 | if sourcematch("|"): | |
310 | continue | |
311 | if not nested: | |
312 | break | |
313 | if not source.next or sourcematch(")", 0): | |
314 | break | |
315 | else: | |
316 | raise error, "pattern not properly closed" | |
317 | ||
318 | if len(items) == 1: | |
319 | return items[0] | |
320 | ||
321 | subpattern = SubPattern(state) | |
322 | subpatternappend = subpattern.append | |
323 | ||
324 | # check if all items share a common prefix | |
325 | while 1: | |
326 | prefix = None | |
327 | for item in items: | |
328 | if not item: | |
329 | break | |
330 | if prefix is None: | |
331 | prefix = item[0] | |
332 | elif item[0] != prefix: | |
333 | break | |
334 | else: | |
335 | # all subitems start with a common "prefix". | |
336 | # move it out of the branch | |
337 | for item in items: | |
338 | del item[0] | |
339 | subpatternappend(prefix) | |
340 | continue # check next one | |
341 | break | |
342 | ||
343 | # check if the branch can be replaced by a character set | |
344 | for item in items: | |
345 | if len(item) != 1 or item[0][0] != LITERAL: | |
346 | break | |
347 | else: | |
348 | # we can store this as a character set instead of a | |
349 | # branch (the compiler may optimize this even more) | |
350 | set = [] | |
351 | setappend = set.append | |
352 | for item in items: | |
353 | setappend(item[0]) | |
354 | subpatternappend((IN, set)) | |
355 | return subpattern | |
356 | ||
357 | subpattern.append((BRANCH, (None, items))) | |
358 | return subpattern | |
359 | ||
360 | def _parse_sub_cond(source, state, condgroup): | |
361 | item_yes = _parse(source, state) | |
362 | if source.match("|"): | |
363 | item_no = _parse(source, state) | |
364 | if source.match("|"): | |
365 | raise error, "conditional backref with more than two branches" | |
366 | else: | |
367 | item_no = None | |
368 | if source.next and not source.match(")", 0): | |
369 | raise error, "pattern not properly closed" | |
370 | subpattern = SubPattern(state) | |
371 | subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) | |
372 | return subpattern | |
373 | ||
374 | def _parse(source, state): | |
375 | # parse a simple pattern | |
376 | subpattern = SubPattern(state) | |
377 | ||
378 | # precompute constants into local variables | |
379 | subpatternappend = subpattern.append | |
380 | sourceget = source.get | |
381 | sourcematch = source.match | |
382 | _len = len | |
383 | PATTERNENDERS = ("|", ")") | |
384 | ASSERTCHARS = ("=", "!", "<") | |
385 | LOOKBEHINDASSERTCHARS = ("=", "!") | |
386 | REPEATCODES = (MIN_REPEAT, MAX_REPEAT) | |
387 | ||
388 | while 1: | |
389 | ||
390 | if source.next in PATTERNENDERS: | |
391 | break # end of subpattern | |
392 | this = sourceget() | |
393 | if this is None: | |
394 | break # end of pattern | |
395 | ||
396 | if state.flags & SRE_FLAG_VERBOSE: | |
397 | # skip whitespace and comments | |
398 | if this in WHITESPACE: | |
399 | continue | |
400 | if this == "#": | |
401 | while 1: | |
402 | this = sourceget() | |
403 | if this in (None, "\n"): | |
404 | break | |
405 | continue | |
406 | ||
407 | if this and this[0] not in SPECIAL_CHARS: | |
408 | subpatternappend((LITERAL, ord(this))) | |
409 | ||
410 | elif this == "[": | |
411 | # character set | |
412 | set = [] | |
413 | setappend = set.append | |
414 | ## if sourcematch(":"): | |
415 | ## pass # handle character classes | |
416 | if sourcematch("^"): | |
417 | setappend((NEGATE, None)) | |
418 | # check remaining characters | |
419 | start = set[:] | |
420 | while 1: | |
421 | this = sourceget() | |
422 | if this == "]" and set != start: | |
423 | break | |
424 | elif this and this[0] == "\\": | |
425 | code1 = _class_escape(source, this) | |
426 | elif this: | |
427 | code1 = LITERAL, ord(this) | |
428 | else: | |
429 | raise error, "unexpected end of regular expression" | |
430 | if sourcematch("-"): | |
431 | # potential range | |
432 | this = sourceget() | |
433 | if this == "]": | |
434 | if code1[0] is IN: | |
435 | code1 = code1[1][0] | |
436 | setappend(code1) | |
437 | setappend((LITERAL, ord("-"))) | |
438 | break | |
439 | elif this: | |
440 | if this[0] == "\\": | |
441 | code2 = _class_escape(source, this) | |
442 | else: | |
443 | code2 = LITERAL, ord(this) | |
444 | if code1[0] != LITERAL or code2[0] != LITERAL: | |
445 | raise error, "bad character range" | |
446 | lo = code1[1] | |
447 | hi = code2[1] | |
448 | if hi < lo: | |
449 | raise error, "bad character range" | |
450 | setappend((RANGE, (lo, hi))) | |
451 | else: | |
452 | raise error, "unexpected end of regular expression" | |
453 | else: | |
454 | if code1[0] is IN: | |
455 | code1 = code1[1][0] | |
456 | setappend(code1) | |
457 | ||
458 | # XXX: <fl> should move set optimization to compiler! | |
459 | if _len(set)==1 and set[0][0] is LITERAL: | |
460 | subpatternappend(set[0]) # optimization | |
461 | elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL: | |
462 | subpatternappend((NOT_LITERAL, set[1][1])) # optimization | |
463 | else: | |
464 | # XXX: <fl> should add charmap optimization here | |
465 | subpatternappend((IN, set)) | |
466 | ||
467 | elif this and this[0] in REPEAT_CHARS: | |
468 | # repeat previous item | |
469 | if this == "?": | |
470 | min, max = 0, 1 | |
471 | elif this == "*": | |
472 | min, max = 0, MAXREPEAT | |
473 | ||
474 | elif this == "+": | |
475 | min, max = 1, MAXREPEAT | |
476 | elif this == "{": | |
477 | if source.next == "}": | |
478 | subpatternappend((LITERAL, ord(this))) | |
479 | continue | |
480 | here = source.tell() | |
481 | min, max = 0, MAXREPEAT | |
482 | lo = hi = "" | |
483 | while source.next in DIGITS: | |
484 | lo = lo + source.get() | |
485 | if sourcematch(","): | |
486 | while source.next in DIGITS: | |
487 | hi = hi + sourceget() | |
488 | else: | |
489 | hi = lo | |
490 | if not sourcematch("}"): | |
491 | subpatternappend((LITERAL, ord(this))) | |
492 | source.seek(here) | |
493 | continue | |
494 | if lo: | |
495 | min = int(lo) | |
496 | if hi: | |
497 | max = int(hi) | |
498 | if max < min: | |
499 | raise error, "bad repeat interval" | |
500 | else: | |
501 | raise error, "not supported" | |
502 | # figure out which item to repeat | |
503 | if subpattern: | |
504 | item = subpattern[-1:] | |
505 | else: | |
506 | item = None | |
507 | if not item or (_len(item) == 1 and item[0][0] == AT): | |
508 | raise error, "nothing to repeat" | |
509 | if item[0][0] in REPEATCODES: | |
510 | raise error, "multiple repeat" | |
511 | if sourcematch("?"): | |
512 | subpattern[-1] = (MIN_REPEAT, (min, max, item)) | |
513 | else: | |
514 | subpattern[-1] = (MAX_REPEAT, (min, max, item)) | |
515 | ||
516 | elif this == ".": | |
517 | subpatternappend((ANY, None)) | |
518 | ||
519 | elif this == "(": | |
520 | group = 1 | |
521 | name = None | |
522 | condgroup = None | |
523 | if sourcematch("?"): | |
524 | group = 0 | |
525 | # options | |
526 | if sourcematch("P"): | |
527 | # python extensions | |
528 | if sourcematch("<"): | |
529 | # named group: skip forward to end of name | |
530 | name = "" | |
531 | while 1: | |
532 | char = sourceget() | |
533 | if char is None: | |
534 | raise error, "unterminated name" | |
535 | if char == ">": | |
536 | break | |
537 | name = name + char | |
538 | group = 1 | |
539 | if not isname(name): | |
540 | raise error, "bad character in group name" | |
541 | elif sourcematch("="): | |
542 | # named backreference | |
543 | name = "" | |
544 | while 1: | |
545 | char = sourceget() | |
546 | if char is None: | |
547 | raise error, "unterminated name" | |
548 | if char == ")": | |
549 | break | |
550 | name = name + char | |
551 | if not isname(name): | |
552 | raise error, "bad character in group name" | |
553 | gid = state.groupdict.get(name) | |
554 | if gid is None: | |
555 | raise error, "unknown group name" | |
556 | subpatternappend((GROUPREF, gid)) | |
557 | continue | |
558 | else: | |
559 | char = sourceget() | |
560 | if char is None: | |
561 | raise error, "unexpected end of pattern" | |
562 | raise error, "unknown specifier: ?P%s" % char | |
563 | elif sourcematch(":"): | |
564 | # non-capturing group | |
565 | group = 2 | |
566 | elif sourcematch("#"): | |
567 | # comment | |
568 | while 1: | |
569 | if source.next is None or source.next == ")": | |
570 | break | |
571 | sourceget() | |
572 | if not sourcematch(")"): | |
573 | raise error, "unbalanced parenthesis" | |
574 | continue | |
575 | elif source.next in ASSERTCHARS: | |
576 | # lookahead assertions | |
577 | char = sourceget() | |
578 | dir = 1 | |
579 | if char == "<": | |
580 | if source.next not in LOOKBEHINDASSERTCHARS: | |
581 | raise error, "syntax error" | |
582 | dir = -1 # lookbehind | |
583 | char = sourceget() | |
584 | p = _parse_sub(source, state) | |
585 | if not sourcematch(")"): | |
586 | raise error, "unbalanced parenthesis" | |
587 | if char == "=": | |
588 | subpatternappend((ASSERT, (dir, p))) | |
589 | else: | |
590 | subpatternappend((ASSERT_NOT, (dir, p))) | |
591 | continue | |
592 | elif sourcematch("("): | |
593 | # conditional backreference group | |
594 | condname = "" | |
595 | while 1: | |
596 | char = sourceget() | |
597 | if char is None: | |
598 | raise error, "unterminated name" | |
599 | if char == ")": | |
600 | break | |
601 | condname = condname + char | |
602 | group = 2 | |
603 | if isname(condname): | |
604 | condgroup = state.groupdict.get(condname) | |
605 | if condgroup is None: | |
606 | raise error, "unknown group name" | |
607 | else: | |
608 | try: | |
609 | condgroup = int(condname) | |
610 | except ValueError: | |
611 | raise error, "bad character in group name" | |
612 | else: | |
613 | # flags | |
614 | if not source.next in FLAGS: | |
615 | raise error, "unexpected end of pattern" | |
616 | while source.next in FLAGS: | |
617 | state.flags = state.flags | FLAGS[sourceget()] | |
618 | if group: | |
619 | # parse group contents | |
620 | if group == 2: | |
621 | # anonymous group | |
622 | group = None | |
623 | else: | |
624 | group = state.opengroup(name) | |
625 | if condgroup: | |
626 | p = _parse_sub_cond(source, state, condgroup) | |
627 | else: | |
628 | p = _parse_sub(source, state) | |
629 | if not sourcematch(")"): | |
630 | raise error, "unbalanced parenthesis" | |
631 | if group is not None: | |
632 | state.closegroup(group) | |
633 | subpatternappend((SUBPATTERN, (group, p))) | |
634 | else: | |
635 | while 1: | |
636 | char = sourceget() | |
637 | if char is None: | |
638 | raise error, "unexpected end of pattern" | |
639 | if char == ")": | |
640 | break | |
641 | raise error, "unknown extension" | |
642 | ||
643 | elif this == "^": | |
644 | subpatternappend((AT, AT_BEGINNING)) | |
645 | ||
646 | elif this == "$": | |
647 | subpattern.append((AT, AT_END)) | |
648 | ||
649 | elif this and this[0] == "\\": | |
650 | code = _escape(source, this, state) | |
651 | subpatternappend(code) | |
652 | ||
653 | else: | |
654 | raise error, "parser error" | |
655 | ||
656 | return subpattern | |
657 | ||
658 | def parse(str, flags=0, pattern=None): | |
659 | # parse 're' pattern into list of (opcode, argument) tuples | |
660 | ||
661 | source = Tokenizer(str) | |
662 | ||
663 | if pattern is None: | |
664 | pattern = Pattern() | |
665 | pattern.flags = flags | |
666 | pattern.str = str | |
667 | ||
668 | p = _parse_sub(source, pattern, 0) | |
669 | ||
670 | tail = source.get() | |
671 | if tail == ")": | |
672 | raise error, "unbalanced parenthesis" | |
673 | elif tail: | |
674 | raise error, "bogus characters at end of regular expression" | |
675 | ||
676 | if flags & SRE_FLAG_DEBUG: | |
677 | p.dump() | |
678 | ||
679 | if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE: | |
680 | # the VERBOSE flag was switched on inside the pattern. to be | |
681 | # on the safe side, we'll parse the whole thing again... | |
682 | return parse(str, p.pattern.flags) | |
683 | ||
684 | return p | |
685 | ||
686 | def parse_template(source, pattern): | |
687 | # parse 're' replacement string into list of literals and | |
688 | # group references | |
689 | s = Tokenizer(source) | |
690 | sget = s.get | |
691 | p = [] | |
692 | a = p.append | |
693 | def literal(literal, p=p, pappend=a): | |
694 | if p and p[-1][0] is LITERAL: | |
695 | p[-1] = LITERAL, p[-1][1] + literal | |
696 | else: | |
697 | pappend((LITERAL, literal)) | |
698 | sep = source[:0] | |
699 | if type(sep) is type(""): | |
700 | makechar = chr | |
701 | else: | |
702 | makechar = unichr | |
703 | while 1: | |
704 | this = sget() | |
705 | if this is None: | |
706 | break # end of replacement string | |
707 | if this and this[0] == "\\": | |
708 | # group | |
709 | c = this[1:2] | |
710 | if c == "g": | |
711 | name = "" | |
712 | if s.match("<"): | |
713 | while 1: | |
714 | char = sget() | |
715 | if char is None: | |
716 | raise error, "unterminated group name" | |
717 | if char == ">": | |
718 | break | |
719 | name = name + char | |
720 | if not name: | |
721 | raise error, "bad group name" | |
722 | try: | |
723 | index = int(name) | |
724 | if index < 0: | |
725 | raise error, "negative group number" | |
726 | except ValueError: | |
727 | if not isname(name): | |
728 | raise error, "bad character in group name" | |
729 | try: | |
730 | index = pattern.groupindex[name] | |
731 | except KeyError: | |
732 | raise IndexError, "unknown group name" | |
733 | a((MARK, index)) | |
734 | elif c == "0": | |
735 | if s.next in OCTDIGITS: | |
736 | this = this + sget() | |
737 | if s.next in OCTDIGITS: | |
738 | this = this + sget() | |
739 | literal(makechar(int(this[1:], 8) & 0xff)) | |
740 | elif c in DIGITS: | |
741 | isoctal = False | |
742 | if s.next in DIGITS: | |
743 | this = this + sget() | |
744 | if (c in OCTDIGITS and this[2] in OCTDIGITS and | |
745 | s.next in OCTDIGITS): | |
746 | this = this + sget() | |
747 | isoctal = True | |
748 | literal(makechar(int(this[1:], 8) & 0xff)) | |
749 | if not isoctal: | |
750 | a((MARK, int(this[1:]))) | |
751 | else: | |
752 | try: | |
753 | this = makechar(ESCAPES[this][1]) | |
754 | except KeyError: | |
755 | pass | |
756 | literal(this) | |
757 | else: | |
758 | literal(this) | |
759 | # convert template to groups and literals lists | |
760 | i = 0 | |
761 | groups = [] | |
762 | groupsappend = groups.append | |
763 | literals = [None] * len(p) | |
764 | for c, s in p: | |
765 | if c is MARK: | |
766 | groupsappend((i, s)) | |
767 | # literal[i] is already None | |
768 | else: | |
769 | literals[i] = s | |
770 | i = i + 1 | |
771 | return groups, literals | |
772 | ||
773 | def expand_template(template, match): | |
774 | g = match.group | |
775 | sep = match.string[:0] | |
776 | groups, literals = template | |
777 | literals = literals[:] | |
778 | try: | |
779 | for index, group in groups: | |
780 | literals[index] = s = g(group) | |
781 | if s is None: | |
782 | raise error, "unmatched group" | |
783 | except IndexError: | |
784 | raise error, "invalid group reference" | |
785 | return sep.join(literals) |