Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | import re |
2 | import sys | |
3 | ||
4 | # Reason last stmt is continued (or C_NONE if it's not). | |
5 | C_NONE, C_BACKSLASH, C_STRING, C_BRACKET = range(4) | |
6 | ||
7 | if 0: # for throwaway debugging output | |
8 | def dump(*stuff): | |
9 | sys.__stdout__.write(" ".join(map(str, stuff)) + "\n") | |
10 | ||
11 | # Find what looks like the start of a popular stmt. | |
12 | ||
13 | _synchre = re.compile(r""" | |
14 | ^ | |
15 | [ \t]* | |
16 | (?: if | |
17 | | for | |
18 | | while | |
19 | | else | |
20 | | def | |
21 | | return | |
22 | | assert | |
23 | | break | |
24 | | class | |
25 | | continue | |
26 | | elif | |
27 | | try | |
28 | | except | |
29 | | raise | |
30 | | import | |
31 | | yield | |
32 | ) | |
33 | \b | |
34 | """, re.VERBOSE | re.MULTILINE).search | |
35 | ||
36 | # Match blank line or non-indenting comment line. | |
37 | ||
38 | _junkre = re.compile(r""" | |
39 | [ \t]* | |
40 | (?: \# \S .* )? | |
41 | \n | |
42 | """, re.VERBOSE).match | |
43 | ||
44 | # Match any flavor of string; the terminating quote is optional | |
45 | # so that we're robust in the face of incomplete program text. | |
46 | ||
47 | _match_stringre = re.compile(r""" | |
48 | \""" [^"\\]* (?: | |
49 | (?: \\. | "(?!"") ) | |
50 | [^"\\]* | |
51 | )* | |
52 | (?: \""" )? | |
53 | ||
54 | | " [^"\\\n]* (?: \\. [^"\\\n]* )* "? | |
55 | ||
56 | | ''' [^'\\]* (?: | |
57 | (?: \\. | '(?!'') ) | |
58 | [^'\\]* | |
59 | )* | |
60 | (?: ''' )? | |
61 | ||
62 | | ' [^'\\\n]* (?: \\. [^'\\\n]* )* '? | |
63 | """, re.VERBOSE | re.DOTALL).match | |
64 | ||
65 | # Match a line that starts with something interesting; | |
66 | # used to find the first item of a bracket structure. | |
67 | ||
68 | _itemre = re.compile(r""" | |
69 | [ \t]* | |
70 | [^\s#\\] # if we match, m.end()-1 is the interesting char | |
71 | """, re.VERBOSE).match | |
72 | ||
73 | # Match start of stmts that should be followed by a dedent. | |
74 | ||
75 | _closere = re.compile(r""" | |
76 | \s* | |
77 | (?: return | |
78 | | break | |
79 | | continue | |
80 | | raise | |
81 | | pass | |
82 | ) | |
83 | \b | |
84 | """, re.VERBOSE).match | |
85 | ||
86 | # Chew up non-special chars as quickly as possible. If match is | |
87 | # successful, m.end() less 1 is the index of the last boring char | |
88 | # matched. If match is unsuccessful, the string starts with an | |
89 | # interesting char. | |
90 | ||
91 | _chew_ordinaryre = re.compile(r""" | |
92 | [^[\](){}#'"\\]+ | |
93 | """, re.VERBOSE).match | |
94 | ||
95 | # Build translation table to map uninteresting chars to "x", open | |
96 | # brackets to "(", and close brackets to ")". | |
97 | ||
98 | _tran = ['x'] * 256 | |
99 | for ch in "({[": | |
100 | _tran[ord(ch)] = '(' | |
101 | for ch in ")}]": | |
102 | _tran[ord(ch)] = ')' | |
103 | for ch in "\"'\\\n#": | |
104 | _tran[ord(ch)] = ch | |
105 | _tran = ''.join(_tran) | |
106 | del ch | |
107 | ||
108 | try: | |
109 | UnicodeType = type(unicode("")) | |
110 | except NameError: | |
111 | UnicodeType = None | |
112 | ||
113 | class Parser: | |
114 | ||
115 | def __init__(self, indentwidth, tabwidth): | |
116 | self.indentwidth = indentwidth | |
117 | self.tabwidth = tabwidth | |
118 | ||
119 | def set_str(self, str): | |
120 | assert len(str) == 0 or str[-1] == '\n' | |
121 | if type(str) is UnicodeType: | |
122 | # The parse functions have no idea what to do with Unicode, so | |
123 | # replace all Unicode characters with "x". This is "safe" | |
124 | # so long as the only characters germane to parsing the structure | |
125 | # of Python are 7-bit ASCII. It's *necessary* because Unicode | |
126 | # strings don't have a .translate() method that supports | |
127 | # deletechars. | |
128 | uniphooey = str | |
129 | str = [] | |
130 | push = str.append | |
131 | for raw in map(ord, uniphooey): | |
132 | push(raw < 127 and chr(raw) or "x") | |
133 | str = "".join(str) | |
134 | self.str = str | |
135 | self.study_level = 0 | |
136 | ||
137 | # Return index of a good place to begin parsing, as close to the | |
138 | # end of the string as possible. This will be the start of some | |
139 | # popular stmt like "if" or "def". Return None if none found: | |
140 | # the caller should pass more prior context then, if possible, or | |
141 | # if not (the entire program text up until the point of interest | |
142 | # has already been tried) pass 0 to set_lo. | |
143 | # | |
144 | # This will be reliable iff given a reliable is_char_in_string | |
145 | # function, meaning that when it says "no", it's absolutely | |
146 | # guaranteed that the char is not in a string. | |
147 | # | |
148 | # Ack, hack: in the shell window this kills us, because there's | |
149 | # no way to tell the differences between output, >>> etc and | |
150 | # user input. Indeed, IDLE's first output line makes the rest | |
151 | # look like it's in an unclosed paren!: | |
152 | # Python 1.5.2 (#0, Apr 13 1999, ... | |
153 | ||
154 | def find_good_parse_start(self, use_ps1, is_char_in_string=None, | |
155 | _synchre=_synchre): | |
156 | str, pos = self.str, None | |
157 | if use_ps1: | |
158 | # shell window | |
159 | ps1 = '\n' + sys.ps1 | |
160 | i = str.rfind(ps1) | |
161 | if i >= 0: | |
162 | pos = i + len(ps1) | |
163 | # make it look like there's a newline instead | |
164 | # of ps1 at the start -- hacking here once avoids | |
165 | # repeated hackery later | |
166 | self.str = str[:pos-1] + '\n' + str[pos:] | |
167 | return pos | |
168 | ||
169 | # File window -- real work. | |
170 | if not is_char_in_string: | |
171 | # no clue -- make the caller pass everything | |
172 | return None | |
173 | ||
174 | # Peek back from the end for a good place to start, | |
175 | # but don't try too often; pos will be left None, or | |
176 | # bumped to a legitimate synch point. | |
177 | limit = len(str) | |
178 | for tries in range(5): | |
179 | i = str.rfind(":\n", 0, limit) | |
180 | if i < 0: | |
181 | break | |
182 | i = str.rfind('\n', 0, i) + 1 # start of colon line | |
183 | m = _synchre(str, i, limit) | |
184 | if m and not is_char_in_string(m.start()): | |
185 | pos = m.start() | |
186 | break | |
187 | limit = i | |
188 | if pos is None: | |
189 | # Nothing looks like a block-opener, or stuff does | |
190 | # but is_char_in_string keeps returning true; most likely | |
191 | # we're in or near a giant string, the colorizer hasn't | |
192 | # caught up enough to be helpful, or there simply *aren't* | |
193 | # any interesting stmts. In any of these cases we're | |
194 | # going to have to parse the whole thing to be sure, so | |
195 | # give it one last try from the start, but stop wasting | |
196 | # time here regardless of the outcome. | |
197 | m = _synchre(str) | |
198 | if m and not is_char_in_string(m.start()): | |
199 | pos = m.start() | |
200 | return pos | |
201 | ||
202 | # Peeking back worked; look forward until _synchre no longer | |
203 | # matches. | |
204 | i = pos + 1 | |
205 | while 1: | |
206 | m = _synchre(str, i) | |
207 | if m: | |
208 | s, i = m.span() | |
209 | if not is_char_in_string(s): | |
210 | pos = s | |
211 | else: | |
212 | break | |
213 | return pos | |
214 | ||
215 | # Throw away the start of the string. Intended to be called with | |
216 | # find_good_parse_start's result. | |
217 | ||
218 | def set_lo(self, lo): | |
219 | assert lo == 0 or self.str[lo-1] == '\n' | |
220 | if lo > 0: | |
221 | self.str = self.str[lo:] | |
222 | ||
223 | # As quickly as humanly possible <wink>, find the line numbers (0- | |
224 | # based) of the non-continuation lines. | |
225 | # Creates self.{goodlines, continuation}. | |
226 | ||
227 | def _study1(self): | |
228 | if self.study_level >= 1: | |
229 | return | |
230 | self.study_level = 1 | |
231 | ||
232 | # Map all uninteresting characters to "x", all open brackets | |
233 | # to "(", all close brackets to ")", then collapse runs of | |
234 | # uninteresting characters. This can cut the number of chars | |
235 | # by a factor of 10-40, and so greatly speed the following loop. | |
236 | str = self.str | |
237 | str = str.translate(_tran) | |
238 | str = str.replace('xxxxxxxx', 'x') | |
239 | str = str.replace('xxxx', 'x') | |
240 | str = str.replace('xx', 'x') | |
241 | str = str.replace('xx', 'x') | |
242 | str = str.replace('\nx', '\n') | |
243 | # note that replacing x\n with \n would be incorrect, because | |
244 | # x may be preceded by a backslash | |
245 | ||
246 | # March over the squashed version of the program, accumulating | |
247 | # the line numbers of non-continued stmts, and determining | |
248 | # whether & why the last stmt is a continuation. | |
249 | continuation = C_NONE | |
250 | level = lno = 0 # level is nesting level; lno is line number | |
251 | self.goodlines = goodlines = [0] | |
252 | push_good = goodlines.append | |
253 | i, n = 0, len(str) | |
254 | while i < n: | |
255 | ch = str[i] | |
256 | i = i+1 | |
257 | ||
258 | # cases are checked in decreasing order of frequency | |
259 | if ch == 'x': | |
260 | continue | |
261 | ||
262 | if ch == '\n': | |
263 | lno = lno + 1 | |
264 | if level == 0: | |
265 | push_good(lno) | |
266 | # else we're in an unclosed bracket structure | |
267 | continue | |
268 | ||
269 | if ch == '(': | |
270 | level = level + 1 | |
271 | continue | |
272 | ||
273 | if ch == ')': | |
274 | if level: | |
275 | level = level - 1 | |
276 | # else the program is invalid, but we can't complain | |
277 | continue | |
278 | ||
279 | if ch == '"' or ch == "'": | |
280 | # consume the string | |
281 | quote = ch | |
282 | if str[i-1:i+2] == quote * 3: | |
283 | quote = quote * 3 | |
284 | w = len(quote) - 1 | |
285 | i = i+w | |
286 | while i < n: | |
287 | ch = str[i] | |
288 | i = i+1 | |
289 | ||
290 | if ch == 'x': | |
291 | continue | |
292 | ||
293 | if str[i-1:i+w] == quote: | |
294 | i = i+w | |
295 | break | |
296 | ||
297 | if ch == '\n': | |
298 | lno = lno + 1 | |
299 | if w == 0: | |
300 | # unterminated single-quoted string | |
301 | if level == 0: | |
302 | push_good(lno) | |
303 | break | |
304 | continue | |
305 | ||
306 | if ch == '\\': | |
307 | assert i < n | |
308 | if str[i] == '\n': | |
309 | lno = lno + 1 | |
310 | i = i+1 | |
311 | continue | |
312 | ||
313 | # else comment char or paren inside string | |
314 | ||
315 | else: | |
316 | # didn't break out of the loop, so we're still | |
317 | # inside a string | |
318 | continuation = C_STRING | |
319 | continue # with outer loop | |
320 | ||
321 | if ch == '#': | |
322 | # consume the comment | |
323 | i = str.find('\n', i) | |
324 | assert i >= 0 | |
325 | continue | |
326 | ||
327 | assert ch == '\\' | |
328 | assert i < n | |
329 | if str[i] == '\n': | |
330 | lno = lno + 1 | |
331 | if i+1 == n: | |
332 | continuation = C_BACKSLASH | |
333 | i = i+1 | |
334 | ||
335 | # The last stmt may be continued for all 3 reasons. | |
336 | # String continuation takes precedence over bracket | |
337 | # continuation, which beats backslash continuation. | |
338 | if continuation != C_STRING and level > 0: | |
339 | continuation = C_BRACKET | |
340 | self.continuation = continuation | |
341 | ||
342 | # Push the final line number as a sentinel value, regardless of | |
343 | # whether it's continued. | |
344 | assert (continuation == C_NONE) == (goodlines[-1] == lno) | |
345 | if goodlines[-1] != lno: | |
346 | push_good(lno) | |
347 | ||
348 | def get_continuation_type(self): | |
349 | self._study1() | |
350 | return self.continuation | |
351 | ||
352 | # study1 was sufficient to determine the continuation status, | |
353 | # but doing more requires looking at every character. study2 | |
354 | # does this for the last interesting statement in the block. | |
355 | # Creates: | |
356 | # self.stmt_start, stmt_end | |
357 | # slice indices of last interesting stmt | |
358 | # self.lastch | |
359 | # last non-whitespace character before optional trailing | |
360 | # comment | |
361 | # self.lastopenbracketpos | |
362 | # if continuation is C_BRACKET, index of last open bracket | |
363 | ||
364 | def _study2(self): | |
365 | if self.study_level >= 2: | |
366 | return | |
367 | self._study1() | |
368 | self.study_level = 2 | |
369 | ||
370 | # Set p and q to slice indices of last interesting stmt. | |
371 | str, goodlines = self.str, self.goodlines | |
372 | i = len(goodlines) - 1 | |
373 | p = len(str) # index of newest line | |
374 | while i: | |
375 | assert p | |
376 | # p is the index of the stmt at line number goodlines[i]. | |
377 | # Move p back to the stmt at line number goodlines[i-1]. | |
378 | q = p | |
379 | for nothing in range(goodlines[i-1], goodlines[i]): | |
380 | # tricky: sets p to 0 if no preceding newline | |
381 | p = str.rfind('\n', 0, p-1) + 1 | |
382 | # The stmt str[p:q] isn't a continuation, but may be blank | |
383 | # or a non-indenting comment line. | |
384 | if _junkre(str, p): | |
385 | i = i-1 | |
386 | else: | |
387 | break | |
388 | if i == 0: | |
389 | # nothing but junk! | |
390 | assert p == 0 | |
391 | q = p | |
392 | self.stmt_start, self.stmt_end = p, q | |
393 | ||
394 | # Analyze this stmt, to find the last open bracket (if any) | |
395 | # and last interesting character (if any). | |
396 | lastch = "" | |
397 | stack = [] # stack of open bracket indices | |
398 | push_stack = stack.append | |
399 | while p < q: | |
400 | # suck up all except ()[]{}'"#\\ | |
401 | m = _chew_ordinaryre(str, p, q) | |
402 | if m: | |
403 | # we skipped at least one boring char | |
404 | newp = m.end() | |
405 | # back up over totally boring whitespace | |
406 | i = newp - 1 # index of last boring char | |
407 | while i >= p and str[i] in " \t\n": | |
408 | i = i-1 | |
409 | if i >= p: | |
410 | lastch = str[i] | |
411 | p = newp | |
412 | if p >= q: | |
413 | break | |
414 | ||
415 | ch = str[p] | |
416 | ||
417 | if ch in "([{": | |
418 | push_stack(p) | |
419 | lastch = ch | |
420 | p = p+1 | |
421 | continue | |
422 | ||
423 | if ch in ")]}": | |
424 | if stack: | |
425 | del stack[-1] | |
426 | lastch = ch | |
427 | p = p+1 | |
428 | continue | |
429 | ||
430 | if ch == '"' or ch == "'": | |
431 | # consume string | |
432 | # Note that study1 did this with a Python loop, but | |
433 | # we use a regexp here; the reason is speed in both | |
434 | # cases; the string may be huge, but study1 pre-squashed | |
435 | # strings to a couple of characters per line. study1 | |
436 | # also needed to keep track of newlines, and we don't | |
437 | # have to. | |
438 | lastch = ch | |
439 | p = _match_stringre(str, p, q).end() | |
440 | continue | |
441 | ||
442 | if ch == '#': | |
443 | # consume comment and trailing newline | |
444 | p = str.find('\n', p, q) + 1 | |
445 | assert p > 0 | |
446 | continue | |
447 | ||
448 | assert ch == '\\' | |
449 | p = p+1 # beyond backslash | |
450 | assert p < q | |
451 | if str[p] != '\n': | |
452 | # the program is invalid, but can't complain | |
453 | lastch = ch + str[p] | |
454 | p = p+1 # beyond escaped char | |
455 | ||
456 | # end while p < q: | |
457 | ||
458 | self.lastch = lastch | |
459 | if stack: | |
460 | self.lastopenbracketpos = stack[-1] | |
461 | ||
462 | # Assuming continuation is C_BRACKET, return the number | |
463 | # of spaces the next line should be indented. | |
464 | ||
465 | def compute_bracket_indent(self): | |
466 | self._study2() | |
467 | assert self.continuation == C_BRACKET | |
468 | j = self.lastopenbracketpos | |
469 | str = self.str | |
470 | n = len(str) | |
471 | origi = i = str.rfind('\n', 0, j) + 1 | |
472 | j = j+1 # one beyond open bracket | |
473 | # find first list item; set i to start of its line | |
474 | while j < n: | |
475 | m = _itemre(str, j) | |
476 | if m: | |
477 | j = m.end() - 1 # index of first interesting char | |
478 | extra = 0 | |
479 | break | |
480 | else: | |
481 | # this line is junk; advance to next line | |
482 | i = j = str.find('\n', j) + 1 | |
483 | else: | |
484 | # nothing interesting follows the bracket; | |
485 | # reproduce the bracket line's indentation + a level | |
486 | j = i = origi | |
487 | while str[j] in " \t": | |
488 | j = j+1 | |
489 | extra = self.indentwidth | |
490 | return len(str[i:j].expandtabs(self.tabwidth)) + extra | |
491 | ||
492 | # Return number of physical lines in last stmt (whether or not | |
493 | # it's an interesting stmt! this is intended to be called when | |
494 | # continuation is C_BACKSLASH). | |
495 | ||
496 | def get_num_lines_in_stmt(self): | |
497 | self._study1() | |
498 | goodlines = self.goodlines | |
499 | return goodlines[-1] - goodlines[-2] | |
500 | ||
501 | # Assuming continuation is C_BACKSLASH, return the number of spaces | |
502 | # the next line should be indented. Also assuming the new line is | |
503 | # the first one following the initial line of the stmt. | |
504 | ||
505 | def compute_backslash_indent(self): | |
506 | self._study2() | |
507 | assert self.continuation == C_BACKSLASH | |
508 | str = self.str | |
509 | i = self.stmt_start | |
510 | while str[i] in " \t": | |
511 | i = i+1 | |
512 | startpos = i | |
513 | ||
514 | # See whether the initial line starts an assignment stmt; i.e., | |
515 | # look for an = operator | |
516 | endpos = str.find('\n', startpos) + 1 | |
517 | found = level = 0 | |
518 | while i < endpos: | |
519 | ch = str[i] | |
520 | if ch in "([{": | |
521 | level = level + 1 | |
522 | i = i+1 | |
523 | elif ch in ")]}": | |
524 | if level: | |
525 | level = level - 1 | |
526 | i = i+1 | |
527 | elif ch == '"' or ch == "'": | |
528 | i = _match_stringre(str, i, endpos).end() | |
529 | elif ch == '#': | |
530 | break | |
531 | elif level == 0 and ch == '=' and \ | |
532 | (i == 0 or str[i-1] not in "=<>!") and \ | |
533 | str[i+1] != '=': | |
534 | found = 1 | |
535 | break | |
536 | else: | |
537 | i = i+1 | |
538 | ||
539 | if found: | |
540 | # found a legit =, but it may be the last interesting | |
541 | # thing on the line | |
542 | i = i+1 # move beyond the = | |
543 | found = re.match(r"\s*\\", str[i:endpos]) is None | |
544 | ||
545 | if not found: | |
546 | # oh well ... settle for moving beyond the first chunk | |
547 | # of non-whitespace chars | |
548 | i = startpos | |
549 | while str[i] not in " \t\n": | |
550 | i = i+1 | |
551 | ||
552 | return len(str[self.stmt_start:i].expandtabs(\ | |
553 | self.tabwidth)) + 1 | |
554 | ||
555 | # Return the leading whitespace on the initial line of the last | |
556 | # interesting stmt. | |
557 | ||
558 | def get_base_indent_string(self): | |
559 | self._study2() | |
560 | i, n = self.stmt_start, self.stmt_end | |
561 | j = i | |
562 | str = self.str | |
563 | while j < n and str[j] in " \t": | |
564 | j = j + 1 | |
565 | return str[i:j] | |
566 | ||
567 | # Did the last interesting stmt open a block? | |
568 | ||
569 | def is_block_opener(self): | |
570 | self._study2() | |
571 | return self.lastch == ':' | |
572 | ||
573 | # Did the last interesting stmt close a block? | |
574 | ||
575 | def is_block_closer(self): | |
576 | self._study2() | |
577 | return _closere(self.str, self.stmt_start) is not None | |
578 | ||
579 | # index of last open bracket ({[, or None if none | |
580 | lastopenbracketpos = None | |
581 | ||
582 | def get_last_open_bracket_pos(self): | |
583 | self._study2() | |
584 | return self.lastopenbracketpos |