Initial commit of OpenSPARC T2 architecture model.
[OpenSPARC-T2-SAM] / sam-t2 / devtools / v8plus / lib / python2.4 / tokenize.py
CommitLineData
920dae64
AT
1"""Tokenization help for Python programs.
2
3generate_tokens(readline) is a generator that breaks a stream of
4text into Python tokens. It accepts a readline-like method which is called
5repeatedly to get the next line of input (or "" for EOF). It generates
65-tuples with these members:
7
8 the token type (see token.py)
9 the token (a string)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
13
14It is designed to match the working of the Python tokenizer exactly, except
15that it produces COMMENT tokens for comments and gives type OP for all
16operators
17
18Older entry points
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21are the same, except instead of generating tokens, tokeneater is a callback
22function to which the 5 fields described above are passed as 5 arguments,
23each time a new token is found."""
24
25__author__ = 'Ka-Ping Yee <ping@lfw.org>'
26__credits__ = \
27 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
28
29import string, re
30from token import *
31
32import token
33__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
34 "generate_tokens", "NL"]
35del x
36del token
37
38COMMENT = N_TOKENS
39tok_name[COMMENT] = 'COMMENT'
40NL = N_TOKENS + 1
41tok_name[NL] = 'NL'
42N_TOKENS += 2
43
44def group(*choices): return '(' + '|'.join(choices) + ')'
45def any(*choices): return group(*choices) + '*'
46def maybe(*choices): return group(*choices) + '?'
47
48Whitespace = r'[ \f\t]*'
49Comment = r'#[^\r\n]*'
50Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51Name = r'[a-zA-Z_]\w*'
52
53Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
54Octnumber = r'0[0-7]*[lL]?'
55Decnumber = r'[1-9]\d*[lL]?'
56Intnumber = group(Hexnumber, Octnumber, Decnumber)
57Exponent = r'[eE][-+]?\d+'
58Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
59Expfloat = r'\d+' + Exponent
60Floatnumber = group(Pointfloat, Expfloat)
61Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
62Number = group(Imagnumber, Floatnumber, Intnumber)
63
64# Tail end of ' string.
65Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
66# Tail end of " string.
67Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
68# Tail end of ''' string.
69Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
70# Tail end of """ string.
71Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
72Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73# Single-line ' or " string.
74String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
76
77# Because of leftmost-then-longest match semantics, be sure to put the
78# longest operators first (e.g., if = came before ==, == would get
79# recognized as two instances of =).
80Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
81 r"//=?",
82 r"[+\-*/%&|^=<>]=?",
83 r"~")
84
85Bracket = '[][(){}]'
86Special = group(r'\r?\n', r'[:;.,`@]')
87Funny = group(Operator, Bracket, Special)
88
89PlainToken = group(Number, Funny, String, Name)
90Token = Ignore + PlainToken
91
92# First (or only) line of ' or " string.
93ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
94 group("'", r'\\\r?\n'),
95 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
96 group('"', r'\\\r?\n'))
97PseudoExtras = group(r'\\\r?\n', Comment, Triple)
98PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
99
100tokenprog, pseudoprog, single3prog, double3prog = map(
101 re.compile, (Token, PseudoToken, Single3, Double3))
102endprogs = {"'": re.compile(Single), '"': re.compile(Double),
103 "'''": single3prog, '"""': double3prog,
104 "r'''": single3prog, 'r"""': double3prog,
105 "u'''": single3prog, 'u"""': double3prog,
106 "ur'''": single3prog, 'ur"""': double3prog,
107 "R'''": single3prog, 'R"""': double3prog,
108 "U'''": single3prog, 'U"""': double3prog,
109 "uR'''": single3prog, 'uR"""': double3prog,
110 "Ur'''": single3prog, 'Ur"""': double3prog,
111 "UR'''": single3prog, 'UR"""': double3prog,
112 'r': None, 'R': None, 'u': None, 'U': None}
113
114triple_quoted = {}
115for t in ("'''", '"""',
116 "r'''", 'r"""', "R'''", 'R"""',
117 "u'''", 'u"""', "U'''", 'U"""',
118 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
119 "uR'''", 'uR"""', "UR'''", 'UR"""'):
120 triple_quoted[t] = t
121single_quoted = {}
122for t in ("'", '"',
123 "r'", 'r"', "R'", 'R"',
124 "u'", 'u"', "U'", 'U"',
125 "ur'", 'ur"', "Ur'", 'Ur"',
126 "uR'", 'uR"', "UR'", 'UR"' ):
127 single_quoted[t] = t
128
129tabsize = 8
130
131class TokenError(Exception): pass
132
133class StopTokenizing(Exception): pass
134
135def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
136 print "%d,%d-%d,%d:\t%s\t%s" % \
137 (srow, scol, erow, ecol, tok_name[type], repr(token))
138
139def tokenize(readline, tokeneater=printtoken):
140 """
141 The tokenize() function accepts two parameters: one representing the
142 input stream, and one providing an output mechanism for tokenize().
143
144 The first parameter, readline, must be a callable object which provides
145 the same interface as the readline() method of built-in file objects.
146 Each call to the function should return one line of input as a string.
147
148 The second parameter, tokeneater, must also be a callable object. It is
149 called once for each token, with five arguments, corresponding to the
150 tuples generated by generate_tokens().
151 """
152 try:
153 tokenize_loop(readline, tokeneater)
154 except StopTokenizing:
155 pass
156
157# backwards compatible interface
158def tokenize_loop(readline, tokeneater):
159 for token_info in generate_tokens(readline):
160 tokeneater(*token_info)
161
162def generate_tokens(readline):
163 """
164 The generate_tokens() generator requires one argment, readline, which
165 must be a callable object which provides the same interface as the
166 readline() method of built-in file objects. Each call to the function
167 should return one line of input as a string.
168
169 The generator produces 5-tuples with these members: the token type; the
170 token string; a 2-tuple (srow, scol) of ints specifying the row and
171 column where the token begins in the source; a 2-tuple (erow, ecol) of
172 ints specifying the row and column where the token ends in the source;
173 and the line on which the token was found. The line passed is the
174 logical line; continuation lines are included.
175 """
176 lnum = parenlev = continued = 0
177 namechars, numchars = string.ascii_letters + '_', '0123456789'
178 contstr, needcont = '', 0
179 contline = None
180 indents = [0]
181
182 while 1: # loop over lines in stream
183 line = readline()
184 lnum = lnum + 1
185 pos, max = 0, len(line)
186
187 if contstr: # continued string
188 if not line:
189 raise TokenError, ("EOF in multi-line string", strstart)
190 endmatch = endprog.match(line)
191 if endmatch:
192 pos = end = endmatch.end(0)
193 yield (STRING, contstr + line[:end],
194 strstart, (lnum, end), contline + line)
195 contstr, needcont = '', 0
196 contline = None
197 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
198 yield (ERRORTOKEN, contstr + line,
199 strstart, (lnum, len(line)), contline)
200 contstr = ''
201 contline = None
202 continue
203 else:
204 contstr = contstr + line
205 contline = contline + line
206 continue
207
208 elif parenlev == 0 and not continued: # new statement
209 if not line: break
210 column = 0
211 while pos < max: # measure leading whitespace
212 if line[pos] == ' ': column = column + 1
213 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
214 elif line[pos] == '\f': column = 0
215 else: break
216 pos = pos + 1
217 if pos == max: break
218
219 if line[pos] in '#\r\n': # skip comments or blank lines
220 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
221 (lnum, pos), (lnum, len(line)), line)
222 continue
223
224 if column > indents[-1]: # count indents or dedents
225 indents.append(column)
226 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
227 while column < indents[-1]:
228 if column not in indents:
229 raise IndentationError(
230 "unindent does not match any outer indentation level")
231 indents = indents[:-1]
232 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
233
234 else: # continued statement
235 if not line:
236 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
237 continued = 0
238
239 while pos < max:
240 pseudomatch = pseudoprog.match(line, pos)
241 if pseudomatch: # scan for tokens
242 start, end = pseudomatch.span(1)
243 spos, epos, pos = (lnum, start), (lnum, end), end
244 token, initial = line[start:end], line[start]
245
246 if initial in numchars or \
247 (initial == '.' and token != '.'): # ordinary number
248 yield (NUMBER, token, spos, epos, line)
249 elif initial in '\r\n':
250 yield (parenlev > 0 and NL or NEWLINE,
251 token, spos, epos, line)
252 elif initial == '#':
253 yield (COMMENT, token, spos, epos, line)
254 elif token in triple_quoted:
255 endprog = endprogs[token]
256 endmatch = endprog.match(line, pos)
257 if endmatch: # all on one line
258 pos = endmatch.end(0)
259 token = line[start:pos]
260 yield (STRING, token, spos, (lnum, pos), line)
261 else:
262 strstart = (lnum, start) # multiple lines
263 contstr = line[start:]
264 contline = line
265 break
266 elif initial in single_quoted or \
267 token[:2] in single_quoted or \
268 token[:3] in single_quoted:
269 if token[-1] == '\n': # continued string
270 strstart = (lnum, start)
271 endprog = (endprogs[initial] or endprogs[token[1]] or
272 endprogs[token[2]])
273 contstr, needcont = line[start:], 1
274 contline = line
275 break
276 else: # ordinary string
277 yield (STRING, token, spos, epos, line)
278 elif initial in namechars: # ordinary name
279 yield (NAME, token, spos, epos, line)
280 elif initial == '\\': # continued stmt
281 continued = 1
282 else:
283 if initial in '([{': parenlev = parenlev + 1
284 elif initial in ')]}': parenlev = parenlev - 1
285 yield (OP, token, spos, epos, line)
286 else:
287 yield (ERRORTOKEN, line[pos],
288 (lnum, pos), (lnum, pos+1), line)
289 pos = pos + 1
290
291 for indent in indents[1:]: # pop remaining indent levels
292 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
293 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
294
295if __name__ == '__main__': # testing
296 import sys
297 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
298 else: tokenize(sys.stdin.readline)