[OpenSPARC-T2-SAM] / sam-t2 / devtools / v8plus / lib / python2.4 / tokenize.py

"""Tokenization help for Python programs.

generate_tokens(readline) is a generator that breaks a stream of
text into Python tokens.  It accepts a readline-like method which is called
repeatedly to get the next line of input (or "" for EOF).  It generates
5-tuples with these members:

    the token type (see token.py)
    the token (a string)
    the starting (row, column) indices of the token (a 2-tuple of ints)
    the ending (row, column) indices of the token (a 2-tuple of ints)
    the original line (string)

It is designed to match the working of the Python tokenizer exactly, except
that it produces COMMENT tokens for comments and gives type OP for all
operators

Older entry points
    tokenize_loop(readline, tokeneater)
    tokenize(readline, tokeneater=printtoken)
are the same, except instead of generating tokens, tokeneater is a callback
function to which the 5 fields described above are passed as 5 arguments,
each time a new token is found."""

__author__ = 'Ka-Ping Yee <ping@lfw.org>'
__credits__ = \
    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'

import string, re
from token import *

import token
__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
           "generate_tokens", "NL"]
del x
del token

COMMENT = N_TOKENS
tok_name[COMMENT] = 'COMMENT'
NL = N_TOKENS + 1
tok_name[NL] = 'NL'
N_TOKENS += 2

def group(*choices): return '(' + '|'.join(choices) + ')'
def any(*choices): return group(*choices) + '*'
def maybe(*choices): return group(*choices) + '?'

Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Name = r'[a-zA-Z_]\w*'

Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
Octnumber = r'0[0-7]*[lL]?'
Decnumber = r'[1-9]\d*[lL]?'
Intnumber = group(Hexnumber, Octnumber, Decnumber)
Exponent = r'[eE][-+]?\d+'
Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
Expfloat = r'\d+' + Exponent
Floatnumber = group(Pointfloat, Expfloat)
Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
Number = group(Imagnumber, Floatnumber, Intnumber)

# Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
# Tail end of " string.
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
# Tail end of ''' string.
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
# Single-line ' or " string.
String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
               r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
                 r"//=?",
                 r"[+\-*/%&|^=<>]=?",
                 r"~")

Bracket = '[][(){}]'
Special = group(r'\r?\n', r'[:;.,`@]')
Funny = group(Operator, Bracket, Special)

PlainToken = group(Number, Funny, String, Name)
Token = Ignore + PlainToken

# First (or only) line of ' or " string.
ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
                group("'", r'\\\r?\n'),
                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
                group('"', r'\\\r?\n'))
PseudoExtras = group(r'\\\r?\n', Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

tokenprog, pseudoprog, single3prog, double3prog = map(
    re.compile, (Token, PseudoToken, Single3, Double3))
endprogs = {"'": re.compile(Single), '"': re.compile(Double),
            "'''": single3prog, '"""': double3prog,
            "r'''": single3prog, 'r"""': double3prog,
            "u'''": single3prog, 'u"""': double3prog,
            "ur'''": single3prog, 'ur"""': double3prog,
            "R'''": single3prog, 'R"""': double3prog,
            "U'''": single3prog, 'U"""': double3prog,
            "uR'''": single3prog, 'uR"""': double3prog,
            "Ur'''": single3prog, 'Ur"""': double3prog,
            "UR'''": single3prog, 'UR"""': double3prog,
            'r': None, 'R': None, 'u': None, 'U': None}

triple_quoted = {}
for t in ("'''", '"""',
          "r'''", 'r"""', "R'''", 'R"""',
          "u'''", 'u"""', "U'''", 'U"""',
          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
          "uR'''", 'uR"""', "UR'''", 'UR"""'):
    triple_quoted[t] = t
single_quoted = {}
for t in ("'", '"',
          "r'", 'r"', "R'", 'R"',
          "u'", 'u"', "U'", 'U"',
          "ur'", 'ur"', "Ur'", 'Ur"',
          "uR'", 'uR"', "UR'", 'UR"' ):
    single_quoted[t] = t

tabsize = 8

class TokenError(Exception): pass

class StopTokenizing(Exception): pass

def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
    print "%d,%d-%d,%d:\t%s\t%s" % \
        (srow, scol, erow, ecol, tok_name[type], repr(token))

def tokenize(readline, tokeneater=printtoken):
    """
    The tokenize() function accepts two parameters: one representing the
    input stream, and one providing an output mechanism for tokenize().

    The first parameter, readline, must be a callable object which provides
    the same interface as the readline() method of built-in file objects.
    Each call to the function should return one line of input as a string.

    The second parameter, tokeneater, must also be a callable object. It is
    called once for each token, with five arguments, corresponding to the
    tuples generated by generate_tokens().
    """
    try:
        tokenize_loop(readline, tokeneater)
    except StopTokenizing:
        pass

# backwards compatible interface
def tokenize_loop(readline, tokeneater):
    for token_info in generate_tokens(readline):
        tokeneater(*token_info)

def generate_tokens(readline):
    """
    The generate_tokens() generator requires one argment, readline, which
    must be a callable object which provides the same interface as the
    readline() method of built-in file objects. Each call to the function
    should return one line of input as a string.

    The generator produces 5-tuples with these members: the token type; the
    token string; a 2-tuple (srow, scol) of ints specifying the row and
    column where the token begins in the source; a 2-tuple (erow, ecol) of
    ints specifying the row and column where the token ends in the source;
    and the line on which the token was found. The line passed is the
    logical line; continuation lines are included.
    """
    lnum = parenlev = continued = 0
    namechars, numchars = string.ascii_letters + '_', '0123456789'
    contstr, needcont = '', 0
    contline = None
    indents = [0]

    while 1:                                   # loop over lines in stream
        line = readline()
        lnum = lnum + 1
        pos, max = 0, len(line)

        if contstr:                            # continued string
            if not line:
                raise TokenError, ("EOF in multi-line string", strstart)
            endmatch = endprog.match(line)
            if endmatch:
                pos = end = endmatch.end(0)
                yield (STRING, contstr + line[:end],
                           strstart, (lnum, end), contline + line)
                contstr, needcont = '', 0
                contline = None
            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
                yield (ERRORTOKEN, contstr + line,
                           strstart, (lnum, len(line)), contline)
                contstr = ''
                contline = None
                continue
            else:
                contstr = contstr + line
                contline = contline + line
                continue

        elif parenlev == 0 and not continued:  # new statement
            if not line: break
            column = 0
            while pos < max:                   # measure leading whitespace
                if line[pos] == ' ': column = column + 1
                elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
                elif line[pos] == '\f': column = 0
                else: break
                pos = pos + 1
            if pos == max: break

            if line[pos] in '#\r\n':           # skip comments or blank lines
                yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
                           (lnum, pos), (lnum, len(line)), line)
                continue

            if column > indents[-1]:           # count indents or dedents
                indents.append(column)
                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
            while column < indents[-1]:
                if column not in indents:
                    raise IndentationError(
                        "unindent does not match any outer indentation level")
                indents = indents[:-1]
                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)

        else:                                  # continued statement
            if not line:
                raise TokenError, ("EOF in multi-line statement", (lnum, 0))
            continued = 0

        while pos < max:
            pseudomatch = pseudoprog.match(line, pos)
            if pseudomatch:                                # scan for tokens
                start, end = pseudomatch.span(1)
                spos, epos, pos = (lnum, start), (lnum, end), end
                token, initial = line[start:end], line[start]

                if initial in numchars or \
                   (initial == '.' and token != '.'):      # ordinary number
                    yield (NUMBER, token, spos, epos, line)
                elif initial in '\r\n':
                    yield (parenlev > 0 and NL or NEWLINE,
                               token, spos, epos, line)
                elif initial == '#':
                    yield (COMMENT, token, spos, epos, line)
                elif token in triple_quoted:
                    endprog = endprogs[token]
                    endmatch = endprog.match(line, pos)
                    if endmatch:                           # all on one line
                        pos = endmatch.end(0)
                        token = line[start:pos]
                        yield (STRING, token, spos, (lnum, pos), line)
                    else:
                        strstart = (lnum, start)           # multiple lines
                        contstr = line[start:]
                        contline = line
                        break
                elif initial in single_quoted or \
                    token[:2] in single_quoted or \
                    token[:3] in single_quoted:
                    if token[-1] == '\n':                  # continued string
                        strstart = (lnum, start)
                        endprog = (endprogs[initial] or endprogs[token[1]] or
                                   endprogs[token[2]])
                        contstr, needcont = line[start:], 1
                        contline = line
                        break
                    else:                                  # ordinary string
                        yield (STRING, token, spos, epos, line)
                elif initial in namechars:                 # ordinary name
                    yield (NAME, token, spos, epos, line)
                elif initial == '\\':                      # continued stmt
                    continued = 1
                else:
                    if initial in '([{': parenlev = parenlev + 1
                    elif initial in ')]}': parenlev = parenlev - 1
                    yield (OP, token, spos, epos, line)
            else:
                yield (ERRORTOKEN, line[pos],
                           (lnum, pos), (lnum, pos+1), line)
                pos = pos + 1

    for indent in indents[1:]:                 # pop remaining indent levels
        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')

if __name__ == '__main__':                     # testing
    import sys
    if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
    else: tokenize(sys.stdin.readline)
Commit	Line	Data
920dae64 AT	1	"""Tokenization help for Python programs.
	2
	3	generate_tokens(readline) is a generator that breaks a stream of
	4	text into Python tokens. It accepts a readline-like method which is called
	5	repeatedly to get the next line of input (or "" for EOF). It generates
	6	5-tuples with these members:
	7
	8	the token type (see token.py)
	9	the token (a string)
	10	the starting (row, column) indices of the token (a 2-tuple of ints)
	11	the ending (row, column) indices of the token (a 2-tuple of ints)
	12	the original line (string)
	13
	14	It is designed to match the working of the Python tokenizer exactly, except
	15	that it produces COMMENT tokens for comments and gives type OP for all
	16	operators
	17
	18	Older entry points
	19	tokenize_loop(readline, tokeneater)
	20	tokenize(readline, tokeneater=printtoken)
	21	are the same, except instead of generating tokens, tokeneater is a callback
	22	function to which the 5 fields described above are passed as 5 arguments,
	23	each time a new token is found."""
	24
	25	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
	26	__credits__ = \
	27	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
	28
	29	import string, re
	30	from token import *
	31
	32	import token
	33	__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
	34	"generate_tokens", "NL"]
	35	del x
	36	del token
	37
	38	COMMENT = N_TOKENS
	39	tok_name[COMMENT] = 'COMMENT'
	40	NL = N_TOKENS + 1
	41	tok_name[NL] = 'NL'
	42	N_TOKENS += 2
	43
	44	def group(*choices): return '(' + '\|'.join(choices) + ')'
	45	def any(choices): return group(choices) + '*'
	46	def maybe(choices): return group(choices) + '?'
	47
	48	Whitespace = r'[ \f\t]*'
	49	Comment = r'#[^\r\n]*'
	50	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
	51	Name = r'[a-zA-Z_]\w*'
	52
	53	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
	54	Octnumber = r'0[0-7]*[lL]?'
	55	Decnumber = r'[1-9]\d*[lL]?'
	56	Intnumber = group(Hexnumber, Octnumber, Decnumber)
	57	Exponent = r'[eE][-+]?\d+'
	58	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
	59	Expfloat = r'\d+' + Exponent
	60	Floatnumber = group(Pointfloat, Expfloat)
	61	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
	62	Number = group(Imagnumber, Floatnumber, Intnumber)
	63
	64	# Tail end of ' string.
65	Single = r"[^'\\](?:\\.[^'\\])*'"
66	# Tail end of " string.
67	Double = r'[^"\\](?:\\.[^"\\])*"'
68	# Tail end of ''' string.
69	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
70	# Tail end of """ string.
71	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
72	Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73	# Single-line ' or " string.
74	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
75	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
76
77	# Because of leftmost-then-longest match semantics, be sure to put the
78	# longest operators first (e.g., if = came before ==, == would get
79	# recognized as two instances of =).
80	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
81	r"//=?",
82	r"[+\-*/%&\|^=<>]=?",
83	r"~")
84
85	Bracket = '[][(){}]'
86	Special = group(r'\r?\n', r'[:;.,`@]')
87	Funny = group(Operator, Bracket, Special)
88
89	PlainToken = group(Number, Funny, String, Name)
90	Token = Ignore + PlainToken
91
92	# First (or only) line of ' or " string.
93	ContStr = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
94	group("'", r'\\\r?\n'),
95	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
96	group('"', r'\\\r?\n'))
97	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
98	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
99
100	tokenprog, pseudoprog, single3prog, double3prog = map(
101	re.compile, (Token, PseudoToken, Single3, Double3))
102	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
103	"'''": single3prog, '"""': double3prog,
104	"r'''": single3prog, 'r"""': double3prog,
105	"u'''": single3prog, 'u"""': double3prog,
106	"ur'''": single3prog, 'ur"""': double3prog,
107	"R'''": single3prog, 'R"""': double3prog,
108	"U'''": single3prog, 'U"""': double3prog,
109	"uR'''": single3prog, 'uR"""': double3prog,
110	"Ur'''": single3prog, 'Ur"""': double3prog,
111	"UR'''": single3prog, 'UR"""': double3prog,
112	'r': None, 'R': None, 'u': None, 'U': None}
113
114	triple_quoted = {}
115	for t in ("'''", '"""',
116	"r'''", 'r"""', "R'''", 'R"""',
117	"u'''", 'u"""', "U'''", 'U"""',
118	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
119	"uR'''", 'uR"""', "UR'''", 'UR"""'):
120	triple_quoted[t] = t
121	single_quoted = {}
122	for t in ("'", '"',
123	"r'", 'r"', "R'", 'R"',
124	"u'", 'u"', "U'", 'U"',
125	"ur'", 'ur"', "Ur'", 'Ur"',
126	"uR'", 'uR"', "UR'", 'UR"' ):
127	single_quoted[t] = t
128
129	tabsize = 8
130
131	class TokenError(Exception): pass
132
133	class StopTokenizing(Exception): pass
134
135	def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
136	print "%d,%d-%d,%d:\t%s\t%s" % \
137	(srow, scol, erow, ecol, tok_name[type], repr(token))
138
139	def tokenize(readline, tokeneater=printtoken):
140	"""
141	The tokenize() function accepts two parameters: one representing the
142	input stream, and one providing an output mechanism for tokenize().
143
144	The first parameter, readline, must be a callable object which provides
145	the same interface as the readline() method of built-in file objects.
146	Each call to the function should return one line of input as a string.
147
148	The second parameter, tokeneater, must also be a callable object. It is
149	called once for each token, with five arguments, corresponding to the
150	tuples generated by generate_tokens().
151	"""
152	try:
153	tokenize_loop(readline, tokeneater)
154	except StopTokenizing:
155	pass
156
157	# backwards compatible interface
158	def tokenize_loop(readline, tokeneater):
159	for token_info in generate_tokens(readline):
160	tokeneater(*token_info)
161
162	def generate_tokens(readline):
163	"""
164	The generate_tokens() generator requires one argment, readline, which
165	must be a callable object which provides the same interface as the
166	readline() method of built-in file objects. Each call to the function
167	should return one line of input as a string.
168
169	The generator produces 5-tuples with these members: the token type; the
170	token string; a 2-tuple (srow, scol) of ints specifying the row and
171	column where the token begins in the source; a 2-tuple (erow, ecol) of
172	ints specifying the row and column where the token ends in the source;
173	and the line on which the token was found. The line passed is the
174	logical line; continuation lines are included.
175	"""
176	lnum = parenlev = continued = 0
177	namechars, numchars = string.ascii_letters + '_', '0123456789'
178	contstr, needcont = '', 0
179	contline = None
180	indents = [0]
181
182	while 1: # loop over lines in stream
183	line = readline()
184	lnum = lnum + 1
185	pos, max = 0, len(line)
186
187	if contstr: # continued string
188	if not line:
189	raise TokenError, ("EOF in multi-line string", strstart)
190	endmatch = endprog.match(line)
191	if endmatch:
192	pos = end = endmatch.end(0)
193	yield (STRING, contstr + line[:end],
194	strstart, (lnum, end), contline + line)
195	contstr, needcont = '', 0
196	contline = None
197	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
198	yield (ERRORTOKEN, contstr + line,
199	strstart, (lnum, len(line)), contline)
200	contstr = ''
201	contline = None
202	continue
203	else:
204	contstr = contstr + line
205	contline = contline + line
206	continue
207
208	elif parenlev == 0 and not continued: # new statement
209	if not line: break
210	column = 0
211	while pos < max: # measure leading whitespace
212	if line[pos] == ' ': column = column + 1
213	elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
214	elif line[pos] == '\f': column = 0
215	else: break
216	pos = pos + 1
217	if pos == max: break
218
219	if line[pos] in '#\r\n': # skip comments or blank lines
220	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
221	(lnum, pos), (lnum, len(line)), line)
222	continue
223
224	if column > indents[-1]: # count indents or dedents
225	indents.append(column)
226	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
227	while column < indents[-1]:
228	if column not in indents:
229	raise IndentationError(
230	"unindent does not match any outer indentation level")
231	indents = indents[:-1]
232	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
233
234	else: # continued statement
235	if not line:
236	raise TokenError, ("EOF in multi-line statement", (lnum, 0))
237	continued = 0
238
239	while pos < max:
240	pseudomatch = pseudoprog.match(line, pos)
241	if pseudomatch: # scan for tokens
242	start, end = pseudomatch.span(1)
243	spos, epos, pos = (lnum, start), (lnum, end), end
244	token, initial = line[start:end], line[start]
245
246	if initial in numchars or \
247	(initial == '.' and token != '.'): # ordinary number
248	yield (NUMBER, token, spos, epos, line)
249	elif initial in '\r\n':
250	yield (parenlev > 0 and NL or NEWLINE,
251	token, spos, epos, line)
252	elif initial == '#':
253	yield (COMMENT, token, spos, epos, line)
254	elif token in triple_quoted:
255	endprog = endprogs[token]
256	endmatch = endprog.match(line, pos)
257	if endmatch: # all on one line
258	pos = endmatch.end(0)
259	token = line[start:pos]
260	yield (STRING, token, spos, (lnum, pos), line)
261	else:
262	strstart = (lnum, start) # multiple lines
263	contstr = line[start:]
264	contline = line
265	break
266	elif initial in single_quoted or \
267	token[:2] in single_quoted or \
268	token[:3] in single_quoted:
269	if token[-1] == '\n': # continued string
270	strstart = (lnum, start)
271	endprog = (endprogs[initial] or endprogs[token[1]] or
272	endprogs[token[2]])
273	contstr, needcont = line[start:], 1
274	contline = line
275	break
276	else: # ordinary string
277	yield (STRING, token, spos, epos, line)
278	elif initial in namechars: # ordinary name
279	yield (NAME, token, spos, epos, line)
280	elif initial == '\\': # continued stmt
281	continued = 1
282	else:
283	if initial in '([{': parenlev = parenlev + 1
284	elif initial in ')]}': parenlev = parenlev - 1
285	yield (OP, token, spos, epos, line)
286	else:
287	yield (ERRORTOKEN, line[pos],
288	(lnum, pos), (lnum, pos+1), line)
289	pos = pos + 1
290
291	for indent in indents[1:]: # pop remaining indent levels
292	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
293	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
294
295	if __name__ == '__main__': # testing
296	import sys
297	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
298	else: tokenize(sys.stdin.readline)