git.subgeniuskitty.com - OpenSPARC-T2-SAM/.git/blame_incremental - sam-t2/devtools/amd64/lib/python2.4/tokenize.py

... / ...

Commit	Line	Data
	1	"""Tokenization help for Python programs.
	2
	3	generate_tokens(readline) is a generator that breaks a stream of
	4	text into Python tokens. It accepts a readline-like method which is called
	5	repeatedly to get the next line of input (or "" for EOF). It generates
	6	5-tuples with these members:
	7
	8	the token type (see token.py)
	9	the token (a string)
	10	the starting (row, column) indices of the token (a 2-tuple of ints)
	11	the ending (row, column) indices of the token (a 2-tuple of ints)
	12	the original line (string)
	13
	14	It is designed to match the working of the Python tokenizer exactly, except
	15	that it produces COMMENT tokens for comments and gives type OP for all
	16	operators
	17
	18	Older entry points
	19	tokenize_loop(readline, tokeneater)
	20	tokenize(readline, tokeneater=printtoken)
	21	are the same, except instead of generating tokens, tokeneater is a callback
	22	function to which the 5 fields described above are passed as 5 arguments,
	23	each time a new token is found."""
	24
	25	__author__ = 'Ka-Ping Yee <ping@lfw.org>'
	26	__credits__ = \
	27	'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
	28
	29	import string, re
	30	from token import *
	31
	32	import token
	33	__all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
	34	"generate_tokens", "NL"]
	35	del x
	36	del token
	37
	38	COMMENT = N_TOKENS
	39	tok_name[COMMENT] = 'COMMENT'
	40	NL = N_TOKENS + 1
	41	tok_name[NL] = 'NL'
	42	N_TOKENS += 2
	43
	44	def group(*choices): return '(' + '\|'.join(choices) + ')'
	45	def any(choices): return group(choices) + '*'
	46	def maybe(choices): return group(choices) + '?'
	47
	48	Whitespace = r'[ \f\t]*'
	49	Comment = r'#[^\r\n]*'
	50	Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
	51	Name = r'[a-zA-Z_]\w*'
	52
	53	Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
	54	Octnumber = r'0[0-7]*[lL]?'
	55	Decnumber = r'[1-9]\d*[lL]?'
	56	Intnumber = group(Hexnumber, Octnumber, Decnumber)
	57	Exponent = r'[eE][-+]?\d+'
	58	Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
	59	Expfloat = r'\d+' + Exponent
	60	Floatnumber = group(Pointfloat, Expfloat)
	61	Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
	62	Number = group(Imagnumber, Floatnumber, Intnumber)
	63
	64	# Tail end of ' string.
	65	Single = r"[^'\\](?:\\.[^'\\])*'"
	66	# Tail end of " string.
	67	Double = r'[^"\\](?:\\.[^"\\])*"'
	68	# Tail end of ''' string.
	69	Single3 = r"[^'\\](?:(?:\\.\|'(?!''))[^'\\])*'''"
	70	# Tail end of """ string.
	71	Double3 = r'[^"\\](?:(?:\\.\|"(?!""))[^"\\])*"""'
	72	Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
	73	# Single-line ' or " string.
	74	String = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*'",
	75	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*"')
	76
	77	# Because of leftmost-then-longest match semantics, be sure to put the
	78	# longest operators first (e.g., if = came before ==, == would get
	79	# recognized as two instances of =).
	80	Operator = group(r"\\=?", r">>=?", r"<<=?", r"<>", r"!=",
	81	r"//=?",
	82	r"[+\-*/%&\|^=<>]=?",
	83	r"~")
	84
	85	Bracket = '[][(){}]'
	86	Special = group(r'\r?\n', r'[:;.,`@]')
	87	Funny = group(Operator, Bracket, Special)
	88
	89	PlainToken = group(Number, Funny, String, Name)
	90	Token = Ignore + PlainToken
	91
	92	# First (or only) line of ' or " string.
	93	ContStr = group(r"[uU]?[rR]?'[^\n'\\](?:\\.[^\n'\\])*" +
	94	group("'", r'\\\r?\n'),
	95	r'[uU]?[rR]?"[^\n"\\](?:\\.[^\n"\\])*' +
	96	group('"', r'\\\r?\n'))
	97	PseudoExtras = group(r'\\\r?\n', Comment, Triple)
	98	PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
	99
	100	tokenprog, pseudoprog, single3prog, double3prog = map(
	101	re.compile, (Token, PseudoToken, Single3, Double3))
	102	endprogs = {"'": re.compile(Single), '"': re.compile(Double),
	103	"'''": single3prog, '"""': double3prog,
	104	"r'''": single3prog, 'r"""': double3prog,
	105	"u'''": single3prog, 'u"""': double3prog,
	106	"ur'''": single3prog, 'ur"""': double3prog,
	107	"R'''": single3prog, 'R"""': double3prog,
	108	"U'''": single3prog, 'U"""': double3prog,
	109	"uR'''": single3prog, 'uR"""': double3prog,
	110	"Ur'''": single3prog, 'Ur"""': double3prog,
	111	"UR'''": single3prog, 'UR"""': double3prog,
	112	'r': None, 'R': None, 'u': None, 'U': None}
	113
	114	triple_quoted = {}
	115	for t in ("'''", '"""',
	116	"r'''", 'r"""', "R'''", 'R"""',
	117	"u'''", 'u"""', "U'''", 'U"""',
	118	"ur'''", 'ur"""', "Ur'''", 'Ur"""',
	119	"uR'''", 'uR"""', "UR'''", 'UR"""'):
	120	triple_quoted[t] = t
	121	single_quoted = {}
	122	for t in ("'", '"',
	123	"r'", 'r"', "R'", 'R"',
	124	"u'", 'u"', "U'", 'U"',
	125	"ur'", 'ur"', "Ur'", 'Ur"',
	126	"uR'", 'uR"', "UR'", 'UR"' ):
	127	single_quoted[t] = t
	128
	129	tabsize = 8
	130
	131	class TokenError(Exception): pass
	132
	133	class StopTokenizing(Exception): pass
	134
	135	def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
	136	print "%d,%d-%d,%d:\t%s\t%s" % \
	137	(srow, scol, erow, ecol, tok_name[type], repr(token))
	138
	139	def tokenize(readline, tokeneater=printtoken):
	140	"""
	141	The tokenize() function accepts two parameters: one representing the
	142	input stream, and one providing an output mechanism for tokenize().
	143
	144	The first parameter, readline, must be a callable object which provides
	145	the same interface as the readline() method of built-in file objects.
	146	Each call to the function should return one line of input as a string.
	147
	148	The second parameter, tokeneater, must also be a callable object. It is
	149	called once for each token, with five arguments, corresponding to the
	150	tuples generated by generate_tokens().
	151	"""
	152	try:
	153	tokenize_loop(readline, tokeneater)
	154	except StopTokenizing:
	155	pass
	156
	157	# backwards compatible interface
	158	def tokenize_loop(readline, tokeneater):
	159	for token_info in generate_tokens(readline):
	160	tokeneater(*token_info)
	161
	162	def generate_tokens(readline):
	163	"""
	164	The generate_tokens() generator requires one argment, readline, which
	165	must be a callable object which provides the same interface as the
	166	readline() method of built-in file objects. Each call to the function
	167	should return one line of input as a string.
	168
	169	The generator produces 5-tuples with these members: the token type; the
	170	token string; a 2-tuple (srow, scol) of ints specifying the row and
	171	column where the token begins in the source; a 2-tuple (erow, ecol) of
	172	ints specifying the row and column where the token ends in the source;
	173	and the line on which the token was found. The line passed is the
	174	logical line; continuation lines are included.
	175	"""
	176	lnum = parenlev = continued = 0
	177	namechars, numchars = string.ascii_letters + '_', '0123456789'
	178	contstr, needcont = '', 0
	179	contline = None
	180	indents = [0]
	181
	182	while 1: # loop over lines in stream
	183	line = readline()
	184	lnum = lnum + 1
	185	pos, max = 0, len(line)
	186
	187	if contstr: # continued string
	188	if not line:
	189	raise TokenError, ("EOF in multi-line string", strstart)
	190	endmatch = endprog.match(line)
	191	if endmatch:
	192	pos = end = endmatch.end(0)
	193	yield (STRING, contstr + line[:end],
	194	strstart, (lnum, end), contline + line)
	195	contstr, needcont = '', 0
	196	contline = None
	197	elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
	198	yield (ERRORTOKEN, contstr + line,
	199	strstart, (lnum, len(line)), contline)
	200	contstr = ''
	201	contline = None
	202	continue
	203	else:
	204	contstr = contstr + line
	205	contline = contline + line
	206	continue
	207
	208	elif parenlev == 0 and not continued: # new statement
	209	if not line: break
	210	column = 0
	211	while pos < max: # measure leading whitespace
	212	if line[pos] == ' ': column = column + 1
	213	elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
	214	elif line[pos] == '\f': column = 0
	215	else: break
	216	pos = pos + 1
	217	if pos == max: break
	218
	219	if line[pos] in '#\r\n': # skip comments or blank lines
	220	yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
	221	(lnum, pos), (lnum, len(line)), line)
	222	continue
	223
	224	if column > indents[-1]: # count indents or dedents
	225	indents.append(column)
	226	yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
	227	while column < indents[-1]:
	228	if column not in indents:
	229	raise IndentationError(
	230	"unindent does not match any outer indentation level")
	231	indents = indents[:-1]
	232	yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
	233
	234	else: # continued statement
	235	if not line:
	236	raise TokenError, ("EOF in multi-line statement", (lnum, 0))
	237	continued = 0
	238
	239	while pos < max:
	240	pseudomatch = pseudoprog.match(line, pos)
	241	if pseudomatch: # scan for tokens
	242	start, end = pseudomatch.span(1)
	243	spos, epos, pos = (lnum, start), (lnum, end), end
	244	token, initial = line[start:end], line[start]
	245
	246	if initial in numchars or \
	247	(initial == '.' and token != '.'): # ordinary number
	248	yield (NUMBER, token, spos, epos, line)
	249	elif initial in '\r\n':
	250	yield (parenlev > 0 and NL or NEWLINE,
	251	token, spos, epos, line)
	252	elif initial == '#':
	253	yield (COMMENT, token, spos, epos, line)
	254	elif token in triple_quoted:
	255	endprog = endprogs[token]
	256	endmatch = endprog.match(line, pos)
	257	if endmatch: # all on one line
	258	pos = endmatch.end(0)
	259	token = line[start:pos]
	260	yield (STRING, token, spos, (lnum, pos), line)
	261	else:
	262	strstart = (lnum, start) # multiple lines
	263	contstr = line[start:]
	264	contline = line
	265	break
	266	elif initial in single_quoted or \
	267	token[:2] in single_quoted or \
	268	token[:3] in single_quoted:
	269	if token[-1] == '\n': # continued string
	270	strstart = (lnum, start)
	271	endprog = (endprogs[initial] or endprogs[token[1]] or
	272	endprogs[token[2]])
	273	contstr, needcont = line[start:], 1
	274	contline = line
	275	break
	276	else: # ordinary string
	277	yield (STRING, token, spos, epos, line)
	278	elif initial in namechars: # ordinary name
	279	yield (NAME, token, spos, epos, line)
	280	elif initial == '\\': # continued stmt
	281	continued = 1
	282	else:
	283	if initial in '([{': parenlev = parenlev + 1
	284	elif initial in ')]}': parenlev = parenlev - 1
	285	yield (OP, token, spos, epos, line)
	286	else:
	287	yield (ERRORTOKEN, line[pos],
	288	(lnum, pos), (lnum, pos+1), line)
	289	pos = pos + 1
	290
	291	for indent in indents[1:]: # pop remaining indent levels
	292	yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
	293	yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
	294
	295	if __name__ == '__main__': # testing
	296	import sys
	297	if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
	298	else: tokenize(sys.stdin.readline)