git.subgeniuskitty.com - OpenSPARC-T2-DV/.git/blame - tools/src/nas,5.n2.os.2/lib/python/lib/python2.4/csv.py

Commit	Line	Data
86530b38 AT	1
	2	"""
	3	csv.py - read/write/investigate CSV files
	4	"""
	5
	6	import re
	7	from _csv import Error, __version__, writer, reader, register_dialect, \
	8	unregister_dialect, get_dialect, list_dialects, \
	9	QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
	10	__doc__
	11
	12	try:
	13	from cStringIO import StringIO
	14	except ImportError:
	15	from StringIO import StringIO
	16
	17	__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
	18	"Error", "Dialect", "excel", "excel_tab", "reader", "writer",
	19	"register_dialect", "get_dialect", "list_dialects", "Sniffer",
	20	"unregister_dialect", "__version__", "DictReader", "DictWriter" ]
	21
	22	class Dialect:
	23	_name = ""
	24	_valid = False
	25	# placeholders
	26	delimiter = None
	27	quotechar = None
	28	escapechar = None
	29	doublequote = None
	30	skipinitialspace = None
	31	lineterminator = None
	32	quoting = None
	33
	34	def __init__(self):
	35	if self.__class__ != Dialect:
	36	self._valid = True
	37	errors = self._validate()
	38	if errors != []:
	39	raise Error, "Dialect did not validate: %s" % ", ".join(errors)
	40
	41	def _validate(self):
	42	errors = []
	43	if not self._valid:
	44	errors.append("can't directly instantiate Dialect class")
	45
	46	if self.delimiter is None:
	47	errors.append("delimiter character not set")
	48	elif (not isinstance(self.delimiter, str) or
	49	len(self.delimiter) > 1):
	50	errors.append("delimiter must be one-character string")
	51
	52	if self.quotechar is None:
	53	if self.quoting != QUOTE_NONE:
	54	errors.append("quotechar not set")
	55	elif (not isinstance(self.quotechar, str) or
	56	len(self.quotechar) > 1):
	57	errors.append("quotechar must be one-character string")
	58
	59	if self.lineterminator is None:
	60	errors.append("lineterminator not set")
	61	elif not isinstance(self.lineterminator, str):
	62	errors.append("lineterminator must be a string")
	63
	64	if self.doublequote not in (True, False):
65	errors.append("doublequote parameter must be True or False")
66
67	if self.skipinitialspace not in (True, False):
68	errors.append("skipinitialspace parameter must be True or False")
69
70	if self.quoting is None:
71	errors.append("quoting parameter not set")
72
73	if self.quoting is QUOTE_NONE:
74	if (not isinstance(self.escapechar, (unicode, str)) or
75	len(self.escapechar) > 1):
76	errors.append("escapechar must be a one-character string or unicode object")
77
78	return errors
79
80	class excel(Dialect):
81	delimiter = ','
82	quotechar = '"'
83	doublequote = True
84	skipinitialspace = False
85	lineterminator = '\r\n'
86	quoting = QUOTE_MINIMAL
87	register_dialect("excel", excel)
88
89	class excel_tab(excel):
90	delimiter = '\t'
91	register_dialect("excel-tab", excel_tab)
92
93
94	class DictReader:
95	def __init__(self, f, fieldnames=None, restkey=None, restval=None,
96	dialect="excel", args, *kwds):
97	self.fieldnames = fieldnames # list of keys for the dict
98	self.restkey = restkey # key to catch long rows
99	self.restval = restval # default value for short rows
100	self.reader = reader(f, dialect, args, *kwds)
101
102	def __iter__(self):
103	return self
104
105	def next(self):
106	row = self.reader.next()
107	if self.fieldnames is None:
108	self.fieldnames = row
109	row = self.reader.next()
110
111	# unlike the basic reader, we prefer not to return blanks,
112	# because we will typically wind up with a dict full of None
113	# values
114	while row == []:
115	row = self.reader.next()
116	d = dict(zip(self.fieldnames, row))
117	lf = len(self.fieldnames)
118	lr = len(row)
119	if lf < lr:
120	d[self.restkey] = row[lf:]
121	elif lf > lr:
122	for key in self.fieldnames[lr:]:
123	d[key] = self.restval
124	return d
125
126
127	class DictWriter:
128	def __init__(self, f, fieldnames, restval="", extrasaction="raise",
129	dialect="excel", args, *kwds):
130	self.fieldnames = fieldnames # list of keys for the dict
131	self.restval = restval # for writing short dicts
132	if extrasaction.lower() not in ("raise", "ignore"):
133	raise ValueError, \
134	("extrasaction (%s) must be 'raise' or 'ignore'" %
135	extrasaction)
136	self.extrasaction = extrasaction
137	self.writer = writer(f, dialect, args, *kwds)
138
139	def _dict_to_list(self, rowdict):
140	if self.extrasaction == "raise":
141	for k in rowdict.keys():
142	if k not in self.fieldnames:
143	raise ValueError, "dict contains fields not in fieldnames"
144	return [rowdict.get(key, self.restval) for key in self.fieldnames]
145
146	def writerow(self, rowdict):
147	return self.writer.writerow(self._dict_to_list(rowdict))
148
149	def writerows(self, rowdicts):
150	rows = []
151	for rowdict in rowdicts:
152	rows.append(self._dict_to_list(rowdict))
153	return self.writer.writerows(rows)
154
155	# Guard Sniffer's type checking against builds that exclude complex()
156	try:
157	complex
158	except NameError:
159	complex = float
160
161	class Sniffer:
162	'''
163	"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
164	Returns a Dialect object.
165	'''
166	def __init__(self):
167	# in case there is more than one possible delimiter
168	self.preferred = [',', '\t', ';', ' ', ':']
169
170
171	def sniff(self, sample, delimiters=None):
172	"""
173	Returns a dialect (or None) corresponding to the sample
174	"""
175
176	quotechar, delimiter, skipinitialspace = \
177	self._guess_quote_and_delimiter(sample, delimiters)
178	if delimiter is None:
179	delimiter, skipinitialspace = self._guess_delimiter(sample,
180	delimiters)
181
182	class dialect(Dialect):
183	_name = "sniffed"
184	lineterminator = '\r\n'
185	quoting = QUOTE_MINIMAL
186	# escapechar = ''
187	doublequote = False
188
189	dialect.delimiter = delimiter
190	# _csv.reader won't accept a quotechar of ''
191	dialect.quotechar = quotechar or '"'
192	dialect.skipinitialspace = skipinitialspace
193
194	return dialect
195
196
197	def _guess_quote_and_delimiter(self, data, delimiters):
198	"""
199	Looks for text enclosed between two identical quotes
200	(the probable quotechar) which are preceded and followed
201	by the same character (the probable delimiter).
202	For example:
203	,'some text',
204	The quote with the most wins, same with the delimiter.
205	If there is no quotechar the delimiter can't be determined
206	this way.
207	"""
208
209	matches = []
210	for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?P=delim)', # ,".?",
211	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".?",
212	'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?:$\|\n)', # ,".?"
213	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?:$\|\n)'): # ".?" (no delim, no space)
214	regexp = re.compile(restr, re.DOTALL \| re.MULTILINE)
215	matches = regexp.findall(data)
216	if matches:
217	break
218
219	if not matches:
220	return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
221
222	quotes = {}
223	delims = {}
224	spaces = 0
225	for m in matches:
226	n = regexp.groupindex['quote'] - 1
227	key = m[n]
228	if key:
229	quotes[key] = quotes.get(key, 0) + 1
230	try:
231	n = regexp.groupindex['delim'] - 1
232	key = m[n]
233	except KeyError:
234	continue
235	if key and (delimiters is None or key in delimiters):
236	delims[key] = delims.get(key, 0) + 1
237	try:
238	n = regexp.groupindex['space'] - 1
239	except KeyError:
240	continue
241	if m[n]:
242	spaces += 1
243
244	quotechar = reduce(lambda a, b, quotes = quotes:
245	(quotes[a] > quotes[b]) and a or b, quotes.keys())
246
247	if delims:
248	delim = reduce(lambda a, b, delims = delims:
249	(delims[a] > delims[b]) and a or b, delims.keys())
250	skipinitialspace = delims[delim] == spaces
251	if delim == '\n': # most likely a file with a single column
252	delim = ''
253	else:
254	# there is no delimiter, it's a single column of quoted data
255	delim = ''
256	skipinitialspace = 0
257
258	return (quotechar, delim, skipinitialspace)
259
260
261	def _guess_delimiter(self, data, delimiters):
262	"""
263	The delimiter /should/ occur the same number of times on
264	each row. However, due to malformed data, it may not. We don't want
265	an all or nothing approach, so we allow for small variations in this
266	number.
267	1) build a table of the frequency of each character on every line.
268	2) build a table of freqencies of this frequency (meta-frequency?),
269	e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
270	7 times in 2 rows'
271	3) use the mode of the meta-frequency to determine the /expected/
272	frequency for that character
273	4) find out how often the character actually meets that goal
274	5) the character that best meets its goal is the delimiter
275	For performance reasons, the data is evaluated in chunks, so it can
276	try and evaluate the smallest portion of the data possible, evaluating
277	additional chunks as necessary.
278	"""
279
280	data = filter(None, data.split('\n'))
281
282	ascii = [chr(c) for c in range(127)] # 7-bit ASCII
283
284	# build frequency tables
285	chunkLength = min(10, len(data))
286	iteration = 0
287	charFrequency = {}
288	modes = {}
289	delims = {}
290	start, end = 0, min(chunkLength, len(data))
291	while start < len(data):
292	iteration += 1
293	for line in data[start:end]:
294	for char in ascii:
295	metaFrequency = charFrequency.get(char, {})
296	# must count even if frequency is 0
297	freq = line.strip().count(char)
298	# value is the mode
299	metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
300	charFrequency[char] = metaFrequency
301
302	for char in charFrequency.keys():
303	items = charFrequency[char].items()
304	if len(items) == 1 and items[0][0] == 0:
305	continue
306	# get the mode of the frequencies
307	if len(items) > 1:
308	modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
309	items)
310	# adjust the mode - subtract the sum of all
311	# other frequencies
312	items.remove(modes[char])
313	modes[char] = (modes[char][0], modes[char][1]
314	- reduce(lambda a, b: (0, a[1] + b[1]),
315	items)[1])
316	else:
317	modes[char] = items[0]
318
319	# build a list of possible delimiters
320	modeList = modes.items()
321	total = float(chunkLength * iteration)
322	# (rows of consistent data) / (number of rows) = 100%
323	consistency = 1.0
324	# minimum consistency threshold
325	threshold = 0.9
326	while len(delims) == 0 and consistency >= threshold:
327	for k, v in modeList:
328	if v[0] > 0 and v[1] > 0:
329	if ((v[1]/total) >= consistency and
330	(delimiters is None or k in delimiters)):
331	delims[k] = v
332	consistency -= 0.01
333
334	if len(delims) == 1:
335	delim = delims.keys()[0]
336	skipinitialspace = (data[0].count(delim) ==
337	data[0].count("%c " % delim))
338	return (delim, skipinitialspace)
339
340	# analyze another chunkLength lines
341	start = end
342	end += chunkLength
343
344	if not delims:
345	return ('', 0)
346
347	# if there's more than one, fall back to a 'preferred' list
348	if len(delims) > 1:
349	for d in self.preferred:
350	if d in delims.keys():
351	skipinitialspace = (data[0].count(d) ==
352	data[0].count("%c " % d))
353	return (d, skipinitialspace)
354
355	# finally, just return the first damn character in the list
356	delim = delims.keys()[0]
357	skipinitialspace = (data[0].count(delim) ==
358	data[0].count("%c " % delim))
359	return (delim, skipinitialspace)
360
361
362	def has_header(self, sample):
363	# Creates a dictionary of types of data in each column. If any
364	# column is of a single type (say, integers), except for the first
365	# row, then the first row is presumed to be labels. If the type
366	# can't be determined, it is assumed to be a string in which case
367	# the length of the string is the determining factor: if all of the
368	# rows except for the first are the same length, it's a header.
369	# Finally, a 'vote' is taken at the end for each column, adding or
370	# subtracting from the likelihood of the first row being a header.
371
372	rdr = reader(StringIO(sample), self.sniff(sample))
373
374	header = rdr.next() # assume first row is header
375
376	columns = len(header)
377	columnTypes = {}
378	for i in range(columns): columnTypes[i] = None
379
380	checked = 0
381	for row in rdr:
382	# arbitrary number of rows to check, to keep it sane
383	if checked > 20:
384	break
385	checked += 1
386
387	if len(row) != columns:
388	continue # skip rows that have irregular number of columns
389
390	for col in columnTypes.keys():
391
392	for thisType in [int, long, float, complex]:
393	try:
394	thisType(row[col])
395	break
396	except (ValueError, OverflowError):
397	pass
398	else:
399	# fallback to length of string
400	thisType = len(row[col])
401
402	# treat longs as ints
403	if thisType == long:
404	thisType = int
405
406	if thisType != columnTypes[col]:
407	if columnTypes[col] is None: # add new column type
408	columnTypes[col] = thisType
409	else:
410	# type is inconsistent, remove column from
411	# consideration
412	del columnTypes[col]
413
414	# finally, compare results against first row and "vote"
415	# on whether it's a header
416	hasHeader = 0
417	for col, colType in columnTypes.items():
418	if type(colType) == type(0): # it's a length
419	if len(header[col]) != colType:
420	hasHeader += 1
421	else:
422	hasHeader -= 1
423	else: # attempt typecast
424	try:
425	colType(header[col])
426	except (ValueError, TypeError):
427	hasHeader += 1
428	else:
429	hasHeader -= 1
430
431	return hasHeader > 0