Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | |
2 | """ | |
3 | csv.py - read/write/investigate CSV files | |
4 | """ | |
5 | ||
6 | import re | |
7 | from _csv import Error, __version__, writer, reader, register_dialect, \ | |
8 | unregister_dialect, get_dialect, list_dialects, \ | |
9 | QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \ | |
10 | __doc__ | |
11 | ||
12 | try: | |
13 | from cStringIO import StringIO | |
14 | except ImportError: | |
15 | from StringIO import StringIO | |
16 | ||
17 | __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", | |
18 | "Error", "Dialect", "excel", "excel_tab", "reader", "writer", | |
19 | "register_dialect", "get_dialect", "list_dialects", "Sniffer", | |
20 | "unregister_dialect", "__version__", "DictReader", "DictWriter" ] | |
21 | ||
22 | class Dialect: | |
23 | _name = "" | |
24 | _valid = False | |
25 | # placeholders | |
26 | delimiter = None | |
27 | quotechar = None | |
28 | escapechar = None | |
29 | doublequote = None | |
30 | skipinitialspace = None | |
31 | lineterminator = None | |
32 | quoting = None | |
33 | ||
34 | def __init__(self): | |
35 | if self.__class__ != Dialect: | |
36 | self._valid = True | |
37 | errors = self._validate() | |
38 | if errors != []: | |
39 | raise Error, "Dialect did not validate: %s" % ", ".join(errors) | |
40 | ||
41 | def _validate(self): | |
42 | errors = [] | |
43 | if not self._valid: | |
44 | errors.append("can't directly instantiate Dialect class") | |
45 | ||
46 | if self.delimiter is None: | |
47 | errors.append("delimiter character not set") | |
48 | elif (not isinstance(self.delimiter, str) or | |
49 | len(self.delimiter) > 1): | |
50 | errors.append("delimiter must be one-character string") | |
51 | ||
52 | if self.quotechar is None: | |
53 | if self.quoting != QUOTE_NONE: | |
54 | errors.append("quotechar not set") | |
55 | elif (not isinstance(self.quotechar, str) or | |
56 | len(self.quotechar) > 1): | |
57 | errors.append("quotechar must be one-character string") | |
58 | ||
59 | if self.lineterminator is None: | |
60 | errors.append("lineterminator not set") | |
61 | elif not isinstance(self.lineterminator, str): | |
62 | errors.append("lineterminator must be a string") | |
63 | ||
64 | if self.doublequote not in (True, False): | |
65 | errors.append("doublequote parameter must be True or False") | |
66 | ||
67 | if self.skipinitialspace not in (True, False): | |
68 | errors.append("skipinitialspace parameter must be True or False") | |
69 | ||
70 | if self.quoting is None: | |
71 | errors.append("quoting parameter not set") | |
72 | ||
73 | if self.quoting is QUOTE_NONE: | |
74 | if (not isinstance(self.escapechar, (unicode, str)) or | |
75 | len(self.escapechar) > 1): | |
76 | errors.append("escapechar must be a one-character string or unicode object") | |
77 | ||
78 | return errors | |
79 | ||
80 | class excel(Dialect): | |
81 | delimiter = ',' | |
82 | quotechar = '"' | |
83 | doublequote = True | |
84 | skipinitialspace = False | |
85 | lineterminator = '\r\n' | |
86 | quoting = QUOTE_MINIMAL | |
87 | register_dialect("excel", excel) | |
88 | ||
89 | class excel_tab(excel): | |
90 | delimiter = '\t' | |
91 | register_dialect("excel-tab", excel_tab) | |
92 | ||
93 | ||
94 | class DictReader: | |
95 | def __init__(self, f, fieldnames=None, restkey=None, restval=None, | |
96 | dialect="excel", *args, **kwds): | |
97 | self.fieldnames = fieldnames # list of keys for the dict | |
98 | self.restkey = restkey # key to catch long rows | |
99 | self.restval = restval # default value for short rows | |
100 | self.reader = reader(f, dialect, *args, **kwds) | |
101 | ||
102 | def __iter__(self): | |
103 | return self | |
104 | ||
105 | def next(self): | |
106 | row = self.reader.next() | |
107 | if self.fieldnames is None: | |
108 | self.fieldnames = row | |
109 | row = self.reader.next() | |
110 | ||
111 | # unlike the basic reader, we prefer not to return blanks, | |
112 | # because we will typically wind up with a dict full of None | |
113 | # values | |
114 | while row == []: | |
115 | row = self.reader.next() | |
116 | d = dict(zip(self.fieldnames, row)) | |
117 | lf = len(self.fieldnames) | |
118 | lr = len(row) | |
119 | if lf < lr: | |
120 | d[self.restkey] = row[lf:] | |
121 | elif lf > lr: | |
122 | for key in self.fieldnames[lr:]: | |
123 | d[key] = self.restval | |
124 | return d | |
125 | ||
126 | ||
127 | class DictWriter: | |
128 | def __init__(self, f, fieldnames, restval="", extrasaction="raise", | |
129 | dialect="excel", *args, **kwds): | |
130 | self.fieldnames = fieldnames # list of keys for the dict | |
131 | self.restval = restval # for writing short dicts | |
132 | if extrasaction.lower() not in ("raise", "ignore"): | |
133 | raise ValueError, \ | |
134 | ("extrasaction (%s) must be 'raise' or 'ignore'" % | |
135 | extrasaction) | |
136 | self.extrasaction = extrasaction | |
137 | self.writer = writer(f, dialect, *args, **kwds) | |
138 | ||
139 | def _dict_to_list(self, rowdict): | |
140 | if self.extrasaction == "raise": | |
141 | for k in rowdict.keys(): | |
142 | if k not in self.fieldnames: | |
143 | raise ValueError, "dict contains fields not in fieldnames" | |
144 | return [rowdict.get(key, self.restval) for key in self.fieldnames] | |
145 | ||
146 | def writerow(self, rowdict): | |
147 | return self.writer.writerow(self._dict_to_list(rowdict)) | |
148 | ||
149 | def writerows(self, rowdicts): | |
150 | rows = [] | |
151 | for rowdict in rowdicts: | |
152 | rows.append(self._dict_to_list(rowdict)) | |
153 | return self.writer.writerows(rows) | |
154 | ||
155 | # Guard Sniffer's type checking against builds that exclude complex() | |
156 | try: | |
157 | complex | |
158 | except NameError: | |
159 | complex = float | |
160 | ||
161 | class Sniffer: | |
162 | ''' | |
163 | "Sniffs" the format of a CSV file (i.e. delimiter, quotechar) | |
164 | Returns a Dialect object. | |
165 | ''' | |
166 | def __init__(self): | |
167 | # in case there is more than one possible delimiter | |
168 | self.preferred = [',', '\t', ';', ' ', ':'] | |
169 | ||
170 | ||
171 | def sniff(self, sample, delimiters=None): | |
172 | """ | |
173 | Returns a dialect (or None) corresponding to the sample | |
174 | """ | |
175 | ||
176 | quotechar, delimiter, skipinitialspace = \ | |
177 | self._guess_quote_and_delimiter(sample, delimiters) | |
178 | if delimiter is None: | |
179 | delimiter, skipinitialspace = self._guess_delimiter(sample, | |
180 | delimiters) | |
181 | ||
182 | class dialect(Dialect): | |
183 | _name = "sniffed" | |
184 | lineterminator = '\r\n' | |
185 | quoting = QUOTE_MINIMAL | |
186 | # escapechar = '' | |
187 | doublequote = False | |
188 | ||
189 | dialect.delimiter = delimiter | |
190 | # _csv.reader won't accept a quotechar of '' | |
191 | dialect.quotechar = quotechar or '"' | |
192 | dialect.skipinitialspace = skipinitialspace | |
193 | ||
194 | return dialect | |
195 | ||
196 | ||
197 | def _guess_quote_and_delimiter(self, data, delimiters): | |
198 | """ | |
199 | Looks for text enclosed between two identical quotes | |
200 | (the probable quotechar) which are preceded and followed | |
201 | by the same character (the probable delimiter). | |
202 | For example: | |
203 | ,'some text', | |
204 | The quote with the most wins, same with the delimiter. | |
205 | If there is no quotechar the delimiter can't be determined | |
206 | this way. | |
207 | """ | |
208 | ||
209 | matches = [] | |
210 | for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", | |
211 | '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?", | |
212 | '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?" | |
213 | '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) | |
214 | regexp = re.compile(restr, re.DOTALL | re.MULTILINE) | |
215 | matches = regexp.findall(data) | |
216 | if matches: | |
217 | break | |
218 | ||
219 | if not matches: | |
220 | return ('', None, 0) # (quotechar, delimiter, skipinitialspace) | |
221 | ||
222 | quotes = {} | |
223 | delims = {} | |
224 | spaces = 0 | |
225 | for m in matches: | |
226 | n = regexp.groupindex['quote'] - 1 | |
227 | key = m[n] | |
228 | if key: | |
229 | quotes[key] = quotes.get(key, 0) + 1 | |
230 | try: | |
231 | n = regexp.groupindex['delim'] - 1 | |
232 | key = m[n] | |
233 | except KeyError: | |
234 | continue | |
235 | if key and (delimiters is None or key in delimiters): | |
236 | delims[key] = delims.get(key, 0) + 1 | |
237 | try: | |
238 | n = regexp.groupindex['space'] - 1 | |
239 | except KeyError: | |
240 | continue | |
241 | if m[n]: | |
242 | spaces += 1 | |
243 | ||
244 | quotechar = reduce(lambda a, b, quotes = quotes: | |
245 | (quotes[a] > quotes[b]) and a or b, quotes.keys()) | |
246 | ||
247 | if delims: | |
248 | delim = reduce(lambda a, b, delims = delims: | |
249 | (delims[a] > delims[b]) and a or b, delims.keys()) | |
250 | skipinitialspace = delims[delim] == spaces | |
251 | if delim == '\n': # most likely a file with a single column | |
252 | delim = '' | |
253 | else: | |
254 | # there is *no* delimiter, it's a single column of quoted data | |
255 | delim = '' | |
256 | skipinitialspace = 0 | |
257 | ||
258 | return (quotechar, delim, skipinitialspace) | |
259 | ||
260 | ||
261 | def _guess_delimiter(self, data, delimiters): | |
262 | """ | |
263 | The delimiter /should/ occur the same number of times on | |
264 | each row. However, due to malformed data, it may not. We don't want | |
265 | an all or nothing approach, so we allow for small variations in this | |
266 | number. | |
267 | 1) build a table of the frequency of each character on every line. | |
268 | 2) build a table of freqencies of this frequency (meta-frequency?), | |
269 | e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows, | |
270 | 7 times in 2 rows' | |
271 | 3) use the mode of the meta-frequency to determine the /expected/ | |
272 | frequency for that character | |
273 | 4) find out how often the character actually meets that goal | |
274 | 5) the character that best meets its goal is the delimiter | |
275 | For performance reasons, the data is evaluated in chunks, so it can | |
276 | try and evaluate the smallest portion of the data possible, evaluating | |
277 | additional chunks as necessary. | |
278 | """ | |
279 | ||
280 | data = filter(None, data.split('\n')) | |
281 | ||
282 | ascii = [chr(c) for c in range(127)] # 7-bit ASCII | |
283 | ||
284 | # build frequency tables | |
285 | chunkLength = min(10, len(data)) | |
286 | iteration = 0 | |
287 | charFrequency = {} | |
288 | modes = {} | |
289 | delims = {} | |
290 | start, end = 0, min(chunkLength, len(data)) | |
291 | while start < len(data): | |
292 | iteration += 1 | |
293 | for line in data[start:end]: | |
294 | for char in ascii: | |
295 | metaFrequency = charFrequency.get(char, {}) | |
296 | # must count even if frequency is 0 | |
297 | freq = line.strip().count(char) | |
298 | # value is the mode | |
299 | metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 | |
300 | charFrequency[char] = metaFrequency | |
301 | ||
302 | for char in charFrequency.keys(): | |
303 | items = charFrequency[char].items() | |
304 | if len(items) == 1 and items[0][0] == 0: | |
305 | continue | |
306 | # get the mode of the frequencies | |
307 | if len(items) > 1: | |
308 | modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b, | |
309 | items) | |
310 | # adjust the mode - subtract the sum of all | |
311 | # other frequencies | |
312 | items.remove(modes[char]) | |
313 | modes[char] = (modes[char][0], modes[char][1] | |
314 | - reduce(lambda a, b: (0, a[1] + b[1]), | |
315 | items)[1]) | |
316 | else: | |
317 | modes[char] = items[0] | |
318 | ||
319 | # build a list of possible delimiters | |
320 | modeList = modes.items() | |
321 | total = float(chunkLength * iteration) | |
322 | # (rows of consistent data) / (number of rows) = 100% | |
323 | consistency = 1.0 | |
324 | # minimum consistency threshold | |
325 | threshold = 0.9 | |
326 | while len(delims) == 0 and consistency >= threshold: | |
327 | for k, v in modeList: | |
328 | if v[0] > 0 and v[1] > 0: | |
329 | if ((v[1]/total) >= consistency and | |
330 | (delimiters is None or k in delimiters)): | |
331 | delims[k] = v | |
332 | consistency -= 0.01 | |
333 | ||
334 | if len(delims) == 1: | |
335 | delim = delims.keys()[0] | |
336 | skipinitialspace = (data[0].count(delim) == | |
337 | data[0].count("%c " % delim)) | |
338 | return (delim, skipinitialspace) | |
339 | ||
340 | # analyze another chunkLength lines | |
341 | start = end | |
342 | end += chunkLength | |
343 | ||
344 | if not delims: | |
345 | return ('', 0) | |
346 | ||
347 | # if there's more than one, fall back to a 'preferred' list | |
348 | if len(delims) > 1: | |
349 | for d in self.preferred: | |
350 | if d in delims.keys(): | |
351 | skipinitialspace = (data[0].count(d) == | |
352 | data[0].count("%c " % d)) | |
353 | return (d, skipinitialspace) | |
354 | ||
355 | # finally, just return the first damn character in the list | |
356 | delim = delims.keys()[0] | |
357 | skipinitialspace = (data[0].count(delim) == | |
358 | data[0].count("%c " % delim)) | |
359 | return (delim, skipinitialspace) | |
360 | ||
361 | ||
362 | def has_header(self, sample): | |
363 | # Creates a dictionary of types of data in each column. If any | |
364 | # column is of a single type (say, integers), *except* for the first | |
365 | # row, then the first row is presumed to be labels. If the type | |
366 | # can't be determined, it is assumed to be a string in which case | |
367 | # the length of the string is the determining factor: if all of the | |
368 | # rows except for the first are the same length, it's a header. | |
369 | # Finally, a 'vote' is taken at the end for each column, adding or | |
370 | # subtracting from the likelihood of the first row being a header. | |
371 | ||
372 | rdr = reader(StringIO(sample), self.sniff(sample)) | |
373 | ||
374 | header = rdr.next() # assume first row is header | |
375 | ||
376 | columns = len(header) | |
377 | columnTypes = {} | |
378 | for i in range(columns): columnTypes[i] = None | |
379 | ||
380 | checked = 0 | |
381 | for row in rdr: | |
382 | # arbitrary number of rows to check, to keep it sane | |
383 | if checked > 20: | |
384 | break | |
385 | checked += 1 | |
386 | ||
387 | if len(row) != columns: | |
388 | continue # skip rows that have irregular number of columns | |
389 | ||
390 | for col in columnTypes.keys(): | |
391 | ||
392 | for thisType in [int, long, float, complex]: | |
393 | try: | |
394 | thisType(row[col]) | |
395 | break | |
396 | except (ValueError, OverflowError): | |
397 | pass | |
398 | else: | |
399 | # fallback to length of string | |
400 | thisType = len(row[col]) | |
401 | ||
402 | # treat longs as ints | |
403 | if thisType == long: | |
404 | thisType = int | |
405 | ||
406 | if thisType != columnTypes[col]: | |
407 | if columnTypes[col] is None: # add new column type | |
408 | columnTypes[col] = thisType | |
409 | else: | |
410 | # type is inconsistent, remove column from | |
411 | # consideration | |
412 | del columnTypes[col] | |
413 | ||
414 | # finally, compare results against first row and "vote" | |
415 | # on whether it's a header | |
416 | hasHeader = 0 | |
417 | for col, colType in columnTypes.items(): | |
418 | if type(colType) == type(0): # it's a length | |
419 | if len(header[col]) != colType: | |
420 | hasHeader += 1 | |
421 | else: | |
422 | hasHeader -= 1 | |
423 | else: # attempt typecast | |
424 | try: | |
425 | colType(header[col]) | |
426 | except (ValueError, TypeError): | |
427 | hasHeader += 1 | |
428 | else: | |
429 | hasHeader -= 1 | |
430 | ||
431 | return hasHeader > 0 |