csv.py - read/write/investigate CSV files
from _csv
import Error
, __version__
, writer
, reader
, register_dialect
, \
unregister_dialect
, get_dialect
, list_dialects
, \
QUOTE_MINIMAL
, QUOTE_ALL
, QUOTE_NONNUMERIC
, QUOTE_NONE
, \
from cStringIO
import StringIO
from StringIO
import StringIO
__all__
= [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
"Error", "Dialect", "excel", "excel_tab", "reader", "writer",
"register_dialect", "get_dialect", "list_dialects", "Sniffer",
"unregister_dialect", "__version__", "DictReader", "DictWriter" ]
if self
.__class
__ != Dialect
:
errors
= self
._validate
()
raise Error
, "Dialect did not validate: %s" % ", ".join(errors
)
errors
.append("can't directly instantiate Dialect class")
if self
.delimiter
is None:
errors
.append("delimiter character not set")
elif (not isinstance(self
.delimiter
, str) or
len(self
.delimiter
) > 1):
errors
.append("delimiter must be one-character string")
if self
.quotechar
is None:
if self
.quoting
!= QUOTE_NONE
:
errors
.append("quotechar not set")
elif (not isinstance(self
.quotechar
, str) or
len(self
.quotechar
) > 1):
errors
.append("quotechar must be one-character string")
if self
.lineterminator
is None:
errors
.append("lineterminator not set")
elif not isinstance(self
.lineterminator
, str):
errors
.append("lineterminator must be a string")
if self
.doublequote
not in (True, False):
errors
.append("doublequote parameter must be True or False")
if self
.skipinitialspace
not in (True, False):
errors
.append("skipinitialspace parameter must be True or False")
errors
.append("quoting parameter not set")
if self
.quoting
is QUOTE_NONE
:
if (not isinstance(self
.escapechar
, (unicode, str)) or
len(self
.escapechar
) > 1):
errors
.append("escapechar must be a one-character string or unicode object")
register_dialect("excel", excel
)
register_dialect("excel-tab", excel_tab
)
def __init__(self
, f
, fieldnames
=None, restkey
=None, restval
=None,
dialect
="excel", *args
, **kwds
):
self
.fieldnames
= fieldnames
# list of keys for the dict
self
.restkey
= restkey
# key to catch long rows
self
.restval
= restval
# default value for short rows
self
.reader
= reader(f
, dialect
, *args
, **kwds
)
if self
.fieldnames
is None:
# unlike the basic reader, we prefer not to return blanks,
# because we will typically wind up with a dict full of None
d
= dict(zip(self
.fieldnames
, row
))
lf
= len(self
.fieldnames
)
d
[self
.restkey
] = row
[lf
:]
for key
in self
.fieldnames
[lr
:]:
def __init__(self
, f
, fieldnames
, restval
="", extrasaction
="raise",
dialect
="excel", *args
, **kwds
):
self
.fieldnames
= fieldnames
# list of keys for the dict
self
.restval
= restval
# for writing short dicts
if extrasaction
.lower() not in ("raise", "ignore"):
("extrasaction (%s) must be 'raise' or 'ignore'" %
self
.extrasaction
= extrasaction
self
.writer
= writer(f
, dialect
, *args
, **kwds
)
def _dict_to_list(self
, rowdict
):
if self
.extrasaction
== "raise":
if k
not in self
.fieldnames
:
raise ValueError, "dict contains fields not in fieldnames"
return [rowdict
.get(key
, self
.restval
) for key
in self
.fieldnames
]
def writerow(self
, rowdict
):
return self
.writer
.writerow(self
._dict
_to
_list
(rowdict
))
def writerows(self
, rowdicts
):
rows
.append(self
._dict
_to
_list
(rowdict
))
return self
.writer
.writerows(rows
)
# Guard Sniffer's type checking against builds that exclude complex()
"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
Returns a Dialect object.
# in case there is more than one possible delimiter
self
.preferred
= [',', '\t', ';', ' ', ':']
def sniff(self
, sample
, delimiters
=None):
Returns a dialect (or None) corresponding to the sample
quotechar
, delimiter
, skipinitialspace
= \
self
._guess
_quote
_and
_delimiter
(sample
, delimiters
)
delimiter
, skipinitialspace
= self
._guess
_delimiter
(sample
,
dialect
.delimiter
= delimiter
# _csv.reader won't accept a quotechar of ''
dialect
.quotechar
= quotechar
or '"'
dialect
.skipinitialspace
= skipinitialspace
def _guess_quote_and_delimiter(self
, data
, delimiters
):
Looks for text enclosed between two identical quotes
(the probable quotechar) which are preceded and followed
by the same character (the probable delimiter).
The quote with the most wins, same with the delimiter.
If there is no quotechar the delimiter can't be determined
for restr
in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
regexp
= re
.compile(restr
, re
.DOTALL | re
.MULTILINE
)
matches
= regexp
.findall(data
)
return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
n
= regexp
.groupindex
['quote'] - 1
quotes
[key
] = quotes
.get(key
, 0) + 1
n
= regexp
.groupindex
['delim'] - 1
if key
and (delimiters
is None or key
in delimiters
):
delims
[key
] = delims
.get(key
, 0) + 1
n
= regexp
.groupindex
['space'] - 1
quotechar
= reduce(lambda a
, b
, quotes
= quotes
:
(quotes
[a
] > quotes
[b
]) and a
or b
, quotes
.keys())
delim
= reduce(lambda a
, b
, delims
= delims
:
(delims
[a
] > delims
[b
]) and a
or b
, delims
.keys())
skipinitialspace
= delims
[delim
] == spaces
if delim
== '\n': # most likely a file with a single column
# there is *no* delimiter, it's a single column of quoted data
return (quotechar
, delim
, skipinitialspace
)
def _guess_delimiter(self
, data
, delimiters
):
The delimiter /should/ occur the same number of times on
each row. However, due to malformed data, it may not. We don't want
an all or nothing approach, so we allow for small variations in this
1) build a table of the frequency of each character on every line.
2) build a table of freqencies of this frequency (meta-frequency?),
e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
3) use the mode of the meta-frequency to determine the /expected/
frequency for that character
4) find out how often the character actually meets that goal
5) the character that best meets its goal is the delimiter
For performance reasons, the data is evaluated in chunks, so it can
try and evaluate the smallest portion of the data possible, evaluating
additional chunks as necessary.
data
= filter(None, data
.split('\n'))
ascii
= [chr(c
) for c
in range(127)] # 7-bit ASCII
chunkLength
= min(10, len(data
))
start
, end
= 0, min(chunkLength
, len(data
))
for line
in data
[start
:end
]:
metaFrequency
= charFrequency
.get(char
, {})
# must count even if frequency is 0
freq
= line
.strip().count(char
)
metaFrequency
[freq
] = metaFrequency
.get(freq
, 0) + 1
charFrequency
[char
] = metaFrequency
for char
in charFrequency
.keys():
items
= charFrequency
[char
].items()
if len(items
) == 1 and items
[0][0] == 0:
# get the mode of the frequencies
modes
[char
] = reduce(lambda a
, b
: a
[1] > b
[1] and a
or b
,
# adjust the mode - subtract the sum of all
items
.remove(modes
[char
])
modes
[char
] = (modes
[char
][0], modes
[char
][1]
- reduce(lambda a
, b
: (0, a
[1] + b
[1]),
# build a list of possible delimiters
total
= float(chunkLength
* iteration
)
# (rows of consistent data) / (number of rows) = 100%
# minimum consistency threshold
while len(delims
) == 0 and consistency
>= threshold
:
if v
[0] > 0 and v
[1] > 0:
if ((v
[1]/total
) >= consistency
and
(delimiters
is None or k
in delimiters
)):
skipinitialspace
= (data
[0].count(delim
) ==
data
[0].count("%c " % delim
))
return (delim
, skipinitialspace
)
# analyze another chunkLength lines
# if there's more than one, fall back to a 'preferred' list
skipinitialspace
= (data
[0].count(d
) ==
data
[0].count("%c " % d
))
return (d
, skipinitialspace
)
# finally, just return the first damn character in the list
skipinitialspace
= (data
[0].count(delim
) ==
data
[0].count("%c " % delim
))
return (delim
, skipinitialspace
)
def has_header(self
, sample
):
# Creates a dictionary of types of data in each column. If any
# column is of a single type (say, integers), *except* for the first
# row, then the first row is presumed to be labels. If the type
# can't be determined, it is assumed to be a string in which case
# the length of the string is the determining factor: if all of the
# rows except for the first are the same length, it's a header.
# Finally, a 'vote' is taken at the end for each column, adding or
# subtracting from the likelihood of the first row being a header.
rdr
= reader(StringIO(sample
), self
.sniff(sample
))
header
= rdr
.next() # assume first row is header
for i
in range(columns
): columnTypes
[i
] = None
# arbitrary number of rows to check, to keep it sane
continue # skip rows that have irregular number of columns
for col
in columnTypes
.keys():
for thisType
in [int, long, float, complex]:
except (ValueError, OverflowError):
# fallback to length of string
if thisType
!= columnTypes
[col
]:
if columnTypes
[col
] is None: # add new column type
columnTypes
[col
] = thisType
# type is inconsistent, remove column from
# finally, compare results against first row and "vote"
# on whether it's a header
for col
, colType
in columnTypes
.items():
if type(colType
) == type(0): # it's a length
if len(header
[col
]) != colType
:
except (ValueError, TypeError):