"""Functions that read and write gzipped files.
The user of the file doesn't have to worry about the compression,
but random access is not allowed."""
# based on Andrew Kuchling's minigzip.py distributed with the zlib module
__all__
= ["GzipFile","open"]
FTEXT
, FHCRC
, FEXTRA
, FNAME
, FCOMMENT
= 1, 2, 4, 8, 16
"""Return i as an unsigned integer, assuming it fits in 32 bits.
If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
"""Return the low-order 32 bits of an int, as a non-negative int."""
def write32(output
, value
):
output
.write(struct
.pack("<l", value
))
def write32u(output
, value
):
# The L format writes the bit pattern correctly whether signed
output
.write(struct
.pack("<L", value
))
return struct
.unpack("<l", input.read(4))[0]
def open(filename
, mode
="rb", compresslevel
=9):
"""Shorthand for GzipFile(filename, mode, compresslevel).
The filename argument is required; mode defaults to 'rb'
and compresslevel defaults to 9.
return GzipFile(filename
, mode
, compresslevel
)
"""The GzipFile class simulates most of the methods of a file object with
the exception of the readinto() and truncate() methods.
max_read_chunk
= 10 * 1024 * 1024
def __init__(self
, filename
=None, mode
=None,
compresslevel
=9, fileobj
=None):
"""Constructor for the GzipFile class.
At least one of fileobj and filename must be given a
The new class instance is based on fileobj, which can be a regular
file, a StringIO object, or any other object which simulates a file.
It defaults to None, in which case filename is opened to provide
When fileobj is not None, the filename argument is only used to be
included in the gzip file header, which may includes the original
filename of the uncompressed file. It defaults to the filename of
fileobj, if discernible; otherwise, it defaults to the empty string,
and in this case the original filename is not included in the header.
The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
depending on whether the file will be read or written. The default
is the mode of fileobj if discernible; otherwise, the default is 'rb'.
Be aware that only the 'rb', 'ab', and 'wb' values should be used
for cross-platform portability.
The compresslevel argument is an integer from 1 to 9 controlling the
level of compression; 1 is fastest and produces the least compression,
and 9 is slowest and produces the most compression. The default is 9.
# guarantee the file is opened in binary mode on platforms
# that care about that sort of thing
if mode
and 'b' not in mode
:
fileobj
= self
.myfileobj
= __builtin__
.open(filename
, mode
or 'rb')
if hasattr(fileobj
, 'name'): filename
= fileobj
.name
if hasattr(fileobj
, 'mode'): mode
= fileobj
.mode
# Set flag indicating start of a new member
elif mode
[0:1] == 'w' or mode
[0:1] == 'a':
self
._init
_write
(filename
)
self
.compress
= zlib
.compressobj(compresslevel
,
raise IOError, "Mode " + mode
+ " not supported"
self
._write
_gzip
_header
()
return '<gzip ' + s
[1:-1] + ' ' + hex(id(self
)) + '>'
def _init_write(self
, filename
):
if filename
[-3:] != '.gz':
filename
= filename
+ '.gz'
self
.crc
= zlib
.crc32("")
def _write_gzip_header(self
):
self
.fileobj
.write('\037\213') # magic header
self
.fileobj
.write('\010') # compression method
fname
= self
.filename
[:-3]
self
.fileobj
.write(chr(flags
))
write32u(self
.fileobj
, long(time
.time()))
self
.fileobj
.write('\002')
self
.fileobj
.write('\377')
self
.fileobj
.write(fname
+ '\000')
self
.crc
= zlib
.crc32("")
def _read_gzip_header(self
):
magic
= self
.fileobj
.read(2)
raise IOError, 'Not a gzipped file'
method
= ord( self
.fileobj
.read(1) )
raise IOError, 'Unknown compression method'
flag
= ord( self
.fileobj
.read(1) )
# modtime = self.fileobj.read(4)
# extraflag = self.fileobj.read(1)
# os = self.fileobj.read(1)
# Read & discard the extra field, if present
xlen
= ord(self
.fileobj
.read(1))
xlen
= xlen
+ 256*ord(self
.fileobj
.read(1))
# Read and discard a null-terminated string containing the filename
# Read and discard a null-terminated string containing a comment
self
.fileobj
.read(2) # Read & discard the 16-bit header CRC
raise IOError(errno
.EBADF
, "write() on read-only GzipFile object")
raise ValueError, "write() on closed GzipFile object"
self
.size
= self
.size
+ len(data
)
self
.crc
= zlib
.crc32(data
, self
.crc
)
self
.fileobj
.write( self
.compress
.compress(data
) )
raise IOError(errno
.EBADF
, "read() on write-only GzipFile object")
if self
.extrasize
<= 0 and self
.fileobj
is None:
if size
< 0: # get the whole thing
readsize
= min(self
.max_read_chunk
, readsize
* 2)
else: # just get some more of it
while size
> self
.extrasize
:
readsize
= min(self
.max_read_chunk
, readsize
* 2)
if size
> self
.extrasize
:
chunk
= self
.extrabuf
[:size
]
self
.extrabuf
= self
.extrabuf
[size
:]
self
.extrasize
= self
.extrasize
- size
self
.extrabuf
= buf
+ self
.extrabuf
self
.extrasize
= len(buf
) + self
.extrasize
def _read(self
, size
=1024):
raise EOFError, "Reached EOF"
# If the _new_member flag is set, we have to
# jump to the next member, if there is one.
# First, check if we're at the end of the file;
# if so, it's time to stop; no more members to read.
pos
= self
.fileobj
.tell() # Save current position
self
.fileobj
.seek(0, 2) # Seek to end of file
if pos
== self
.fileobj
.tell():
raise EOFError, "Reached EOF"
self
.fileobj
.seek( pos
) # Return to original position
self
.decompress
= zlib
.decompressobj(-zlib
.MAX_WBITS
)
# Read a chunk of data from the file
buf
= self
.fileobj
.read(size
)
# If the EOF has been reached, flush the decompression object
# and mark this object as finished.
uncompress
= self
.decompress
.flush()
self
._add
_read
_data
( uncompress
)
raise EOFError, 'Reached EOF'
uncompress
= self
.decompress
.decompress(buf
)
self
._add
_read
_data
( uncompress
)
if self
.decompress
.unused_data
!= "":
# Ending case: we've come to the end of a member in the file,
# so seek back to the start of the unused data, finish up
# this member, and read a new gzip header.
# (The number of bytes to seek back is the length of the unused
# data, minus 8 because _read_eof() will rewind a further 8 bytes)
self
.fileobj
.seek( -len(self
.decompress
.unused_data
)+8, 1)
# Check the CRC and file size, and set the flag so we read
# a new member on the next call
def _add_read_data(self
, data
):
self
.crc
= zlib
.crc32(data
, self
.crc
)
self
.extrabuf
= self
.extrabuf
+ data
self
.extrasize
= self
.extrasize
+ len(data
)
self
.size
= self
.size
+ len(data
)
# We've read to the end of the file, so we have to rewind in order
# to reread the 8 bytes containing the CRC and the file size.
# We check the that the computed CRC and size of the
# uncompressed data matches the stored values. Note that the size
# stored is the true file size mod 2**32.
crc32
= read32(self
.fileobj
)
isize
= U32(read32(self
.fileobj
)) # may exceed 2GB
if U32(crc32
) != U32(self
.crc
):
raise IOError, "CRC check failed"
elif isize
!= LOWU32(self
.size
):
raise IOError, "Incorrect length of data produced"
self
.fileobj
.write(self
.compress
.flush())
write32(self
.fileobj
, self
.crc
)
# self.size may exceed 2GB, or even 4GB
write32u(self
.fileobj
, LOWU32(self
.size
))
if (self
.myfileobj
is None and
"""Invoke the underlying file object's fileno() method.
This will raise AttributeError if the underlying file object
doesn't support fileno().
return self
.fileobj
.fileno()
'''Return the uncompressed stream file position indicator to the
raise IOError("Can't rewind in write mode")
raise IOError('Negative seek in write mode')
count
= offset
- self
.offset
for i
in range(count
// 1024):
self
.write((count
% 1024) * '\0')
# for negative seek, rewind and do positive seek
count
= offset
- self
.offset
for i
in range(count
// 1024):
def readline(self
, size
=-1):
if size
< 0: size
= sys
.maxint
readsize
= min(100, size
) # Read from the file in small chunks
return "".join(bufs
) # Return resulting line
# We set i=size to break out of the loop under two
# conditions: 1) there's no newline, and the chunk is
# larger than size, or 2) there is a newline, but the
# resulting line would be longer than 'size'.
if i
==-1 and len(c
) > size
: i
=size
-1
elif size
<= i
: i
= size
-1
bufs
.append(c
[:i
+1]) # Add portion of last chunk
self
._unread
(c
[i
+1:]) # Push back rest of chunk
return ''.join(bufs
) # Return resulting line
# Append chunk to list, decrease 'size',
readsize
= min(size
, readsize
* 2)
def readlines(self
, sizehint
=0):
# Negative numbers result in reading all the lines
sizehint
= sizehint
- len(line
)
# Act like gzip; with -d, act like gunzip.
# The input file is not deleted, however, nor are any other gzip
# options or features supported.
decompress
= args
and args
[0] == "-d"
f
= GzipFile(filename
="", mode
="rb", fileobj
=sys
.stdin
)
print "filename doesn't end in .gz:", repr(arg
)
g
= __builtin__
.open(arg
[:-3], "wb")
g
= GzipFile(filename
="", mode
="wb", fileobj
=sys
.stdout
)
f
= __builtin__
.open(arg
, "rb")
g
= open(arg
+ ".gz", "wb")
if __name__
== '__main__':