Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | """Functions that read and write gzipped files. |
2 | ||
3 | The user of the file doesn't have to worry about the compression, | |
4 | but random access is not allowed.""" | |
5 | ||
6 | # based on Andrew Kuchling's minigzip.py distributed with the zlib module | |
7 | ||
8 | import struct, sys, time | |
9 | import zlib | |
10 | import __builtin__ | |
11 | ||
12 | __all__ = ["GzipFile","open"] | |
13 | ||
14 | FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16 | |
15 | ||
16 | READ, WRITE = 1, 2 | |
17 | ||
18 | def U32(i): | |
19 | """Return i as an unsigned integer, assuming it fits in 32 bits. | |
20 | ||
21 | If it's >= 2GB when viewed as a 32-bit unsigned int, return a long. | |
22 | """ | |
23 | if i < 0: | |
24 | i += 1L << 32 | |
25 | return i | |
26 | ||
27 | def LOWU32(i): | |
28 | """Return the low-order 32 bits of an int, as a non-negative int.""" | |
29 | return i & 0xFFFFFFFFL | |
30 | ||
31 | def write32(output, value): | |
32 | output.write(struct.pack("<l", value)) | |
33 | ||
34 | def write32u(output, value): | |
35 | # The L format writes the bit pattern correctly whether signed | |
36 | # or unsigned. | |
37 | output.write(struct.pack("<L", value)) | |
38 | ||
39 | def read32(input): | |
40 | return struct.unpack("<l", input.read(4))[0] | |
41 | ||
42 | def open(filename, mode="rb", compresslevel=9): | |
43 | """Shorthand for GzipFile(filename, mode, compresslevel). | |
44 | ||
45 | The filename argument is required; mode defaults to 'rb' | |
46 | and compresslevel defaults to 9. | |
47 | ||
48 | """ | |
49 | return GzipFile(filename, mode, compresslevel) | |
50 | ||
51 | class GzipFile: | |
52 | """The GzipFile class simulates most of the methods of a file object with | |
53 | the exception of the readinto() and truncate() methods. | |
54 | ||
55 | """ | |
56 | ||
57 | myfileobj = None | |
58 | max_read_chunk = 10 * 1024 * 1024 | |
59 | ||
60 | def __init__(self, filename=None, mode=None, | |
61 | compresslevel=9, fileobj=None): | |
62 | """Constructor for the GzipFile class. | |
63 | ||
64 | At least one of fileobj and filename must be given a | |
65 | non-trivial value. | |
66 | ||
67 | The new class instance is based on fileobj, which can be a regular | |
68 | file, a StringIO object, or any other object which simulates a file. | |
69 | It defaults to None, in which case filename is opened to provide | |
70 | a file object. | |
71 | ||
72 | When fileobj is not None, the filename argument is only used to be | |
73 | included in the gzip file header, which may includes the original | |
74 | filename of the uncompressed file. It defaults to the filename of | |
75 | fileobj, if discernible; otherwise, it defaults to the empty string, | |
76 | and in this case the original filename is not included in the header. | |
77 | ||
78 | The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb', | |
79 | depending on whether the file will be read or written. The default | |
80 | is the mode of fileobj if discernible; otherwise, the default is 'rb'. | |
81 | Be aware that only the 'rb', 'ab', and 'wb' values should be used | |
82 | for cross-platform portability. | |
83 | ||
84 | The compresslevel argument is an integer from 1 to 9 controlling the | |
85 | level of compression; 1 is fastest and produces the least compression, | |
86 | and 9 is slowest and produces the most compression. The default is 9. | |
87 | ||
88 | """ | |
89 | ||
90 | # guarantee the file is opened in binary mode on platforms | |
91 | # that care about that sort of thing | |
92 | if mode and 'b' not in mode: | |
93 | mode += 'b' | |
94 | if fileobj is None: | |
95 | fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb') | |
96 | if filename is None: | |
97 | if hasattr(fileobj, 'name'): filename = fileobj.name | |
98 | else: filename = '' | |
99 | if mode is None: | |
100 | if hasattr(fileobj, 'mode'): mode = fileobj.mode | |
101 | else: mode = 'rb' | |
102 | ||
103 | if mode[0:1] == 'r': | |
104 | self.mode = READ | |
105 | # Set flag indicating start of a new member | |
106 | self._new_member = True | |
107 | self.extrabuf = "" | |
108 | self.extrasize = 0 | |
109 | self.filename = filename | |
110 | ||
111 | elif mode[0:1] == 'w' or mode[0:1] == 'a': | |
112 | self.mode = WRITE | |
113 | self._init_write(filename) | |
114 | self.compress = zlib.compressobj(compresslevel, | |
115 | zlib.DEFLATED, | |
116 | -zlib.MAX_WBITS, | |
117 | zlib.DEF_MEM_LEVEL, | |
118 | 0) | |
119 | else: | |
120 | raise IOError, "Mode " + mode + " not supported" | |
121 | ||
122 | self.fileobj = fileobj | |
123 | self.offset = 0 | |
124 | ||
125 | if self.mode == WRITE: | |
126 | self._write_gzip_header() | |
127 | ||
128 | def __repr__(self): | |
129 | s = repr(self.fileobj) | |
130 | return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>' | |
131 | ||
132 | def _init_write(self, filename): | |
133 | if filename[-3:] != '.gz': | |
134 | filename = filename + '.gz' | |
135 | self.filename = filename | |
136 | self.crc = zlib.crc32("") | |
137 | self.size = 0 | |
138 | self.writebuf = [] | |
139 | self.bufsize = 0 | |
140 | ||
141 | def _write_gzip_header(self): | |
142 | self.fileobj.write('\037\213') # magic header | |
143 | self.fileobj.write('\010') # compression method | |
144 | fname = self.filename[:-3] | |
145 | flags = 0 | |
146 | if fname: | |
147 | flags = FNAME | |
148 | self.fileobj.write(chr(flags)) | |
149 | write32u(self.fileobj, long(time.time())) | |
150 | self.fileobj.write('\002') | |
151 | self.fileobj.write('\377') | |
152 | if fname: | |
153 | self.fileobj.write(fname + '\000') | |
154 | ||
155 | def _init_read(self): | |
156 | self.crc = zlib.crc32("") | |
157 | self.size = 0 | |
158 | ||
159 | def _read_gzip_header(self): | |
160 | magic = self.fileobj.read(2) | |
161 | if magic != '\037\213': | |
162 | raise IOError, 'Not a gzipped file' | |
163 | method = ord( self.fileobj.read(1) ) | |
164 | if method != 8: | |
165 | raise IOError, 'Unknown compression method' | |
166 | flag = ord( self.fileobj.read(1) ) | |
167 | # modtime = self.fileobj.read(4) | |
168 | # extraflag = self.fileobj.read(1) | |
169 | # os = self.fileobj.read(1) | |
170 | self.fileobj.read(6) | |
171 | ||
172 | if flag & FEXTRA: | |
173 | # Read & discard the extra field, if present | |
174 | xlen = ord(self.fileobj.read(1)) | |
175 | xlen = xlen + 256*ord(self.fileobj.read(1)) | |
176 | self.fileobj.read(xlen) | |
177 | if flag & FNAME: | |
178 | # Read and discard a null-terminated string containing the filename | |
179 | while True: | |
180 | s = self.fileobj.read(1) | |
181 | if not s or s=='\000': | |
182 | break | |
183 | if flag & FCOMMENT: | |
184 | # Read and discard a null-terminated string containing a comment | |
185 | while True: | |
186 | s = self.fileobj.read(1) | |
187 | if not s or s=='\000': | |
188 | break | |
189 | if flag & FHCRC: | |
190 | self.fileobj.read(2) # Read & discard the 16-bit header CRC | |
191 | ||
192 | ||
193 | def write(self,data): | |
194 | if self.mode != WRITE: | |
195 | import errno | |
196 | raise IOError(errno.EBADF, "write() on read-only GzipFile object") | |
197 | ||
198 | if self.fileobj is None: | |
199 | raise ValueError, "write() on closed GzipFile object" | |
200 | if len(data) > 0: | |
201 | self.size = self.size + len(data) | |
202 | self.crc = zlib.crc32(data, self.crc) | |
203 | self.fileobj.write( self.compress.compress(data) ) | |
204 | self.offset += len(data) | |
205 | ||
206 | def read(self, size=-1): | |
207 | if self.mode != READ: | |
208 | import errno | |
209 | raise IOError(errno.EBADF, "read() on write-only GzipFile object") | |
210 | ||
211 | if self.extrasize <= 0 and self.fileobj is None: | |
212 | return '' | |
213 | ||
214 | readsize = 1024 | |
215 | if size < 0: # get the whole thing | |
216 | try: | |
217 | while True: | |
218 | self._read(readsize) | |
219 | readsize = min(self.max_read_chunk, readsize * 2) | |
220 | except EOFError: | |
221 | size = self.extrasize | |
222 | else: # just get some more of it | |
223 | try: | |
224 | while size > self.extrasize: | |
225 | self._read(readsize) | |
226 | readsize = min(self.max_read_chunk, readsize * 2) | |
227 | except EOFError: | |
228 | if size > self.extrasize: | |
229 | size = self.extrasize | |
230 | ||
231 | chunk = self.extrabuf[:size] | |
232 | self.extrabuf = self.extrabuf[size:] | |
233 | self.extrasize = self.extrasize - size | |
234 | ||
235 | self.offset += size | |
236 | return chunk | |
237 | ||
238 | def _unread(self, buf): | |
239 | self.extrabuf = buf + self.extrabuf | |
240 | self.extrasize = len(buf) + self.extrasize | |
241 | self.offset -= len(buf) | |
242 | ||
243 | def _read(self, size=1024): | |
244 | if self.fileobj is None: | |
245 | raise EOFError, "Reached EOF" | |
246 | ||
247 | if self._new_member: | |
248 | # If the _new_member flag is set, we have to | |
249 | # jump to the next member, if there is one. | |
250 | # | |
251 | # First, check if we're at the end of the file; | |
252 | # if so, it's time to stop; no more members to read. | |
253 | pos = self.fileobj.tell() # Save current position | |
254 | self.fileobj.seek(0, 2) # Seek to end of file | |
255 | if pos == self.fileobj.tell(): | |
256 | raise EOFError, "Reached EOF" | |
257 | else: | |
258 | self.fileobj.seek( pos ) # Return to original position | |
259 | ||
260 | self._init_read() | |
261 | self._read_gzip_header() | |
262 | self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) | |
263 | self._new_member = False | |
264 | ||
265 | # Read a chunk of data from the file | |
266 | buf = self.fileobj.read(size) | |
267 | ||
268 | # If the EOF has been reached, flush the decompression object | |
269 | # and mark this object as finished. | |
270 | ||
271 | if buf == "": | |
272 | uncompress = self.decompress.flush() | |
273 | self._read_eof() | |
274 | self._add_read_data( uncompress ) | |
275 | raise EOFError, 'Reached EOF' | |
276 | ||
277 | uncompress = self.decompress.decompress(buf) | |
278 | self._add_read_data( uncompress ) | |
279 | ||
280 | if self.decompress.unused_data != "": | |
281 | # Ending case: we've come to the end of a member in the file, | |
282 | # so seek back to the start of the unused data, finish up | |
283 | # this member, and read a new gzip header. | |
284 | # (The number of bytes to seek back is the length of the unused | |
285 | # data, minus 8 because _read_eof() will rewind a further 8 bytes) | |
286 | self.fileobj.seek( -len(self.decompress.unused_data)+8, 1) | |
287 | ||
288 | # Check the CRC and file size, and set the flag so we read | |
289 | # a new member on the next call | |
290 | self._read_eof() | |
291 | self._new_member = True | |
292 | ||
293 | def _add_read_data(self, data): | |
294 | self.crc = zlib.crc32(data, self.crc) | |
295 | self.extrabuf = self.extrabuf + data | |
296 | self.extrasize = self.extrasize + len(data) | |
297 | self.size = self.size + len(data) | |
298 | ||
299 | def _read_eof(self): | |
300 | # We've read to the end of the file, so we have to rewind in order | |
301 | # to reread the 8 bytes containing the CRC and the file size. | |
302 | # We check the that the computed CRC and size of the | |
303 | # uncompressed data matches the stored values. Note that the size | |
304 | # stored is the true file size mod 2**32. | |
305 | self.fileobj.seek(-8, 1) | |
306 | crc32 = read32(self.fileobj) | |
307 | isize = U32(read32(self.fileobj)) # may exceed 2GB | |
308 | if U32(crc32) != U32(self.crc): | |
309 | raise IOError, "CRC check failed" | |
310 | elif isize != LOWU32(self.size): | |
311 | raise IOError, "Incorrect length of data produced" | |
312 | ||
313 | def close(self): | |
314 | if self.mode == WRITE: | |
315 | self.fileobj.write(self.compress.flush()) | |
316 | write32(self.fileobj, self.crc) | |
317 | # self.size may exceed 2GB, or even 4GB | |
318 | write32u(self.fileobj, LOWU32(self.size)) | |
319 | self.fileobj = None | |
320 | elif self.mode == READ: | |
321 | self.fileobj = None | |
322 | if self.myfileobj: | |
323 | self.myfileobj.close() | |
324 | self.myfileobj = None | |
325 | ||
326 | def __del__(self): | |
327 | try: | |
328 | if (self.myfileobj is None and | |
329 | self.fileobj is None): | |
330 | return | |
331 | except AttributeError: | |
332 | return | |
333 | self.close() | |
334 | ||
335 | def flush(self): | |
336 | self.fileobj.flush() | |
337 | ||
338 | def fileno(self): | |
339 | """Invoke the underlying file object's fileno() method. | |
340 | ||
341 | This will raise AttributeError if the underlying file object | |
342 | doesn't support fileno(). | |
343 | """ | |
344 | return self.fileobj.fileno() | |
345 | ||
346 | def isatty(self): | |
347 | return False | |
348 | ||
349 | def tell(self): | |
350 | return self.offset | |
351 | ||
352 | def rewind(self): | |
353 | '''Return the uncompressed stream file position indicator to the | |
354 | beginning of the file''' | |
355 | if self.mode != READ: | |
356 | raise IOError("Can't rewind in write mode") | |
357 | self.fileobj.seek(0) | |
358 | self._new_member = True | |
359 | self.extrabuf = "" | |
360 | self.extrasize = 0 | |
361 | self.offset = 0 | |
362 | ||
363 | def seek(self, offset): | |
364 | if self.mode == WRITE: | |
365 | if offset < self.offset: | |
366 | raise IOError('Negative seek in write mode') | |
367 | count = offset - self.offset | |
368 | for i in range(count // 1024): | |
369 | self.write(1024 * '\0') | |
370 | self.write((count % 1024) * '\0') | |
371 | elif self.mode == READ: | |
372 | if offset < self.offset: | |
373 | # for negative seek, rewind and do positive seek | |
374 | self.rewind() | |
375 | count = offset - self.offset | |
376 | for i in range(count // 1024): | |
377 | self.read(1024) | |
378 | self.read(count % 1024) | |
379 | ||
380 | def readline(self, size=-1): | |
381 | if size < 0: size = sys.maxint | |
382 | bufs = [] | |
383 | readsize = min(100, size) # Read from the file in small chunks | |
384 | while True: | |
385 | if size == 0: | |
386 | return "".join(bufs) # Return resulting line | |
387 | ||
388 | c = self.read(readsize) | |
389 | i = c.find('\n') | |
390 | if size is not None: | |
391 | # We set i=size to break out of the loop under two | |
392 | # conditions: 1) there's no newline, and the chunk is | |
393 | # larger than size, or 2) there is a newline, but the | |
394 | # resulting line would be longer than 'size'. | |
395 | if i==-1 and len(c) > size: i=size-1 | |
396 | elif size <= i: i = size -1 | |
397 | ||
398 | if i >= 0 or c == '': | |
399 | bufs.append(c[:i+1]) # Add portion of last chunk | |
400 | self._unread(c[i+1:]) # Push back rest of chunk | |
401 | return ''.join(bufs) # Return resulting line | |
402 | ||
403 | # Append chunk to list, decrease 'size', | |
404 | bufs.append(c) | |
405 | size = size - len(c) | |
406 | readsize = min(size, readsize * 2) | |
407 | ||
408 | def readlines(self, sizehint=0): | |
409 | # Negative numbers result in reading all the lines | |
410 | if sizehint <= 0: | |
411 | sizehint = sys.maxint | |
412 | L = [] | |
413 | while sizehint > 0: | |
414 | line = self.readline() | |
415 | if line == "": | |
416 | break | |
417 | L.append(line) | |
418 | sizehint = sizehint - len(line) | |
419 | ||
420 | return L | |
421 | ||
422 | def writelines(self, L): | |
423 | for line in L: | |
424 | self.write(line) | |
425 | ||
426 | def __iter__(self): | |
427 | return self | |
428 | ||
429 | def next(self): | |
430 | line = self.readline() | |
431 | if line: | |
432 | return line | |
433 | else: | |
434 | raise StopIteration | |
435 | ||
436 | ||
437 | def _test(): | |
438 | # Act like gzip; with -d, act like gunzip. | |
439 | # The input file is not deleted, however, nor are any other gzip | |
440 | # options or features supported. | |
441 | args = sys.argv[1:] | |
442 | decompress = args and args[0] == "-d" | |
443 | if decompress: | |
444 | args = args[1:] | |
445 | if not args: | |
446 | args = ["-"] | |
447 | for arg in args: | |
448 | if decompress: | |
449 | if arg == "-": | |
450 | f = GzipFile(filename="", mode="rb", fileobj=sys.stdin) | |
451 | g = sys.stdout | |
452 | else: | |
453 | if arg[-3:] != ".gz": | |
454 | print "filename doesn't end in .gz:", repr(arg) | |
455 | continue | |
456 | f = open(arg, "rb") | |
457 | g = __builtin__.open(arg[:-3], "wb") | |
458 | else: | |
459 | if arg == "-": | |
460 | f = sys.stdin | |
461 | g = GzipFile(filename="", mode="wb", fileobj=sys.stdout) | |
462 | else: | |
463 | f = __builtin__.open(arg, "rb") | |
464 | g = open(arg + ".gz", "wb") | |
465 | while True: | |
466 | chunk = f.read(1024) | |
467 | if not chunk: | |
468 | break | |
469 | g.write(chunk) | |
470 | if g is not sys.stdout: | |
471 | g.close() | |
472 | if f is not sys.stdin: | |
473 | f.close() | |
474 | ||
475 | if __name__ == '__main__': | |
476 | _test() |