Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | """Guess the MIME type of a file. |
2 | ||
3 | This module defines two useful functions: | |
4 | ||
5 | guess_type(url, strict=1) -- guess the MIME type and encoding of a URL. | |
6 | ||
7 | guess_extension(type, strict=1) -- guess the extension for a given MIME type. | |
8 | ||
9 | It also contains the following, for tuning the behavior: | |
10 | ||
11 | Data: | |
12 | ||
13 | knownfiles -- list of files to parse | |
14 | inited -- flag set when init() has been called | |
15 | suffix_map -- dictionary mapping suffixes to suffixes | |
16 | encodings_map -- dictionary mapping suffixes to encodings | |
17 | types_map -- dictionary mapping suffixes to types | |
18 | ||
19 | Functions: | |
20 | ||
21 | init([files]) -- parse a list of files, default knownfiles | |
22 | read_mime_types(file) -- parse one file, return a dictionary or None | |
23 | """ | |
24 | ||
25 | import os | |
26 | import posixpath | |
27 | import urllib | |
28 | ||
29 | __all__ = [ | |
30 | "guess_type","guess_extension","guess_all_extensions", | |
31 | "add_type","read_mime_types","init" | |
32 | ] | |
33 | ||
34 | knownfiles = [ | |
35 | "/etc/mime.types", | |
36 | "/usr/local/etc/httpd/conf/mime.types", | |
37 | "/usr/local/lib/netscape/mime.types", | |
38 | "/usr/local/etc/httpd/conf/mime.types", # Apache 1.2 | |
39 | "/usr/local/etc/mime.types", # Apache 1.3 | |
40 | ] | |
41 | ||
42 | inited = False | |
43 | ||
44 | ||
45 | class MimeTypes: | |
46 | """MIME-types datastore. | |
47 | ||
48 | This datastore can handle information from mime.types-style files | |
49 | and supports basic determination of MIME type from a filename or | |
50 | URL, and can guess a reasonable extension given a MIME type. | |
51 | """ | |
52 | ||
53 | def __init__(self, filenames=(), strict=True): | |
54 | if not inited: | |
55 | init() | |
56 | self.encodings_map = encodings_map.copy() | |
57 | self.suffix_map = suffix_map.copy() | |
58 | self.types_map = ({}, {}) # dict for (non-strict, strict) | |
59 | self.types_map_inv = ({}, {}) | |
60 | for (ext, type) in types_map.items(): | |
61 | self.add_type(type, ext, True) | |
62 | for (ext, type) in common_types.items(): | |
63 | self.add_type(type, ext, False) | |
64 | for name in filenames: | |
65 | self.read(name, strict) | |
66 | ||
67 | def add_type(self, type, ext, strict=True): | |
68 | """Add a mapping between a type and an extension. | |
69 | ||
70 | When the extension is already known, the new | |
71 | type will replace the old one. When the type | |
72 | is already known the extension will be added | |
73 | to the list of known extensions. | |
74 | ||
75 | If strict is true, information will be added to | |
76 | list of standard types, else to the list of non-standard | |
77 | types. | |
78 | """ | |
79 | self.types_map[strict][ext] = type | |
80 | exts = self.types_map_inv[strict].setdefault(type, []) | |
81 | if ext not in exts: | |
82 | exts.append(ext) | |
83 | ||
84 | def guess_type(self, url, strict=True): | |
85 | """Guess the type of a file based on its URL. | |
86 | ||
87 | Return value is a tuple (type, encoding) where type is None if | |
88 | the type can't be guessed (no or unknown suffix) or a string | |
89 | of the form type/subtype, usable for a MIME Content-type | |
90 | header; and encoding is None for no encoding or the name of | |
91 | the program used to encode (e.g. compress or gzip). The | |
92 | mappings are table driven. Encoding suffixes are case | |
93 | sensitive; type suffixes are first tried case sensitive, then | |
94 | case insensitive. | |
95 | ||
96 | The suffixes .tgz, .taz and .tz (case sensitive!) are all | |
97 | mapped to '.tar.gz'. (This is table-driven too, using the | |
98 | dictionary suffix_map.) | |
99 | ||
100 | Optional `strict' argument when False adds a bunch of commonly found, | |
101 | but non-standard types. | |
102 | """ | |
103 | scheme, url = urllib.splittype(url) | |
104 | if scheme == 'data': | |
105 | # syntax of data URLs: | |
106 | # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data | |
107 | # mediatype := [ type "/" subtype ] *( ";" parameter ) | |
108 | # data := *urlchar | |
109 | # parameter := attribute "=" value | |
110 | # type/subtype defaults to "text/plain" | |
111 | comma = url.find(',') | |
112 | if comma < 0: | |
113 | # bad data URL | |
114 | return None, None | |
115 | semi = url.find(';', 0, comma) | |
116 | if semi >= 0: | |
117 | type = url[:semi] | |
118 | else: | |
119 | type = url[:comma] | |
120 | if '=' in type or '/' not in type: | |
121 | type = 'text/plain' | |
122 | return type, None # never compressed, so encoding is None | |
123 | base, ext = posixpath.splitext(url) | |
124 | while ext in self.suffix_map: | |
125 | base, ext = posixpath.splitext(base + self.suffix_map[ext]) | |
126 | if ext in self.encodings_map: | |
127 | encoding = self.encodings_map[ext] | |
128 | base, ext = posixpath.splitext(base) | |
129 | else: | |
130 | encoding = None | |
131 | types_map = self.types_map[True] | |
132 | if ext in types_map: | |
133 | return types_map[ext], encoding | |
134 | elif ext.lower() in types_map: | |
135 | return types_map[ext.lower()], encoding | |
136 | elif strict: | |
137 | return None, encoding | |
138 | types_map = self.types_map[False] | |
139 | if ext in types_map: | |
140 | return types_map[ext], encoding | |
141 | elif ext.lower() in types_map: | |
142 | return types_map[ext.lower()], encoding | |
143 | else: | |
144 | return None, encoding | |
145 | ||
146 | def guess_all_extensions(self, type, strict=True): | |
147 | """Guess the extensions for a file based on its MIME type. | |
148 | ||
149 | Return value is a list of strings giving the possible filename | |
150 | extensions, including the leading dot ('.'). The extension is not | |
151 | guaranteed to have been associated with any particular data stream, | |
152 | but would be mapped to the MIME type `type' by guess_type(). | |
153 | ||
154 | Optional `strict' argument when false adds a bunch of commonly found, | |
155 | but non-standard types. | |
156 | """ | |
157 | type = type.lower() | |
158 | extensions = self.types_map_inv[True].get(type, []) | |
159 | if not strict: | |
160 | for ext in self.types_map_inv[False].get(type, []): | |
161 | if ext not in extensions: | |
162 | extensions.append(ext) | |
163 | return extensions | |
164 | ||
165 | def guess_extension(self, type, strict=True): | |
166 | """Guess the extension for a file based on its MIME type. | |
167 | ||
168 | Return value is a string giving a filename extension, | |
169 | including the leading dot ('.'). The extension is not | |
170 | guaranteed to have been associated with any particular data | |
171 | stream, but would be mapped to the MIME type `type' by | |
172 | guess_type(). If no extension can be guessed for `type', None | |
173 | is returned. | |
174 | ||
175 | Optional `strict' argument when false adds a bunch of commonly found, | |
176 | but non-standard types. | |
177 | """ | |
178 | extensions = self.guess_all_extensions(type, strict) | |
179 | if not extensions: | |
180 | return None | |
181 | return extensions[0] | |
182 | ||
183 | def read(self, filename, strict=True): | |
184 | """ | |
185 | Read a single mime.types-format file, specified by pathname. | |
186 | ||
187 | If strict is true, information will be added to | |
188 | list of standard types, else to the list of non-standard | |
189 | types. | |
190 | """ | |
191 | fp = open(filename) | |
192 | self.readfp(fp, strict) | |
193 | fp.close() | |
194 | ||
195 | def readfp(self, fp, strict=True): | |
196 | """ | |
197 | Read a single mime.types-format file. | |
198 | ||
199 | If strict is true, information will be added to | |
200 | list of standard types, else to the list of non-standard | |
201 | types. | |
202 | """ | |
203 | while 1: | |
204 | line = fp.readline() | |
205 | if not line: | |
206 | break | |
207 | words = line.split() | |
208 | for i in range(len(words)): | |
209 | if words[i][0] == '#': | |
210 | del words[i:] | |
211 | break | |
212 | if not words: | |
213 | continue | |
214 | type, suffixes = words[0], words[1:] | |
215 | for suff in suffixes: | |
216 | self.add_type(type, '.' + suff, strict) | |
217 | ||
218 | def guess_type(url, strict=True): | |
219 | """Guess the type of a file based on its URL. | |
220 | ||
221 | Return value is a tuple (type, encoding) where type is None if the | |
222 | type can't be guessed (no or unknown suffix) or a string of the | |
223 | form type/subtype, usable for a MIME Content-type header; and | |
224 | encoding is None for no encoding or the name of the program used | |
225 | to encode (e.g. compress or gzip). The mappings are table | |
226 | driven. Encoding suffixes are case sensitive; type suffixes are | |
227 | first tried case sensitive, then case insensitive. | |
228 | ||
229 | The suffixes .tgz, .taz and .tz (case sensitive!) are all mapped | |
230 | to ".tar.gz". (This is table-driven too, using the dictionary | |
231 | suffix_map). | |
232 | ||
233 | Optional `strict' argument when false adds a bunch of commonly found, but | |
234 | non-standard types. | |
235 | """ | |
236 | init() | |
237 | return guess_type(url, strict) | |
238 | ||
239 | ||
240 | def guess_all_extensions(type, strict=True): | |
241 | """Guess the extensions for a file based on its MIME type. | |
242 | ||
243 | Return value is a list of strings giving the possible filename | |
244 | extensions, including the leading dot ('.'). The extension is not | |
245 | guaranteed to have been associated with any particular data | |
246 | stream, but would be mapped to the MIME type `type' by | |
247 | guess_type(). If no extension can be guessed for `type', None | |
248 | is returned. | |
249 | ||
250 | Optional `strict' argument when false adds a bunch of commonly found, | |
251 | but non-standard types. | |
252 | """ | |
253 | init() | |
254 | return guess_all_extensions(type, strict) | |
255 | ||
256 | def guess_extension(type, strict=True): | |
257 | """Guess the extension for a file based on its MIME type. | |
258 | ||
259 | Return value is a string giving a filename extension, including the | |
260 | leading dot ('.'). The extension is not guaranteed to have been | |
261 | associated with any particular data stream, but would be mapped to the | |
262 | MIME type `type' by guess_type(). If no extension can be guessed for | |
263 | `type', None is returned. | |
264 | ||
265 | Optional `strict' argument when false adds a bunch of commonly found, | |
266 | but non-standard types. | |
267 | """ | |
268 | init() | |
269 | return guess_extension(type, strict) | |
270 | ||
271 | def add_type(type, ext, strict=True): | |
272 | """Add a mapping between a type and an extension. | |
273 | ||
274 | When the extension is already known, the new | |
275 | type will replace the old one. When the type | |
276 | is already known the extension will be added | |
277 | to the list of known extensions. | |
278 | ||
279 | If strict is true, information will be added to | |
280 | list of standard types, else to the list of non-standard | |
281 | types. | |
282 | """ | |
283 | init() | |
284 | return add_type(type, ext, strict) | |
285 | ||
286 | ||
287 | def init(files=None): | |
288 | global guess_all_extensions, guess_extension, guess_type | |
289 | global suffix_map, types_map, encodings_map, common_types | |
290 | global add_type, inited | |
291 | inited = True | |
292 | db = MimeTypes() | |
293 | if files is None: | |
294 | files = knownfiles | |
295 | for file in files: | |
296 | if os.path.isfile(file): | |
297 | db.readfp(open(file)) | |
298 | encodings_map = db.encodings_map | |
299 | suffix_map = db.suffix_map | |
300 | types_map = db.types_map[True] | |
301 | guess_all_extensions = db.guess_all_extensions | |
302 | guess_extension = db.guess_extension | |
303 | guess_type = db.guess_type | |
304 | add_type = db.add_type | |
305 | common_types = db.types_map[False] | |
306 | ||
307 | ||
308 | def read_mime_types(file): | |
309 | try: | |
310 | f = open(file) | |
311 | except IOError: | |
312 | return None | |
313 | db = MimeTypes() | |
314 | db.readfp(f, True) | |
315 | return db.types_map[True] | |
316 | ||
317 | ||
318 | suffix_map = { | |
319 | '.tgz': '.tar.gz', | |
320 | '.taz': '.tar.gz', | |
321 | '.tz': '.tar.gz', | |
322 | } | |
323 | ||
324 | encodings_map = { | |
325 | '.gz': 'gzip', | |
326 | '.Z': 'compress', | |
327 | } | |
328 | ||
329 | # Before adding new types, make sure they are either registered with IANA, at | |
330 | # http://www.isi.edu/in-notes/iana/assignments/media-types | |
331 | # or extensions, i.e. using the x- prefix | |
332 | ||
333 | # If you add to these, please keep them sorted! | |
334 | types_map = { | |
335 | '.a' : 'application/octet-stream', | |
336 | '.ai' : 'application/postscript', | |
337 | '.aif' : 'audio/x-aiff', | |
338 | '.aifc' : 'audio/x-aiff', | |
339 | '.aiff' : 'audio/x-aiff', | |
340 | '.au' : 'audio/basic', | |
341 | '.avi' : 'video/x-msvideo', | |
342 | '.bat' : 'text/plain', | |
343 | '.bcpio' : 'application/x-bcpio', | |
344 | '.bin' : 'application/octet-stream', | |
345 | '.bmp' : 'image/x-ms-bmp', | |
346 | '.c' : 'text/plain', | |
347 | # Duplicates :( | |
348 | '.cdf' : 'application/x-cdf', | |
349 | '.cdf' : 'application/x-netcdf', | |
350 | '.cpio' : 'application/x-cpio', | |
351 | '.csh' : 'application/x-csh', | |
352 | '.css' : 'text/css', | |
353 | '.dll' : 'application/octet-stream', | |
354 | '.doc' : 'application/msword', | |
355 | '.dot' : 'application/msword', | |
356 | '.dvi' : 'application/x-dvi', | |
357 | '.eml' : 'message/rfc822', | |
358 | '.eps' : 'application/postscript', | |
359 | '.etx' : 'text/x-setext', | |
360 | '.exe' : 'application/octet-stream', | |
361 | '.gif' : 'image/gif', | |
362 | '.gtar' : 'application/x-gtar', | |
363 | '.h' : 'text/plain', | |
364 | '.hdf' : 'application/x-hdf', | |
365 | '.htm' : 'text/html', | |
366 | '.html' : 'text/html', | |
367 | '.ief' : 'image/ief', | |
368 | '.jpe' : 'image/jpeg', | |
369 | '.jpeg' : 'image/jpeg', | |
370 | '.jpg' : 'image/jpeg', | |
371 | '.js' : 'application/x-javascript', | |
372 | '.ksh' : 'text/plain', | |
373 | '.latex' : 'application/x-latex', | |
374 | '.m1v' : 'video/mpeg', | |
375 | '.man' : 'application/x-troff-man', | |
376 | '.me' : 'application/x-troff-me', | |
377 | '.mht' : 'message/rfc822', | |
378 | '.mhtml' : 'message/rfc822', | |
379 | '.mif' : 'application/x-mif', | |
380 | '.mov' : 'video/quicktime', | |
381 | '.movie' : 'video/x-sgi-movie', | |
382 | '.mp2' : 'audio/mpeg', | |
383 | '.mp3' : 'audio/mpeg', | |
384 | '.mpa' : 'video/mpeg', | |
385 | '.mpe' : 'video/mpeg', | |
386 | '.mpeg' : 'video/mpeg', | |
387 | '.mpg' : 'video/mpeg', | |
388 | '.ms' : 'application/x-troff-ms', | |
389 | '.nc' : 'application/x-netcdf', | |
390 | '.nws' : 'message/rfc822', | |
391 | '.o' : 'application/octet-stream', | |
392 | '.obj' : 'application/octet-stream', | |
393 | '.oda' : 'application/oda', | |
394 | '.p12' : 'application/x-pkcs12', | |
395 | '.p7c' : 'application/pkcs7-mime', | |
396 | '.pbm' : 'image/x-portable-bitmap', | |
397 | '.pdf' : 'application/pdf', | |
398 | '.pfx' : 'application/x-pkcs12', | |
399 | '.pgm' : 'image/x-portable-graymap', | |
400 | '.pl' : 'text/plain', | |
401 | '.png' : 'image/png', | |
402 | '.pnm' : 'image/x-portable-anymap', | |
403 | '.pot' : 'application/vnd.ms-powerpoint', | |
404 | '.ppa' : 'application/vnd.ms-powerpoint', | |
405 | '.ppm' : 'image/x-portable-pixmap', | |
406 | '.pps' : 'application/vnd.ms-powerpoint', | |
407 | '.ppt' : 'application/vnd.ms-powerpoint', | |
408 | '.ps' : 'application/postscript', | |
409 | '.pwz' : 'application/vnd.ms-powerpoint', | |
410 | '.py' : 'text/x-python', | |
411 | '.pyc' : 'application/x-python-code', | |
412 | '.pyo' : 'application/x-python-code', | |
413 | '.qt' : 'video/quicktime', | |
414 | '.ra' : 'audio/x-pn-realaudio', | |
415 | '.ram' : 'application/x-pn-realaudio', | |
416 | '.ras' : 'image/x-cmu-raster', | |
417 | '.rdf' : 'application/xml', | |
418 | '.rgb' : 'image/x-rgb', | |
419 | '.roff' : 'application/x-troff', | |
420 | '.rtx' : 'text/richtext', | |
421 | '.sgm' : 'text/x-sgml', | |
422 | '.sgml' : 'text/x-sgml', | |
423 | '.sh' : 'application/x-sh', | |
424 | '.shar' : 'application/x-shar', | |
425 | '.snd' : 'audio/basic', | |
426 | '.so' : 'application/octet-stream', | |
427 | '.src' : 'application/x-wais-source', | |
428 | '.sv4cpio': 'application/x-sv4cpio', | |
429 | '.sv4crc' : 'application/x-sv4crc', | |
430 | '.swf' : 'application/x-shockwave-flash', | |
431 | '.t' : 'application/x-troff', | |
432 | '.tar' : 'application/x-tar', | |
433 | '.tcl' : 'application/x-tcl', | |
434 | '.tex' : 'application/x-tex', | |
435 | '.texi' : 'application/x-texinfo', | |
436 | '.texinfo': 'application/x-texinfo', | |
437 | '.tif' : 'image/tiff', | |
438 | '.tiff' : 'image/tiff', | |
439 | '.tr' : 'application/x-troff', | |
440 | '.tsv' : 'text/tab-separated-values', | |
441 | '.txt' : 'text/plain', | |
442 | '.ustar' : 'application/x-ustar', | |
443 | '.vcf' : 'text/x-vcard', | |
444 | '.wav' : 'audio/x-wav', | |
445 | '.wiz' : 'application/msword', | |
446 | '.xbm' : 'image/x-xbitmap', | |
447 | '.xlb' : 'application/vnd.ms-excel', | |
448 | # Duplicates :( | |
449 | '.xls' : 'application/excel', | |
450 | '.xls' : 'application/vnd.ms-excel', | |
451 | '.xml' : 'text/xml', | |
452 | '.xpm' : 'image/x-xpixmap', | |
453 | '.xsl' : 'application/xml', | |
454 | '.xwd' : 'image/x-xwindowdump', | |
455 | '.zip' : 'application/zip', | |
456 | } | |
457 | ||
458 | # These are non-standard types, commonly found in the wild. They will only | |
459 | # match if strict=0 flag is given to the API methods. | |
460 | ||
461 | # Please sort these too | |
462 | common_types = { | |
463 | '.jpg' : 'image/jpg', | |
464 | '.mid' : 'audio/midi', | |
465 | '.midi': 'audio/midi', | |
466 | '.pct' : 'image/pict', | |
467 | '.pic' : 'image/pict', | |
468 | '.pict': 'image/pict', | |
469 | '.rtf' : 'application/rtf', | |
470 | '.xul' : 'text/xul' | |
471 | } | |
472 | ||
473 | ||
474 | if __name__ == '__main__': | |
475 | import sys | |
476 | import getopt | |
477 | ||
478 | USAGE = """\ | |
479 | Usage: mimetypes.py [options] type | |
480 | ||
481 | Options: | |
482 | --help / -h -- print this message and exit | |
483 | --lenient / -l -- additionally search of some common, but non-standard | |
484 | types. | |
485 | --extension / -e -- guess extension instead of type | |
486 | ||
487 | More than one type argument may be given. | |
488 | """ | |
489 | ||
490 | def usage(code, msg=''): | |
491 | print USAGE | |
492 | if msg: print msg | |
493 | sys.exit(code) | |
494 | ||
495 | try: | |
496 | opts, args = getopt.getopt(sys.argv[1:], 'hle', | |
497 | ['help', 'lenient', 'extension']) | |
498 | except getopt.error, msg: | |
499 | usage(1, msg) | |
500 | ||
501 | strict = 1 | |
502 | extension = 0 | |
503 | for opt, arg in opts: | |
504 | if opt in ('-h', '--help'): | |
505 | usage(0) | |
506 | elif opt in ('-l', '--lenient'): | |
507 | strict = 0 | |
508 | elif opt in ('-e', '--extension'): | |
509 | extension = 1 | |
510 | for gtype in args: | |
511 | if extension: | |
512 | guess = guess_extension(gtype, strict) | |
513 | if not guess: print "I don't know anything about type", gtype | |
514 | else: print guess | |
515 | else: | |
516 | guess, encoding = guess_type(gtype, strict) | |
517 | if not guess: print "I don't know anything about type", gtype | |
518 | else: print 'type:', guess, 'encoding:', encoding |