Initial commit of OpenSPARC T2 design and verification files.
[OpenSPARC-T2-DV] / tools / src / nas,5.n2.os.2 / lib / python / lib / python2.4 / urllib.py
CommitLineData
86530b38
AT
1"""Open an arbitrary URL.
2
3See the following document for more info on URLs:
4"Names and Addresses, URIs, URLs, URNs, URCs", at
5http://www.w3.org/pub/WWW/Addressing/Overview.html
6
7See also the HTTP spec (from which the error codes are derived):
8"HTTP - Hypertext Transfer Protocol", at
9http://www.w3.org/pub/WWW/Protocols/
10
11Related standards and specs:
12- RFC1808: the "relative URL" spec. (authoritative status)
13- RFC1738 - the "URL standard". (authoritative status)
14- RFC1630 - the "URI spec". (informational status)
15
16The object returned by URLopener().open(file) will differ per
17protocol. All you know is that is has methods read(), readline(),
18readlines(), fileno(), close() and info(). The read*(), fileno()
19and close() methods work like those of open files.
20The info() method returns a mimetools.Message object which can be
21used to query various info about the object, if available.
22(mimetools.Message objects are queried with the getheader() method.)
23"""
24
25import string
26import socket
27import os
28import time
29import sys
30from urlparse import urljoin as basejoin
31
32__all__ = ["urlopen", "URLopener", "FancyURLopener", "urlretrieve",
33 "urlcleanup", "quote", "quote_plus", "unquote", "unquote_plus",
34 "urlencode", "url2pathname", "pathname2url", "splittag",
35 "localhost", "thishost", "ftperrors", "basejoin", "unwrap",
36 "splittype", "splithost", "splituser", "splitpasswd", "splitport",
37 "splitnport", "splitquery", "splitattr", "splitvalue",
38 "splitgophertype", "getproxies"]
39
40__version__ = '1.16' # XXX This version is not always updated :-(
41
42MAXFTPCACHE = 10 # Trim the ftp cache beyond this size
43
44# Helper for non-unix systems
45if os.name == 'mac':
46 from macurl2path import url2pathname, pathname2url
47elif os.name == 'nt':
48 from nturl2path import url2pathname, pathname2url
49elif os.name == 'riscos':
50 from rourl2path import url2pathname, pathname2url
51else:
52 def url2pathname(pathname):
53 return unquote(pathname)
54 def pathname2url(pathname):
55 return quote(pathname)
56
57# This really consists of two pieces:
58# (1) a class which handles opening of all sorts of URLs
59# (plus assorted utilities etc.)
60# (2) a set of functions for parsing URLs
61# XXX Should these be separated out into different modules?
62
63
64# Shortcut for basic usage
65_urlopener = None
66def urlopen(url, data=None, proxies=None):
67 """urlopen(url [, data]) -> open file-like object"""
68 global _urlopener
69 if proxies is not None:
70 opener = FancyURLopener(proxies=proxies)
71 elif not _urlopener:
72 opener = FancyURLopener()
73 _urlopener = opener
74 else:
75 opener = _urlopener
76 if data is None:
77 return opener.open(url)
78 else:
79 return opener.open(url, data)
80def urlretrieve(url, filename=None, reporthook=None, data=None):
81 global _urlopener
82 if not _urlopener:
83 _urlopener = FancyURLopener()
84 return _urlopener.retrieve(url, filename, reporthook, data)
85def urlcleanup():
86 if _urlopener:
87 _urlopener.cleanup()
88
89# exception raised when downloaded size does not match content-length
90class ContentTooShortError(IOError):
91 def __init__(self, message, content):
92 IOError.__init__(self, message)
93 self.content = content
94
95ftpcache = {}
96class URLopener:
97 """Class to open URLs.
98 This is a class rather than just a subroutine because we may need
99 more than one set of global protocol-specific options.
100 Note -- this is a base class for those who don't want the
101 automatic handling of errors type 302 (relocated) and 401
102 (authorization needed)."""
103
104 __tempfiles = None
105
106 version = "Python-urllib/%s" % __version__
107
108 # Constructor
109 def __init__(self, proxies=None, **x509):
110 if proxies is None:
111 proxies = getproxies()
112 assert hasattr(proxies, 'has_key'), "proxies must be a mapping"
113 self.proxies = proxies
114 self.key_file = x509.get('key_file')
115 self.cert_file = x509.get('cert_file')
116 self.addheaders = [('User-agent', self.version)]
117 self.__tempfiles = []
118 self.__unlink = os.unlink # See cleanup()
119 self.tempcache = None
120 # Undocumented feature: if you assign {} to tempcache,
121 # it is used to cache files retrieved with
122 # self.retrieve(). This is not enabled by default
123 # since it does not work for changing documents (and I
124 # haven't got the logic to check expiration headers
125 # yet).
126 self.ftpcache = ftpcache
127 # Undocumented feature: you can use a different
128 # ftp cache by assigning to the .ftpcache member;
129 # in case you want logically independent URL openers
130 # XXX This is not threadsafe. Bah.
131
132 def __del__(self):
133 self.close()
134
135 def close(self):
136 self.cleanup()
137
138 def cleanup(self):
139 # This code sometimes runs when the rest of this module
140 # has already been deleted, so it can't use any globals
141 # or import anything.
142 if self.__tempfiles:
143 for file in self.__tempfiles:
144 try:
145 self.__unlink(file)
146 except OSError:
147 pass
148 del self.__tempfiles[:]
149 if self.tempcache:
150 self.tempcache.clear()
151
152 def addheader(self, *args):
153 """Add a header to be used by the HTTP interface only
154 e.g. u.addheader('Accept', 'sound/basic')"""
155 self.addheaders.append(args)
156
157 # External interface
158 def open(self, fullurl, data=None):
159 """Use URLopener().open(file) instead of open(file, 'r')."""
160 fullurl = unwrap(toBytes(fullurl))
161 if self.tempcache and fullurl in self.tempcache:
162 filename, headers = self.tempcache[fullurl]
163 fp = open(filename, 'rb')
164 return addinfourl(fp, headers, fullurl)
165 urltype, url = splittype(fullurl)
166 if not urltype:
167 urltype = 'file'
168 if urltype in self.proxies:
169 proxy = self.proxies[urltype]
170 urltype, proxyhost = splittype(proxy)
171 host, selector = splithost(proxyhost)
172 url = (host, fullurl) # Signal special case to open_*()
173 else:
174 proxy = None
175 name = 'open_' + urltype
176 self.type = urltype
177 name = name.replace('-', '_')
178 if not hasattr(self, name):
179 if proxy:
180 return self.open_unknown_proxy(proxy, fullurl, data)
181 else:
182 return self.open_unknown(fullurl, data)
183 try:
184 if data is None:
185 return getattr(self, name)(url)
186 else:
187 return getattr(self, name)(url, data)
188 except socket.error, msg:
189 raise IOError, ('socket error', msg), sys.exc_info()[2]
190
191 def open_unknown(self, fullurl, data=None):
192 """Overridable interface to open unknown URL type."""
193 type, url = splittype(fullurl)
194 raise IOError, ('url error', 'unknown url type', type)
195
196 def open_unknown_proxy(self, proxy, fullurl, data=None):
197 """Overridable interface to open unknown URL type."""
198 type, url = splittype(fullurl)
199 raise IOError, ('url error', 'invalid proxy for %s' % type, proxy)
200
201 # External interface
202 def retrieve(self, url, filename=None, reporthook=None, data=None):
203 """retrieve(url) returns (filename, headers) for a local object
204 or (tempfilename, headers) for a remote object."""
205 url = unwrap(toBytes(url))
206 if self.tempcache and url in self.tempcache:
207 return self.tempcache[url]
208 type, url1 = splittype(url)
209 if filename is None and (not type or type == 'file'):
210 try:
211 fp = self.open_local_file(url1)
212 hdrs = fp.info()
213 del fp
214 return url2pathname(splithost(url1)[1]), hdrs
215 except IOError, msg:
216 pass
217 fp = self.open(url, data)
218 headers = fp.info()
219 if filename:
220 tfp = open(filename, 'wb')
221 else:
222 import tempfile
223 garbage, path = splittype(url)
224 garbage, path = splithost(path or "")
225 path, garbage = splitquery(path or "")
226 path, garbage = splitattr(path or "")
227 suffix = os.path.splitext(path)[1]
228 (fd, filename) = tempfile.mkstemp(suffix)
229 self.__tempfiles.append(filename)
230 tfp = os.fdopen(fd, 'wb')
231 result = filename, headers
232 if self.tempcache is not None:
233 self.tempcache[url] = result
234 bs = 1024*8
235 size = -1
236 read = 0
237 blocknum = 0
238 if reporthook:
239 if "content-length" in headers:
240 size = int(headers["Content-Length"])
241 reporthook(blocknum, bs, size)
242 while 1:
243 block = fp.read(bs)
244 if block == "":
245 break
246 read += len(block)
247 tfp.write(block)
248 blocknum += 1
249 if reporthook:
250 reporthook(blocknum, bs, size)
251 fp.close()
252 tfp.close()
253 del fp
254 del tfp
255
256 # raise exception if actual size does not match content-length header
257 if size >= 0 and read < size:
258 raise ContentTooShortError("retrieval incomplete: got only %i out "
259 "of %i bytes" % (read, size), result)
260
261 return result
262
263 # Each method named open_<type> knows how to open that type of URL
264
265 def open_http(self, url, data=None):
266 """Use HTTP protocol."""
267 import httplib
268 user_passwd = None
269 if isinstance(url, str):
270 host, selector = splithost(url)
271 if host:
272 user_passwd, host = splituser(host)
273 host = unquote(host)
274 realhost = host
275 else:
276 host, selector = url
277 urltype, rest = splittype(selector)
278 url = rest
279 user_passwd = None
280 if urltype.lower() != 'http':
281 realhost = None
282 else:
283 realhost, rest = splithost(rest)
284 if realhost:
285 user_passwd, realhost = splituser(realhost)
286 if user_passwd:
287 selector = "%s://%s%s" % (urltype, realhost, rest)
288 if proxy_bypass(realhost):
289 host = realhost
290
291 #print "proxy via http:", host, selector
292 if not host: raise IOError, ('http error', 'no host given')
293 if user_passwd:
294 import base64
295 auth = base64.encodestring(user_passwd).strip()
296 else:
297 auth = None
298 h = httplib.HTTP(host)
299 if data is not None:
300 h.putrequest('POST', selector)
301 h.putheader('Content-type', 'application/x-www-form-urlencoded')
302 h.putheader('Content-length', '%d' % len(data))
303 else:
304 h.putrequest('GET', selector)
305 if auth: h.putheader('Authorization', 'Basic %s' % auth)
306 if realhost: h.putheader('Host', realhost)
307 for args in self.addheaders: h.putheader(*args)
308 h.endheaders()
309 if data is not None:
310 h.send(data)
311 errcode, errmsg, headers = h.getreply()
312 fp = h.getfile()
313 if errcode == 200:
314 return addinfourl(fp, headers, "http:" + url)
315 else:
316 if data is None:
317 return self.http_error(url, fp, errcode, errmsg, headers)
318 else:
319 return self.http_error(url, fp, errcode, errmsg, headers, data)
320
321 def http_error(self, url, fp, errcode, errmsg, headers, data=None):
322 """Handle http errors.
323 Derived class can override this, or provide specific handlers
324 named http_error_DDD where DDD is the 3-digit error code."""
325 # First check if there's a specific handler for this error
326 name = 'http_error_%d' % errcode
327 if hasattr(self, name):
328 method = getattr(self, name)
329 if data is None:
330 result = method(url, fp, errcode, errmsg, headers)
331 else:
332 result = method(url, fp, errcode, errmsg, headers, data)
333 if result: return result
334 return self.http_error_default(url, fp, errcode, errmsg, headers)
335
336 def http_error_default(self, url, fp, errcode, errmsg, headers):
337 """Default error handler: close the connection and raise IOError."""
338 void = fp.read()
339 fp.close()
340 raise IOError, ('http error', errcode, errmsg, headers)
341
342 if hasattr(socket, "ssl"):
343 def open_https(self, url, data=None):
344 """Use HTTPS protocol."""
345 import httplib
346 user_passwd = None
347 if isinstance(url, str):
348 host, selector = splithost(url)
349 if host:
350 user_passwd, host = splituser(host)
351 host = unquote(host)
352 realhost = host
353 else:
354 host, selector = url
355 urltype, rest = splittype(selector)
356 url = rest
357 user_passwd = None
358 if urltype.lower() != 'https':
359 realhost = None
360 else:
361 realhost, rest = splithost(rest)
362 if realhost:
363 user_passwd, realhost = splituser(realhost)
364 if user_passwd:
365 selector = "%s://%s%s" % (urltype, realhost, rest)
366 #print "proxy via https:", host, selector
367 if not host: raise IOError, ('https error', 'no host given')
368 if user_passwd:
369 import base64
370 auth = base64.encodestring(user_passwd).strip()
371 else:
372 auth = None
373 h = httplib.HTTPS(host, 0,
374 key_file=self.key_file,
375 cert_file=self.cert_file)
376 if data is not None:
377 h.putrequest('POST', selector)
378 h.putheader('Content-type',
379 'application/x-www-form-urlencoded')
380 h.putheader('Content-length', '%d' % len(data))
381 else:
382 h.putrequest('GET', selector)
383 if auth: h.putheader('Authorization', 'Basic %s' % auth)
384 if realhost: h.putheader('Host', realhost)
385 for args in self.addheaders: h.putheader(*args)
386 h.endheaders()
387 if data is not None:
388 h.send(data)
389 errcode, errmsg, headers = h.getreply()
390 fp = h.getfile()
391 if errcode == 200:
392 return addinfourl(fp, headers, "https:" + url)
393 else:
394 if data is None:
395 return self.http_error(url, fp, errcode, errmsg, headers)
396 else:
397 return self.http_error(url, fp, errcode, errmsg, headers,
398 data)
399
400 def open_gopher(self, url):
401 """Use Gopher protocol."""
402 import gopherlib
403 host, selector = splithost(url)
404 if not host: raise IOError, ('gopher error', 'no host given')
405 host = unquote(host)
406 type, selector = splitgophertype(selector)
407 selector, query = splitquery(selector)
408 selector = unquote(selector)
409 if query:
410 query = unquote(query)
411 fp = gopherlib.send_query(selector, query, host)
412 else:
413 fp = gopherlib.send_selector(selector, host)
414 return addinfourl(fp, noheaders(), "gopher:" + url)
415
416 def open_file(self, url):
417 """Use local file or FTP depending on form of URL."""
418 if url[:2] == '//' and url[2:3] != '/' and url[2:12].lower() != 'localhost/':
419 return self.open_ftp(url)
420 else:
421 return self.open_local_file(url)
422
423 def open_local_file(self, url):
424 """Use local file."""
425 import mimetypes, mimetools, email.Utils
426 try:
427 from cStringIO import StringIO
428 except ImportError:
429 from StringIO import StringIO
430 host, file = splithost(url)
431 localname = url2pathname(file)
432 try:
433 stats = os.stat(localname)
434 except OSError, e:
435 raise IOError(e.errno, e.strerror, e.filename)
436 size = stats.st_size
437 modified = email.Utils.formatdate(stats.st_mtime, usegmt=True)
438 mtype = mimetypes.guess_type(url)[0]
439 headers = mimetools.Message(StringIO(
440 'Content-Type: %s\nContent-Length: %d\nLast-modified: %s\n' %
441 (mtype or 'text/plain', size, modified)))
442 if not host:
443 urlfile = file
444 if file[:1] == '/':
445 urlfile = 'file://' + file
446 return addinfourl(open(localname, 'rb'),
447 headers, urlfile)
448 host, port = splitport(host)
449 if not port \
450 and socket.gethostbyname(host) in (localhost(), thishost()):
451 urlfile = file
452 if file[:1] == '/':
453 urlfile = 'file://' + file
454 return addinfourl(open(localname, 'rb'),
455 headers, urlfile)
456 raise IOError, ('local file error', 'not on local host')
457
458 def open_ftp(self, url):
459 """Use FTP protocol."""
460 import mimetypes, mimetools
461 try:
462 from cStringIO import StringIO
463 except ImportError:
464 from StringIO import StringIO
465 host, path = splithost(url)
466 if not host: raise IOError, ('ftp error', 'no host given')
467 host, port = splitport(host)
468 user, host = splituser(host)
469 if user: user, passwd = splitpasswd(user)
470 else: passwd = None
471 host = unquote(host)
472 user = unquote(user or '')
473 passwd = unquote(passwd or '')
474 host = socket.gethostbyname(host)
475 if not port:
476 import ftplib
477 port = ftplib.FTP_PORT
478 else:
479 port = int(port)
480 path, attrs = splitattr(path)
481 path = unquote(path)
482 dirs = path.split('/')
483 dirs, file = dirs[:-1], dirs[-1]
484 if dirs and not dirs[0]: dirs = dirs[1:]
485 if dirs and not dirs[0]: dirs[0] = '/'
486 key = user, host, port, '/'.join(dirs)
487 # XXX thread unsafe!
488 if len(self.ftpcache) > MAXFTPCACHE:
489 # Prune the cache, rather arbitrarily
490 for k in self.ftpcache.keys():
491 if k != key:
492 v = self.ftpcache[k]
493 del self.ftpcache[k]
494 v.close()
495 try:
496 if not key in self.ftpcache:
497 self.ftpcache[key] = \
498 ftpwrapper(user, passwd, host, port, dirs)
499 if not file: type = 'D'
500 else: type = 'I'
501 for attr in attrs:
502 attr, value = splitvalue(attr)
503 if attr.lower() == 'type' and \
504 value in ('a', 'A', 'i', 'I', 'd', 'D'):
505 type = value.upper()
506 (fp, retrlen) = self.ftpcache[key].retrfile(file, type)
507 mtype = mimetypes.guess_type("ftp:" + url)[0]
508 headers = ""
509 if mtype:
510 headers += "Content-Type: %s\n" % mtype
511 if retrlen is not None and retrlen >= 0:
512 headers += "Content-Length: %d\n" % retrlen
513 headers = mimetools.Message(StringIO(headers))
514 return addinfourl(fp, headers, "ftp:" + url)
515 except ftperrors(), msg:
516 raise IOError, ('ftp error', msg), sys.exc_info()[2]
517
518 def open_data(self, url, data=None):
519 """Use "data" URL."""
520 # ignore POSTed data
521 #
522 # syntax of data URLs:
523 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
524 # mediatype := [ type "/" subtype ] *( ";" parameter )
525 # data := *urlchar
526 # parameter := attribute "=" value
527 import mimetools
528 try:
529 from cStringIO import StringIO
530 except ImportError:
531 from StringIO import StringIO
532 try:
533 [type, data] = url.split(',', 1)
534 except ValueError:
535 raise IOError, ('data error', 'bad data URL')
536 if not type:
537 type = 'text/plain;charset=US-ASCII'
538 semi = type.rfind(';')
539 if semi >= 0 and '=' not in type[semi:]:
540 encoding = type[semi+1:]
541 type = type[:semi]
542 else:
543 encoding = ''
544 msg = []
545 msg.append('Date: %s'%time.strftime('%a, %d %b %Y %T GMT',
546 time.gmtime(time.time())))
547 msg.append('Content-type: %s' % type)
548 if encoding == 'base64':
549 import base64
550 data = base64.decodestring(data)
551 else:
552 data = unquote(data)
553 msg.append('Content-length: %d' % len(data))
554 msg.append('')
555 msg.append(data)
556 msg = '\n'.join(msg)
557 f = StringIO(msg)
558 headers = mimetools.Message(f, 0)
559 f.fileno = None # needed for addinfourl
560 return addinfourl(f, headers, url)
561
562
563class FancyURLopener(URLopener):
564 """Derived class with handlers for errors we can handle (perhaps)."""
565
566 def __init__(self, *args, **kwargs):
567 URLopener.__init__(self, *args, **kwargs)
568 self.auth_cache = {}
569 self.tries = 0
570 self.maxtries = 10
571
572 def http_error_default(self, url, fp, errcode, errmsg, headers):
573 """Default error handling -- don't raise an exception."""
574 return addinfourl(fp, headers, "http:" + url)
575
576 def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
577 """Error 302 -- relocated (temporarily)."""
578 self.tries += 1
579 if self.maxtries and self.tries >= self.maxtries:
580 if hasattr(self, "http_error_500"):
581 meth = self.http_error_500
582 else:
583 meth = self.http_error_default
584 self.tries = 0
585 return meth(url, fp, 500,
586 "Internal Server Error: Redirect Recursion", headers)
587 result = self.redirect_internal(url, fp, errcode, errmsg, headers,
588 data)
589 self.tries = 0
590 return result
591
592 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
593 if 'location' in headers:
594 newurl = headers['location']
595 elif 'uri' in headers:
596 newurl = headers['uri']
597 else:
598 return
599 void = fp.read()
600 fp.close()
601 # In case the server sent a relative URL, join with original:
602 newurl = basejoin(self.type + ":" + url, newurl)
603 return self.open(newurl)
604
605 def http_error_301(self, url, fp, errcode, errmsg, headers, data=None):
606 """Error 301 -- also relocated (permanently)."""
607 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
608
609 def http_error_303(self, url, fp, errcode, errmsg, headers, data=None):
610 """Error 303 -- also relocated (essentially identical to 302)."""
611 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
612
613 def http_error_307(self, url, fp, errcode, errmsg, headers, data=None):
614 """Error 307 -- relocated, but turn POST into error."""
615 if data is None:
616 return self.http_error_302(url, fp, errcode, errmsg, headers, data)
617 else:
618 return self.http_error_default(url, fp, errcode, errmsg, headers)
619
620 def http_error_401(self, url, fp, errcode, errmsg, headers, data=None):
621 """Error 401 -- authentication required.
622 See this URL for a description of the basic authentication scheme:
623 http://www.ics.uci.edu/pub/ietf/http/draft-ietf-http-v10-spec-00.txt"""
624 if not 'www-authenticate' in headers:
625 URLopener.http_error_default(self, url, fp,
626 errcode, errmsg, headers)
627 stuff = headers['www-authenticate']
628 import re
629 match = re.match('[ \t]*([^ \t]+)[ \t]+realm="([^"]*)"', stuff)
630 if not match:
631 URLopener.http_error_default(self, url, fp,
632 errcode, errmsg, headers)
633 scheme, realm = match.groups()
634 if scheme.lower() != 'basic':
635 URLopener.http_error_default(self, url, fp,
636 errcode, errmsg, headers)
637 name = 'retry_' + self.type + '_basic_auth'
638 if data is None:
639 return getattr(self,name)(url, realm)
640 else:
641 return getattr(self,name)(url, realm, data)
642
643 def retry_http_basic_auth(self, url, realm, data=None):
644 host, selector = splithost(url)
645 i = host.find('@') + 1
646 host = host[i:]
647 user, passwd = self.get_user_passwd(host, realm, i)
648 if not (user or passwd): return None
649 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
650 newurl = 'http://' + host + selector
651 if data is None:
652 return self.open(newurl)
653 else:
654 return self.open(newurl, data)
655
656 def retry_https_basic_auth(self, url, realm, data=None):
657 host, selector = splithost(url)
658 i = host.find('@') + 1
659 host = host[i:]
660 user, passwd = self.get_user_passwd(host, realm, i)
661 if not (user or passwd): return None
662 host = quote(user, safe='') + ':' + quote(passwd, safe='') + '@' + host
663 newurl = '//' + host + selector
664 return self.open_https(newurl, data)
665
666 def get_user_passwd(self, host, realm, clear_cache = 0):
667 key = realm + '@' + host.lower()
668 if key in self.auth_cache:
669 if clear_cache:
670 del self.auth_cache[key]
671 else:
672 return self.auth_cache[key]
673 user, passwd = self.prompt_user_passwd(host, realm)
674 if user or passwd: self.auth_cache[key] = (user, passwd)
675 return user, passwd
676
677 def prompt_user_passwd(self, host, realm):
678 """Override this in a GUI environment!"""
679 import getpass
680 try:
681 user = raw_input("Enter username for %s at %s: " % (realm,
682 host))
683 passwd = getpass.getpass("Enter password for %s in %s at %s: " %
684 (user, realm, host))
685 return user, passwd
686 except KeyboardInterrupt:
687 print
688 return None, None
689
690
691# Utility functions
692
693_localhost = None
694def localhost():
695 """Return the IP address of the magic hostname 'localhost'."""
696 global _localhost
697 if _localhost is None:
698 _localhost = socket.gethostbyname('localhost')
699 return _localhost
700
701_thishost = None
702def thishost():
703 """Return the IP address of the current host."""
704 global _thishost
705 if _thishost is None:
706 _thishost = socket.gethostbyname(socket.gethostname())
707 return _thishost
708
709_ftperrors = None
710def ftperrors():
711 """Return the set of errors raised by the FTP class."""
712 global _ftperrors
713 if _ftperrors is None:
714 import ftplib
715 _ftperrors = ftplib.all_errors
716 return _ftperrors
717
718_noheaders = None
719def noheaders():
720 """Return an empty mimetools.Message object."""
721 global _noheaders
722 if _noheaders is None:
723 import mimetools
724 try:
725 from cStringIO import StringIO
726 except ImportError:
727 from StringIO import StringIO
728 _noheaders = mimetools.Message(StringIO(), 0)
729 _noheaders.fp.close() # Recycle file descriptor
730 return _noheaders
731
732
733# Utility classes
734
735class ftpwrapper:
736 """Class used by open_ftp() for cache of open FTP connections."""
737
738 def __init__(self, user, passwd, host, port, dirs):
739 self.user = user
740 self.passwd = passwd
741 self.host = host
742 self.port = port
743 self.dirs = dirs
744 self.init()
745
746 def init(self):
747 import ftplib
748 self.busy = 0
749 self.ftp = ftplib.FTP()
750 self.ftp.connect(self.host, self.port)
751 self.ftp.login(self.user, self.passwd)
752 for dir in self.dirs:
753 self.ftp.cwd(dir)
754
755 def retrfile(self, file, type):
756 import ftplib
757 self.endtransfer()
758 if type in ('d', 'D'): cmd = 'TYPE A'; isdir = 1
759 else: cmd = 'TYPE ' + type; isdir = 0
760 try:
761 self.ftp.voidcmd(cmd)
762 except ftplib.all_errors:
763 self.init()
764 self.ftp.voidcmd(cmd)
765 conn = None
766 if file and not isdir:
767 # Use nlst to see if the file exists at all
768 try:
769 self.ftp.nlst(file)
770 except ftplib.error_perm, reason:
771 raise IOError, ('ftp error', reason), sys.exc_info()[2]
772 # Restore the transfer mode!
773 self.ftp.voidcmd(cmd)
774 # Try to retrieve as a file
775 try:
776 cmd = 'RETR ' + file
777 conn = self.ftp.ntransfercmd(cmd)
778 except ftplib.error_perm, reason:
779 if str(reason)[:3] != '550':
780 raise IOError, ('ftp error', reason), sys.exc_info()[2]
781 if not conn:
782 # Set transfer mode to ASCII!
783 self.ftp.voidcmd('TYPE A')
784 # Try a directory listing
785 if file: cmd = 'LIST ' + file
786 else: cmd = 'LIST'
787 conn = self.ftp.ntransfercmd(cmd)
788 self.busy = 1
789 # Pass back both a suitably decorated object and a retrieval length
790 return (addclosehook(conn[0].makefile('rb'),
791 self.endtransfer), conn[1])
792 def endtransfer(self):
793 if not self.busy:
794 return
795 self.busy = 0
796 try:
797 self.ftp.voidresp()
798 except ftperrors():
799 pass
800
801 def close(self):
802 self.endtransfer()
803 try:
804 self.ftp.close()
805 except ftperrors():
806 pass
807
808class addbase:
809 """Base class for addinfo and addclosehook."""
810
811 def __init__(self, fp):
812 self.fp = fp
813 self.read = self.fp.read
814 self.readline = self.fp.readline
815 if hasattr(self.fp, "readlines"): self.readlines = self.fp.readlines
816 if hasattr(self.fp, "fileno"): self.fileno = self.fp.fileno
817 if hasattr(self.fp, "__iter__"):
818 self.__iter__ = self.fp.__iter__
819 if hasattr(self.fp, "next"):
820 self.next = self.fp.next
821
822 def __repr__(self):
823 return '<%s at %r whose fp = %r>' % (self.__class__.__name__,
824 id(self), self.fp)
825
826 def close(self):
827 self.read = None
828 self.readline = None
829 self.readlines = None
830 self.fileno = None
831 if self.fp: self.fp.close()
832 self.fp = None
833
834class addclosehook(addbase):
835 """Class to add a close hook to an open file."""
836
837 def __init__(self, fp, closehook, *hookargs):
838 addbase.__init__(self, fp)
839 self.closehook = closehook
840 self.hookargs = hookargs
841
842 def close(self):
843 addbase.close(self)
844 if self.closehook:
845 self.closehook(*self.hookargs)
846 self.closehook = None
847 self.hookargs = None
848
849class addinfo(addbase):
850 """class to add an info() method to an open file."""
851
852 def __init__(self, fp, headers):
853 addbase.__init__(self, fp)
854 self.headers = headers
855
856 def info(self):
857 return self.headers
858
859class addinfourl(addbase):
860 """class to add info() and geturl() methods to an open file."""
861
862 def __init__(self, fp, headers, url):
863 addbase.__init__(self, fp)
864 self.headers = headers
865 self.url = url
866
867 def info(self):
868 return self.headers
869
870 def geturl(self):
871 return self.url
872
873
874# Utilities to parse URLs (most of these return None for missing parts):
875# unwrap('<URL:type://host/path>') --> 'type://host/path'
876# splittype('type:opaquestring') --> 'type', 'opaquestring'
877# splithost('//host[:port]/path') --> 'host[:port]', '/path'
878# splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'
879# splitpasswd('user:passwd') -> 'user', 'passwd'
880# splitport('host:port') --> 'host', 'port'
881# splitquery('/path?query') --> '/path', 'query'
882# splittag('/path#tag') --> '/path', 'tag'
883# splitattr('/path;attr1=value1;attr2=value2;...') ->
884# '/path', ['attr1=value1', 'attr2=value2', ...]
885# splitvalue('attr=value') --> 'attr', 'value'
886# splitgophertype('/Xselector') --> 'X', 'selector'
887# unquote('abc%20def') -> 'abc def'
888# quote('abc def') -> 'abc%20def')
889
890try:
891 unicode
892except NameError:
893 def _is_unicode(x):
894 return 0
895else:
896 def _is_unicode(x):
897 return isinstance(x, unicode)
898
899def toBytes(url):
900 """toBytes(u"URL") --> 'URL'."""
901 # Most URL schemes require ASCII. If that changes, the conversion
902 # can be relaxed
903 if _is_unicode(url):
904 try:
905 url = url.encode("ASCII")
906 except UnicodeError:
907 raise UnicodeError("URL " + repr(url) +
908 " contains non-ASCII characters")
909 return url
910
911def unwrap(url):
912 """unwrap('<URL:type://host/path>') --> 'type://host/path'."""
913 url = url.strip()
914 if url[:1] == '<' and url[-1:] == '>':
915 url = url[1:-1].strip()
916 if url[:4] == 'URL:': url = url[4:].strip()
917 return url
918
919_typeprog = None
920def splittype(url):
921 """splittype('type:opaquestring') --> 'type', 'opaquestring'."""
922 global _typeprog
923 if _typeprog is None:
924 import re
925 _typeprog = re.compile('^([^/:]+):')
926
927 match = _typeprog.match(url)
928 if match:
929 scheme = match.group(1)
930 return scheme.lower(), url[len(scheme) + 1:]
931 return None, url
932
933_hostprog = None
934def splithost(url):
935 """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
936 global _hostprog
937 if _hostprog is None:
938 import re
939 _hostprog = re.compile('^//([^/]*)(.*)$')
940
941 match = _hostprog.match(url)
942 if match: return match.group(1, 2)
943 return None, url
944
945_userprog = None
946def splituser(host):
947 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'."""
948 global _userprog
949 if _userprog is None:
950 import re
951 _userprog = re.compile('^(.*)@(.*)$')
952
953 match = _userprog.match(host)
954 if match: return map(unquote, match.group(1, 2))
955 return None, host
956
957_passwdprog = None
958def splitpasswd(user):
959 """splitpasswd('user:passwd') -> 'user', 'passwd'."""
960 global _passwdprog
961 if _passwdprog is None:
962 import re
963 _passwdprog = re.compile('^([^:]*):(.*)$')
964
965 match = _passwdprog.match(user)
966 if match: return match.group(1, 2)
967 return user, None
968
969# splittag('/path#tag') --> '/path', 'tag'
970_portprog = None
971def splitport(host):
972 """splitport('host:port') --> 'host', 'port'."""
973 global _portprog
974 if _portprog is None:
975 import re
976 _portprog = re.compile('^(.*):([0-9]+)$')
977
978 match = _portprog.match(host)
979 if match: return match.group(1, 2)
980 return host, None
981
982_nportprog = None
983def splitnport(host, defport=-1):
984 """Split host and port, returning numeric port.
985 Return given default port if no ':' found; defaults to -1.
986 Return numerical port if a valid number are found after ':'.
987 Return None if ':' but not a valid number."""
988 global _nportprog
989 if _nportprog is None:
990 import re
991 _nportprog = re.compile('^(.*):(.*)$')
992
993 match = _nportprog.match(host)
994 if match:
995 host, port = match.group(1, 2)
996 try:
997 if not port: raise ValueError, "no digits"
998 nport = int(port)
999 except ValueError:
1000 nport = None
1001 return host, nport
1002 return host, defport
1003
1004_queryprog = None
1005def splitquery(url):
1006 """splitquery('/path?query') --> '/path', 'query'."""
1007 global _queryprog
1008 if _queryprog is None:
1009 import re
1010 _queryprog = re.compile('^(.*)\?([^?]*)$')
1011
1012 match = _queryprog.match(url)
1013 if match: return match.group(1, 2)
1014 return url, None
1015
1016_tagprog = None
1017def splittag(url):
1018 """splittag('/path#tag') --> '/path', 'tag'."""
1019 global _tagprog
1020 if _tagprog is None:
1021 import re
1022 _tagprog = re.compile('^(.*)#([^#]*)$')
1023
1024 match = _tagprog.match(url)
1025 if match: return match.group(1, 2)
1026 return url, None
1027
1028def splitattr(url):
1029 """splitattr('/path;attr1=value1;attr2=value2;...') ->
1030 '/path', ['attr1=value1', 'attr2=value2', ...]."""
1031 words = url.split(';')
1032 return words[0], words[1:]
1033
1034_valueprog = None
1035def splitvalue(attr):
1036 """splitvalue('attr=value') --> 'attr', 'value'."""
1037 global _valueprog
1038 if _valueprog is None:
1039 import re
1040 _valueprog = re.compile('^([^=]*)=(.*)$')
1041
1042 match = _valueprog.match(attr)
1043 if match: return match.group(1, 2)
1044 return attr, None
1045
1046def splitgophertype(selector):
1047 """splitgophertype('/Xselector') --> 'X', 'selector'."""
1048 if selector[:1] == '/' and selector[1:2]:
1049 return selector[1], selector[2:]
1050 return None, selector
1051
1052_hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
1053_hextochr.update(('%02X' % i, chr(i)) for i in range(256))
1054
1055def unquote(s):
1056 """unquote('abc%20def') -> 'abc def'."""
1057 res = s.split('%')
1058 for i in xrange(1, len(res)):
1059 item = res[i]
1060 try:
1061 res[i] = _hextochr[item[:2]] + item[2:]
1062 except KeyError:
1063 res[i] = '%' + item
1064 return "".join(res)
1065
1066def unquote_plus(s):
1067 """unquote('%7e/abc+def') -> '~/abc def'"""
1068 s = s.replace('+', ' ')
1069 return unquote(s)
1070
1071always_safe = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
1072 'abcdefghijklmnopqrstuvwxyz'
1073 '0123456789' '_.-')
1074_safemaps = {}
1075
1076def quote(s, safe = '/'):
1077 """quote('abc def') -> 'abc%20def'
1078
1079 Each part of a URL, e.g. the path info, the query, etc., has a
1080 different set of reserved characters that must be quoted.
1081
1082 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists
1083 the following reserved characters.
1084
1085 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" |
1086 "$" | ","
1087
1088 Each of these characters is reserved in some component of a URL,
1089 but not necessarily in all of them.
1090
1091 By default, the quote function is intended for quoting the path
1092 section of a URL. Thus, it will not encode '/'. This character
1093 is reserved, but in typical usage the quote function is being
1094 called on a path where the existing slash characters are used as
1095 reserved characters.
1096 """
1097 cachekey = (safe, always_safe)
1098 try:
1099 safe_map = _safemaps[cachekey]
1100 except KeyError:
1101 safe += always_safe
1102 safe_map = {}
1103 for i in range(256):
1104 c = chr(i)
1105 safe_map[c] = (c in safe) and c or ('%%%02X' % i)
1106 _safemaps[cachekey] = safe_map
1107 res = map(safe_map.__getitem__, s)
1108 return ''.join(res)
1109
1110def quote_plus(s, safe = ''):
1111 """Quote the query fragment of a URL; replacing ' ' with '+'"""
1112 if ' ' in s:
1113 s = quote(s, safe + ' ')
1114 return s.replace(' ', '+')
1115 return quote(s, safe)
1116
1117def urlencode(query,doseq=0):
1118 """Encode a sequence of two-element tuples or dictionary into a URL query string.
1119
1120 If any values in the query arg are sequences and doseq is true, each
1121 sequence element is converted to a separate parameter.
1122
1123 If the query arg is a sequence of two-element tuples, the order of the
1124 parameters in the output will match the order of parameters in the
1125 input.
1126 """
1127
1128 if hasattr(query,"items"):
1129 # mapping objects
1130 query = query.items()
1131 else:
1132 # it's a bother at times that strings and string-like objects are
1133 # sequences...
1134 try:
1135 # non-sequence items should not work with len()
1136 # non-empty strings will fail this
1137 if len(query) and not isinstance(query[0], tuple):
1138 raise TypeError
1139 # zero-length sequences of all types will get here and succeed,
1140 # but that's a minor nit - since the original implementation
1141 # allowed empty dicts that type of behavior probably should be
1142 # preserved for consistency
1143 except TypeError:
1144 ty,va,tb = sys.exc_info()
1145 raise TypeError, "not a valid non-string sequence or mapping object", tb
1146
1147 l = []
1148 if not doseq:
1149 # preserve old behavior
1150 for k, v in query:
1151 k = quote_plus(str(k))
1152 v = quote_plus(str(v))
1153 l.append(k + '=' + v)
1154 else:
1155 for k, v in query:
1156 k = quote_plus(str(k))
1157 if isinstance(v, str):
1158 v = quote_plus(v)
1159 l.append(k + '=' + v)
1160 elif _is_unicode(v):
1161 # is there a reasonable way to convert to ASCII?
1162 # encode generates a string, but "replace" or "ignore"
1163 # lose information and "strict" can raise UnicodeError
1164 v = quote_plus(v.encode("ASCII","replace"))
1165 l.append(k + '=' + v)
1166 else:
1167 try:
1168 # is this a sufficient test for sequence-ness?
1169 x = len(v)
1170 except TypeError:
1171 # not a sequence
1172 v = quote_plus(str(v))
1173 l.append(k + '=' + v)
1174 else:
1175 # loop over the sequence
1176 for elt in v:
1177 l.append(k + '=' + quote_plus(str(elt)))
1178 return '&'.join(l)
1179
1180# Proxy handling
1181def getproxies_environment():
1182 """Return a dictionary of scheme -> proxy server URL mappings.
1183
1184 Scan the environment for variables named <scheme>_proxy;
1185 this seems to be the standard convention. If you need a
1186 different way, you can pass a proxies dictionary to the
1187 [Fancy]URLopener constructor.
1188
1189 """
1190 proxies = {}
1191 for name, value in os.environ.items():
1192 name = name.lower()
1193 if value and name[-6:] == '_proxy':
1194 proxies[name[:-6]] = value
1195 return proxies
1196
1197if sys.platform == 'darwin':
1198 def getproxies_internetconfig():
1199 """Return a dictionary of scheme -> proxy server URL mappings.
1200
1201 By convention the mac uses Internet Config to store
1202 proxies. An HTTP proxy, for instance, is stored under
1203 the HttpProxy key.
1204
1205 """
1206 try:
1207 import ic
1208 except ImportError:
1209 return {}
1210
1211 try:
1212 config = ic.IC()
1213 except ic.error:
1214 return {}
1215 proxies = {}
1216 # HTTP:
1217 if 'UseHTTPProxy' in config and config['UseHTTPProxy']:
1218 try:
1219 value = config['HTTPProxyHost']
1220 except ic.error:
1221 pass
1222 else:
1223 proxies['http'] = 'http://%s' % value
1224 # FTP: XXXX To be done.
1225 # Gopher: XXXX To be done.
1226 return proxies
1227
1228 def proxy_bypass(x):
1229 return 0
1230
1231 def getproxies():
1232 return getproxies_environment() or getproxies_internetconfig()
1233
1234elif os.name == 'nt':
1235 def getproxies_registry():
1236 """Return a dictionary of scheme -> proxy server URL mappings.
1237
1238 Win32 uses the registry to store proxies.
1239
1240 """
1241 proxies = {}
1242 try:
1243 import _winreg
1244 except ImportError:
1245 # Std module, so should be around - but you never know!
1246 return proxies
1247 try:
1248 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1249 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1250 proxyEnable = _winreg.QueryValueEx(internetSettings,
1251 'ProxyEnable')[0]
1252 if proxyEnable:
1253 # Returned as Unicode but problems if not converted to ASCII
1254 proxyServer = str(_winreg.QueryValueEx(internetSettings,
1255 'ProxyServer')[0])
1256 if '=' in proxyServer:
1257 # Per-protocol settings
1258 for p in proxyServer.split(';'):
1259 protocol, address = p.split('=', 1)
1260 # See if address has a type:// prefix
1261 import re
1262 if not re.match('^([^/:]+)://', address):
1263 address = '%s://%s' % (protocol, address)
1264 proxies[protocol] = address
1265 else:
1266 # Use one setting for all protocols
1267 if proxyServer[:5] == 'http:':
1268 proxies['http'] = proxyServer
1269 else:
1270 proxies['http'] = 'http://%s' % proxyServer
1271 proxies['ftp'] = 'ftp://%s' % proxyServer
1272 internetSettings.Close()
1273 except (WindowsError, ValueError, TypeError):
1274 # Either registry key not found etc, or the value in an
1275 # unexpected format.
1276 # proxies already set up to be empty so nothing to do
1277 pass
1278 return proxies
1279
1280 def getproxies():
1281 """Return a dictionary of scheme -> proxy server URL mappings.
1282
1283 Returns settings gathered from the environment, if specified,
1284 or the registry.
1285
1286 """
1287 return getproxies_environment() or getproxies_registry()
1288
1289 def proxy_bypass(host):
1290 try:
1291 import _winreg
1292 import re
1293 except ImportError:
1294 # Std modules, so should be around - but you never know!
1295 return 0
1296 try:
1297 internetSettings = _winreg.OpenKey(_winreg.HKEY_CURRENT_USER,
1298 r'Software\Microsoft\Windows\CurrentVersion\Internet Settings')
1299 proxyEnable = _winreg.QueryValueEx(internetSettings,
1300 'ProxyEnable')[0]
1301 proxyOverride = str(_winreg.QueryValueEx(internetSettings,
1302 'ProxyOverride')[0])
1303 # ^^^^ Returned as Unicode but problems if not converted to ASCII
1304 except WindowsError:
1305 return 0
1306 if not proxyEnable or not proxyOverride:
1307 return 0
1308 # try to make a host list from name and IP address.
1309 host = [host]
1310 try:
1311 addr = socket.gethostbyname(host[0])
1312 if addr != host:
1313 host.append(addr)
1314 except socket.error:
1315 pass
1316 # make a check value list from the registry entry: replace the
1317 # '<local>' string by the localhost entry and the corresponding
1318 # canonical entry.
1319 proxyOverride = proxyOverride.split(';')
1320 i = 0
1321 while i < len(proxyOverride):
1322 if proxyOverride[i] == '<local>':
1323 proxyOverride[i:i+1] = ['localhost',
1324 '127.0.0.1',
1325 socket.gethostname(),
1326 socket.gethostbyname(
1327 socket.gethostname())]
1328 i += 1
1329 # print proxyOverride
1330 # now check if we match one of the registry values.
1331 for test in proxyOverride:
1332 test = test.replace(".", r"\.") # mask dots
1333 test = test.replace("*", r".*") # change glob sequence
1334 test = test.replace("?", r".") # change glob char
1335 for val in host:
1336 # print "%s <--> %s" %( test, val )
1337 if re.match(test, val, re.I):
1338 return 1
1339 return 0
1340
1341else:
1342 # By default use environment variables
1343 getproxies = getproxies_environment
1344
1345 def proxy_bypass(host):
1346 return 0
1347
1348# Test and time quote() and unquote()
1349def test1():
1350 s = ''
1351 for i in range(256): s = s + chr(i)
1352 s = s*4
1353 t0 = time.time()
1354 qs = quote(s)
1355 uqs = unquote(qs)
1356 t1 = time.time()
1357 if uqs != s:
1358 print 'Wrong!'
1359 print repr(s)
1360 print repr(qs)
1361 print repr(uqs)
1362 print round(t1 - t0, 3), 'sec'
1363
1364
1365def reporthook(blocknum, blocksize, totalsize):
1366 # Report during remote transfers
1367 print "Block number: %d, Block size: %d, Total size: %d" % (
1368 blocknum, blocksize, totalsize)
1369
1370# Test program
1371def test(args=[]):
1372 if not args:
1373 args = [
1374 '/etc/passwd',
1375 'file:/etc/passwd',
1376 'file://localhost/etc/passwd',
1377 'ftp://ftp.python.org/pub/python/README',
1378## 'gopher://gopher.micro.umn.edu/1/',
1379 'http://www.python.org/index.html',
1380 ]
1381 if hasattr(URLopener, "open_https"):
1382 args.append('https://synergy.as.cmu.edu/~geek/')
1383 try:
1384 for url in args:
1385 print '-'*10, url, '-'*10
1386 fn, h = urlretrieve(url, None, reporthook)
1387 print fn
1388 if h:
1389 print '======'
1390 for k in h.keys(): print k + ':', h[k]
1391 print '======'
1392 fp = open(fn, 'rb')
1393 data = fp.read()
1394 del fp
1395 if '\r' in data:
1396 table = string.maketrans("", "")
1397 data = data.translate(table, "\r")
1398 print data
1399 fn, h = None, None
1400 print '-'*40
1401 finally:
1402 urlcleanup()
1403
1404def main():
1405 import getopt, sys
1406 try:
1407 opts, args = getopt.getopt(sys.argv[1:], "th")
1408 except getopt.error, msg:
1409 print msg
1410 print "Use -h for help"
1411 return
1412 t = 0
1413 for o, a in opts:
1414 if o == '-t':
1415 t = t + 1
1416 if o == '-h':
1417 print "Usage: python urllib.py [-t] [url ...]"
1418 print "-t runs self-test;",
1419 print "otherwise, contents of urls are printed"
1420 return
1421 if t:
1422 if t > 1:
1423 test1()
1424 test(args)
1425 else:
1426 if not args:
1427 print "Use -h for help"
1428 for url in args:
1429 print urlopen(url).read(),
1430
1431# Run test program when run as a script
1432if __name__ == '__main__':
1433 main()