Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | """Parse (absolute and relative) URLs. |
2 | ||
3 | See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, | |
4 | UC Irvine, June 1995. | |
5 | """ | |
6 | ||
7 | __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", | |
8 | "urlsplit", "urlunsplit"] | |
9 | ||
10 | # A classification of schemes ('' means apply by default) | |
11 | uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', | |
12 | 'wais', 'file', 'https', 'shttp', 'mms', | |
13 | 'prospero', 'rtsp', 'rtspu', ''] | |
14 | uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', | |
15 | 'imap', 'wais', 'file', 'mms', 'https', 'shttp', | |
16 | 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', | |
17 | 'svn', 'svn+ssh'] | |
18 | non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', | |
19 | 'telnet', 'wais', 'imap', 'snews', 'sip'] | |
20 | uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', | |
21 | 'https', 'shttp', 'rtsp', 'rtspu', 'sip', | |
22 | 'mms', ''] | |
23 | uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', | |
24 | 'gopher', 'rtsp', 'rtspu', 'sip', ''] | |
25 | uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', | |
26 | 'nntp', 'wais', 'https', 'shttp', 'snews', | |
27 | 'file', 'prospero', ''] | |
28 | ||
29 | # Characters valid in scheme names | |
30 | scheme_chars = ('abcdefghijklmnopqrstuvwxyz' | |
31 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' | |
32 | '0123456789' | |
33 | '+-.') | |
34 | ||
35 | MAX_CACHE_SIZE = 20 | |
36 | _parse_cache = {} | |
37 | ||
38 | def clear_cache(): | |
39 | """Clear the parse cache.""" | |
40 | global _parse_cache | |
41 | _parse_cache = {} | |
42 | ||
43 | ||
44 | def urlparse(url, scheme='', allow_fragments=1): | |
45 | """Parse a URL into 6 components: | |
46 | <scheme>://<netloc>/<path>;<params>?<query>#<fragment> | |
47 | Return a 6-tuple: (scheme, netloc, path, params, query, fragment). | |
48 | Note that we don't break the components up in smaller bits | |
49 | (e.g. netloc is a single string) and we don't expand % escapes.""" | |
50 | tuple = urlsplit(url, scheme, allow_fragments) | |
51 | scheme, netloc, url, query, fragment = tuple | |
52 | if scheme in uses_params and ';' in url: | |
53 | url, params = _splitparams(url) | |
54 | else: | |
55 | params = '' | |
56 | return scheme, netloc, url, params, query, fragment | |
57 | ||
58 | def _splitparams(url): | |
59 | if '/' in url: | |
60 | i = url.find(';', url.rfind('/')) | |
61 | if i < 0: | |
62 | return url, '' | |
63 | else: | |
64 | i = url.find(';') | |
65 | return url[:i], url[i+1:] | |
66 | ||
67 | def _splitnetloc(url, start=0): | |
68 | for c in '/?#': # the order is important! | |
69 | delim = url.find(c, start) | |
70 | if delim >= 0: | |
71 | break | |
72 | else: | |
73 | delim = len(url) | |
74 | return url[start:delim], url[delim:] | |
75 | ||
76 | def urlsplit(url, scheme='', allow_fragments=1): | |
77 | """Parse a URL into 5 components: | |
78 | <scheme>://<netloc>/<path>?<query>#<fragment> | |
79 | Return a 5-tuple: (scheme, netloc, path, query, fragment). | |
80 | Note that we don't break the components up in smaller bits | |
81 | (e.g. netloc is a single string) and we don't expand % escapes.""" | |
82 | key = url, scheme, allow_fragments | |
83 | cached = _parse_cache.get(key, None) | |
84 | if cached: | |
85 | return cached | |
86 | if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth | |
87 | clear_cache() | |
88 | netloc = query = fragment = '' | |
89 | i = url.find(':') | |
90 | if i > 0: | |
91 | if url[:i] == 'http': # optimize the common case | |
92 | scheme = url[:i].lower() | |
93 | url = url[i+1:] | |
94 | if url[:2] == '//': | |
95 | netloc, url = _splitnetloc(url, 2) | |
96 | if allow_fragments and '#' in url: | |
97 | url, fragment = url.split('#', 1) | |
98 | if '?' in url: | |
99 | url, query = url.split('?', 1) | |
100 | tuple = scheme, netloc, url, query, fragment | |
101 | _parse_cache[key] = tuple | |
102 | return tuple | |
103 | for c in url[:i]: | |
104 | if c not in scheme_chars: | |
105 | break | |
106 | else: | |
107 | scheme, url = url[:i].lower(), url[i+1:] | |
108 | if scheme in uses_netloc and url[:2] == '//': | |
109 | netloc, url = _splitnetloc(url, 2) | |
110 | if allow_fragments and scheme in uses_fragment and '#' in url: | |
111 | url, fragment = url.split('#', 1) | |
112 | if scheme in uses_query and '?' in url: | |
113 | url, query = url.split('?', 1) | |
114 | tuple = scheme, netloc, url, query, fragment | |
115 | _parse_cache[key] = tuple | |
116 | return tuple | |
117 | ||
118 | def urlunparse((scheme, netloc, url, params, query, fragment)): | |
119 | """Put a parsed URL back together again. This may result in a | |
120 | slightly different, but equivalent URL, if the URL that was parsed | |
121 | originally had redundant delimiters, e.g. a ? with an empty query | |
122 | (the draft states that these are equivalent).""" | |
123 | if params: | |
124 | url = "%s;%s" % (url, params) | |
125 | return urlunsplit((scheme, netloc, url, query, fragment)) | |
126 | ||
127 | def urlunsplit((scheme, netloc, url, query, fragment)): | |
128 | if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): | |
129 | if url and url[:1] != '/': url = '/' + url | |
130 | url = '//' + (netloc or '') + url | |
131 | if scheme: | |
132 | url = scheme + ':' + url | |
133 | if query: | |
134 | url = url + '?' + query | |
135 | if fragment: | |
136 | url = url + '#' + fragment | |
137 | return url | |
138 | ||
139 | def urljoin(base, url, allow_fragments = 1): | |
140 | """Join a base URL and a possibly relative URL to form an absolute | |
141 | interpretation of the latter.""" | |
142 | if not base: | |
143 | return url | |
144 | if not url: | |
145 | return base | |
146 | bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ | |
147 | urlparse(base, '', allow_fragments) | |
148 | scheme, netloc, path, params, query, fragment = \ | |
149 | urlparse(url, bscheme, allow_fragments) | |
150 | if scheme != bscheme or scheme not in uses_relative: | |
151 | return url | |
152 | if scheme in uses_netloc: | |
153 | if netloc: | |
154 | return urlunparse((scheme, netloc, path, | |
155 | params, query, fragment)) | |
156 | netloc = bnetloc | |
157 | if path[:1] == '/': | |
158 | return urlunparse((scheme, netloc, path, | |
159 | params, query, fragment)) | |
160 | if not (path or params or query): | |
161 | return urlunparse((scheme, netloc, bpath, | |
162 | bparams, bquery, fragment)) | |
163 | segments = bpath.split('/')[:-1] + path.split('/') | |
164 | # XXX The stuff below is bogus in various ways... | |
165 | if segments[-1] == '.': | |
166 | segments[-1] = '' | |
167 | while '.' in segments: | |
168 | segments.remove('.') | |
169 | while 1: | |
170 | i = 1 | |
171 | n = len(segments) - 1 | |
172 | while i < n: | |
173 | if (segments[i] == '..' | |
174 | and segments[i-1] not in ('', '..')): | |
175 | del segments[i-1:i+1] | |
176 | break | |
177 | i = i+1 | |
178 | else: | |
179 | break | |
180 | if segments == ['', '..']: | |
181 | segments[-1] = '' | |
182 | elif len(segments) >= 2 and segments[-1] == '..': | |
183 | segments[-2:] = [''] | |
184 | return urlunparse((scheme, netloc, '/'.join(segments), | |
185 | params, query, fragment)) | |
186 | ||
187 | def urldefrag(url): | |
188 | """Removes any existing fragment from URL. | |
189 | ||
190 | Returns a tuple of the defragmented URL and the fragment. If | |
191 | the URL contained no fragments, the second element is the | |
192 | empty string. | |
193 | """ | |
194 | if '#' in url: | |
195 | s, n, p, a, q, frag = urlparse(url) | |
196 | defrag = urlunparse((s, n, p, a, q, '')) | |
197 | return defrag, frag | |
198 | else: | |
199 | return url, '' | |
200 | ||
201 | ||
202 | test_input = """ | |
203 | http://a/b/c/d | |
204 | ||
205 | g:h = <URL:g:h> | |
206 | http:g = <URL:http://a/b/c/g> | |
207 | http: = <URL:http://a/b/c/d> | |
208 | g = <URL:http://a/b/c/g> | |
209 | ./g = <URL:http://a/b/c/g> | |
210 | g/ = <URL:http://a/b/c/g/> | |
211 | /g = <URL:http://a/g> | |
212 | //g = <URL:http://g> | |
213 | ?y = <URL:http://a/b/c/d?y> | |
214 | g?y = <URL:http://a/b/c/g?y> | |
215 | g?y/./x = <URL:http://a/b/c/g?y/./x> | |
216 | . = <URL:http://a/b/c/> | |
217 | ./ = <URL:http://a/b/c/> | |
218 | .. = <URL:http://a/b/> | |
219 | ../ = <URL:http://a/b/> | |
220 | ../g = <URL:http://a/b/g> | |
221 | ../.. = <URL:http://a/> | |
222 | ../../g = <URL:http://a/g> | |
223 | ../../../g = <URL:http://a/../g> | |
224 | ./../g = <URL:http://a/b/g> | |
225 | ./g/. = <URL:http://a/b/c/g/> | |
226 | /./g = <URL:http://a/./g> | |
227 | g/./h = <URL:http://a/b/c/g/h> | |
228 | g/../h = <URL:http://a/b/c/h> | |
229 | http:g = <URL:http://a/b/c/g> | |
230 | http: = <URL:http://a/b/c/d> | |
231 | http:?y = <URL:http://a/b/c/d?y> | |
232 | http:g?y = <URL:http://a/b/c/g?y> | |
233 | http:g?y/./x = <URL:http://a/b/c/g?y/./x> | |
234 | """ | |
235 | ||
236 | def test(): | |
237 | import sys | |
238 | base = '' | |
239 | if sys.argv[1:]: | |
240 | fn = sys.argv[1] | |
241 | if fn == '-': | |
242 | fp = sys.stdin | |
243 | else: | |
244 | fp = open(fn) | |
245 | else: | |
246 | import StringIO | |
247 | fp = StringIO.StringIO(test_input) | |
248 | while 1: | |
249 | line = fp.readline() | |
250 | if not line: break | |
251 | words = line.split() | |
252 | if not words: | |
253 | continue | |
254 | url = words[0] | |
255 | parts = urlparse(url) | |
256 | print '%-10s : %s' % (url, parts) | |
257 | abs = urljoin(base, url) | |
258 | if not base: | |
259 | base = abs | |
260 | wrapped = '<URL:%s>' % abs | |
261 | print '%-10s = %s' % (url, wrapped) | |
262 | if len(words) == 3 and words[1] == '=': | |
263 | if wrapped != words[2]: | |
264 | print 'EXPECTED', words[2], '!!!!!!!!!!' | |
265 | ||
266 | if __name__ == '__main__': | |
267 | test() |