"""Parse (absolute and relative) URLs.
See RFC 1808: "Relative Uniform Resource Locators", by R. Fielding,
__all__
= ["urlparse", "urlunparse", "urljoin", "urldefrag",
"urlsplit", "urlunsplit"]
# A classification of schemes ('' means apply by default)
uses_relative
= ['ftp', 'http', 'gopher', 'nntp', 'imap',
'wais', 'file', 'https', 'shttp', 'mms',
'prospero', 'rtsp', 'rtspu', '']
uses_netloc
= ['ftp', 'http', 'gopher', 'nntp', 'telnet',
'imap', 'wais', 'file', 'mms', 'https', 'shttp',
'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '',
non_hierarchical
= ['gopher', 'hdl', 'mailto', 'news',
'telnet', 'wais', 'imap', 'snews', 'sip']
uses_params
= ['ftp', 'hdl', 'prospero', 'http', 'imap',
'https', 'shttp', 'rtsp', 'rtspu', 'sip',
uses_query
= ['http', 'wais', 'imap', 'https', 'shttp', 'mms',
'gopher', 'rtsp', 'rtspu', 'sip', '']
uses_fragment
= ['ftp', 'hdl', 'http', 'gopher', 'news',
'nntp', 'wais', 'https', 'shttp', 'snews',
# Characters valid in scheme names
scheme_chars
= ('abcdefghijklmnopqrstuvwxyz'
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
"""Clear the parse cache."""
def urlparse(url
, scheme
='', allow_fragments
=1):
"""Parse a URL into 6 components:
<scheme>://<netloc>/<path>;<params>?<query>#<fragment>
Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
Note that we don't break the components up in smaller bits
(e.g. netloc is a single string) and we don't expand % escapes."""
tuple = urlsplit(url
, scheme
, allow_fragments
)
scheme
, netloc
, url
, query
, fragment
= tuple
if scheme
in uses_params
and ';' in url
:
url
, params
= _splitparams(url
)
return scheme
, netloc
, url
, params
, query
, fragment
i
= url
.find(';', url
.rfind('/'))
return url
[:i
], url
[i
+1:]
def _splitnetloc(url
, start
=0):
for c
in '/?#': # the order is important!
delim
= url
.find(c
, start
)
return url
[start
:delim
], url
[delim
:]
def urlsplit(url
, scheme
='', allow_fragments
=1):
"""Parse a URL into 5 components:
<scheme>://<netloc>/<path>?<query>#<fragment>
Return a 5-tuple: (scheme, netloc, path, query, fragment).
Note that we don't break the components up in smaller bits
(e.g. netloc is a single string) and we don't expand % escapes."""
key
= url
, scheme
, allow_fragments
cached
= _parse_cache
.get(key
, None)
if len(_parse_cache
) >= MAX_CACHE_SIZE
: # avoid runaway growth
netloc
= query
= fragment
= ''
if url
[:i
] == 'http': # optimize the common case
netloc
, url
= _splitnetloc(url
, 2)
if allow_fragments
and '#' in url
:
url
, fragment
= url
.split('#', 1)
url
, query
= url
.split('?', 1)
tuple = scheme
, netloc
, url
, query
, fragment
_parse_cache
[key
] = tuple
if c
not in scheme_chars
:
scheme
, url
= url
[:i
].lower(), url
[i
+1:]
if scheme
in uses_netloc
and url
[:2] == '//':
netloc
, url
= _splitnetloc(url
, 2)
if allow_fragments
and scheme
in uses_fragment
and '#' in url
:
url
, fragment
= url
.split('#', 1)
if scheme
in uses_query
and '?' in url
:
url
, query
= url
.split('?', 1)
tuple = scheme
, netloc
, url
, query
, fragment
_parse_cache
[key
] = tuple
def urlunparse((scheme
, netloc
, url
, params
, query
, fragment
)):
"""Put a parsed URL back together again. This may result in a
slightly different, but equivalent URL, if the URL that was parsed
originally had redundant delimiters, e.g. a ? with an empty query
(the draft states that these are equivalent)."""
url
= "%s;%s" % (url
, params
)
return urlunsplit((scheme
, netloc
, url
, query
, fragment
))
def urlunsplit((scheme
, netloc
, url
, query
, fragment
)):
if netloc
or (scheme
and scheme
in uses_netloc
and url
[:2] != '//'):
if url
and url
[:1] != '/': url
= '/' + url
url
= '//' + (netloc
or '') + url
url
= url
+ '#' + fragment
def urljoin(base
, url
, allow_fragments
= 1):
"""Join a base URL and a possibly relative URL to form an absolute
interpretation of the latter."""
bscheme
, bnetloc
, bpath
, bparams
, bquery
, bfragment
= \
urlparse(base
, '', allow_fragments
)
scheme
, netloc
, path
, params
, query
, fragment
= \
urlparse(url
, bscheme
, allow_fragments
)
if scheme
!= bscheme
or scheme
not in uses_relative
:
if scheme
in uses_netloc
:
return urlunparse((scheme
, netloc
, path
,
params
, query
, fragment
))
return urlunparse((scheme
, netloc
, path
,
params
, query
, fragment
))
if not (path
or params
or query
):
return urlunparse((scheme
, netloc
, bpath
,
bparams
, bquery
, fragment
))
segments
= bpath
.split('/')[:-1] + path
.split('/')
# XXX The stuff below is bogus in various ways...
and segments
[i
-1] not in ('', '..')):
if segments
== ['', '..']:
elif len(segments
) >= 2 and segments
[-1] == '..':
return urlunparse((scheme
, netloc
, '/'.join(segments
),
params
, query
, fragment
))
"""Removes any existing fragment from URL.
Returns a tuple of the defragmented URL and the fragment. If
the URL contained no fragments, the second element is the
s
, n
, p
, a
, q
, frag
= urlparse(url
)
defrag
= urlunparse((s
, n
, p
, a
, q
, ''))
http:g = <URL:http://a/b/c/g>
http: = <URL:http://a/b/c/d>
./g = <URL:http://a/b/c/g>
g/ = <URL:http://a/b/c/g/>
?y = <URL:http://a/b/c/d?y>
g?y = <URL:http://a/b/c/g?y>
g?y/./x = <URL:http://a/b/c/g?y/./x>
../g = <URL:http://a/b/g>
../../g = <URL:http://a/g>
../../../g = <URL:http://a/../g>
./../g = <URL:http://a/b/g>
./g/. = <URL:http://a/b/c/g/>
/./g = <URL:http://a/./g>
g/./h = <URL:http://a/b/c/g/h>
g/../h = <URL:http://a/b/c/h>
http:g = <URL:http://a/b/c/g>
http: = <URL:http://a/b/c/d>
http:?y = <URL:http://a/b/c/d?y>
http:g?y = <URL:http://a/b/c/g?y>
http:g?y/./x = <URL:http://a/b/c/g?y/./x>
fp
= StringIO
.StringIO(test_input
)
print '%-10s : %s' % (url
, parts
)
wrapped
= '<URL:%s>' % abs
print '%-10s = %s' % (url
, wrapped
)
if len(words
) == 3 and words
[1] == '=':
print 'EXPECTED', words
[2], '!!!!!!!!!!'
if __name__
== '__main__':