48ea066682e1878d3402c4275cbcf45276ff5fae
Copyright (C) 2000 Bastian Kleineidam
You can choose between two licenses when using this package:
2) PSF license for Python 2.2
The robots.txt Exclusion Protocol is implemented as specified in
http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
__all__
= ["RobotFileParser"]
""" This class provides a set of methods to read, parse and answer
questions about a single robots.txt file.
def __init__(self
, url
=''):
self
.default_entry
= None
self
.disallow_all
= False
"""Returns the time the robots.txt file was last fetched.
This is useful for long-running web spiders that need to
check for new robots.txt files periodically.
"""Sets the time the robots.txt file was last fetched to the
self
.last_checked
= time
.time()
"""Sets the URL referring to a robots.txt file."""
self
.host
, self
.path
= urlparse
.urlparse(url
)[1:3]
"""Reads the robots.txt URL and feeds it to the parser."""
f
= opener
.open(self
.url
)
lines
.append(line
.strip())
self
.errcode
= opener
.errcode
if self
.errcode
== 401 or self
.errcode
== 403:
elif self
.errcode
>= 400:
elif self
.errcode
== 200 and lines
:
def _add_entry(self
, entry
):
if "*" in entry
.useragents
:
# the default entry is considered last
self
.default_entry
= entry
self
.entries
.append(entry
)
"""parse the input lines from a robots.txt file.
We allow that a user-agent: line is not preceded by
one or more blank lines."""
linenumber
= linenumber
+ 1
_debug("line %d: warning: you should insert"
" allow: or disallow: directives below any"
" user-agent: line" % linenumber
)
# remove optional comment and strip line
line
= line
.split(':', 1)
line
[0] = line
[0].strip().lower()
line
[1] = urllib
.unquote(line
[1].strip())
if line
[0] == "user-agent":
_debug("line %d: warning: you should insert a blank"
" line before any user-agent"
" directive" % linenumber
)
entry
.useragents
.append(line
[1])
elif line
[0] == "disallow":
_debug("line %d: error: you must insert a user-agent:"
" directive before this line" % linenumber
)
entry
.rulelines
.append(RuleLine(line
[1], False))
_debug("line %d: error: you must insert a user-agent:"
" directive before this line" % linenumber
)
entry
.rulelines
.append(RuleLine(line
[1], True))
_debug("line %d: warning: unknown key %s" % (linenumber
,
_debug("line %d: error: malformed line %s"%(linenumber
, line
))
self
.entries
.append(entry
)
_debug("Parsed rules:\n%s" % str(self
))
def can_fetch(self
, useragent
, url
):
"""using the parsed robots.txt decide if useragent can fetch url"""
_debug("Checking robots.txt allowance for:\n user agent: %s\n url: %s" %
# search for given user agent matches
url
= urllib
.quote(urlparse
.urlparse(urllib
.unquote(url
))[2]) or "/"
for entry
in self
.entries
:
if entry
.applies_to(useragent
):
return entry
.allowance(url
)
# try the default entry last
return self
.default_entry
.allowance(url
)
# agent not found ==> access granted
for entry
in self
.entries
:
ret
= ret
+ str(entry
) + "\n"
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
(allowance==False) followed by a path."""
def __init__(self
, path
, allowance
):
if path
== '' and not allowance
:
# an empty value means allow all
self
.path
= urllib
.quote(path
)
self
.allowance
= allowance
def applies_to(self
, filename
):
return self
.path
=="*" or filename
.startswith(self
.path
)
return (self
.allowance
and "Allow" or "Disallow")+": "+self
.path
"""An entry has one or more user-agents and zero or more rulelines"""
for agent
in self
.useragents
:
ret
= ret
+ "User-agent: "+agent
+"\n"
for line
in self
.rulelines
:
ret
= ret
+ str(line
) + "\n"
def applies_to(self
, useragent
):
"""check if this entry applies to the specified agent"""
# split the name token and make it lower case
useragent
= useragent
.split("/")[0].lower()
for agent
in self
.useragents
:
# we have the catch-all agent
def allowance(self
, filename
):
- our agent applies to this entry
- filename is URL decoded"""
for line
in self
.rulelines
:
_debug((filename
, str(line
), line
.allowance
))
if line
.applies_to(filename
):
class URLopener(urllib
.FancyURLopener
):
def __init__(self
, *args
):
urllib
.FancyURLopener
.__init
__(self
, *args
)
def http_error_default(self
, url
, fp
, errcode
, errmsg
, headers
):
return urllib
.FancyURLopener
.http_error_default(self
, url
, fp
, errcode
,
# robots.txt that exists, gotten to by redirection
rp
.set_url('http://www.musi-cal.com/robots.txt')
_check(rp
.can_fetch('*', 'http://www.musi-cal.com/'), 1)
# this should match the first rule, which is a disallow
_check(rp
.can_fetch('', 'http://www.musi-cal.com/'), 0)
_check(rp
.can_fetch('CherryPickerSE',
'http://www.musi-cal.com/cgi-bin/event-search'
'?city=San+Francisco'), 0)
_check(rp
.can_fetch('CherryPickerSE/1.0',
'http://www.musi-cal.com/cgi-bin/event-search'
'?city=San+Francisco'), 0)
_check(rp
.can_fetch('CherryPickerSE/1.5',
'http://www.musi-cal.com/cgi-bin/event-search'
'?city=San+Francisco'), 0)
_check(rp
.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
_check(rp
.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
_check(rp
.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
# tests for catch-all * agent
_check(rp
.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
_check(rp
.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
_check(rp
.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
_check(rp
.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
# robots.txt that does not exist
rp
.set_url('http://www.lycos.com/robots.txt')
_check(rp
.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
if __name__
== '__main__':