[OpenSPARC-T2-SAM] / sam-t2 / devtools / amd64 / lib / python2.4 / robotparser.py

""" robotparser.py

    Copyright (C) 2000  Bastian Kleineidam

    You can choose between two licenses when using this package:
    1) GNU GPLv2
    2) PSF license for Python 2.2

    The robots.txt Exclusion Protocol is implemented as specified in
    http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
"""
import urlparse,urllib

__all__ = ["RobotFileParser"]

debug = 0

def _debug(msg):
    if debug: print msg


class RobotFileParser:
    """ This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file.

    """

    def __init__(self, url=''):
        self.entries = []
        self.default_entry = None
        self.disallow_all = False
        self.allow_all = False
        self.set_url(url)
        self.last_checked = 0

    def mtime(self):
        """Returns the time the robots.txt file was last fetched.

        This is useful for long-running web spiders that need to
        check for new robots.txt files periodically.

        """
        return self.last_checked

    def modified(self):
        """Sets the time the robots.txt file was last fetched to the
        current time.

        """
        import time
        self.last_checked = time.time()

    def set_url(self, url):
        """Sets the URL referring to a robots.txt file."""
        self.url = url
        self.host, self.path = urlparse.urlparse(url)[1:3]

    def read(self):
        """Reads the robots.txt URL and feeds it to the parser."""
        opener = URLopener()
        f = opener.open(self.url)
        lines = []
        line = f.readline()
        while line:
            lines.append(line.strip())
            line = f.readline()
        self.errcode = opener.errcode
        if self.errcode == 401 or self.errcode == 403:
            self.disallow_all = True
            _debug("disallow all")
        elif self.errcode >= 400:
            self.allow_all = True
            _debug("allow all")
        elif self.errcode == 200 and lines:
            _debug("parse lines")
            self.parse(lines)

    def _add_entry(self, entry):
        if "*" in entry.useragents:
            # the default entry is considered last
            self.default_entry = entry
        else:
            self.entries.append(entry)

    def parse(self, lines):
        """parse the input lines from a robots.txt file.
           We allow that a user-agent: line is not preceded by
           one or more blank lines."""
        state = 0
        linenumber = 0
        entry = Entry()

        for line in lines:
            linenumber = linenumber + 1
            if not line:
                if state==1:
                    _debug("line %d: warning: you should insert"
                           " allow: or disallow: directives below any"
                           " user-agent: line" % linenumber)
                    entry = Entry()
                    state = 0
                elif state==2:
                    self._add_entry(entry)
                    entry = Entry()
                    state = 0
            # remove optional comment and strip line
            i = line.find('#')
            if i>=0:
                line = line[:i]
            line = line.strip()
            if not line:
                continue
            line = line.split(':', 1)
            if len(line) == 2:
                line[0] = line[0].strip().lower()
                line[1] = urllib.unquote(line[1].strip())
                if line[0] == "user-agent":
                    if state==2:
                        _debug("line %d: warning: you should insert a blank"
                               " line before any user-agent"
                               " directive" % linenumber)
                        self._add_entry(entry)
                        entry = Entry()
                    entry.useragents.append(line[1])
                    state = 1
                elif line[0] == "disallow":
                    if state==0:
                        _debug("line %d: error: you must insert a user-agent:"
                               " directive before this line" % linenumber)
                    else:
                        entry.rulelines.append(RuleLine(line[1], False))
                        state = 2
                elif line[0] == "allow":
                    if state==0:
                        _debug("line %d: error: you must insert a user-agent:"
                               " directive before this line" % linenumber)
                    else:
                        entry.rulelines.append(RuleLine(line[1], True))
                else:
                    _debug("line %d: warning: unknown key %s" % (linenumber,
                               line[0]))
            else:
                _debug("line %d: error: malformed line %s"%(linenumber, line))
        if state==2:
            self.entries.append(entry)
        _debug("Parsed rules:\n%s" % str(self))


    def can_fetch(self, useragent, url):
        """using the parsed robots.txt decide if useragent can fetch url"""
        _debug("Checking robots.txt allowance for:\n  user agent: %s\n  url: %s" %
               (useragent, url))
        if self.disallow_all:
            return False
        if self.allow_all:
            return True
        # search for given user agent matches
        # the first match counts
        url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
        for entry in self.entries:
            if entry.applies_to(useragent):
                return entry.allowance(url)
        # try the default entry last
        if self.default_entry:
            return self.default_entry.allowance(url)
        # agent not found ==> access granted
        return True


    def __str__(self):
        ret = ""
        for entry in self.entries:
            ret = ret + str(entry) + "\n"
        return ret


class RuleLine:
    """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
       (allowance==False) followed by a path."""
    def __init__(self, path, allowance):
        if path == '' and not allowance:
            # an empty value means allow all
            allowance = True
        self.path = urllib.quote(path)
        self.allowance = allowance

    def applies_to(self, filename):
        return self.path=="*" or filename.startswith(self.path)

    def __str__(self):
        return (self.allowance and "Allow" or "Disallow")+": "+self.path


class Entry:
    """An entry has one or more user-agents and zero or more rulelines"""
    def __init__(self):
        self.useragents = []
        self.rulelines = []

    def __str__(self):
        ret = ""
        for agent in self.useragents:
            ret = ret + "User-agent: "+agent+"\n"
        for line in self.rulelines:
            ret = ret + str(line) + "\n"
        return ret

    def applies_to(self, useragent):
        """check if this entry applies to the specified agent"""
        # split the name token and make it lower case
        useragent = useragent.split("/")[0].lower()
        for agent in self.useragents:
            if agent=='*':
                # we have the catch-all agent
                return True
            agent = agent.lower()
            if agent in useragent:
                return True
        return False

    def allowance(self, filename):
        """Preconditions:
        - our agent applies to this entry
        - filename is URL decoded"""
        for line in self.rulelines:
            _debug((filename, str(line), line.allowance))
            if line.applies_to(filename):
                return line.allowance
        return True

class URLopener(urllib.FancyURLopener):
    def __init__(self, *args):
        urllib.FancyURLopener.__init__(self, *args)
        self.errcode = 200

    def http_error_default(self, url, fp, errcode, errmsg, headers):
        self.errcode = errcode
        return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
                                                        errmsg, headers)

def _check(a,b):
    if not b:
        ac = "access denied"
    else:
        ac = "access allowed"
    if a!=b:
        print "failed"
    else:
        print "ok (%s)" % ac
    print

def _test():
    global debug
    rp = RobotFileParser()
    debug = 1

    # robots.txt that exists, gotten to by redirection
    rp.set_url('http://www.musi-cal.com/robots.txt')
    rp.read()

    # test for re.escape
    _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
    # this should match the first rule, which is a disallow
    _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
    # various cherry pickers
    _check(rp.can_fetch('CherryPickerSE',
                       'http://www.musi-cal.com/cgi-bin/event-search'
                       '?city=San+Francisco'), 0)
    _check(rp.can_fetch('CherryPickerSE/1.0',
                       'http://www.musi-cal.com/cgi-bin/event-search'
                       '?city=San+Francisco'), 0)
    _check(rp.can_fetch('CherryPickerSE/1.5',
                       'http://www.musi-cal.com/cgi-bin/event-search'
                       '?city=San+Francisco'), 0)
    # case sensitivity
    _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
    _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
    # substring test
    _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
    # tests for catch-all * agent
    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)

    # robots.txt that does not exist
    rp.set_url('http://www.lycos.com/robots.txt')
    rp.read()
    _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)

if __name__ == '__main__':
    _test()
Commit	Line	Data
920dae64 AT	1	""" robotparser.py
	2
	3	Copyright (C) 2000 Bastian Kleineidam
	4
	5	You can choose between two licenses when using this package:
	6	1) GNU GPLv2
	7	2) PSF license for Python 2.2
	8
	9	The robots.txt Exclusion Protocol is implemented as specified in
	10	http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
	11	"""
	12	import urlparse,urllib
	13
	14	__all__ = ["RobotFileParser"]
	15
	16	debug = 0
	17
	18	def _debug(msg):
	19	if debug: print msg
	20
	21
	22	class RobotFileParser:
	23	""" This class provides a set of methods to read, parse and answer
	24	questions about a single robots.txt file.
	25
	26	"""
	27
	28	def __init__(self, url=''):
	29	self.entries = []
	30	self.default_entry = None
	31	self.disallow_all = False
	32	self.allow_all = False
	33	self.set_url(url)
	34	self.last_checked = 0
	35
	36	def mtime(self):
	37	"""Returns the time the robots.txt file was last fetched.
	38
	39	This is useful for long-running web spiders that need to
	40	check for new robots.txt files periodically.
	41
	42	"""
	43	return self.last_checked
	44
	45	def modified(self):
	46	"""Sets the time the robots.txt file was last fetched to the
	47	current time.
	48
	49	"""
	50	import time
	51	self.last_checked = time.time()
	52
	53	def set_url(self, url):
	54	"""Sets the URL referring to a robots.txt file."""
	55	self.url = url
	56	self.host, self.path = urlparse.urlparse(url)[1:3]
	57
	58	def read(self):
	59	"""Reads the robots.txt URL and feeds it to the parser."""
	60	opener = URLopener()
	61	f = opener.open(self.url)
	62	lines = []
	63	line = f.readline()
	64	while line:
65	lines.append(line.strip())
66	line = f.readline()
67	self.errcode = opener.errcode
68	if self.errcode == 401 or self.errcode == 403:
69	self.disallow_all = True
70	_debug("disallow all")
71	elif self.errcode >= 400:
72	self.allow_all = True
73	_debug("allow all")
74	elif self.errcode == 200 and lines:
75	_debug("parse lines")
76	self.parse(lines)
77
78	def _add_entry(self, entry):
79	if "*" in entry.useragents:
80	# the default entry is considered last
81	self.default_entry = entry
82	else:
83	self.entries.append(entry)
84
85	def parse(self, lines):
86	"""parse the input lines from a robots.txt file.
87	We allow that a user-agent: line is not preceded by
88	one or more blank lines."""
89	state = 0
90	linenumber = 0
91	entry = Entry()
92
93	for line in lines:
94	linenumber = linenumber + 1
95	if not line:
96	if state==1:
97	_debug("line %d: warning: you should insert"
98	" allow: or disallow: directives below any"
99	" user-agent: line" % linenumber)
100	entry = Entry()
101	state = 0
102	elif state==2:
103	self._add_entry(entry)
104	entry = Entry()
105	state = 0
106	# remove optional comment and strip line
107	i = line.find('#')
108	if i>=0:
109	line = line[:i]
110	line = line.strip()
111	if not line:
112	continue
113	line = line.split(':', 1)
114	if len(line) == 2:
115	line[0] = line[0].strip().lower()
116	line[1] = urllib.unquote(line[1].strip())
117	if line[0] == "user-agent":
118	if state==2:
119	_debug("line %d: warning: you should insert a blank"
120	" line before any user-agent"
121	" directive" % linenumber)
122	self._add_entry(entry)
123	entry = Entry()
124	entry.useragents.append(line[1])
125	state = 1
126	elif line[0] == "disallow":
127	if state==0:
128	_debug("line %d: error: you must insert a user-agent:"
129	" directive before this line" % linenumber)
130	else:
131	entry.rulelines.append(RuleLine(line[1], False))
132	state = 2
133	elif line[0] == "allow":
134	if state==0:
135	_debug("line %d: error: you must insert a user-agent:"
136	" directive before this line" % linenumber)
137	else:
138	entry.rulelines.append(RuleLine(line[1], True))
139	else:
140	_debug("line %d: warning: unknown key %s" % (linenumber,
141	line[0]))
142	else:
143	_debug("line %d: error: malformed line %s"%(linenumber, line))
144	if state==2:
145	self.entries.append(entry)
146	_debug("Parsed rules:\n%s" % str(self))
147
148
149	def can_fetch(self, useragent, url):
150	"""using the parsed robots.txt decide if useragent can fetch url"""
151	_debug("Checking robots.txt allowance for:\n user agent: %s\n url: %s" %
152	(useragent, url))
153	if self.disallow_all:
154	return False
155	if self.allow_all:
156	return True
157	# search for given user agent matches
158	# the first match counts
159	url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
160	for entry in self.entries:
161	if entry.applies_to(useragent):
162	return entry.allowance(url)
163	# try the default entry last
164	if self.default_entry:
165	return self.default_entry.allowance(url)
166	# agent not found ==> access granted
167	return True
168
169
170	def __str__(self):
171	ret = ""
172	for entry in self.entries:
173	ret = ret + str(entry) + "\n"
174	return ret
175
176
177	class RuleLine:
178	"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
179	(allowance==False) followed by a path."""
180	def __init__(self, path, allowance):
181	if path == '' and not allowance:
182	# an empty value means allow all
183	allowance = True
184	self.path = urllib.quote(path)
185	self.allowance = allowance
186
187	def applies_to(self, filename):
188	return self.path=="*" or filename.startswith(self.path)
189
190	def __str__(self):
191	return (self.allowance and "Allow" or "Disallow")+": "+self.path
192
193
194	class Entry:
195	"""An entry has one or more user-agents and zero or more rulelines"""
196	def __init__(self):
197	self.useragents = []
198	self.rulelines = []
199
200	def __str__(self):
201	ret = ""
202	for agent in self.useragents:
203	ret = ret + "User-agent: "+agent+"\n"
204	for line in self.rulelines:
205	ret = ret + str(line) + "\n"
206	return ret
207
208	def applies_to(self, useragent):
209	"""check if this entry applies to the specified agent"""
210	# split the name token and make it lower case
211	useragent = useragent.split("/")[0].lower()
212	for agent in self.useragents:
213	if agent=='*':
214	# we have the catch-all agent
215	return True
216	agent = agent.lower()
217	if agent in useragent:
218	return True
219	return False
220
221	def allowance(self, filename):
222	"""Preconditions:
223	- our agent applies to this entry
224	- filename is URL decoded"""
225	for line in self.rulelines:
226	_debug((filename, str(line), line.allowance))
227	if line.applies_to(filename):
228	return line.allowance
229	return True
230
231	class URLopener(urllib.FancyURLopener):
232	def __init__(self, *args):
233	urllib.FancyURLopener.__init__(self, *args)
234	self.errcode = 200
235
236	def http_error_default(self, url, fp, errcode, errmsg, headers):
237	self.errcode = errcode
238	return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
239	errmsg, headers)
240
241	def _check(a,b):
242	if not b:
243	ac = "access denied"
244	else:
245	ac = "access allowed"
246	if a!=b:
247	print "failed"
248	else:
249	print "ok (%s)" % ac
250	print
251
252	def _test():
253	global debug
254	rp = RobotFileParser()
255	debug = 1
256
257	# robots.txt that exists, gotten to by redirection
258	rp.set_url('http://www.musi-cal.com/robots.txt')
259	rp.read()
260
261	# test for re.escape
262	_check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
263	# this should match the first rule, which is a disallow
264	_check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
265	# various cherry pickers
266	_check(rp.can_fetch('CherryPickerSE',
267	'http://www.musi-cal.com/cgi-bin/event-search'
268	'?city=San+Francisco'), 0)
269	_check(rp.can_fetch('CherryPickerSE/1.0',
270	'http://www.musi-cal.com/cgi-bin/event-search'
271	'?city=San+Francisco'), 0)
272	_check(rp.can_fetch('CherryPickerSE/1.5',
273	'http://www.musi-cal.com/cgi-bin/event-search'
274	'?city=San+Francisco'), 0)
275	# case sensitivity
276	_check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
277	_check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
278	# substring test
279	_check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
280	# tests for catch-all * agent
281	_check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
282	_check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
283	_check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
284	_check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
285
286	# robots.txt that does not exist
287	rp.set_url('http://www.lycos.com/robots.txt')
288	rp.read()
289	_check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
290
291	if __name__ == '__main__':
292	_test()