Commit | Line | Data |
---|---|---|
920dae64 AT |
1 | """ robotparser.py |
2 | ||
3 | Copyright (C) 2000 Bastian Kleineidam | |
4 | ||
5 | You can choose between two licenses when using this package: | |
6 | 1) GNU GPLv2 | |
7 | 2) PSF license for Python 2.2 | |
8 | ||
9 | The robots.txt Exclusion Protocol is implemented as specified in | |
10 | http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html | |
11 | """ | |
12 | import urlparse,urllib | |
13 | ||
14 | __all__ = ["RobotFileParser"] | |
15 | ||
16 | debug = 0 | |
17 | ||
18 | def _debug(msg): | |
19 | if debug: print msg | |
20 | ||
21 | ||
22 | class RobotFileParser: | |
23 | """ This class provides a set of methods to read, parse and answer | |
24 | questions about a single robots.txt file. | |
25 | ||
26 | """ | |
27 | ||
28 | def __init__(self, url=''): | |
29 | self.entries = [] | |
30 | self.default_entry = None | |
31 | self.disallow_all = False | |
32 | self.allow_all = False | |
33 | self.set_url(url) | |
34 | self.last_checked = 0 | |
35 | ||
36 | def mtime(self): | |
37 | """Returns the time the robots.txt file was last fetched. | |
38 | ||
39 | This is useful for long-running web spiders that need to | |
40 | check for new robots.txt files periodically. | |
41 | ||
42 | """ | |
43 | return self.last_checked | |
44 | ||
45 | def modified(self): | |
46 | """Sets the time the robots.txt file was last fetched to the | |
47 | current time. | |
48 | ||
49 | """ | |
50 | import time | |
51 | self.last_checked = time.time() | |
52 | ||
53 | def set_url(self, url): | |
54 | """Sets the URL referring to a robots.txt file.""" | |
55 | self.url = url | |
56 | self.host, self.path = urlparse.urlparse(url)[1:3] | |
57 | ||
58 | def read(self): | |
59 | """Reads the robots.txt URL and feeds it to the parser.""" | |
60 | opener = URLopener() | |
61 | f = opener.open(self.url) | |
62 | lines = [] | |
63 | line = f.readline() | |
64 | while line: | |
65 | lines.append(line.strip()) | |
66 | line = f.readline() | |
67 | self.errcode = opener.errcode | |
68 | if self.errcode == 401 or self.errcode == 403: | |
69 | self.disallow_all = True | |
70 | _debug("disallow all") | |
71 | elif self.errcode >= 400: | |
72 | self.allow_all = True | |
73 | _debug("allow all") | |
74 | elif self.errcode == 200 and lines: | |
75 | _debug("parse lines") | |
76 | self.parse(lines) | |
77 | ||
78 | def _add_entry(self, entry): | |
79 | if "*" in entry.useragents: | |
80 | # the default entry is considered last | |
81 | self.default_entry = entry | |
82 | else: | |
83 | self.entries.append(entry) | |
84 | ||
85 | def parse(self, lines): | |
86 | """parse the input lines from a robots.txt file. | |
87 | We allow that a user-agent: line is not preceded by | |
88 | one or more blank lines.""" | |
89 | state = 0 | |
90 | linenumber = 0 | |
91 | entry = Entry() | |
92 | ||
93 | for line in lines: | |
94 | linenumber = linenumber + 1 | |
95 | if not line: | |
96 | if state==1: | |
97 | _debug("line %d: warning: you should insert" | |
98 | " allow: or disallow: directives below any" | |
99 | " user-agent: line" % linenumber) | |
100 | entry = Entry() | |
101 | state = 0 | |
102 | elif state==2: | |
103 | self._add_entry(entry) | |
104 | entry = Entry() | |
105 | state = 0 | |
106 | # remove optional comment and strip line | |
107 | i = line.find('#') | |
108 | if i>=0: | |
109 | line = line[:i] | |
110 | line = line.strip() | |
111 | if not line: | |
112 | continue | |
113 | line = line.split(':', 1) | |
114 | if len(line) == 2: | |
115 | line[0] = line[0].strip().lower() | |
116 | line[1] = urllib.unquote(line[1].strip()) | |
117 | if line[0] == "user-agent": | |
118 | if state==2: | |
119 | _debug("line %d: warning: you should insert a blank" | |
120 | " line before any user-agent" | |
121 | " directive" % linenumber) | |
122 | self._add_entry(entry) | |
123 | entry = Entry() | |
124 | entry.useragents.append(line[1]) | |
125 | state = 1 | |
126 | elif line[0] == "disallow": | |
127 | if state==0: | |
128 | _debug("line %d: error: you must insert a user-agent:" | |
129 | " directive before this line" % linenumber) | |
130 | else: | |
131 | entry.rulelines.append(RuleLine(line[1], False)) | |
132 | state = 2 | |
133 | elif line[0] == "allow": | |
134 | if state==0: | |
135 | _debug("line %d: error: you must insert a user-agent:" | |
136 | " directive before this line" % linenumber) | |
137 | else: | |
138 | entry.rulelines.append(RuleLine(line[1], True)) | |
139 | else: | |
140 | _debug("line %d: warning: unknown key %s" % (linenumber, | |
141 | line[0])) | |
142 | else: | |
143 | _debug("line %d: error: malformed line %s"%(linenumber, line)) | |
144 | if state==2: | |
145 | self.entries.append(entry) | |
146 | _debug("Parsed rules:\n%s" % str(self)) | |
147 | ||
148 | ||
149 | def can_fetch(self, useragent, url): | |
150 | """using the parsed robots.txt decide if useragent can fetch url""" | |
151 | _debug("Checking robots.txt allowance for:\n user agent: %s\n url: %s" % | |
152 | (useragent, url)) | |
153 | if self.disallow_all: | |
154 | return False | |
155 | if self.allow_all: | |
156 | return True | |
157 | # search for given user agent matches | |
158 | # the first match counts | |
159 | url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/" | |
160 | for entry in self.entries: | |
161 | if entry.applies_to(useragent): | |
162 | return entry.allowance(url) | |
163 | # try the default entry last | |
164 | if self.default_entry: | |
165 | return self.default_entry.allowance(url) | |
166 | # agent not found ==> access granted | |
167 | return True | |
168 | ||
169 | ||
170 | def __str__(self): | |
171 | ret = "" | |
172 | for entry in self.entries: | |
173 | ret = ret + str(entry) + "\n" | |
174 | return ret | |
175 | ||
176 | ||
177 | class RuleLine: | |
178 | """A rule line is a single "Allow:" (allowance==True) or "Disallow:" | |
179 | (allowance==False) followed by a path.""" | |
180 | def __init__(self, path, allowance): | |
181 | if path == '' and not allowance: | |
182 | # an empty value means allow all | |
183 | allowance = True | |
184 | self.path = urllib.quote(path) | |
185 | self.allowance = allowance | |
186 | ||
187 | def applies_to(self, filename): | |
188 | return self.path=="*" or filename.startswith(self.path) | |
189 | ||
190 | def __str__(self): | |
191 | return (self.allowance and "Allow" or "Disallow")+": "+self.path | |
192 | ||
193 | ||
194 | class Entry: | |
195 | """An entry has one or more user-agents and zero or more rulelines""" | |
196 | def __init__(self): | |
197 | self.useragents = [] | |
198 | self.rulelines = [] | |
199 | ||
200 | def __str__(self): | |
201 | ret = "" | |
202 | for agent in self.useragents: | |
203 | ret = ret + "User-agent: "+agent+"\n" | |
204 | for line in self.rulelines: | |
205 | ret = ret + str(line) + "\n" | |
206 | return ret | |
207 | ||
208 | def applies_to(self, useragent): | |
209 | """check if this entry applies to the specified agent""" | |
210 | # split the name token and make it lower case | |
211 | useragent = useragent.split("/")[0].lower() | |
212 | for agent in self.useragents: | |
213 | if agent=='*': | |
214 | # we have the catch-all agent | |
215 | return True | |
216 | agent = agent.lower() | |
217 | if agent in useragent: | |
218 | return True | |
219 | return False | |
220 | ||
221 | def allowance(self, filename): | |
222 | """Preconditions: | |
223 | - our agent applies to this entry | |
224 | - filename is URL decoded""" | |
225 | for line in self.rulelines: | |
226 | _debug((filename, str(line), line.allowance)) | |
227 | if line.applies_to(filename): | |
228 | return line.allowance | |
229 | return True | |
230 | ||
231 | class URLopener(urllib.FancyURLopener): | |
232 | def __init__(self, *args): | |
233 | urllib.FancyURLopener.__init__(self, *args) | |
234 | self.errcode = 200 | |
235 | ||
236 | def http_error_default(self, url, fp, errcode, errmsg, headers): | |
237 | self.errcode = errcode | |
238 | return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, | |
239 | errmsg, headers) | |
240 | ||
241 | def _check(a,b): | |
242 | if not b: | |
243 | ac = "access denied" | |
244 | else: | |
245 | ac = "access allowed" | |
246 | if a!=b: | |
247 | print "failed" | |
248 | else: | |
249 | print "ok (%s)" % ac | |
250 | ||
251 | ||
252 | def _test(): | |
253 | global debug | |
254 | rp = RobotFileParser() | |
255 | debug = 1 | |
256 | ||
257 | # robots.txt that exists, gotten to by redirection | |
258 | rp.set_url('http://www.musi-cal.com/robots.txt') | |
259 | rp.read() | |
260 | ||
261 | # test for re.escape | |
262 | _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1) | |
263 | # this should match the first rule, which is a disallow | |
264 | _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0) | |
265 | # various cherry pickers | |
266 | _check(rp.can_fetch('CherryPickerSE', | |
267 | 'http://www.musi-cal.com/cgi-bin/event-search' | |
268 | '?city=San+Francisco'), 0) | |
269 | _check(rp.can_fetch('CherryPickerSE/1.0', | |
270 | 'http://www.musi-cal.com/cgi-bin/event-search' | |
271 | '?city=San+Francisco'), 0) | |
272 | _check(rp.can_fetch('CherryPickerSE/1.5', | |
273 | 'http://www.musi-cal.com/cgi-bin/event-search' | |
274 | '?city=San+Francisco'), 0) | |
275 | # case sensitivity | |
276 | _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0) | |
277 | _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0) | |
278 | # substring test | |
279 | _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0) | |
280 | # tests for catch-all * agent | |
281 | _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0) | |
282 | _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1) | |
283 | _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) | |
284 | _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1) | |
285 | ||
286 | # robots.txt that does not exist | |
287 | rp.set_url('http://www.lycos.com/robots.txt') | |
288 | rp.read() | |
289 | _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1) | |
290 | ||
291 | if __name__ == '__main__': | |
292 | _test() |