Commit | Line | Data |
---|---|---|
86530b38 AT |
1 | import unittest, StringIO, robotparser |
2 | from test import test_support | |
3 | ||
4 | class RobotTestCase(unittest.TestCase): | |
5 | def __init__(self, index, parser, url, good, agent): | |
6 | unittest.TestCase.__init__(self) | |
7 | if good: | |
8 | self.str = "RobotTest(%d, good, %s)" % (index, url) | |
9 | else: | |
10 | self.str = "RobotTest(%d, bad, %s)" % (index, url) | |
11 | self.parser = parser | |
12 | self.url = url | |
13 | self.good = good | |
14 | self.agent = agent | |
15 | ||
16 | def runTest(self): | |
17 | if isinstance(self.url, tuple): | |
18 | agent, url = self.url | |
19 | else: | |
20 | url = self.url | |
21 | agent = self.agent | |
22 | if self.good: | |
23 | self.failUnless(self.parser.can_fetch(agent, url)) | |
24 | else: | |
25 | self.failIf(self.parser.can_fetch(agent, url)) | |
26 | ||
27 | def __str__(self): | |
28 | return self.str | |
29 | ||
30 | tests = unittest.TestSuite() | |
31 | ||
32 | def RobotTest(index, robots_txt, good_urls, bad_urls, | |
33 | agent="test_robotparser"): | |
34 | ||
35 | lines = StringIO.StringIO(robots_txt).readlines() | |
36 | parser = robotparser.RobotFileParser() | |
37 | parser.parse(lines) | |
38 | for url in good_urls: | |
39 | tests.addTest(RobotTestCase(index, parser, url, 1, agent)) | |
40 | for url in bad_urls: | |
41 | tests.addTest(RobotTestCase(index, parser, url, 0, agent)) | |
42 | ||
43 | # Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002) | |
44 | ||
45 | # 1. | |
46 | doc = """ | |
47 | User-agent: * | |
48 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space | |
49 | Disallow: /tmp/ # these will soon disappear | |
50 | Disallow: /foo.html | |
51 | """ | |
52 | ||
53 | good = ['/','/test.html'] | |
54 | bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html'] | |
55 | ||
56 | RobotTest(1, doc, good, bad) | |
57 | ||
58 | # 2. | |
59 | doc = """ | |
60 | # robots.txt for http://www.example.com/ | |
61 | ||
62 | User-agent: * | |
63 | Disallow: /cyberworld/map/ # This is an infinite virtual URL space | |
64 | ||
65 | # Cybermapper knows where to go. | |
66 | User-agent: cybermapper | |
67 | Disallow: | |
68 | ||
69 | """ | |
70 | ||
71 | good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')] | |
72 | bad = ['/cyberworld/map/index.html'] | |
73 | ||
74 | RobotTest(2, doc, good, bad) | |
75 | ||
76 | # 3. | |
77 | doc = """ | |
78 | # go away | |
79 | User-agent: * | |
80 | Disallow: / | |
81 | """ | |
82 | ||
83 | good = [] | |
84 | bad = ['/cyberworld/map/index.html','/','/tmp/'] | |
85 | ||
86 | RobotTest(3, doc, good, bad) | |
87 | ||
88 | # Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002) | |
89 | ||
90 | # 4. | |
91 | doc = """ | |
92 | User-agent: figtree | |
93 | Disallow: /tmp | |
94 | Disallow: /a%3cd.html | |
95 | Disallow: /a%2fb.html | |
96 | Disallow: /%7ejoe/index.html | |
97 | """ | |
98 | ||
99 | good = [] # XFAIL '/a/b.html' | |
100 | bad = ['/tmp','/tmp.html','/tmp/a.html', | |
101 | '/a%3cd.html','/a%3Cd.html','/a%2fb.html', | |
102 | '/~joe/index.html' | |
103 | ] | |
104 | ||
105 | RobotTest(4, doc, good, bad, 'figtree') | |
106 | RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04') | |
107 | ||
108 | # 6. | |
109 | doc = """ | |
110 | User-agent: * | |
111 | Disallow: /tmp/ | |
112 | Disallow: /a%3Cd.html | |
113 | Disallow: /a/b.html | |
114 | Disallow: /%7ejoe/index.html | |
115 | """ | |
116 | ||
117 | good = ['/tmp',] # XFAIL: '/a%2fb.html' | |
118 | bad = ['/tmp/','/tmp/a.html', | |
119 | '/a%3cd.html','/a%3Cd.html',"/a/b.html", | |
120 | '/%7Ejoe/index.html'] | |
121 | ||
122 | RobotTest(6, doc, good, bad) | |
123 | ||
124 | # From bug report #523041 | |
125 | ||
126 | # 7. | |
127 | doc = """ | |
128 | User-Agent: * | |
129 | Disallow: /. | |
130 | """ | |
131 | ||
132 | good = ['/foo.html'] | |
133 | bad = [] # Bug report says "/" should be denied, but that is not in the RFC | |
134 | ||
135 | RobotTest(7, doc, good, bad) | |
136 | ||
137 | def test_main(): | |
138 | test_support.run_suite(tests) | |
139 | ||
140 | if __name__=='__main__': | |
141 | test_support.Verbose = 1 | |
142 | test_support.run_suite(tests) |