import unittest
, StringIO
, robotparser
from test
import test_support
class RobotTestCase(unittest
.TestCase
):
def __init__(self
, index
, parser
, url
, good
, agent
):
unittest
.TestCase
.__init
__(self
)
self
.str = "RobotTest(%d, good, %s)" % (index
, url
)
self
.str = "RobotTest(%d, bad, %s)" % (index
, url
)
if isinstance(self
.url
, tuple):
self
.failUnless(self
.parser
.can_fetch(agent
, url
))
self
.failIf(self
.parser
.can_fetch(agent
, url
))
tests
= unittest
.TestSuite()
def RobotTest(index
, robots_txt
, good_urls
, bad_urls
,
agent
="test_robotparser"):
lines
= StringIO
.StringIO(robots_txt
).readlines()
parser
= robotparser
.RobotFileParser()
tests
.addTest(RobotTestCase(index
, parser
, url
, 1, agent
))
tests
.addTest(RobotTestCase(index
, parser
, url
, 0, agent
))
# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002)
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
Disallow: /tmp/ # these will soon disappear
good
= ['/','/test.html']
bad
= ['/cyberworld/map/index.html','/tmp/xxx','/foo.html']
RobotTest(1, doc
, good
, bad
)
# robots.txt for http://www.example.com/
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
# Cybermapper knows where to go.
good
= ['/','/test.html',('cybermapper','/cyberworld/map/index.html')]
bad
= ['/cyberworld/map/index.html']
RobotTest(2, doc
, good
, bad
)
bad
= ['/cyberworld/map/index.html','/','/tmp/']
RobotTest(3, doc
, good
, bad
)
# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002)
Disallow: /%7ejoe/index.html
good
= [] # XFAIL '/a/b.html'
bad
= ['/tmp','/tmp.html','/tmp/a.html',
'/a%3cd.html','/a%3Cd.html','/a%2fb.html',
RobotTest(4, doc
, good
, bad
, 'figtree')
RobotTest(5, doc
, good
, bad
, 'FigTree Robot libwww-perl/5.04')
Disallow: /%7ejoe/index.html
good
= ['/tmp',] # XFAIL: '/a%2fb.html'
bad
= ['/tmp/','/tmp/a.html',
'/a%3cd.html','/a%3Cd.html',"/a/b.html",
RobotTest(6, doc
, good
, bad
)
# From bug report #523041
bad
= [] # Bug report says "/" should be denied, but that is not in the RFC
RobotTest(7, doc
, good
, bad
)
test_support
.run_suite(tests
)
test_support
.run_suite(tests
)