diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py --- a/Lib/test/test_robotparser.py +++ b/Lib/test/test_robotparser.py @@ -74,51 +74,54 @@ class RejectAllRobotsTest(BaseRobotTest, # go away User-agent: * Disallow: / """ good = [] bad = ['/cyberworld/map/index.html', '/', '/tmp/'] -class CrawlDelayAndRequestRateTest(BaseRobotTest, unittest.TestCase): +class BaseRequestRateTest(BaseRobotTest): + + def test_request_rate(self): + for url in self.good + self.bad: + agent, url = self.get_agent_and_url(url) + with self.subTest(url=url, agent=agent): + if self.crawl_delay: + self.assertEqual( + self.parser.crawl_delay(agent), self.crawl_delay + ) + if self.request_rate: + self.assertEqual( + self.parser.request_rate(agent).requests, + self.request_rate.requests + ) + self.assertEqual( + self.parser.request_rate(agent).seconds, + self.request_rate.seconds + ) + + +class CrawlDelayAndRequestRateTest(BaseRequestRateTest, unittest.TestCase): robots_txt = """\ User-agent: figtree Crawl-delay: 3 Request-rate: 9/30 Disallow: /tmp Disallow: /a%3cd.html Disallow: /a%2fb.html Disallow: /%7ejoe/index.html """ agent = 'figtree' request_rate = namedtuple('req_rate', 'requests seconds')(9, 30) crawl_delay = 3 good = [('figtree', '/foo.html')] bad = ['/tmp', '/tmp.html', '/tmp/a.html', '/a%3cd.html', '/a%3Cd.html', '/a%2fb.html', '/~joe/index.html'] - def test_request_rate(self): - for url in self.good: - agent, url = self.get_agent_and_url(url) - with self.subTest(url=url, agent=agent): - if self.crawl_delay: - self.assertEqual( - self.parser.crawl_delay(agent), self.crawl_delay - ) - if self.request_rate and self.parser.request_rate(agent): - self.assertEqual( - self.parser.request_rate(agent).requests, - self.request_rate.requests - ) - self.assertEqual( - self.parser.request_rate(agent).seconds, - self.request_rate.seconds - ) - class DifferentAgentTest(CrawlDelayAndRequestRateTest): agent = 'FigTree Robot libwww-perl/5.04' # these are not actually tested, but we still need to parse it # in order to accommodate the input parameters request_rate = None crawl_delay = None @@ -225,16 +228,29 @@ class EmptyQueryStringTest(BaseRobotTest User-agent: * Allow: /some/path? Disallow: /another/path? """ good = ['/some/path?'] bad = ['/another/path?'] +class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase): + robots_txt = """\ +User-agent: * +Crawl-delay: 1 +Request-rate: 3/15 +Disallow: /cyberworld/map/ + """ + request_rate = namedtuple('req_rate', 'requests seconds')(3, 15) + crawl_delay = 1 + good = ['/', '/test.html'] + bad = ['/cyberworld/map/index.html'] + + class RobotHandler(BaseHTTPRequestHandler): def do_GET(self): self.send_error(403, "Forbidden access") def log_message(self, format, *args): pass diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py --- a/Lib/urllib/robotparser.py +++ b/Lib/urllib/robotparser.py @@ -170,26 +170,30 @@ class RobotFileParser: return entry.allowance(url) # try the default entry last if self.default_entry: return self.default_entry.allowance(url) # agent not found ==> access granted return True def crawl_delay(self, useragent): + if not self.mtime(): + return None for entry in self.entries: if entry.applies_to(useragent): return entry.delay - return None + return self.default_entry.delay def request_rate(self, useragent): + if not self.mtime(): + return None for entry in self.entries: if entry.applies_to(useragent): return entry.req_rate - return None + return self.default_entry.req_rate def __str__(self): return ''.join([str(entry) + "\n" for entry in self.entries]) class RuleLine: """A rule line is a single "Allow:" (allowance==True) or "Disallow:" (allowance==False) followed by a path."""