diff -r de982d8b7b15 Lib/test/test_robotparser.py --- a/Lib/test/test_robotparser.py Tue Oct 13 21:26:35 2015 +0300 +++ b/Lib/test/test_robotparser.py Wed Oct 14 11:33:02 2015 -0700 @@ -39,7 +39,7 @@ if self.good: self.assertTrue(self.parser.can_fetch(agent, url)) self.assertEqual(self.parser.crawl_delay(agent), self.crawl_delay) - # if we have actual values for request rate + #if we have actual values for request rate if self.request_rate and self.parser.request_rate(agent): self.assertEqual( self.parser.request_rate(agent).requests, @@ -49,7 +49,11 @@ self.parser.request_rate(agent).seconds, self.request_rate.seconds ) - self.assertEqual(self.parser.request_rate(agent), self.request_rate) + else: + self.assertEqual( + self.parser.request_rate(agent), + self.request_rate + ) else: self.assertFalse(self.parser.can_fetch(agent, url)) @@ -103,14 +107,39 @@ """ -good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')] +good = [('cybermapper','/cyberworld/map/index.html')] bad = ['/cyberworld/map/index.html'] request_rate = None # The parameters should be equal to None since they crawl_delay = None # don't apply to the cybermapper user agent RobotTest(2, doc, good, bad, request_rate, crawl_delay) -# 3. +# 3 +doc = """ +# robots.txt for http://www.example.com/ + +User-agent: * +Crawl-delay: 1 +Request-rate: 3/15 +Disallow: /cyberworld/map/ # This is an infinite virtual URL space + +# Cybermapper knows where to go. +User-agent: cybermapper +Disallow: + +""" + +good = ['/','/test.html'] +bad = ['/cyberworld/map/index.html'] +request_rate = namedtuple('req_rate', 'requests seconds') +request_rate.requests = 3 +request_rate.seconds = 15 +crawl_delay = 1 # don't apply to the cybermapper user agent + +RobotTest(3, doc, good, bad, request_rate, crawl_delay) + + +# 4. doc = """ # go away User-agent: * @@ -122,11 +151,11 @@ request_rate = None crawl_delay = None -RobotTest(3, doc, good, bad, request_rate, crawl_delay) +RobotTest(4, doc, good, bad, request_rate, crawl_delay) # Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002) -# 4. +# 5. doc = """ User-agent: figtree Crawl-delay: 3 @@ -150,8 +179,7 @@ request_rate_bad = None # not actually tested, but we still need to parse it crawl_delay_bad = None # in order to accommodate the input parameters - -RobotTest(4, doc, good, bad, request_rate, crawl_delay, 'figtree' ) +RobotTest(5, doc, good, bad, request_rate, crawl_delay, 'figtree' ) RobotTest(5, doc, good, bad, request_rate_bad, crawl_delay_bad, 'FigTree Robot libwww-perl/5.04') @@ -168,12 +196,12 @@ good = ['/tmp',] # XFAIL: '/a%2fb.html' bad = ['/tmp/','/tmp/a.html', - '/a%3cd.html','/a%3Cd.html',"/a/b.html", + '/a%3cd.html','/a%3Cd.html','/a/b.html', '/%7Ejoe/index.html'] crawl_delay = 3 request_rate = None # since request rate has invalid syntax, return None -RobotTest(6, doc, good, bad, None, None) +RobotTest(6, doc, good, bad, request_rate, crawl_delay) # From bug report #523041 @@ -227,9 +255,9 @@ good = [] bad = ['/something.jpg'] -RobotTest(10, doc, good, bad, None, None, agent="Googlebot-Mobile") +RobotTest(9, doc, good, bad, None, None, agent="Googlebot-Mobile") -# 11. Get the order correct. +# 10. Get the order correct. doc = """ User-agent: Googlebot-Mobile Allow: / @@ -241,15 +269,15 @@ good = [] bad = ['/something.jpg'] -RobotTest(11, doc, good, bad, None, None, agent="Googlebot") +RobotTest(10, doc, good, bad, None, None, agent="Googlebot") good = ['/something.jpg'] bad = [] -RobotTest(12, doc, good, bad, None, None, agent="Googlebot-Mobile") +RobotTest(10, doc, good, bad, None, None, agent="Googlebot-Mobile") -# 13. Google also got the order wrong in #8. You need to specify the +# 11. Google also got the order wrong in #8. You need to specify the # URLs from more specific to more general. doc = """ User-agent: Googlebot @@ -260,10 +288,10 @@ good = ['/folder1/myfile.html'] bad = ['/folder1/anotherfile.html'] -RobotTest(13, doc, good, bad, None, None, agent="googlebot") +RobotTest(11, doc, good, bad, None, None, agent="googlebot") -# 14. For issue #6325 (query string support) +# 12. For issue #6325 (query string support) doc = """ User-agent: * Disallow: /some/path?name=value @@ -272,9 +300,9 @@ good = ['/some/path'] bad = ['/some/path?name=value'] -RobotTest(14, doc, good, bad, None, None) +RobotTest(12, doc, good, bad, None, None) -# 15. For issue #4108 (obey first * entry) +# 13. For issue #4108 (obey first * entry) doc = """ User-agent: * Disallow: /some/path @@ -286,9 +314,9 @@ good = ['/another/path'] bad = ['/some/path'] -RobotTest(15, doc, good, bad, None, None) +RobotTest(13, doc, good, bad, None, None) -# 16. Empty query (issue #17403). Normalizing the url first. +# 14. Empty query (issue #17403). Normalizing the url first. doc = """ User-agent: * Allow: /some/path? @@ -298,7 +326,7 @@ good = ['/some/path?'] bad = ['/another/path?'] -RobotTest(16, doc, good, bad, None, None) +RobotTest(14, doc, good, bad, None, None) class RobotHandler(BaseHTTPRequestHandler): diff -r de982d8b7b15 Lib/urllib/robotparser.py --- a/Lib/urllib/robotparser.py Tue Oct 13 21:26:35 2015 +0300 +++ b/Lib/urllib/robotparser.py Wed Oct 14 11:33:02 2015 -0700 @@ -178,13 +178,19 @@ for entry in self.entries: if entry.applies_to(useragent): return entry.delay - return None + # Check to see if robots.txt has been read, if not, return None + if not self.last_checked: + return None + return self.default_entry.delay def request_rate(self, useragent): for entry in self.entries: if entry.applies_to(useragent): return entry.req_rate - return None + # Check to see if robots.txt has been read, if not, return None + if not self.last_checked: + return None + return self.default_entry.req_rate def __str__(self): return ''.join([str(entry) + "\n" for entry in self.entries])