--- /usr/lib/python3.2/urllib/robotparser.py 2012-04-24 00:27:37.000000000 +0100 +++ robotparser.py 2012-10-01 14:29:18.680660702 +0100 @@ -1,6 +1,7 @@ """ robotparser.py Copyright (C) 2000 Bastian Kleineidam + 2012 Nikolay Bogoychev You can choose between two licenses when using this package: 1) GNU GPLv2 @@ -81,7 +82,7 @@ # states: # 0: start state # 1: saw user-agent line - # 2: saw an allow or disallow line + # 2: saw an allow or disallow line or crawl-delay/request rate state = 0 entry = Entry() @@ -119,6 +120,15 @@ if state != 0: entry.rulelines.append(RuleLine(line[1], True)) state = 2 + elif line[0] == "crawl-delay": + if state != 0: + entry.delay = line[1] + state = 2 + elif line[0] == "request-rate": + if state != 0: + numbers = line[1].split('/') + entry.req_rate = [(int)(numbers[0]), (int)(numbers[1])] + state = 2 if state == 2: self._add_entry(entry) @@ -145,6 +155,24 @@ return self.default_entry.allowance(url) # agent not found ==> access granted return True + + def crawl_delay(self, useragent): + for entry in self.entries: + if entry.applies_to(useragent): + if entry.delay == 0: + return -1 #If we don't have crawl-delay entry in robots.txt + else: + return entry.delay + return -1 #If there is no request_rate defined + + def request_rate(self, useragent): + for entry in self.entries: + if entry.applies_to(useragent): + if entry.req_rate == []: + return -1 + else: + return entry.req_rate + return -1 #If there is no request_rate defined def __str__(self): return ''.join([str(entry) + "\n" for entry in self.entries]) @@ -172,6 +200,8 @@ def __init__(self): self.useragents = [] self.rulelines = [] + self.delay = 0 #Support for request rate and crawl delay + self.req_rate = [] def __str__(self): ret = []