--- /usr/lib/python2.7/robotparser.py 2012-04-24 01:02:22.000000000 +0100 +++ robotparser.py 2012-09-27 11:05:16.377662790 +0100 @@ -1,6 +1,10 @@ +#the original robotparser.py with small aditions so that it would recognize +#crawl-delay and request-rate + """ robotparser.py Copyright (C) 2000 Bastian Kleineidam + 2012 Nikolay Bogoychev You can choose between two licenses when using this package: 1) GNU GPLv2 @@ -8,6 +12,7 @@ The robots.txt Exclusion Protocol is implemented as specified in http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html + """ import urlparse import urllib @@ -26,6 +31,8 @@ self.default_entry = None self.disallow_all = False self.allow_all = False + #self.delay = 0 + #self.request-rate = [] self.set_url(url) self.last_checked = 0 @@ -81,7 +88,7 @@ # states: # 0: start state # 1: saw user-agent line - # 2: saw an allow or disallow line + # 2: saw an allow or disallow line or crawl-delay/request rate state = 0 linenumber = 0 entry = Entry() @@ -121,6 +128,15 @@ if state != 0: entry.rulelines.append(RuleLine(line[1], True)) state = 2 + elif line[0] == "crawl-delay": + if state != 0: + entry.delay = line[1] + state = 2 + elif line[0] == "request-rate": + if state != 0: + numbers = line[1].split('/') + entry.req_rate = [(int)(numbers[0]), (int)(numbers[1])] + state = 2 if state == 2: self._add_entry(entry) @@ -147,6 +163,24 @@ return self.default_entry.allowance(url) # agent not found ==> access granted return True + + def crawl_delay(self, useragent): + for entry in self.entries: + if entry.applies_to(useragent): + if entry.delay == 0: + return -1 #If we don't have crawl-delay entry in robots.txt + else: + return entry.delay + return -1 #If there is no request_rate defined + + def request_rate(self, useragent): + for entry in self.entries: + if entry.applies_to(useragent): + if entry.req_rate == []: + return -1 + else: + return entry.req_rate + return -1 #If there is no request_rate defined def __str__(self): @@ -175,6 +209,8 @@ def __init__(self): self.useragents = [] self.rulelines = [] + self.delay = 0 #Support for request rate and crawl delay + self.req_rate = [] def __str__(self): ret = []