Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(42982)

Side by Side Diff: Lib/urllib/robotparser.py

Issue 16099: robotparser doesn't support request rate and crawl delay parameters
Patch Set: Created 6 years, 3 months ago
Left:
Right:
Use n/p to move between diff chunks; N/P to move between comments. Please Sign in to add in-line comments.
Jump to:
View unified diff | Download patch
« Lib/test/test_robotparser.py ('K') | « Lib/test/test_robotparser.py ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 """ robotparser.py 1 """ robotparser.py
2 2
3 Copyright (C) 2000 Bastian Kleineidam 3 Copyright (C) 2000 Bastian Kleineidam
4 4
5 You can choose between two licenses when using this package: 5 You can choose between two licenses when using this package:
6 1) GNU GPLv2 6 1) GNU GPLv2
7 2) PSF license for Python 2.2 7 2) PSF license for Python 2.2
8 8
9 The robots.txt Exclusion Protocol is implemented as specified in 9 The robots.txt Exclusion Protocol is implemented as specified in
10 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html 10 http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
(...skipping 100 matching lines...) Expand 10 before | Expand all | Expand 10 after
111 entry = Entry() 111 entry = Entry()
112 entry.useragents.append(line[1]) 112 entry.useragents.append(line[1])
113 state = 1 113 state = 1
114 elif line[0] == "disallow": 114 elif line[0] == "disallow":
115 if state != 0: 115 if state != 0:
116 entry.rulelines.append(RuleLine(line[1], False)) 116 entry.rulelines.append(RuleLine(line[1], False))
117 state = 2 117 state = 2
118 elif line[0] == "allow": 118 elif line[0] == "allow":
119 if state != 0: 119 if state != 0:
120 entry.rulelines.append(RuleLine(line[1], True)) 120 entry.rulelines.append(RuleLine(line[1], True))
121 state = 2
122 elif line[0] == "crawl-delay":
123 if state != 0:
124 entry.delay = int(line[1])
125 state = 2
126 elif line[0] == "request-rate":
127 if state != 0:
128 numbers = line[1].split('/')
129 entry.req_rate = [int(numbers[0]), int(numbers[1])]
121 state = 2 130 state = 2
122 if state == 2: 131 if state == 2:
123 self._add_entry(entry) 132 self._add_entry(entry)
124 133
125 134
126 def can_fetch(self, useragent, url): 135 def can_fetch(self, useragent, url):
127 """using the parsed robots.txt decide if useragent can fetch url""" 136 """using the parsed robots.txt decide if useragent can fetch url"""
128 if self.disallow_all: 137 if self.disallow_all:
129 return False 138 return False
130 if self.allow_all: 139 if self.allow_all:
131 return True 140 return True
132 # search for given user agent matches 141 # search for given user agent matches
133 # the first match counts 142 # the first match counts
134 parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url)) 143 parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
135 url = urllib.parse.urlunparse(('','',parsed_url.path, 144 url = urllib.parse.urlunparse(('','',parsed_url.path,
136 parsed_url.params,parsed_url.query, parsed_url.fragment)) 145 parsed_url.params,parsed_url.query, parsed_url.fragment))
137 url = urllib.parse.quote(url) 146 url = urllib.parse.quote(url)
138 if not url: 147 if not url:
139 url = "/" 148 url = "/"
140 for entry in self.entries: 149 for entry in self.entries:
141 if entry.applies_to(useragent): 150 if entry.applies_to(useragent):
142 return entry.allowance(url) 151 return entry.allowance(url)
143 # try the default entry last 152 # try the default entry last
144 if self.default_entry: 153 if self.default_entry:
145 return self.default_entry.allowance(url) 154 return self.default_entry.allowance(url)
146 # agent not found ==> access granted 155 # agent not found ==> access granted
147 return True 156 return True
157
158 def crawl_delay(self, useragent):
159 for entry in self.entries:
160 if entry.applies_to(useragent):
161 if entry.delay == 0:
162 return -1 # If we don't have timeout to wait
berkerpeksag 2013/12/09 03:30:54 This comment could go to the above of ``if entry.d
163 else: # before crawling, or we don't have entry for the
164 return entry.delay # current user agent
165 return -1 # If there is no request_rate defined
166
167 def request_rate(self, useragent):
168 for entry in self.entries:
berkerpeksag 2013/12/09 03:30:54 Is there a better way to calculate this? (perhaps
169 if entry.applies_to(useragent):
170 if entry.req_rate == []:
berkerpeksag 2013/12/09 03:30:54 if not entry.req_rate:
171 return -1 # If we don't have request_rate
172 else: # for the current agent.
173 return entry.req_rate
174 return -1 # If there is no request_rate defined
148 175
149 def __str__(self): 176 def __str__(self):
150 return ''.join([str(entry) + "\n" for entry in self.entries]) 177 return ''.join([str(entry) + "\n" for entry in self.entries])
151 178
152 179
153 class RuleLine: 180 class RuleLine:
154 """A rule line is a single "Allow:" (allowance==True) or "Disallow:" 181 """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
155 (allowance==False) followed by a path.""" 182 (allowance==False) followed by a path."""
156 def __init__(self, path, allowance): 183 def __init__(self, path, allowance):
157 if path == '' and not allowance: 184 if path == '' and not allowance:
158 # an empty value means allow all 185 # an empty value means allow all
159 allowance = True 186 allowance = True
160 self.path = urllib.parse.quote(path) 187 self.path = urllib.parse.quote(path)
161 self.allowance = allowance 188 self.allowance = allowance
162 189
163 def applies_to(self, filename): 190 def applies_to(self, filename):
164 return self.path == "*" or filename.startswith(self.path) 191 return self.path == "*" or filename.startswith(self.path)
165 192
166 def __str__(self): 193 def __str__(self):
167 return (self.allowance and "Allow" or "Disallow") + ": " + self.path 194 return (self.allowance and "Allow" or "Disallow") + ": " + self.path
168 195
169 196
170 class Entry: 197 class Entry:
171 """An entry has one or more user-agents and zero or more rulelines""" 198 """An entry has one or more user-agents and zero or more rulelines"""
172 def __init__(self): 199 def __init__(self):
173 self.useragents = [] 200 self.useragents = []
174 self.rulelines = [] 201 self.rulelines = []
202 self.delay = 0 # Support for request rate and crawl delay here
berkerpeksag 2013/12/09 03:30:54 Please use two spaces: self.delay = 0 # Supp
203 self.req_rate = []
175 204
176 def __str__(self): 205 def __str__(self):
177 ret = [] 206 ret = []
178 for agent in self.useragents: 207 for agent in self.useragents:
179 ret.extend(["User-agent: ", agent, "\n"]) 208 ret.extend(["User-agent: ", agent, "\n"])
180 for line in self.rulelines: 209 for line in self.rulelines:
181 ret.extend([str(line), "\n"]) 210 ret.extend([str(line), "\n"])
182 return ''.join(ret) 211 return ''.join(ret)
183 212
184 def applies_to(self, useragent): 213 def applies_to(self, useragent):
(...skipping 10 matching lines...) Expand all
195 return False 224 return False
196 225
197 def allowance(self, filename): 226 def allowance(self, filename):
198 """Preconditions: 227 """Preconditions:
199 - our agent applies to this entry 228 - our agent applies to this entry
200 - filename is URL decoded""" 229 - filename is URL decoded"""
201 for line in self.rulelines: 230 for line in self.rulelines:
202 if line.applies_to(filename): 231 if line.applies_to(filename):
203 return line.allowance 232 return line.allowance
204 return True 233 return True
OLDNEW
« Lib/test/test_robotparser.py ('K') | « Lib/test/test_robotparser.py ('k') | no next file » | no next file with comments »

RSS Feeds Recent Issues | This issue
This is Rietveld 894c83f36cb7+