--- robotparser.py.old 2008-06-13 15:14:02.000000000 +0300 +++ robotparser.py 2008-06-13 15:23:24.000000000 +0300 @@ -9,7 +9,7 @@ The robots.txt Exclusion Protocol is implemented as specified in http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html """ -import urlparse,urllib +import urlparse, urllib __all__ = ["RobotFileParser"] @@ -59,13 +59,10 @@ """Reads the robots.txt URL and feeds it to the parser.""" opener = URLopener() f = opener.open(self.url) - lines = [] - line = f.readline() - while line: - lines.append(line.strip()) - line = f.readline() + lines = [ line.strip() for line in f ] + f.close() self.errcode = opener.errcode - if self.errcode == 401 or self.errcode == 403: + if self.errcode in (401, 403): self.disallow_all = True _debug("disallow all") elif self.errcode >= 400: @@ -91,21 +88,21 @@ entry = Entry() for line in lines: - linenumber = linenumber + 1 + linenumber += 1 if not line: - if state==1: + if state == 1: _debug("line %d: warning: you should insert" " allow: or disallow: directives below any" " user-agent: line" % linenumber) entry = Entry() state = 0 - elif state==2: + elif state == 2: self._add_entry(entry) entry = Entry() state = 0 # remove optional comment and strip line i = line.find('#') - if i>=0: + if i >= 0: line = line[:i] line = line.strip() if not line: @@ -115,7 +112,7 @@ line[0] = line[0].strip().lower() line[1] = urllib.unquote(line[1].strip()) if line[0] == "user-agent": - if state==2: + if state == 2: _debug("line %d: warning: you should insert a blank" " line before any user-agent" " directive" % linenumber) @@ -124,14 +121,14 @@ entry.useragents.append(line[1]) state = 1 elif line[0] == "disallow": - if state==0: + if not state: _debug("line %d: error: you must insert a user-agent:" " directive before this line" % linenumber) else: entry.rulelines.append(RuleLine(line[1], False)) state = 2 elif line[0] == "allow": - if state==0: + if not state: _debug("line %d: error: you must insert a user-agent:" " directive before this line" % linenumber) else: @@ -141,7 +138,7 @@ line[0])) else: _debug("line %d: error: malformed line %s"%(linenumber, line)) - if state==2: + if state == 2: self.entries.append(entry) _debug("Parsed rules:\n%s" % str(self)) @@ -185,10 +182,10 @@ self.allowance = allowance def applies_to(self, filename): - return self.path=="*" or filename.startswith(self.path) + return self.path == "*" or filename.startswith(self.path) def __str__(self): - return (self.allowance and "Allow" or "Disallow")+": "+self.path + return "%s: %s" % ((self.allowance and "Allow" or "Disallow"), self.path) class Entry: @@ -200,9 +197,9 @@ def __str__(self): ret = "" for agent in self.useragents: - ret = ret + "User-agent: "+agent+"\n" + ret = "%sUser-agent: %s\n" % (ret, agent) for line in self.rulelines: - ret = ret + str(line) + "\n" + ret += str(line) + "\n" return ret def applies_to(self, useragent): @@ -210,7 +207,7 @@ # split the name token and make it lower case useragent = useragent.split("/")[0].lower() for agent in self.useragents: - if agent=='*': + if agent == '*': # we have the catch-all agent return True agent = agent.lower() @@ -243,12 +240,12 @@ return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, errmsg, headers) -def _check(a,b): +def _check(a, b): if not b: ac = "access denied" else: ac = "access allowed" - if a!=b: + if a != b: print "failed" else: print "ok (%s)" % ac