from urllib import robotparser # Agent: FootBot. Longest match (/folder1/myfile.html) should be allowed # https://tools.ietf.org/html/draft-koster-rep-00#section-3.2 robotstxt1 = """ User-agent: FooBot Disallow: /folder1/ Allow: /folder1/myfile.html """ # Agent: FootBot. Longest match (/folder1/myfile.html) should be allowed # https://tools.ietf.org/html/draft-koster-rep-00#section-3.2 robotstxt2 = """ User-agent: FooBot Disallow: / Allow: /folder1/myfile.html """ # Default agent. Longest match (/folder1/myfile.html) should be allowed # https://tools.ietf.org/html/draft-koster-rep-00#section-3.2 robotstxt3 = """ User-agent: * Disallow: /folder1 Allow: /folder1/myfile.html """ # Default agent. Longest match (/folder1/myfile.html) should be allowed # https://tools.ietf.org/html/draft-koster-rep-00#section-3.2 robotstxt4 = """ User-agent: * Disallow: / Allow: /folder1/myfile.html """ # http://www.pythontest.net/elsewhere/robots.txt # Agent Nutch. Should be allowed to fetch /brian/ # https://tools.ietf.org/html/draft-koster-rep-00#section-3.2 # Equivalent test in Lib/test/test_robotparser.py (NetworkTestCase) should be # assertTrue robotstxt5 = """ # Used by NetworkTestCase in Lib/test/test_robotparser.py User-agent: Nutch Disallow: / Allow: /brian/ User-agent: * Disallow: /webstats/ """ # Agent GoogleBot. Equivalent rules: allow should be used. # https://tools.ietf.org/html/draft-koster-rep-00#section-2.2.2 robotstxt6 = """ User-agent: GoogleBot Disallow: /folder1/ Allow: /folder1/ """ def test(robots, agent, url, expected): print(f"{robots}") print(f"{agent=}, {url=}, {expected=}") parser = robotparser.RobotFileParser() parser.parse(robots.split('\n')) result = parser.can_fetch(agent, url) print(f"Expected {expected}, got {result}") print("=" * 80) test(robotstxt1, 'FooBot', '/folder1/myfile.html', True) test(robotstxt2, 'FooBot', '/folder1/myfile.html', True) test(robotstxt3, 'FooBot', '/folder1/myfile.html', True) test(robotstxt4, 'FooBot', '/folder1/myfile.html', True) test(robotstxt5, 'Nutch', '/brian/', True)