From: <fwi...@us...> - 2011-03-12 16:58:24
|
Revision: 7213 http://jython.svn.sourceforge.net/jython/?rev=7213&view=rev Author: fwierzbicki Date: 2011-03-12 16:58:16 +0000 (Sat, 12 Mar 2011) Log Message: ----------- from: http://svn.python.org/projects/python/branches/release26-maint/Lib/robotparser.py@88766 http://svn.python.org/projects/python/branches/release26-maint/Lib/test/test_robotparser.py@88766 Added Paths: ----------- trunk/jython/Lib/robotparser.py trunk/jython/Lib/test/test_robotparser.py Added: trunk/jython/Lib/robotparser.py =================================================================== --- trunk/jython/Lib/robotparser.py (rev 0) +++ trunk/jython/Lib/robotparser.py 2011-03-12 16:58:16 UTC (rev 7213) @@ -0,0 +1,217 @@ +""" robotparser.py + + Copyright (C) 2000 Bastian Kleineidam + + You can choose between two licenses when using this package: + 1) GNU GPLv2 + 2) PSF license for Python 2.2 + + The robots.txt Exclusion Protocol is implemented as specified in + http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html +""" +import urlparse +import urllib + +__all__ = ["RobotFileParser"] + + +class RobotFileParser: + """ This class provides a set of methods to read, parse and answer + questions about a single robots.txt file. + + """ + + def __init__(self, url=''): + self.entries = [] + self.default_entry = None + self.disallow_all = False + self.allow_all = False + self.set_url(url) + self.last_checked = 0 + + def mtime(self): + """Returns the time the robots.txt file was last fetched. + + This is useful for long-running web spiders that need to + check for new robots.txt files periodically. + + """ + return self.last_checked + + def modified(self): + """Sets the time the robots.txt file was last fetched to the + current time. + + """ + import time + self.last_checked = time.time() + + def set_url(self, url): + """Sets the URL referring to a robots.txt file.""" + self.url = url + self.host, self.path = urlparse.urlparse(url)[1:3] + + def read(self): + """Reads the robots.txt URL and feeds it to the parser.""" + opener = URLopener() + f = opener.open(self.url) + lines = [line.strip() for line in f] + f.close() + self.errcode = opener.errcode + if self.errcode in (401, 403): + self.disallow_all = True + elif self.errcode >= 400: + self.allow_all = True + elif self.errcode == 200 and lines: + self.parse(lines) + + def _add_entry(self, entry): + if "*" in entry.useragents: + # the default entry is considered last + if self.default_entry is None: + # the first default entry wins + self.default_entry = entry + else: + self.entries.append(entry) + + def parse(self, lines): + """parse the input lines from a robots.txt file. + We allow that a user-agent: line is not preceded by + one or more blank lines.""" + # states: + # 0: start state + # 1: saw user-agent line + # 2: saw an allow or disallow line + state = 0 + linenumber = 0 + entry = Entry() + + for line in lines: + linenumber += 1 + if not line: + if state == 1: + entry = Entry() + state = 0 + elif state == 2: + self._add_entry(entry) + entry = Entry() + state = 0 + # remove optional comment and strip line + i = line.find('#') + if i >= 0: + line = line[:i] + line = line.strip() + if not line: + continue + line = line.split(':', 1) + if len(line) == 2: + line[0] = line[0].strip().lower() + line[1] = urllib.unquote(line[1].strip()) + if line[0] == "user-agent": + if state == 2: + self._add_entry(entry) + entry = Entry() + entry.useragents.append(line[1]) + state = 1 + elif line[0] == "disallow": + if state != 0: + entry.rulelines.append(RuleLine(line[1], False)) + state = 2 + elif line[0] == "allow": + if state != 0: + entry.rulelines.append(RuleLine(line[1], True)) + state = 2 + if state == 2: + self._add_entry(entry) + + + def can_fetch(self, useragent, url): + """using the parsed robots.txt decide if useragent can fetch url""" + if self.disallow_all: + return False + if self.allow_all: + return True + # search for given user agent matches + # the first match counts + url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/" + for entry in self.entries: + if entry.applies_to(useragent): + return entry.allowance(url) + # try the default entry last + if self.default_entry: + return self.default_entry.allowance(url) + # agent not found ==> access granted + return True + + + def __str__(self): + return ''.join([str(entry) + "\n" for entry in self.entries]) + + +class RuleLine: + """A rule line is a single "Allow:" (allowance==True) or "Disallow:" + (allowance==False) followed by a path.""" + def __init__(self, path, allowance): + if path == '' and not allowance: + # an empty value means allow all + allowance = True + self.path = urllib.quote(path) + self.allowance = allowance + + def applies_to(self, filename): + return self.path == "*" or filename.startswith(self.path) + + def __str__(self): + return (self.allowance and "Allow" or "Disallow") + ": " + self.path + + +class Entry: + """An entry has one or more user-agents and zero or more rulelines""" + def __init__(self): + self.useragents = [] + self.rulelines = [] + + def __str__(self): + ret = [] + for agent in self.useragents: + ret.extend(["User-agent: ", agent, "\n"]) + for line in self.rulelines: + ret.extend([str(line), "\n"]) + return ''.join(ret) + + def applies_to(self, useragent): + """check if this entry applies to the specified agent""" + # split the name token and make it lower case + useragent = useragent.split("/")[0].lower() + for agent in self.useragents: + if agent == '*': + # we have the catch-all agent + return True + agent = agent.lower() + if agent in useragent: + return True + return False + + def allowance(self, filename): + """Preconditions: + - our agent applies to this entry + - filename is URL decoded""" + for line in self.rulelines: + if line.applies_to(filename): + return line.allowance + return True + +class URLopener(urllib.FancyURLopener): + def __init__(self, *args): + urllib.FancyURLopener.__init__(self, *args) + self.errcode = 200 + + def prompt_user_passwd(self, host, realm): + ## If robots.txt file is accessible only with a password, + ## we act as if the file wasn't there. + return None, None + + def http_error_default(self, url, fp, errcode, errmsg, headers): + self.errcode = errcode + return urllib.FancyURLopener.http_error_default(self, url, fp, errcode, + errmsg, headers) Added: trunk/jython/Lib/test/test_robotparser.py =================================================================== --- trunk/jython/Lib/test/test_robotparser.py (rev 0) +++ trunk/jython/Lib/test/test_robotparser.py 2011-03-12 16:58:16 UTC (rev 7213) @@ -0,0 +1,236 @@ +import unittest, StringIO, robotparser +from test import test_support + +class RobotTestCase(unittest.TestCase): + def __init__(self, index, parser, url, good, agent): + unittest.TestCase.__init__(self) + if good: + self.str = "RobotTest(%d, good, %s)" % (index, url) + else: + self.str = "RobotTest(%d, bad, %s)" % (index, url) + self.parser = parser + self.url = url + self.good = good + self.agent = agent + + def runTest(self): + if isinstance(self.url, tuple): + agent, url = self.url + else: + url = self.url + agent = self.agent + if self.good: + self.failUnless(self.parser.can_fetch(agent, url)) + else: + self.failIf(self.parser.can_fetch(agent, url)) + + def __str__(self): + return self.str + +tests = unittest.TestSuite() + +def RobotTest(index, robots_txt, good_urls, bad_urls, + agent="test_robotparser"): + + lines = StringIO.StringIO(robots_txt).readlines() + parser = robotparser.RobotFileParser() + parser.parse(lines) + for url in good_urls: + tests.addTest(RobotTestCase(index, parser, url, 1, agent)) + for url in bad_urls: + tests.addTest(RobotTestCase(index, parser, url, 0, agent)) + +# Examples from http://www.robotstxt.org/wc/norobots.html (fetched 2002) + +# 1. +doc = """ +User-agent: * +Disallow: /cyberworld/map/ # This is an infinite virtual URL space +Disallow: /tmp/ # these will soon disappear +Disallow: /foo.html +""" + +good = ['/','/test.html'] +bad = ['/cyberworld/map/index.html','/tmp/xxx','/foo.html'] + +RobotTest(1, doc, good, bad) + +# 2. +doc = """ +# robots.txt for http://www.example.com/ + +User-agent: * +Disallow: /cyberworld/map/ # This is an infinite virtual URL space + +# Cybermapper knows where to go. +User-agent: cybermapper +Disallow: + +""" + +good = ['/','/test.html',('cybermapper','/cyberworld/map/index.html')] +bad = ['/cyberworld/map/index.html'] + +RobotTest(2, doc, good, bad) + +# 3. +doc = """ +# go away +User-agent: * +Disallow: / +""" + +good = [] +bad = ['/cyberworld/map/index.html','/','/tmp/'] + +RobotTest(3, doc, good, bad) + +# Examples from http://www.robotstxt.org/wc/norobots-rfc.html (fetched 2002) + +# 4. +doc = """ +User-agent: figtree +Disallow: /tmp +Disallow: /a%3cd.html +Disallow: /a%2fb.html +Disallow: /%7ejoe/index.html +""" + +good = [] # XFAIL '/a/b.html' +bad = ['/tmp','/tmp.html','/tmp/a.html', + '/a%3cd.html','/a%3Cd.html','/a%2fb.html', + '/~joe/index.html' + ] + +RobotTest(4, doc, good, bad, 'figtree') +RobotTest(5, doc, good, bad, 'FigTree Robot libwww-perl/5.04') + +# 6. +doc = """ +User-agent: * +Disallow: /tmp/ +Disallow: /a%3Cd.html +Disallow: /a/b.html +Disallow: /%7ejoe/index.html +""" + +good = ['/tmp',] # XFAIL: '/a%2fb.html' +bad = ['/tmp/','/tmp/a.html', + '/a%3cd.html','/a%3Cd.html',"/a/b.html", + '/%7Ejoe/index.html'] + +RobotTest(6, doc, good, bad) + +# From bug report #523041 + +# 7. +doc = """ +User-Agent: * +Disallow: /. +""" + +good = ['/foo.html'] +bad = [] # Bug report says "/" should be denied, but that is not in the RFC + +RobotTest(7, doc, good, bad) + +# From Google: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40364 + +# 8. +doc = """ +User-agent: Googlebot +Allow: /folder1/myfile.html +Disallow: /folder1/ +""" + +good = ['/folder1/myfile.html'] +bad = ['/folder1/anotherfile.html'] + +RobotTest(8, doc, good, bad, agent="Googlebot") + +# 9. This file is incorrect because "Googlebot" is a substring of +# "Googlebot-Mobile", so test 10 works just like test 9. +doc = """ +User-agent: Googlebot +Disallow: / + +User-agent: Googlebot-Mobile +Allow: / +""" + +good = [] +bad = ['/something.jpg'] + +RobotTest(9, doc, good, bad, agent="Googlebot") + +good = [] +bad = ['/something.jpg'] + +RobotTest(10, doc, good, bad, agent="Googlebot-Mobile") + +# 11. Get the order correct. +doc = """ +User-agent: Googlebot-Mobile +Allow: / + +User-agent: Googlebot +Disallow: / +""" + +good = [] +bad = ['/something.jpg'] + +RobotTest(11, doc, good, bad, agent="Googlebot") + +good = ['/something.jpg'] +bad = [] + +RobotTest(12, doc, good, bad, agent="Googlebot-Mobile") + + +# 13. Google also got the order wrong in #8. You need to specify the +# URLs from more specific to more general. +doc = """ +User-agent: Googlebot +Allow: /folder1/myfile.html +Disallow: /folder1/ +""" + +good = ['/folder1/myfile.html'] +bad = ['/folder1/anotherfile.html'] + +RobotTest(13, doc, good, bad, agent="googlebot") + + +# 14. For issue #4108 (obey first * entry) +doc = """ +User-agent: * +Disallow: /some/path + +User-agent: * +Disallow: /another/path +""" + +good = ['/another/path'] +bad = ['/some/path'] + +RobotTest(14, doc, good, bad) + + +class TestCase(unittest.TestCase): + def runTest(self): + test_support.requires('network') + # whole site is password-protected. + url = 'http://mueblesmoraleda.com' + parser = robotparser.RobotFileParser() + parser.set_url(url) + parser.read() + self.assertEqual(parser.can_fetch("*", url+"/robots.txt"), False) + +def test_main(): + test_support.run_unittest(tests) + TestCase().run() + +if __name__=='__main__': + test_support.verbose = 1 + test_main() This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |