From: <al...@us...> - 2008-02-11 21:23:22
|
Revision: 2198 http://archive-access.svn.sourceforge.net/archive-access/?rev=2198&view=rev Author: alexoz Date: 2008-02-11 13:23:12 -0800 (Mon, 11 Feb 2008) Log Message: ----------- Added robots.txt fetch and rule evaluation. * AccessControlException.java, RobotsUnavailableExcpetion.java, RuleOracleUnavailableException.java Created exceptions for when the oracle or robots.txt are unreachable. * CachingRuleDao.java, HttpRuleDao.java, RuleDao.java Adjusted to throw the new exceptions. * CachingRobotClient.java, HttpRobotClient.java, RobotClient.java, RobotRules.java Created caching robots.txt client based on RobotRules class from wayback. * AccessControlClient.java Added (optional) robots.txt lookups and prepare calls. Modified Paths: -------------- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/AccessControlClient.java trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/CachingRuleDao.java trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/HttpRuleDao.java trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/RuleDao.java trunk/archive-access/projects/access-control/access-control/src/test/java/org/archive/accesscontrol/AccessControlClientTest.java Added Paths: ----------- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/AccessControlException.java trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/RobotsUnavailableException.java trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/RuleOracleUnavailableException.java trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/ trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/CachingRobotClient.java trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/HttpRobotClient.java trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/RobotClient.java trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/RobotRules.java trunk/archive-access/projects/access-control/access-control/src/test/java/org/archive/accesscontrol/robotstxt/ trunk/archive-access/projects/access-control/access-control/src/test/java/org/archive/accesscontrol/robotstxt/HttpRobotClientTest.java Modified: trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/AccessControlClient.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/AccessControlClient.java 2008-02-08 22:40:50 UTC (rev 2197) +++ trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/AccessControlClient.java 2008-02-11 21:23:12 UTC (rev 2198) @@ -1,11 +1,14 @@ package org.archive.accesscontrol; +import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Date; import org.archive.accesscontrol.model.Rule; import org.archive.accesscontrol.model.RuleSet; +import org.archive.accesscontrol.robotstxt.CachingRobotClient; +import org.archive.accesscontrol.robotstxt.RobotClient; import org.archive.net.PublicSuffixes; import org.archive.util.ArchiveUtils; import org.archive.util.SURT; @@ -21,10 +24,15 @@ */ public class AccessControlClient { protected RuleDao ruleDao; + protected RobotClient robotClient; + private boolean robotLookupsEnabled = true; + private boolean robotPreparationEnabled = true; + private String robotUserAgent = "wayback-access-control"; - public AccessControlClient(RuleDao ruleDao) { + public AccessControlClient(RuleDao ruleDao, RobotClient robotClient) { super(); this.ruleDao = ruleDao; + this.robotClient = robotClient; } /** @@ -35,7 +43,7 @@ * "http://localhost:8080/exclusions-oracle/" */ public AccessControlClient(String oracleUrl) { - this(new CachingRuleDao(oracleUrl)); + this(new CachingRuleDao(oracleUrl), new CachingRobotClient()); } /** @@ -51,10 +59,24 @@ * Group name of the user accessing the document. * @return Access-control policy that should be enforced. eg "robots", * "block" or "allow". + * @throws RobotsUnavailableException + * @throws RuleOracleUnavailableException */ public String getPolicy(String url, Date captureDate, Date retrievalDate, - String who) { + String who) throws RobotsUnavailableException, RuleOracleUnavailableException { Rule matchingRule = getRule(url, captureDate, retrievalDate, who); + + if (robotLookupsEnabled && matchingRule != null && "robots".equals(matchingRule.getPolicy())) { + try { + if (robotClient.isRobotPermitted(url, robotUserAgent)) { + return "allow"; + } else { + return "block"; + } + } catch (IOException e) { + throw new RobotsUnavailableException(e); + } + } return matchingRule.getPolicy(); } @@ -70,9 +92,10 @@ * @param who * Group name of the user accessing the document. * @return + * @throws RuleOracleUnavailableException */ public Rule getRule(String url, Date captureDate, Date retrievalDate, - String who) { + String who) throws RuleOracleUnavailableException { url = ArchiveUtils.addImpliedHttpIfNecessary(url); String surt = SURT.fromURI(url); String publicSuffix = PublicSuffixes @@ -103,6 +126,10 @@ .reduceSurtToTopmostAssigned(getSurtAuthority(surt))); } ruleDao.prepare(publicSuffixes); + + if (robotPreparationEnabled) { + robotClient.prepare(urls, robotUserAgent); + } } protected String getSurtAuthority(String surt) { @@ -124,4 +151,30 @@ return surt; } } + + + public String getRobotUserAgent() { + return robotUserAgent; + } + + public void setRobotUserAgent(String robotUserAgent) { + this.robotUserAgent = robotUserAgent; + } + + public boolean isRobotLookupsEnabled() { + return robotLookupsEnabled; + } + + public void setRobotLookupsEnabled(boolean robotLookupsEnabled) { + this.robotLookupsEnabled = robotLookupsEnabled; + } + + public boolean isRobotPreparationEnabled() { + return robotPreparationEnabled; + } + + public void setRobotPreparationEnabled(boolean robotPreparationEnabled) { + this.robotPreparationEnabled = robotPreparationEnabled; + } + } Added: trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/AccessControlException.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/AccessControlException.java (rev 0) +++ trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/AccessControlException.java 2008-02-11 21:23:12 UTC (rev 2198) @@ -0,0 +1,30 @@ +package org.archive.accesscontrol; + +public class AccessControlException extends Exception { + + /** + * + */ + private static final long serialVersionUID = 4300180270651774259L; + + public AccessControlException() { + super(); + // TODO Auto-generated constructor stub + } + + public AccessControlException(String arg0, Throwable arg1) { + super(arg0, arg1); + // TODO Auto-generated constructor stub + } + + public AccessControlException(String arg0) { + super(arg0); + // TODO Auto-generated constructor stub + } + + public AccessControlException(Throwable arg0) { + super(arg0); + // TODO Auto-generated constructor stub + } + +} Modified: trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/CachingRuleDao.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/CachingRuleDao.java 2008-02-08 22:40:50 UTC (rev 2197) +++ trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/CachingRuleDao.java 2008-02-11 21:23:12 UTC (rev 2198) @@ -36,7 +36,7 @@ this.ruleDao = ruleDao; } - public RuleSet getRuleTree(String surt) { + public RuleSet getRuleTree(String surt) throws RuleOracleUnavailableException { RuleSet rules; synchronized (cache) { rules = cache.get(surt); @@ -65,7 +65,12 @@ break; surt = surts.remove(0); } - getRuleTree(surt); + try { + getRuleTree(surt); + } catch (RuleOracleUnavailableException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } } } } Modified: trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/HttpRuleDao.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/HttpRuleDao.java 2008-02-08 22:40:50 UTC (rev 2197) +++ trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/HttpRuleDao.java 2008-02-11 21:23:12 UTC (rev 2198) @@ -35,9 +35,10 @@ } /** + * @throws RuleOracleUnavailableException * @see RuleDao#getRuleTree(String) */ - public RuleSet getRuleTree(String surt) { + public RuleSet getRuleTree(String surt) throws RuleOracleUnavailableException { HttpMethod method = new GetMethod(oracleUrl + "/rules/tree/" + surt); RuleSet rules; @@ -47,8 +48,7 @@ System.out.println(response); rules = (RuleSet) xstream.fromXML(method.getResponseBodyAsStream()); } catch (IOException e) { - e.printStackTrace(); - return null; + throw new RuleOracleUnavailableException(e); } method.releaseConnection(); return rules; Added: trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/RobotsUnavailableException.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/RobotsUnavailableException.java (rev 0) +++ trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/RobotsUnavailableException.java 2008-02-11 21:23:12 UTC (rev 2198) @@ -0,0 +1,30 @@ +package org.archive.accesscontrol; + +public class RobotsUnavailableException extends AccessControlException { + + /** + * + */ + private static final long serialVersionUID = -6268896797166951256L; + + public RobotsUnavailableException() { + super(); + // TODO Auto-generated constructor stub + } + + public RobotsUnavailableException(String arg0, Throwable arg1) { + super(arg0, arg1); + // TODO Auto-generated constructor stub + } + + public RobotsUnavailableException(String arg0) { + super(arg0); + // TODO Auto-generated constructor stub + } + + public RobotsUnavailableException(Throwable arg0) { + super(arg0); + // TODO Auto-generated constructor stub + } + +} Modified: trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/RuleDao.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/RuleDao.java 2008-02-08 22:40:50 UTC (rev 2197) +++ trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/RuleDao.java 2008-02-11 21:23:12 UTC (rev 2198) @@ -25,9 +25,10 @@ * * @param surt * @return + * @throws RuleOracleUnavailableException * @throws URIException */ - public RuleSet getRuleTree(String surt); + public RuleSet getRuleTree(String surt) throws RuleOracleUnavailableException; /** * This method allows a RuleDao to prepare for lookups from a given set of Added: trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/RuleOracleUnavailableException.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/RuleOracleUnavailableException.java (rev 0) +++ trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/RuleOracleUnavailableException.java 2008-02-11 21:23:12 UTC (rev 2198) @@ -0,0 +1,30 @@ +package org.archive.accesscontrol; + +public class RuleOracleUnavailableException extends AccessControlException { + + /** + * + */ + private static final long serialVersionUID = 8574598479427378024L; + + public RuleOracleUnavailableException() { + super(); + // TODO Auto-generated constructor stub + } + + public RuleOracleUnavailableException(String arg0, Throwable arg1) { + super(arg0, arg1); + // TODO Auto-generated constructor stub + } + + public RuleOracleUnavailableException(String arg0) { + super(arg0); + // TODO Auto-generated constructor stub + } + + public RuleOracleUnavailableException(Throwable arg0) { + super(arg0); + // TODO Auto-generated constructor stub + } + +} Added: trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/CachingRobotClient.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/CachingRobotClient.java (rev 0) +++ trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/CachingRobotClient.java 2008-02-11 21:23:12 UTC (rev 2198) @@ -0,0 +1,105 @@ +package org.archive.accesscontrol.robotstxt; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.archive.accesscontrol.LruCache; + +/** + * The CachingRobotClient wraps another RobotClient and caches requests. + * + * @author aosborne + * + */ +public class CachingRobotClient extends RobotClient { + protected LruCache<String, RobotRules> cache = new LruCache<String, RobotRules>(); + protected RobotClient client; + private static final int PREPARE_THREAD_COUNT = 15; + + public RobotClient getClient() { + return client; + } + + public void setClient(RobotClient client) { + this.client = client; + } + + public CachingRobotClient() { + this.client = new HttpRobotClient(); + } + + public CachingRobotClient(RobotClient client) { + this.client = client; + } + + @Override + public RobotRules getRulesForUrl(String url, String userAgent) + throws IOException { + String robotsUrl = robotsUrlForUrl(url); + RobotRules rules; + + synchronized(cache) { + rules = cache.get(robotsUrl); + } + if (rules == null) { + rules = client.getRulesForUrl(url, userAgent); + + synchronized(cache) { + cache.put(robotsUrl, rules); + } + } + return rules; + } + + public LruCache<String, RobotRules> getCache() { + return cache; + } + + class FetchThread extends Thread { + private List<String> urls; + private String userAgent; + + public FetchThread(List<String> urls, String userAgent) { + this.urls = urls; + this.userAgent = userAgent; + } + + public void run() { + while (true) { + String url; + synchronized (urls) { + if (urls.isEmpty()) + break; + url = urls.remove(0); + } + try { + getRulesForUrl(url, userAgent); + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + + /** + * Prepare the cache to lookup info for a given set of urls. The fetches + * happen in parallel so this also makes a good option for speeding up bulk lookups. + */ + public void prepare(Collection<String> urls, String userAgent) { + List<String> safeUrls = new ArrayList<String>(urls); + FetchThread threads[] = new FetchThread[PREPARE_THREAD_COUNT ]; + for (int i = 0; i < PREPARE_THREAD_COUNT ; i++) { + threads[i] = new FetchThread(safeUrls, userAgent); + threads[i].start(); + } + for (int i = 0; i < PREPARE_THREAD_COUNT ; i++) { + try { + threads[i].join(); + } catch (InterruptedException e) { + } + } + } + +} Added: trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/HttpRobotClient.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/HttpRobotClient.java (rev 0) +++ trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/HttpRobotClient.java 2008-02-11 21:23:12 UTC (rev 2198) @@ -0,0 +1,39 @@ +package org.archive.accesscontrol.robotstxt; + +import java.io.IOException; +import java.util.Collection; + +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.HttpMethod; +import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; +import org.apache.commons.httpclient.methods.GetMethod; + +/** + * HttpRobotClient allows fetching of robots.txt rules over HTTP. + * + * @author aosborne + * + */ +public class HttpRobotClient extends RobotClient { + protected HttpClient http = new HttpClient( + new MultiThreadedHttpConnectionManager()); + + public HttpClient getHttpClient() { + return http; + } + + public RobotRules getRulesForUrl(String url, String userAgent) throws IOException { + String robotsUrl = robotsUrlForUrl(url); + HttpMethod method = new GetMethod(robotsUrl); + method.addRequestHeader("User-Agent", userAgent); + http.executeMethod(method); + RobotRules rules = new RobotRules(); + rules.parse(method.getResponseBodyAsStream()); + return rules; + } + + @Override + public void prepare(Collection<String> urls, String userAgent) { + // no-op + } +} Added: trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/RobotClient.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/RobotClient.java (rev 0) +++ trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/RobotClient.java 2008-02-11 21:23:12 UTC (rev 2198) @@ -0,0 +1,58 @@ +package org.archive.accesscontrol.robotstxt; + +import java.io.IOException; +import java.util.Collection; + +import org.apache.commons.httpclient.URIException; +import org.archive.net.LaxURI; + +/** + * A client for checking whether a robot is allowed by a robots.txt file. + * + * @author aosborne + * + */ +public abstract class RobotClient { + /** + * Returns true if a robot with the given user-agent is allowed to access + * the given url. + * + * @param url + * @param userAgent + * @return + * @throws IOException + */ + public boolean isRobotPermitted(String url, String userAgent) + throws IOException { + RobotRules rules = getRulesForUrl(url, userAgent); + return !rules.blocksPathForUA(new LaxURI(url, false).getPath(), + userAgent); + } + + /** + * Fetch the applicable ruleset for the given url and robot. + * + * @param url + * @param userAgent + * @return + * @throws IOException + */ + public abstract RobotRules getRulesForUrl(String url, String userAgent) + throws IOException; + + public static String robotsUrlForUrl(String url) throws URIException { + LaxURI uri = new LaxURI(url, false); + uri.setPath("/robots.txt"); + uri.setQuery(null); + uri.setFragment(null); + return uri.toString(); + } + + /** + * Prepare the cache to lookup info for a given set of urls. The fetches + * happen in parallel so this also makes a good option for speeding up bulk lookups. + * + * This may be a no-op. + */ + public abstract void prepare(Collection<String> urls, String userAgent); +} Added: trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/RobotRules.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/RobotRules.java (rev 0) +++ trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/RobotRules.java 2008-02-11 21:23:12 UTC (rev 2198) @@ -0,0 +1,180 @@ +/* RobotRules + * + * $Id$ + * + * Created on 2:51:20 PM Mar 12, 2007. + * + * Copyright (C) 2007 Internet Archive. + * + * This file is part of wayback-svn. + * + * wayback-svn is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback-svn is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback-svn; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.accesscontrol.robotstxt; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.logging.Logger; + +/** + * Class which parses a robots.txt file, storing the rules contained therein, + * and then allows for testing if path/userAgent tuples are blocked by those + * rules. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class RobotRules { + + private static final long serialVersionUID = 2917420727021840982L; + private static final Logger LOGGER = Logger.getLogger(RobotRules.class + .getName()); + /** + * Special name for User-agent which matches all values + */ + public static final String GLOBAL_USER_AGENT = "*"; + + private boolean bSyntaxErrors = false; + private HashMap<String, ArrayList<String>> rules = + new HashMap<String, ArrayList<String>>(); + + private LinkedList<String> userAgents = new LinkedList<String>(); + + /** + * @return true if the robots.txt file looked suspicious, currently meaning + * we found a Disallow rule that was not preceded by a "User-agent:" line + */ + public boolean hasSyntaxErrors() { + return bSyntaxErrors; + } + + /** + * @return a List of all UserAgents Found in the Robots.txt document + */ + public List<String> getUserAgentsFound() { + return userAgents; + } + + /** + * Read rules from InputStream argument into this RobotRules, as a + * side-effect, sets the bSyntaxErrors property. + * + * @param is + * @throws IOException + */ + public void parse(InputStream is) throws IOException { + + BufferedReader br = new BufferedReader(new InputStreamReader( + (InputStream) is)); + String read; + ArrayList<String> current = null; + while (br != null) { + do { + read = br.readLine(); + // Skip comments & blanks + } while ((read != null) && ((read = read.trim()).startsWith("#") || + read.length() == 0)); + if (read == null) { + br.close(); + br = null; + } else { + int commentIndex = read.indexOf("#"); + if (commentIndex > -1) { + // Strip trailing comment + read = read.substring(0, commentIndex); + } + read = read.trim(); + if (read.matches("(?i)^User-agent:.*")) { + String ua = read.substring(11).trim().toLowerCase(); + if (current == null || current.size() != 0) { + // only create new rules-list if necessary + // otherwise share with previous user-agent + current = new ArrayList<String>(); + } + rules.put(ua, current); + LOGGER.fine("Found User-agent(" + ua + ") rules..."); + continue; + } + if (read.matches("(?i)Disallow:.*")) { + if (current == null) { + // buggy robots.txt + bSyntaxErrors = true; + continue; + } + String path = read.substring(9).trim(); + current.add(path); + continue; + } + // unknown line; do nothing for now + + // TODO: check for "Allow" lines, and flag a syntax error if + // we encounter any unknown lines? + } + } + } + + private boolean blocksPath(String path, String curUA, List<String> uaRules) { + + Iterator<String> disItr = uaRules.iterator(); + while (disItr.hasNext()) { + String disallowedPath = disItr.next(); + if (disallowedPath.length() == 0) { + + LOGGER.fine("UA(" + curUA + + ") has empty disallow: Go for it!"); + return false; + + } else { + LOGGER.fine("UA(" + curUA + ") has (" + + disallowedPath + ") blocked...(" + + disallowedPath.length() + ")"); + if (disallowedPath.equals("/") || path.startsWith(disallowedPath)) { + LOGGER.fine("THIS APPLIES!!!"); + return true; + } + } + } + return false; + } + + /** + * Checks first the specified ua UserAgent, if rules are present for it, + * and then falls back to using rules for the '*' UserAgent. + * + * @param path + * @param ua + * @return boolean value where true indicates the path is blocked for ua + */ + public boolean blocksPathForUA(String path, String ua) { + + if(rules.containsKey(ua.toLowerCase())) { + + return blocksPath(path,ua,rules.get(ua.toLowerCase())); + + } else if(rules.containsKey(GLOBAL_USER_AGENT)) { + + return blocksPath(path,GLOBAL_USER_AGENT, + rules.get(GLOBAL_USER_AGENT)); + } + return false; + } +} Modified: trunk/archive-access/projects/access-control/access-control/src/test/java/org/archive/accesscontrol/AccessControlClientTest.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/test/java/org/archive/accesscontrol/AccessControlClientTest.java 2008-02-08 22:40:50 UTC (rev 2197) +++ trunk/archive-access/projects/access-control/access-control/src/test/java/org/archive/accesscontrol/AccessControlClientTest.java 2008-02-11 21:23:12 UTC (rev 2198) @@ -1,7 +1,5 @@ package org.archive.accesscontrol; -import java.util.Date; - import org.archive.accesscontrol.AccessControlClient; import junit.framework.TestCase; @@ -13,7 +11,7 @@ protected void setUp() throws Exception { super.setUp(); System.out.println("hello world"); - client = new AccessControlClient(new HttpRuleDao(ORACLE_URL)); + client = new AccessControlClient(ORACLE_URL); } protected void tearDown() throws Exception { Added: trunk/archive-access/projects/access-control/access-control/src/test/java/org/archive/accesscontrol/robotstxt/HttpRobotClientTest.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/test/java/org/archive/accesscontrol/robotstxt/HttpRobotClientTest.java (rev 0) +++ trunk/archive-access/projects/access-control/access-control/src/test/java/org/archive/accesscontrol/robotstxt/HttpRobotClientTest.java 2008-02-11 21:23:12 UTC (rev 2198) @@ -0,0 +1,25 @@ +package org.archive.accesscontrol.robotstxt; + +import org.apache.commons.httpclient.URIException; + +import junit.framework.TestCase; + +public class HttpRobotClientTest extends TestCase { + public void testRobotUrlForUrl() throws URIException { + assertEquals("http://example.com/robots.txt", HttpRobotClient.robotsUrlForUrl("http://example.com/")); + assertEquals("http://example.com/robots.txt", HttpRobotClient.robotsUrlForUrl("http://example.com/foo/bar.html?boozle#bazzle")); + assertEquals("https://example.com/robots.txt", HttpRobotClient.robotsUrlForUrl("https://example.com/foo/bar.html?boozle#bazzle")); + assertEquals("https://us...@ex.../robots.txt", HttpRobotClient.robotsUrlForUrl("https://us...@ex.../foo/bar.html?boozle#bazzle")); + assertEquals("http://user:pa...@ex.../robots.txt", HttpRobotClient.robotsUrlForUrl("http://user:pa...@ex.../foo/bar.html?boozle#bazzle")); + assertEquals("http://user:pa...@ex...:2311/robots.txt", HttpRobotClient.robotsUrlForUrl("http://user:pa...@ex...:2311/foo/bar.html?boozle#bazzle")); + } + + public void testBasic() throws Exception { + HttpRobotClient client = new HttpRobotClient(); + assertFalse(client.isRobotPermitted("http://web.archive.org/cgi-bin/fishbowl", "wayback-access-control-test")); + assertTrue(client.isRobotPermitted("http://www.archive.org/index.html", "wayback-access-control-test")); + assertTrue(client.isRobotPermitted("http://google.com/fish.html", "wayback-access-control-test")); + assertFalse(client.isRobotPermitted("http://google.com/news", "wayback-access-control-test")); + + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |