From: <al...@us...> - 2008-02-13 00:50:03
|
Revision: 2200 http://archive-access.svn.sourceforge.net/archive-access/?rev=2200&view=rev Author: alexoz Date: 2008-02-12 16:50:02 -0800 (Tue, 12 Feb 2008) Log Message: ----------- Catch 404s, timeouts etc when fetching robots.txt separately and throw RobotsUnvailableException. Modified Paths: -------------- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/CachingRobotClient.java trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/HttpRobotClient.java trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/RobotClient.java Modified: trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/CachingRobotClient.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/CachingRobotClient.java 2008-02-11 21:29:21 UTC (rev 2199) +++ trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/CachingRobotClient.java 2008-02-13 00:50:02 UTC (rev 2200) @@ -4,16 +4,20 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; +import java.util.logging.Logger; import org.archive.accesscontrol.LruCache; +import org.archive.accesscontrol.RobotsUnavailableException; /** - * The CachingRobotClient wraps another RobotClient and caches requests. + * The CchingRobotClient wraps another RobotClient and caches requests. * * @author aosborne * */ public class CachingRobotClient extends RobotClient { + private static final Logger LOGGER = Logger.getLogger( + CachingRobotClient.class.getName()); protected LruCache<String, RobotRules> cache = new LruCache<String, RobotRules>(); protected RobotClient client; private static final int PREPARE_THREAD_COUNT = 15; @@ -36,7 +40,7 @@ @Override public RobotRules getRulesForUrl(String url, String userAgent) - throws IOException { + throws IOException, RobotsUnavailableException { String robotsUrl = robotsUrlForUrl(url); RobotRules rules; @@ -78,6 +82,9 @@ getRulesForUrl(url, userAgent); } catch (IOException e) { e.printStackTrace(); + } catch (RobotsUnavailableException e) { + // TODO Auto-generated catch block + e.printStackTrace(); } } } Modified: trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/HttpRobotClient.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/HttpRobotClient.java 2008-02-11 21:29:21 UTC (rev 2199) +++ trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/HttpRobotClient.java 2008-02-13 00:50:02 UTC (rev 2200) @@ -1,12 +1,19 @@ package org.archive.accesscontrol.robotstxt; import java.io.IOException; +import java.net.ConnectException; +import java.net.NoRouteToHostException; +import java.net.UnknownHostException; import java.util.Collection; +import java.util.logging.Logger; +import org.apache.commons.httpclient.ConnectTimeoutException; import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; import org.apache.commons.httpclient.methods.GetMethod; +import org.archive.accesscontrol.RobotsUnavailableException; /** * HttpRobotClient allows fetching of robots.txt rules over HTTP. @@ -15,6 +22,8 @@ * */ public class HttpRobotClient extends RobotClient { + private static final Logger LOGGER = Logger.getLogger( + RobotClient.class.getName()); protected HttpClient http = new HttpClient( new MultiThreadedHttpConnectionManager()); @@ -22,11 +31,32 @@ return http; } - public RobotRules getRulesForUrl(String url, String userAgent) throws IOException { + public RobotRules getRulesForUrl(String url, String userAgent) throws IOException, RobotsUnavailableException { String robotsUrl = robotsUrlForUrl(url); HttpMethod method = new GetMethod(robotsUrl); method.addRequestHeader("User-Agent", userAgent); - http.executeMethod(method); + try { + int code = http.executeMethod(method); + // TODO: Constant 200 + if (code != 200) { + throw new RobotsUnavailableException(robotsUrl); + } + } catch (HttpException e) { + e.printStackTrace(); + throw new RobotsUnavailableException(robotsUrl); + } catch (UnknownHostException e) { + LOGGER.info("Unknown host for URL " + robotsUrl); + throw new RobotsUnavailableException(robotsUrl); + } catch (ConnectTimeoutException e) { + LOGGER.info("Connection Timeout for URL " + robotsUrl); + throw new RobotsUnavailableException(robotsUrl); + } catch (NoRouteToHostException e) { + LOGGER.info("No route to host for URL " + robotsUrl); + throw new RobotsUnavailableException(robotsUrl); + } catch (ConnectException e) { + LOGGER.info("ConnectException URL " + robotsUrl); + throw new RobotsUnavailableException(robotsUrl); + } RobotRules rules = new RobotRules(); rules.parse(method.getResponseBodyAsStream()); return rules; Modified: trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/RobotClient.java =================================================================== --- trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/RobotClient.java 2008-02-11 21:29:21 UTC (rev 2199) +++ trunk/archive-access/projects/access-control/access-control/src/main/java/org/archive/accesscontrol/robotstxt/RobotClient.java 2008-02-13 00:50:02 UTC (rev 2200) @@ -4,6 +4,7 @@ import java.util.Collection; import org.apache.commons.httpclient.URIException; +import org.archive.accesscontrol.RobotsUnavailableException; import org.archive.net.LaxURI; /** @@ -21,9 +22,10 @@ * @param userAgent * @return * @throws IOException + * @throws RobotsUnavailableException */ public boolean isRobotPermitted(String url, String userAgent) - throws IOException { + throws IOException, RobotsUnavailableException { RobotRules rules = getRulesForUrl(url, userAgent); return !rules.blocksPathForUA(new LaxURI(url, false).getPath(), userAgent); @@ -35,10 +37,11 @@ * @param url * @param userAgent * @return - * @throws IOException + * @throws IOException a local problem occurred when attempting to fetch the robots.txt + * @throws RobotsUnavailableException a remote problem, we found no robots.txt or the server is down. */ public abstract RobotRules getRulesForUrl(String url, String userAgent) - throws IOException; + throws IOException, RobotsUnavailableException; public static String robotsUrlForUrl(String url) throws URIException { LaxURI uri = new LaxURI(url, false); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |