Revision: 2949 http://archive-access.svn.sourceforge.net/archive-access/?rev=2949&view=rev Author: bradtofel Date: 2010-01-13 01:28:52 +0000 (Wed, 13 Jan 2010) Log Message: ----------- FEATURE: added logging Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-01-13 01:26:57 UTC (rev 2948) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-01-13 01:28:52 UTC (rev 2949) @@ -31,6 +31,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.List; +import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -38,6 +39,7 @@ import org.archive.wayback.core.Resource; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.exception.LiveDocumentNotAvailableException; +import org.archive.wayback.exception.LiveWebCacheUnavailableException; import org.archive.wayback.liveweb.LiveWebCache; import org.archive.wayback.util.ObjectFilter; @@ -58,6 +60,8 @@ */ public class RobotExclusionFilter implements ObjectFilter<CaptureSearchResult> { + private final static Logger LOGGER = Logger.getLogger(RobotExclusionFilter.class.getName()); + private final static String HTTP_PREFIX = "http://"; private final static String ROBOT_SUFFIX = "/robots.txt"; @@ -142,18 +146,28 @@ firstUrlString = urlString; } if(rulesCache.containsKey(urlString)) { + LOGGER.fine("ROBOT: Cached("+urlString+")"); rules = rulesCache.get(urlString); } else { try { - + LOGGER.fine("ROBOT: NotCached("+urlString+")"); + tmpRules = new RobotRules(); Resource resource = webCache.getCachedResource(new URL(urlString), maxCacheMS,true); + if(resource.getStatusCode() != 200) { + LOGGER.info("ROBOT: NotAvailable("+urlString+")"); + throw new LiveDocumentNotAvailableException(urlString); + } tmpRules.parse(resource); rulesCache.put(firstUrlString,tmpRules); rules = tmpRules; + LOGGER.info("ROBOT: Downloaded("+urlString+")"); } catch (LiveDocumentNotAvailableException e) { + // cache an empty rule: all OK +// rulesCache.put(firstUrlString, emptyRules); +// rules = emptyRules; continue; } catch (MalformedURLException e) { e.printStackTrace(); @@ -161,6 +175,9 @@ } catch (IOException e) { e.printStackTrace(); return null; + } catch (LiveWebCacheUnavailableException e) { + e.printStackTrace(); + return null; } } } @@ -186,6 +203,8 @@ url = new URL(ArchiveUtils.addImpliedHttpIfNecessary(resultURL)); if(!rules.blocksPathForUA(url.getPath(), userAgent)) { filterResult = ObjectFilter.FILTER_INCLUDE; + } else { + LOGGER.info("ROBOT: BLOCKED("+resultURL+")"); } } catch (MalformedURLException e) { e.printStackTrace(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |