From: <bra...@us...> - 2011-03-09 05:56:23
|
Revision: 3429 http://archive-access.svn.sourceforge.net/archive-access/?rev=3429&view=rev Author: bradtofel Date: 2011-03-09 05:56:17 +0000 (Wed, 09 Mar 2011) Log Message: ----------- FEATURE: Now detects a LiveWebTimeout, or LiveRobotsTimeout request, and aborts subsequent attempts, which are also likely to timeout within this request. More of a BugFix, since this caused dramatic problems, hanging up the thread to timeout on robots request for each search result.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroup.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2011-03-09 05:53:57 UTC (rev 3428) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2011-03-09 05:56:17 UTC (rev 3429) @@ -35,6 +35,7 @@ import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.exception.LiveDocumentNotAvailableException; import org.archive.wayback.exception.LiveWebCacheUnavailableException; +import org.archive.wayback.exception.LiveWebTimeoutException; import org.archive.wayback.liveweb.LiveWebCache; import org.archive.wayback.resourceindex.filters.ExclusionFilter; import org.archive.wayback.util.ObjectFilter; @@ -188,7 +189,7 @@ rulesCache.put(firstUrlString,tmpRules); rules = tmpRules; LOGGER.info("ROBOT: Downloaded("+urlString+")"); - + } catch (LiveDocumentNotAvailableException e) { LOGGER.info("ROBOT: LiveDocumentNotAvailableException("+urlString+")"); @@ -201,7 +202,12 @@ return null; } catch (LiveWebCacheUnavailableException e) { LOGGER.info("ROBOT: LiveWebCacheUnavailableException("+urlString+")"); + filterGroup.setLiveWebGone(); return null; + } catch (LiveWebTimeoutException e) { + LOGGER.info("ROBOT: LiveDocumentTimedOutException("+urlString+")"); + filterGroup.setRobotTimedOut(); + return null; } } } @@ -226,7 +232,11 @@ } int filterResult = ObjectFilter.FILTER_EXCLUDE; RobotRules rules = getRules(r); - if(rules != null) { + if(rules == null) { + if(filterGroup.getRobotTimedOut() || filterGroup.getLiveWebGone()) { + return ObjectFilter.FILTER_ABORT; + } + } else { String resultURL = r.getOriginalUrl(); URL url; try { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroup.java 2011-03-09 05:53:57 UTC (rev 3428) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroup.java 2011-03-09 05:56:17 UTC (rev 3429) @@ -26,8 +26,11 @@ import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.AccessControlException; import org.archive.wayback.exception.AdministrativeAccessControlException; +import org.archive.wayback.exception.LiveWebCacheUnavailableException; import org.archive.wayback.exception.ResourceNotInArchiveException; import org.archive.wayback.exception.RobotAccessControlException; +import org.archive.wayback.exception.RobotNotAvailableException; +import org.archive.wayback.exception.RobotTimedOutAccessControlException; import org.archive.wayback.resourceindex.filters.CounterFilter; import org.archive.wayback.resourceindex.filters.ExclusionFilter; import org.archive.wayback.util.ObjectFilter; @@ -41,6 +44,8 @@ String requestUrl = null; private boolean sawRobots = false; private boolean passedRobots = false; + private boolean robotTimedOut = false; + private boolean liveWebGone = false; private boolean sawAdministrative = false; private boolean passedAdministrative = false; @@ -67,7 +72,16 @@ } public void annotateResults(SearchResults results) - throws AccessControlException, ResourceNotInArchiveException { + throws AccessControlException, ResourceNotInArchiveException, + RobotNotAvailableException { + if(robotTimedOut) { + throw new RobotTimedOutAccessControlException("Unable to check" + + " robots.txt for " + requestUrl); + } + if(liveWebGone) { + throw new RobotNotAvailableException("The URL " + requestUrl + + " is blocked by the sites robots.txt file"); + } if(sawRobots && !passedRobots) { throw new RobotAccessControlException("The URL " + requestUrl + " is blocked by the sites robots.txt file"); @@ -91,4 +105,18 @@ public void setSawAdministrative() { sawAdministrative = true; } + + public void setRobotTimedOut() { + robotTimedOut = true; + } + public boolean getRobotTimedOut() { + return robotTimedOut; + } + + public void setLiveWebGone() { + liveWebGone = true; + } + public boolean getLiveWebGone() { + return liveWebGone; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |