Revision: 2949 http://archive-access.svn.sourceforge.net/archive-access/?rev=2949&view=rev Author: bradtofel Date: 2010-01-13 01:28:52 +0000 (Wed, 13 Jan 2010) Log Message: ----------- FEATURE: added logging Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-01-13 01:26:57 UTC (rev 2948) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-01-13 01:28:52 UTC (rev 2949) @@ -31,6 +31,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.List; +import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -38,6 +39,7 @@ import org.archive.wayback.core.Resource; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.exception.LiveDocumentNotAvailableException; +import org.archive.wayback.exception.LiveWebCacheUnavailableException; import org.archive.wayback.liveweb.LiveWebCache; import org.archive.wayback.util.ObjectFilter; @@ -58,6 +60,8 @@ */ public class RobotExclusionFilter implements ObjectFilter<CaptureSearchResult> { + private final static Logger LOGGER = Logger.getLogger(RobotExclusionFilter.class.getName()); + private final static String HTTP_PREFIX = "http://"; private final static String ROBOT_SUFFIX = "/robots.txt"; @@ -142,18 +146,28 @@ firstUrlString = urlString; } if(rulesCache.containsKey(urlString)) { + LOGGER.fine("ROBOT: Cached("+urlString+")"); rules = rulesCache.get(urlString); } else { try { - + LOGGER.fine("ROBOT: NotCached("+urlString+")"); + tmpRules = new RobotRules(); Resource resource = webCache.getCachedResource(new URL(urlString), maxCacheMS,true); + if(resource.getStatusCode() != 200) { + LOGGER.info("ROBOT: NotAvailable("+urlString+")"); + throw new LiveDocumentNotAvailableException(urlString); + } tmpRules.parse(resource); rulesCache.put(firstUrlString,tmpRules); rules = tmpRules; + LOGGER.info("ROBOT: Downloaded("+urlString+")"); } catch (LiveDocumentNotAvailableException e) { + // cache an empty rule: all OK +// rulesCache.put(firstUrlString, emptyRules); +// rules = emptyRules; continue; } catch (MalformedURLException e) { e.printStackTrace(); @@ -161,6 +175,9 @@ } catch (IOException e) { e.printStackTrace(); return null; + } catch (LiveWebCacheUnavailableException e) { + e.printStackTrace(); + return null; } } } @@ -186,6 +203,8 @@ url = new URL(ArchiveUtils.addImpliedHttpIfNecessary(resultURL)); if(!rules.blocksPathForUA(url.getPath(), userAgent)) { filterResult = ObjectFilter.FILTER_INCLUDE; + } else { + LOGGER.info("ROBOT: BLOCKED("+resultURL+")"); } } catch (MalformedURLException e) { e.printStackTrace(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 2994 http://archive-access.svn.sourceforge.net/archive-access/?rev=2994&view=rev Author: bradtofel Date: 2010-03-20 01:19:20 +0000 (Sat, 20 Mar 2010) Log Message: ----------- BUGFIX(unreported): was not actually caching a robots.txt correctly, causing MANY robots.txt requests. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-03-20 01:16:21 UTC (rev 2993) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-03-20 01:19:20 UTC (rev 2994) @@ -99,17 +99,29 @@ private String hostToRobotUrlString(String host) { sb.setLength(0); sb.append(HTTP_PREFIX).append(host).append(ROBOT_SUFFIX); - return sb.toString(); + String robotUrl = sb.toString(); + LOGGER.fine("Adding robot URL:" + robotUrl); + return robotUrl; } /* - * Return a List of all robots.txt urls to attempt for this url: - * If originalURL starts with "www.DOMAIN": - * [originalURL,DOMAIN] - * If url starts with "www[0-9]+.DOMAIN": - * [originalURL,www.DOMAIN,DOMAIN] + * Return a List of all robots.txt urls to attempt for this HOST: + * If HOST starts with "www.DOMAIN": + * [ + * http://HOST/robots.txt, + * http://DOMAIN/robots.txt + * ] + * If HOST starts with "www[0-9]+.DOMAIN": + * [ + * http://HOST/robots.txt, + * http://www.DOMAIN/robots.txt, + * http://DOMAIN/robots.txt + * ] * Otherwise: - * [originalURL,www.originalURL] + * [ + * http://HOST/robots.txt, + * http://www.HOST/robots.txt + * ] */ protected List<String> searchResultToRobotUrlStrings(String resultHost) { ArrayList<String> list = new ArrayList<String>(); @@ -135,22 +147,41 @@ private RobotRules getRules(CaptureSearchResult result) { RobotRules rules = null; RobotRules tmpRules = null; - String host = result.getOriginalHost(); + String host; + try { + host = result.getOriginalHost(); + } catch(Exception e) { + LOGGER.warning("ROBOT: Failed to get host from("+result.getOriginalUrl()+")"); + return null; + } List<String> urlStrings = searchResultToRobotUrlStrings(host); Iterator<String> itr = urlStrings.iterator(); String firstUrlString = null; - +// StringBuilder sb = new StringBuilder(); +// for(String ttt : urlStrings) { +// sb.append("RU(").append(ttt).append(")"); +// } +// LOGGER.info("RobotUrls for("+host+")"+sb.toString()); + // loop through them all. As soon as we get a response, store that + // in the cache for the FIRST url we tried and return it.. + // If we get no responses for any of the robot URLs, use "empty" rules, + // and record that in the cache, too. + while(rules == null && itr.hasNext()) { String urlString = (String) itr.next(); if(firstUrlString == null) { firstUrlString = urlString; } if(rulesCache.containsKey(urlString)) { - LOGGER.fine("ROBOT: Cached("+urlString+")"); + LOGGER.info("ROBOT: Cached("+urlString+")"); rules = rulesCache.get(urlString); + if(!urlString.equals(firstUrlString)) { + LOGGER.info("Adding extra url("+firstUrlString+") for prev cached rules("+urlString+")"); + rulesCache.put(firstUrlString, rules); + } } else { try { - LOGGER.fine("ROBOT: NotCached("+urlString+")"); + LOGGER.info("ROBOT: NotCached("+urlString+")"); tmpRules = new RobotRules(); Resource resource = webCache.getCachedResource(new URL(urlString), @@ -165,18 +196,19 @@ LOGGER.info("ROBOT: Downloaded("+urlString+")"); } catch (LiveDocumentNotAvailableException e) { - // cache an empty rule: all OK -// rulesCache.put(firstUrlString, emptyRules); -// rules = emptyRules; - continue; + LOGGER.info("ROBOT: LiveDocumentNotAvailableException("+urlString+")"); + } catch (MalformedURLException e) { e.printStackTrace(); + LOGGER.info("ROBOT: MalformedURLException("+urlString+")"); return null; } catch (IOException e) { - e.printStackTrace(); + e.printStackTrace(System.err); + LOGGER.info("ROBOT: IOException("+urlString+"):"+e.getLocalizedMessage()); return null; } catch (LiveWebCacheUnavailableException e) { e.printStackTrace(); + LOGGER.info("ROBOT: LiveWebCacheUnavailableException("+urlString+")"); return null; } } @@ -185,6 +217,7 @@ // special-case, allow empty rules if no longer available. rulesCache.put(firstUrlString,emptyRules); rules = emptyRules; + LOGGER.info("No rules available, using emptyRules for:" + firstUrlString); } return rules; } @@ -203,6 +236,7 @@ url = new URL(ArchiveUtils.addImpliedHttpIfNecessary(resultURL)); if(!rules.blocksPathForUA(url.getPath(), userAgent)) { filterResult = ObjectFilter.FILTER_INCLUDE; + LOGGER.fine("ROBOT: ALLOWED("+resultURL+")"); } else { LOGGER.info("ROBOT: BLOCKED("+resultURL+")"); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3018 http://archive-access.svn.sourceforge.net/archive-access/?rev=3018&view=rev Author: bradtofel Date: 2010-04-02 03:19:24 +0000 (Fri, 02 Apr 2010) Log Message: ----------- test for null pointer before using filterGroup property Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-04-02 03:18:34 UTC (rev 3017) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-04-02 03:19:24 UTC (rev 3018) @@ -230,7 +230,9 @@ */ public int filterObject(CaptureSearchResult r) { if(!notifiedSeen) { - filterGroup.setSawRobots(); + if(filterGroup != null) { + filterGroup.setSawRobots(); + } notifiedSeen = true; } int filterResult = ObjectFilter.FILTER_EXCLUDE; @@ -242,7 +244,9 @@ url = new URL(ArchiveUtils.addImpliedHttpIfNecessary(resultURL)); if(!rules.blocksPathForUA(url.getPath(), userAgent)) { if(!notifiedPassed) { - filterGroup.setPassedRobots(); + if(filterGroup != null) { + filterGroup.setPassedRobots(); + } notifiedPassed = true; } filterResult = ObjectFilter.FILTER_INCLUDE; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3023 http://archive-access.svn.sourceforge.net/archive-access/?rev=3023&view=rev Author: bradtofel Date: 2010-04-05 23:23:21 +0000 (Mon, 05 Apr 2010) Log Message: ----------- LOGGING tweaks Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-04-05 23:22:41 UTC (rev 3022) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-04-05 23:23:21 UTC (rev 3023) @@ -176,7 +176,7 @@ firstUrlString = urlString; } if(rulesCache.containsKey(urlString)) { - LOGGER.info("ROBOT: Cached("+urlString+")"); + LOGGER.fine("ROBOT: Cached("+urlString+")"); rules = rulesCache.get(urlString); if(!urlString.equals(firstUrlString)) { LOGGER.info("Adding extra url("+firstUrlString+") for prev cached rules("+urlString+")"); @@ -184,7 +184,7 @@ } } else { try { - LOGGER.info("ROBOT: NotCached("+urlString+")"); + LOGGER.info("ROBOT: NotCached - Downloading("+urlString+")"); tmpRules = new RobotRules(); Resource resource = webCache.getCachedResource(new URL(urlString), This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3031 http://archive-access.svn.sourceforge.net/archive-access/?rev=3031&view=rev Author: bradtofel Date: 2010-04-09 02:02:57 +0000 (Fri, 09 Apr 2010) Log Message: ----------- LOGGING: reduced stacktrace output, upped info info log to a warning TWEAK: removed comments and some whitespace changes Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-04-09 02:01:13 UTC (rev 3030) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-04-09 02:02:57 UTC (rev 3031) @@ -61,12 +61,12 @@ */ public class RobotExclusionFilter extends ExclusionFilter { - private final static Logger LOGGER = Logger.getLogger(RobotExclusionFilter.class.getName()); - + private final static Logger LOGGER = + Logger.getLogger(RobotExclusionFilter.class.getName()); + private final static String HTTP_PREFIX = "http://"; private final static String ROBOT_SUFFIX = "/robots.txt"; - private static String WWWN_REGEX = "^www[0-9]+\\."; private final static Pattern WWWN_PATTERN = Pattern.compile(WWWN_REGEX); private LiveWebCache webCache = null; @@ -160,11 +160,7 @@ List<String> urlStrings = searchResultToRobotUrlStrings(host); Iterator<String> itr = urlStrings.iterator(); String firstUrlString = null; -// StringBuilder sb = new StringBuilder(); -// for(String ttt : urlStrings) { -// sb.append("RU(").append(ttt).append(")"); -// } -// LOGGER.info("RobotUrls for("+host+")"+sb.toString()); + // loop through them all. As soon as we get a response, store that // in the cache for the FIRST url we tried and return it.. // If we get no responses for any of the robot URLs, use "empty" rules, @@ -202,15 +198,13 @@ LOGGER.info("ROBOT: LiveDocumentNotAvailableException("+urlString+")"); } catch (MalformedURLException e) { - e.printStackTrace(); +// e.printStackTrace(); LOGGER.info("ROBOT: MalformedURLException("+urlString+")"); return null; } catch (IOException e) { - e.printStackTrace(System.err); - LOGGER.info("ROBOT: IOException("+urlString+"):"+e.getLocalizedMessage()); + LOGGER.warning("ROBOT: IOException("+urlString+"):"+e.getLocalizedMessage()); return null; } catch (LiveWebCacheUnavailableException e) { - e.printStackTrace(); LOGGER.info("ROBOT: LiveWebCacheUnavailableException("+urlString+")"); return null; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3247 http://archive-access.svn.sourceforge.net/archive-access/?rev=3247&view=rev Author: bradtofel Date: 2010-09-03 23:19:28 +0000 (Fri, 03 Sep 2010) Log Message: ----------- FEATURE: explicitly allowing download of /robots.txt paths, without consulting the robots.txt file. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-09-03 22:32:51 UTC (rev 3246) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-09-03 23:19:28 UTC (rev 3247) @@ -236,7 +236,9 @@ URL url; try { url = new URL(ArchiveUtils.addImpliedHttpIfNecessary(resultURL)); - if(!rules.blocksPathForUA(url.getPath(), userAgent)) { + String path = url.getPath(); + if(path.equals(ROBOT_SUFFIX) || + !rules.blocksPathForUA(path, userAgent)) { if(!notifiedPassed) { if(filterGroup != null) { filterGroup.setPassedRobots(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3453 http://archive-access.svn.sourceforge.net/archive-access/?rev=3453&view=rev Author: bradtofel Date: 2011-05-25 01:40:30 +0000 (Wed, 25 May 2011) Log Message: ----------- OPTIMIZ: now uses UrlOperations.getUrlPath() instead of constructing a URL object when determining if URLs are /robots.txt Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2011-05-25 01:37:48 UTC (rev 3452) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2011-05-25 01:40:30 UTC (rev 3453) @@ -39,6 +39,7 @@ import org.archive.wayback.liveweb.LiveWebCache; import org.archive.wayback.resourceindex.filters.ExclusionFilter; import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.url.UrlOperations; /** * CaptureSearchResult Filter that uses a LiveWebCache to retrieve robots.txt @@ -230,6 +231,17 @@ } notifiedSeen = true; } + String resultURL = r.getOriginalUrl(); + String path = UrlOperations.getURLPath(resultURL); + if(path.equals(ROBOT_SUFFIX)) { + if(!notifiedPassed) { + if(filterGroup != null) { + filterGroup.setPassedRobots(); + } + notifiedPassed = true; + } + return ObjectFilter.FILTER_INCLUDE; + } int filterResult = ObjectFilter.FILTER_EXCLUDE; RobotRules rules = getRules(r); if(rules == null) { @@ -237,26 +249,17 @@ return ObjectFilter.FILTER_ABORT; } } else { - String resultURL = r.getOriginalUrl(); - URL url; - try { - url = new URL(ArchiveUtils.addImpliedHttpIfNecessary(resultURL)); - String path = url.getPath(); - if(path.equals(ROBOT_SUFFIX) || - !rules.blocksPathForUA(path, userAgent)) { - if(!notifiedPassed) { - if(filterGroup != null) { - filterGroup.setPassedRobots(); - } - notifiedPassed = true; + if(!rules.blocksPathForUA(path, userAgent)) { + if(!notifiedPassed) { + if(filterGroup != null) { + filterGroup.setPassedRobots(); } - filterResult = ObjectFilter.FILTER_INCLUDE; - LOGGER.fine("ROBOT: ALLOWED("+resultURL+")"); - } else { - LOGGER.info("ROBOT: BLOCKED("+resultURL+")"); + notifiedPassed = true; } - } catch (MalformedURLException e) { - e.printStackTrace(); + filterResult = ObjectFilter.FILTER_INCLUDE; + LOGGER.fine("ROBOT: ALLOWED("+resultURL+")"); + } else { + LOGGER.info("ROBOT: BLOCKED("+resultURL+")"); } } return filterResult; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3485 http://archive-access.svn.sourceforge.net/archive-access/?rev=3485&view=rev Author: bradtofel Date: 2011-07-08 04:47:51 +0000 (Fri, 08 Jul 2011) Log Message: ----------- LOGGING: changed logging levels for most messages, added PerformanceLogger Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2011-07-08 04:40:04 UTC (rev 3484) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2011-07-08 04:47:51 UTC (rev 3485) @@ -40,6 +40,7 @@ import org.archive.wayback.resourceindex.filters.ExclusionFilter; import org.archive.wayback.util.ObjectFilter; import org.archive.wayback.util.url.UrlOperations; +import org.archive.wayback.webapp.PerformanceLogger; /** * CaptureSearchResult Filter that uses a LiveWebCache to retrieve robots.txt @@ -172,16 +173,20 @@ LOGGER.fine("ROBOT: Cached("+urlString+")"); rules = rulesCache.get(urlString); if(!urlString.equals(firstUrlString)) { - LOGGER.info("Adding extra url("+firstUrlString+") for prev cached rules("+urlString+")"); + LOGGER.fine("Adding extra url("+firstUrlString+") for prev cached rules("+urlString+")"); rulesCache.put(firstUrlString, rules); } } else { try { - LOGGER.info("ROBOT: NotCached - Downloading("+urlString+")"); + LOGGER.fine("ROBOT: NotCached - Downloading("+urlString+")"); tmpRules = new RobotRules(); + long start = System.currentTimeMillis(); Resource resource = webCache.getCachedResource(new URL(urlString), maxCacheMS,true); + long elapsed = System.currentTimeMillis() - start; + PerformanceLogger.noteElapsed("RobotRequest", elapsed, urlString); + if(resource.getStatusCode() != 200) { LOGGER.info("ROBOT: NotAvailable("+urlString+")"); throw new LiveDocumentNotAvailableException(urlString); @@ -189,24 +194,24 @@ tmpRules.parse(resource); rulesCache.put(firstUrlString,tmpRules); rules = tmpRules; - LOGGER.info("ROBOT: Downloaded("+urlString+")"); + LOGGER.fine("ROBOT: Downloaded("+urlString+")"); } catch (LiveDocumentNotAvailableException e) { LOGGER.info("ROBOT: LiveDocumentNotAvailableException("+urlString+")"); } catch (MalformedURLException e) { // e.printStackTrace(); - LOGGER.info("ROBOT: MalformedURLException("+urlString+")"); + LOGGER.warning("ROBOT: MalformedURLException("+urlString+")"); return null; } catch (IOException e) { LOGGER.warning("ROBOT: IOException("+urlString+"):"+e.getLocalizedMessage()); return null; } catch (LiveWebCacheUnavailableException e) { - LOGGER.info("ROBOT: LiveWebCacheUnavailableException("+urlString+")"); + LOGGER.severe("ROBOT: LiveWebCacheUnavailableException("+urlString+")"); filterGroup.setLiveWebGone(); return null; } catch (LiveWebTimeoutException e) { - LOGGER.info("ROBOT: LiveDocumentTimedOutException("+urlString+")"); + LOGGER.severe("ROBOT: LiveDocumentTimedOutException("+urlString+")"); filterGroup.setRobotTimedOut(); return null; } @@ -216,7 +221,7 @@ // special-case, allow empty rules if no longer available. rulesCache.put(firstUrlString,emptyRules); rules = emptyRules; - LOGGER.info("No rules available, using emptyRules for:" + firstUrlString); + LOGGER.fine("No rules available, using emptyRules for:" + firstUrlString); } return rules; } @@ -257,9 +262,9 @@ notifiedPassed = true; } filterResult = ObjectFilter.FILTER_INCLUDE; - LOGGER.fine("ROBOT: ALLOWED("+resultURL+")"); + LOGGER.finer("ROBOT: ALLOWED("+resultURL+")"); } else { - LOGGER.info("ROBOT: BLOCKED("+resultURL+")"); + LOGGER.fine("ROBOT: BLOCKED("+resultURL+")"); } } return filterResult; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
Revision: 3624 http://archive-access.svn.sourceforge.net/archive-access/?rev=3624&view=rev Author: ikreymer Date: 2012-03-01 08:47:49 +0000 (Thu, 01 Mar 2012) Log Message: ----------- BUGFIX: Fix a NPE due to null filterGroups Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2012-02-29 22:37:07 UTC (rev 3623) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2012-03-01 08:47:49 UTC (rev 3624) @@ -207,11 +207,15 @@ return null; } catch (LiveWebCacheUnavailableException e) { LOGGER.severe("ROBOT: LiveWebCacheUnavailableException("+urlString+")"); - filterGroup.setLiveWebGone(); + if (filterGroup != null) { + filterGroup.setLiveWebGone(); + } return null; } catch (LiveWebTimeoutException e) { LOGGER.severe("ROBOT: LiveDocumentTimedOutException("+urlString+")"); - filterGroup.setRobotTimedOut(); + if (filterGroup != null) { + filterGroup.setRobotTimedOut(); + } return null; } finally { long elapsed = System.currentTimeMillis() - start; @@ -252,7 +256,7 @@ int filterResult = ObjectFilter.FILTER_EXCLUDE; RobotRules rules = getRules(r); if(rules == null) { - if(filterGroup.getRobotTimedOut() || filterGroup.getLiveWebGone()) { + if((filterGroup == null) || (filterGroup.getRobotTimedOut() || filterGroup.getLiveWebGone())) { return ObjectFilter.FILTER_ABORT; } } else { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |