Revision: 2994
http://archive-access.svn.sourceforge.net/archive-access/?rev=2994&view=rev
Author: bradtofel
Date: 2010-03-20 01:19:20 +0000 (Sat, 20 Mar 2010)
Log Message:
-----------
BUGFIX(unreported): was not actually caching a robots.txt correctly, causing MANY robots.txt requests.
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-03-20 01:16:21 UTC (rev 2993)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java 2010-03-20 01:19:20 UTC (rev 2994)
@@ -99,17 +99,29 @@
private String hostToRobotUrlString(String host) {
sb.setLength(0);
sb.append(HTTP_PREFIX).append(host).append(ROBOT_SUFFIX);
- return sb.toString();
+ String robotUrl = sb.toString();
+ LOGGER.fine("Adding robot URL:" + robotUrl);
+ return robotUrl;
}
/*
- * Return a List of all robots.txt urls to attempt for this url:
- * If originalURL starts with "www.DOMAIN":
- * [originalURL,DOMAIN]
- * If url starts with "www[0-9]+.DOMAIN":
- * [originalURL,www.DOMAIN,DOMAIN]
+ * Return a List of all robots.txt urls to attempt for this HOST:
+ * If HOST starts with "www.DOMAIN":
+ * [
+ * http://HOST/robots.txt,
+ * http://DOMAIN/robots.txt
+ * ]
+ * If HOST starts with "www[0-9]+.DOMAIN":
+ * [
+ * http://HOST/robots.txt,
+ * http://www.DOMAIN/robots.txt,
+ * http://DOMAIN/robots.txt
+ * ]
* Otherwise:
- * [originalURL,www.originalURL]
+ * [
+ * http://HOST/robots.txt,
+ * http://www.HOST/robots.txt
+ * ]
*/
protected List<String> searchResultToRobotUrlStrings(String resultHost) {
ArrayList<String> list = new ArrayList<String>();
@@ -135,22 +147,41 @@
private RobotRules getRules(CaptureSearchResult result) {
RobotRules rules = null;
RobotRules tmpRules = null;
- String host = result.getOriginalHost();
+ String host;
+ try {
+ host = result.getOriginalHost();
+ } catch(Exception e) {
+ LOGGER.warning("ROBOT: Failed to get host from("+result.getOriginalUrl()+")");
+ return null;
+ }
List<String> urlStrings = searchResultToRobotUrlStrings(host);
Iterator<String> itr = urlStrings.iterator();
String firstUrlString = null;
-
+// StringBuilder sb = new StringBuilder();
+// for(String ttt : urlStrings) {
+// sb.append("RU(").append(ttt).append(")");
+// }
+// LOGGER.info("RobotUrls for("+host+")"+sb.toString());
+ // loop through them all. As soon as we get a response, store that
+ // in the cache for the FIRST url we tried and return it..
+ // If we get no responses for any of the robot URLs, use "empty" rules,
+ // and record that in the cache, too.
+
while(rules == null && itr.hasNext()) {
String urlString = (String) itr.next();
if(firstUrlString == null) {
firstUrlString = urlString;
}
if(rulesCache.containsKey(urlString)) {
- LOGGER.fine("ROBOT: Cached("+urlString+")");
+ LOGGER.info("ROBOT: Cached("+urlString+")");
rules = rulesCache.get(urlString);
+ if(!urlString.equals(firstUrlString)) {
+ LOGGER.info("Adding extra url("+firstUrlString+") for prev cached rules("+urlString+")");
+ rulesCache.put(firstUrlString, rules);
+ }
} else {
try {
- LOGGER.fine("ROBOT: NotCached("+urlString+")");
+ LOGGER.info("ROBOT: NotCached("+urlString+")");
tmpRules = new RobotRules();
Resource resource = webCache.getCachedResource(new URL(urlString),
@@ -165,18 +196,19 @@
LOGGER.info("ROBOT: Downloaded("+urlString+")");
} catch (LiveDocumentNotAvailableException e) {
- // cache an empty rule: all OK
-// rulesCache.put(firstUrlString, emptyRules);
-// rules = emptyRules;
- continue;
+ LOGGER.info("ROBOT: LiveDocumentNotAvailableException("+urlString+")");
+
} catch (MalformedURLException e) {
e.printStackTrace();
+ LOGGER.info("ROBOT: MalformedURLException("+urlString+")");
return null;
} catch (IOException e) {
- e.printStackTrace();
+ e.printStackTrace(System.err);
+ LOGGER.info("ROBOT: IOException("+urlString+"):"+e.getLocalizedMessage());
return null;
} catch (LiveWebCacheUnavailableException e) {
e.printStackTrace();
+ LOGGER.info("ROBOT: LiveWebCacheUnavailableException("+urlString+")");
return null;
}
}
@@ -185,6 +217,7 @@
// special-case, allow empty rules if no longer available.
rulesCache.put(firstUrlString,emptyRules);
rules = emptyRules;
+ LOGGER.info("No rules available, using emptyRules for:" + firstUrlString);
}
return rules;
}
@@ -203,6 +236,7 @@
url = new URL(ArchiveUtils.addImpliedHttpIfNecessary(resultURL));
if(!rules.blocksPathForUA(url.getPath(), userAgent)) {
filterResult = ObjectFilter.FILTER_INCLUDE;
+ LOGGER.fine("ROBOT: ALLOWED("+resultURL+")");
} else {
LOGGER.info("ROBOT: BLOCKED("+resultURL+")");
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|