[Archive-access-cvs] SF.net SVN: archive-access:[2994] trunk/archive-access/projects/wayback/ wayb

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2994
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2994&view=rev
Author:   bradtofel
Date:     2010-03-20 01:19:20 +0000 (Sat, 20 Mar 2010)

Log Message:
-----------
BUGFIX(unreported): was not actually caching a robots.txt correctly, causing MANY robots.txt requests.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java
===================================================================

--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java	2010-03-20 01:16:21 UTC (rev 2993)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotExclusionFilter.java	2010-03-20 01:19:20 UTC (rev 2994)
@@ -99,17 +99,29 @@
 	private String hostToRobotUrlString(String host) {
 		sb.setLength(0);
 		sb.append(HTTP_PREFIX).append(host).append(ROBOT_SUFFIX);
-		return sb.toString();
+		String robotUrl = sb.toString();
+		LOGGER.fine("Adding robot URL:" + robotUrl);
+		return robotUrl;
 	}
 	
 	/*
-	 * Return a List of all robots.txt urls to attempt for this url:
-	 * If originalURL starts with "www.DOMAIN":
-	 * 	[originalURL,DOMAIN]
-	 * If url starts with "www[0-9]+.DOMAIN":
-	 *  [originalURL,www.DOMAIN,DOMAIN]
+	 * Return a List of all robots.txt urls to attempt for this HOST:
+	 * If HOST starts with "www.DOMAIN":
+	 * 	   [
+	 *        http://HOST/robots.txt,
+	 *        http://DOMAIN/robots.txt
+	 *     ]
+	 * If HOST starts with "www[0-9]+.DOMAIN":
+	 *     [
+	 *        http://HOST/robots.txt,
+	 *        http://www.DOMAIN/robots.txt,
+	 *        http://DOMAIN/robots.txt
+	 *     ]
 	 * Otherwise:
-	 *  [originalURL,www.originalURL]
+	 *     [
+	 *        http://HOST/robots.txt,
+	 *        http://www.HOST/robots.txt
+	 *     ]
 	 */
 	protected List<String> searchResultToRobotUrlStrings(String resultHost) {
 		ArrayList<String> list = new ArrayList<String>();
@@ -135,22 +147,41 @@
 	private RobotRules getRules(CaptureSearchResult result) {
 		RobotRules rules = null;
 		RobotRules tmpRules = null;
-		String host = result.getOriginalHost();
+		String host;
+		try {
+			host = result.getOriginalHost();
+		} catch(Exception e) {
+			LOGGER.warning("ROBOT: Failed to get host from("+result.getOriginalUrl()+")");			
+			return null;
+		}
 		List<String> urlStrings = searchResultToRobotUrlStrings(host);
 		Iterator<String> itr = urlStrings.iterator();
 		String firstUrlString = null;
-
+//		StringBuilder sb = new StringBuilder();
+//		for(String ttt : urlStrings) {
+//			sb.append("RU(").append(ttt).append(")");
+//		}
+//		LOGGER.info("RobotUrls for("+host+")"+sb.toString());
+		// loop through them all. As soon as we get a response, store that
+		// in the cache for the FIRST url we tried and return it..
+		// If we get no responses for any of the robot URLs, use "empty" rules,
+		// and record that in the cache, too.
+		
 		while(rules == null && itr.hasNext()) {
 			String urlString = (String) itr.next();
 			if(firstUrlString == null) {
 				firstUrlString = urlString;
 			}
 			if(rulesCache.containsKey(urlString)) {
-				LOGGER.fine("ROBOT: Cached("+urlString+")");
+				LOGGER.info("ROBOT: Cached("+urlString+")");
 				rules = rulesCache.get(urlString);
+				if(!urlString.equals(firstUrlString)) {
+					LOGGER.info("Adding extra url("+firstUrlString+") for prev cached rules("+urlString+")");
+					rulesCache.put(firstUrlString, rules);
+				}
 			} else {
 				try {
-					LOGGER.fine("ROBOT: NotCached("+urlString+")");
+					LOGGER.info("ROBOT: NotCached("+urlString+")");
 				
 					tmpRules = new RobotRules();
 					Resource resource = webCache.getCachedResource(new URL(urlString),
@@ -165,18 +196,19 @@
 					LOGGER.info("ROBOT: Downloaded("+urlString+")");
 					
 				} catch (LiveDocumentNotAvailableException e) {
-					// cache an empty rule: all OK
-//					rulesCache.put(firstUrlString, emptyRules);
-//					rules = emptyRules;
-					continue;
+					LOGGER.info("ROBOT: LiveDocumentNotAvailableException("+urlString+")");
+
 				} catch (MalformedURLException e) {
 					e.printStackTrace();
+					LOGGER.info("ROBOT: MalformedURLException("+urlString+")");
 					return null;
 				} catch (IOException e) {
-					e.printStackTrace();
+					e.printStackTrace(System.err);
+					LOGGER.info("ROBOT: IOException("+urlString+"):"+e.getLocalizedMessage());
 					return null;
 				} catch (LiveWebCacheUnavailableException e) {
 					e.printStackTrace();
+					LOGGER.info("ROBOT: LiveWebCacheUnavailableException("+urlString+")");
 					return null;
 				}
 			}
@@ -185,6 +217,7 @@
 			// special-case, allow empty rules if no longer available.
 			rulesCache.put(firstUrlString,emptyRules);
 			rules = emptyRules;
+			LOGGER.info("No rules available, using emptyRules for:" + firstUrlString);
 		}
 		return rules;
 	}
@@ -203,6 +236,7 @@
 				url = new URL(ArchiveUtils.addImpliedHttpIfNecessary(resultURL));
 				if(!rules.blocksPathForUA(url.getPath(), userAgent)) {
 					filterResult = ObjectFilter.FILTER_INCLUDE;
+					LOGGER.fine("ROBOT: ALLOWED("+resultURL+")");
 				} else {
 					LOGGER.info("ROBOT: BLOCKED("+resultURL+")");
 				}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access:[2994] trunk/archive-access/projects/wayback/ wayb

[Archive-access-cvs] SF.net SVN: archive-access:[2994] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/ RobotExclusionFilter.java