[Archive-access-cvs] SF.net SVN: archive-access:[3559] trunk/archive-access/projects/wayback/ wayba

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3559
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3559&view=rev
Author:   bradtofel
Date:     2011-11-16 22:17:57 +0000 (Wed, 16 Nov 2011)
Log Message:
-----------
INITIAL REV: not fully tested but much improved robots.txt handling. Uses copy of current H3 robots handling - allows + disallow, more robust parsing, cleaner separation of responsibility to clean up the code

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/FixedRobotsDirectives.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/HRobotExclusionFilter.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregation.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectives.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/Robotstxt.java
    trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregationTest.java

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/FixedRobotsDirectives.java
===================================================================

--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/FixedRobotsDirectives.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/FixedRobotsDirectives.java	2011-11-16 22:17:57 UTC (rev 3559)
@@ -0,0 +1,11 @@
+package org.archive.wayback.accesscontrol.robotstxt;
+
+public class FixedRobotsDirectives extends RobotsDirectives {
+	private boolean result;
+	public FixedRobotsDirectives(boolean result) {
+		this.result = result;
+	}
+	public boolean allows(String path) {
+		return result;
+	}
+}

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/HRobotExclusionFilter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/HRobotExclusionFilter.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/HRobotExclusionFilter.java	2011-11-16 22:17:57 UTC (rev 3559)
@@ -0,0 +1,164 @@
+package org.archive.wayback.accesscontrol.robotstxt;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.List;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.archive.wayback.core.CaptureSearchResult;
+import org.archive.wayback.core.Resource;
+import org.archive.wayback.exception.LiveDocumentNotAvailableException;
+import org.archive.wayback.exception.LiveWebCacheUnavailableException;
+import org.archive.wayback.exception.LiveWebTimeoutException;
+import org.archive.wayback.liveweb.LiveWebCache;
+import org.archive.wayback.resourceindex.filters.ExclusionFilter;
+import org.archive.wayback.util.ObjectFilter;
+import org.archive.wayback.util.url.UrlOperations;
+import org.archive.wayback.webapp.PerformanceLogger;
+
+public class HRobotExclusionFilter extends ExclusionFilter {
+
+	private final static String ROBOT_SUFFIX = "/robots.txt";
+	private final static Logger LOGGER = 
+		Logger.getLogger(HRobotExclusionFilter.class.getName());
+
+	// TODO: this is not the right thing!
+	private Charset cs = Charset.forName("UTF-8");
+	
+	private RobotsDirectiveAggregation aggregation = null;
+	private LiveWebCache webCache = null;
+
+	private String userAgent = null;
+	private boolean notifiedSeen = false;
+	private boolean notifiedPassed = false;
+	private static final FixedRobotsDirectives ALLOW_ROBOT_DIRECTIVE = 
+		new FixedRobotsDirectives(true);
+	
+	/**
+	 * Construct a new HRobotExclusionFilter that uses webCache to pull 
+	 * robots.txt documents. filtering is based on userAgent, and cached 
+	 * documents newer than maxCacheMS in the webCache are considered valid.
+	 * 
+	 * @param webCache LiveWebCache from which documents can be retrieved 
+	 * @param userAgent String user agent to use for requests to the live web.
+	 * @param maxCacheMS long number of milliseconds to cache documents in the
+	 *                   LiveWebCache
+	 */
+	public HRobotExclusionFilter(LiveWebCache webCache, String userAgent,
+			long maxCacheMS) {
+		aggregation = new RobotsDirectiveAggregation();
+		this.webCache = webCache;
+		this.userAgent = userAgent;
+	}
+	
+	private void updateAggregation(String host) 
+	throws LiveWebCacheUnavailableException,
+	LiveWebTimeoutException, MalformedURLException, IOException {
+	
+		List<String> missing = aggregation.getMissingRobotUrls(host);
+		for(String robotUrl : missing) {
+			long start = System.currentTimeMillis();
+			Resource resource;
+			try {
+				resource = webCache.getCachedResource(new URL(robotUrl),
+						0,true);
+				if(resource.getStatusCode() != 200) {
+					LOGGER.info("ROBOT: Non200("+robotUrl+")");
+					// consider it an allow:
+					aggregation.addDirectives(robotUrl, ALLOW_ROBOT_DIRECTIVE);
+				} else {
+					InputStreamReader isr = new InputStreamReader(resource, cs);
+					BufferedReader br = new BufferedReader(isr);
+					Robotstxt robotsTxt = new Robotstxt(br);
+					RobotsDirectives directives = robotsTxt.getDirectivesFor(userAgent);
+					aggregation.addDirectives(robotUrl, directives);
+				}
+			} catch (LiveDocumentNotAvailableException e) {
+				if(LOGGER.isLoggable(Level.INFO)) {
+					LOGGER.info("ROBOT: LiveDocumentNotAvailableException("
+							+ robotUrl + ")");
+				}
+				// consider it an allow:
+				aggregation.addDirectives(robotUrl, ALLOW_ROBOT_DIRECTIVE);
+			}
+			long elapsed = System.currentTimeMillis() - start;
+			PerformanceLogger.noteElapsed("RobotRequest", elapsed, robotUrl);
+		}
+	}
+	
+	/* (non-Javadoc)
+	 * @see org.archive.wayback.resourceindex.SearchResultFilter#filterSearchResult(org.archive.wayback.core.SearchResult)
+	 */
+	public int filterObject(CaptureSearchResult r) {
+		if(!notifiedSeen) {
+			if(filterGroup != null) {
+				filterGroup.setSawRobots();
+			}
+			notifiedSeen = true;
+		}
+		String originalURL = r.getOriginalUrl();
+		String path = UrlOperations.getURLPath(originalURL);
+		if(path.equals(ROBOT_SUFFIX)) {
+			if(!notifiedPassed) {
+				if(filterGroup != null) {
+					filterGroup.setPassedRobots();
+				}
+				notifiedPassed = true;
+			}
+			return ObjectFilter.FILTER_INCLUDE;
+		}
+		String host = UrlOperations.urlToHost(originalURL);
+		boolean updated = false;
+		try {
+			updateAggregation(host);
+			if(!aggregation.isBlocked(path)) {
+				if(LOGGER.isLoggable(Level.INFO)) {
+					LOGGER.fine("ROBOT: BLOCKED(" + originalURL + ")");
+				}
+				if(LOGGER.isLoggable(Level.FINE)) {
+					LOGGER.finer("ROBOT: ALLOWED(" + originalURL + ")");
+				}
+				if(!notifiedPassed) {
+					if(filterGroup != null) {
+						filterGroup.setPassedRobots();
+					}
+					notifiedPassed = true;
+				}
+				return ObjectFilter.FILTER_INCLUDE;
+			}
+
+//		} catch (LiveDocumentNotAvailableException e) {
+		} catch (LiveWebCacheUnavailableException e) {
+			LOGGER.severe("ROBOT: LiveWebCacheUnavailableException("
+					+ originalURL + ")");
+			filterGroup.setLiveWebGone();
+
+		} catch (LiveWebTimeoutException e) {
+			LOGGER.severe("ROBOT: LiveDocumentTimedOutException("
+					+ originalURL + ")");
+			filterGroup.setRobotTimedOut();
+
+		} catch (MalformedURLException e) {
+
+			LOGGER.warning("ROBOT: MalformedURLException(" + 
+					originalURL + ")");
+
+		} catch (IOException e) {
+			e.printStackTrace();
+			return ObjectFilter.FILTER_EXCLUDE;
+		}
+
+		if(filterGroup.getRobotTimedOut() || filterGroup.getLiveWebGone()) {
+			return ObjectFilter.FILTER_ABORT;
+		}
+		if(LOGGER.isLoggable(Level.INFO)) {
+			LOGGER.fine("ROBOT: BLOCKED(" + originalURL + ")");
+		}
+		return ObjectFilter.FILTER_EXCLUDE;
+	}
+}

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregation.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregation.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregation.java	2011-11-16 22:17:57 UTC (rev 3559)
@@ -0,0 +1,111 @@
+package org.archive.wayback.accesscontrol.robotstxt;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.logging.Logger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Class which acts as an aggregation of RobotsDirectives.
+ * 
+ * If given a host String, will return a list of additional robot URLs that
+ * need to be added to the current aggregation.
+ * 
+ * Allows a user to then add new RobotsDirectives for one or more robot URLs.
+ * 
+ * Finally, allows the aggregation to be queried to see if any of the
+ * directives block a particular path.
+ * 
+ * 
+ * @author brad
+ *
+ */
+public class RobotsDirectiveAggregation {
+	private final static Logger LOGGER = 
+		Logger.getLogger(RobotsDirectiveAggregation.class.getName());
+
+	private final static String HTTP_PREFIX = "http://";
+	private final static String ROBOT_SUFFIX = "/robots.txt";
+
+	private static String WWWN_REGEX = "^www[0-9]+\\.";
+	private final static Pattern WWWN_PATTERN = Pattern.compile(WWWN_REGEX);
+
+	private HashMap<String,RobotsDirectives> cache = 
+		new HashMap<String, RobotsDirectives>();
+
+	private StringBuilder sb = new StringBuilder();
+
+	private String hostToRobotUrlString(final String host) {
+		sb.setLength(0);
+		sb.append(HTTP_PREFIX).append(host).append(ROBOT_SUFFIX);
+		String robotUrl = sb.toString();
+		LOGGER.fine("Adding robot URL:" + robotUrl);
+		return robotUrl;
+	}
+	/*
+	 */
+	/**
+	 * @param resultHost
+	 * @return a List of all robots.txt urls to attempt for this HOST:
+	 * If HOST starts with "www.DOMAIN":
+	 * 	   [
+	 *        http://HOST/robots.txt,
+	 *        http://DOMAIN/robots.txt
+	 *     ]
+	 * If HOST starts with "www[0-9]+.DOMAIN":
+	 *     [
+	 *        http://HOST/robots.txt,
+	 *        http://www.DOMAIN/robots.txt,
+	 *        http://DOMAIN/robots.txt
+	 *     ]
+	 * Otherwise:
+	 *     [
+	 *        http://HOST/robots.txt,
+	 *        http://www.HOST/robots.txt
+	 *     ]
+	 */
+	List<String> hostToRobotUrlStrings(final String resultHost) {
+		ArrayList<String> list = new ArrayList<String>();
+		list.add(hostToRobotUrlString(resultHost));
+		
+		if(resultHost.startsWith("www")) {
+			if(resultHost.startsWith("www.")) {
+				list.add(hostToRobotUrlString(resultHost.substring(4)));
+			} else {
+				Matcher m = WWWN_PATTERN.matcher(resultHost);
+				if(m.find()) {
+					String massagedHost = resultHost.substring(m.end());
+					list.add(hostToRobotUrlString("www." + massagedHost));
+					list.add(hostToRobotUrlString(massagedHost));
+				}
+			}
+		} else {
+			list.add(hostToRobotUrlString("www." + resultHost));			
+		}
+		return list;
+	}
+
+	public List<String> getMissingRobotUrls(String host) {
+		ArrayList<String> missing = new ArrayList<String>();
+		List<String> needed = hostToRobotUrlStrings(host);
+		for(String need : needed) {
+			if(!cache.containsKey(need)) {
+				missing.add(need);
+			}
+		}
+		return missing;
+	}
+	public void addDirectives(String url, RobotsDirectives directives) {
+		cache.put(url, directives);
+	}
+	public boolean isBlocked(String path) {
+		for(RobotsDirectives directives : cache.values()) {
+			if(!directives.allows(path)) {
+				return true;
+			}
+		}
+		return false;
+	}
+}

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectives.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectives.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectives.java	2011-11-16 22:17:57 UTC (rev 3559)
@@ -0,0 +1,75 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.wayback.accesscontrol.robotstxt;
+
+import java.io.Serializable;
+import java.util.concurrent.ConcurrentSkipListSet;
+
+/**
+ * Represents the directives that apply to a user-agent (or set of
+ * user-agents)
+ */
+public class RobotsDirectives implements Serializable {
+    private static final long serialVersionUID = 5386542759286155383L;
+    
+    ConcurrentSkipListSet<String> disallows = new ConcurrentSkipListSet<String>();
+    ConcurrentSkipListSet<String> allows = new ConcurrentSkipListSet<String>();
+    float crawlDelay = -1; 
+
+    public boolean allows(String path) {
+        return !(longestPrefixLength(disallows, path) > longestPrefixLength(allows, path));
+    }
+
+    /**
+     * @param prefixSet
+     * @param str
+     * @return length of longest entry in {@code prefixSet} that prefixes {@code str}, or zero
+     *         if no entry prefixes {@code str}
+     */
+    protected int longestPrefixLength(ConcurrentSkipListSet<String> prefixSet,
+            String str) {
+        String possiblePrefix = prefixSet.floor(str);
+        if (possiblePrefix != null && str.startsWith(possiblePrefix)) {
+            return possiblePrefix.length();
+        } else {
+            return 0;
+        }
+    }
+
+    public void addDisallow(String path) {
+        if(path.length()==0) {
+            // ignore empty-string disallows 
+            // (they really mean allow, when alone)
+            return;
+        }
+        disallows.add(path);
+    }
+
+    public void addAllow(String path) {
+        allows.add(path);
+    }
+
+    public void setCrawlDelay(float i) {
+        crawlDelay=i;
+    }
+
+    public float getCrawlDelay() {
+        return crawlDelay;
+    }
+}
\ No newline at end of file

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/Robotstxt.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/Robotstxt.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/Robotstxt.java	2011-11-16 22:17:57 UTC (rev 3559)
@@ -0,0 +1,234 @@
+/*
+ *  This file is part of the Heritrix web crawler (crawler.archive.org).
+ *
+ *  Licensed to the Internet Archive (IA) by one or more individual 
+ *  contributors. 
+ *
+ *  The IA licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.archive.wayback.accesscontrol.robotstxt;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Serializable;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+import org.apache.commons.io.IOUtils;
+import org.archive.io.ReadSource;
+
+/**
+ * Utility class for parsing and representing 'robots.txt' format 
+ * directives, into a list of named user-agents and map from user-agents 
+ * to RobotsDirectives. 
+ */
+public class Robotstxt implements Serializable {
+    static final long serialVersionUID = 7025386509301303890L;
+    private static final Logger logger =
+        Logger.getLogger(Robotstxt.class.getName());
+
+    // all user agents contained in this robots.txt
+    // in order of declaration
+    // TODO: consider discarding irrelevant entries
+    LinkedList<String> namedUserAgents = new LinkedList<String>();
+    // map user-agents to directives
+    Map<String,RobotsDirectives> agentsToDirectives = 
+        new HashMap<String,RobotsDirectives>();
+    RobotsDirectives wildcardDirectives = null; 
+    
+    boolean hasErrors = false;
+    
+    static RobotsDirectives NO_DIRECTIVES = new RobotsDirectives();
+    /** empty, reusable instance for all sites providing no rules */
+    public static Robotstxt NO_ROBOTS = new Robotstxt();
+    
+    public Robotstxt() {
+    }
+
+    public Robotstxt(BufferedReader reader) throws IOException {
+        initializeFromReader(reader);
+    }
+
+    public Robotstxt(ReadSource customRobots) {
+        BufferedReader reader = new BufferedReader(customRobots.obtainReader());
+        try {
+            initializeFromReader(reader);
+        } catch (IOException e) {
+            logger.log(Level.SEVERE,
+                    "robots ReadSource problem: potential for inadvertent overcrawling",
+                    e);
+        } finally {
+            IOUtils.closeQuietly(reader); 
+        }
+    }
+
+    protected void initializeFromReader(BufferedReader reader) throws IOException {
+        String read;
+        // current is the disallowed paths for the preceding User-Agent(s)
+        RobotsDirectives current = null;
+        // whether a non-'User-Agent' directive has been encountered
+        boolean hasDirectivesYet = false; 
+        while (reader != null) {
+            do {
+                read = reader.readLine();
+                // Skip comments & blanks
+            } while ((read != null) && ((read = read.trim()).startsWith("#") ||
+                read.length() == 0));
+            if (read == null) {
+                reader.close();
+                reader = null;
+            } else {
+                // remove any html markup
+                read = read.replaceAll("<[^>]+>","");
+                int commentIndex = read.indexOf("#");
+                if (commentIndex > -1) {
+                    // Strip trailing comment
+                    read = read.substring(0, commentIndex);
+                }
+                read = read.trim();
+                if (read.matches("(?i)^User-agent:.*")) {
+                    String ua = read.substring(11).trim().toLowerCase();
+                    if (current == null || hasDirectivesYet ) {
+                        // only create new rules-list if necessary
+                        // otherwise share with previous user-agent
+                        current = new RobotsDirectives();
+                        hasDirectivesYet = false; 
+                    }
+                    if (ua.equals("*")) {
+                        wildcardDirectives = current;
+                    } else {
+                        namedUserAgents.addLast(ua);
+                        agentsToDirectives.put(ua, current);
+                    }
+                    continue;
+                }
+                if (read.matches("(?i)Disallow:.*")) {
+                    if (current == null) {
+                        // buggy robots.txt
+                        hasErrors = true;
+                        continue;
+                    }
+                    String path = read.substring(9).trim();
+                    // tolerate common error of ending path with '*' character
+                    // (not allowed by original spec; redundant but harmless with 
+                    // Google's wildcarding extensions -- which we don't yet fully
+                    // support). 
+                    if(path.endsWith("*")) {
+                        path = path.substring(0,path.length()-1); 
+                    }
+                    current.addDisallow(path);
+                    hasDirectivesYet = true; 
+                    continue;
+                }
+                if (read.matches("(?i)Crawl-delay:.*")) {
+                    if (current == null) {
+                        // buggy robots.txt
+                        hasErrors = true;
+                        continue;
+                    }
+                    // consider a crawl-delay, even though we don't 
+                    // yet understand it, as sufficient to end a 
+                    // grouping of User-Agent lines
+                    hasDirectivesYet = true;
+                    String val = read.substring(12).trim();
+                    val = val.split("[^\\d\\.]+")[0];
+                    try {
+                        current.setCrawlDelay(Float.parseFloat(val));
+                    } catch (NumberFormatException nfe) {
+                        // ignore
+                    }
+                    continue;
+                }
+                if (read.matches("(?i)Allow:.*")) {
+                    if (current == null) {
+                        // buggy robots.txt
+                        hasErrors = true;
+                        continue;
+                    }
+                    String path = read.substring(6).trim();
+                    // tolerate common error of ending path with '*' character
+                    // (not allowed by original spec; redundant but harmless with 
+                    // Google's wildcarding extensions -- which we don't yet fully
+                    // support). 
+                    if(path.endsWith("*")) {
+                        path = path.substring(0,path.length()-1); 
+                    }
+                    current.addAllow(path);
+                    hasDirectivesYet = true;
+                    continue;
+                }
+                // unknown line; do nothing for now
+            }
+        }
+    }
+
+    /**
+     * Does this policy effectively allow everything? (No 
+     * disallows or timing (crawl-delay) directives?)
+     * @return
+     */
+    public boolean allowsAll() {
+        // TODO: refine so directives that are all empty are also 
+        // recognized as allowing all
+        return agentsToDirectives.isEmpty();
+    }
+    
+    public List<String> getNamedUserAgents() {
+        return namedUserAgents;
+    }
+
+    /**
+     * Return the RobotsDirectives, if any, appropriate for the given User-Agent
+     * string. If useFallbacks is true, a wildcard ('*') directives or the default
+     * of NO_DIRECTIVES will be returned, as appropriate, if there is no better
+     * match. If useFallbacks is false, a null will be returned if no declared
+     * directives targeted the given User-Agent.
+     * 
+     * @param ua String User-Agent to lookup
+     * @param useFallbacks if true, fall-back to wildcard directives or 
+     * default allow as needed
+     * @return directives to use, or null if useFallbacks is false and no 
+     * non-wildcard directives match the supplied User-Agent
+     */
+    public RobotsDirectives getDirectivesFor(String ua, boolean useFallbacks) {
+        // find matching ua
+        for(String uaListed : namedUserAgents) {
+            if(ua.indexOf(uaListed)>-1) {
+                return agentsToDirectives.get(uaListed);
+            }
+        }
+        if(useFallbacks==false) {
+            return null; 
+        }
+        if (wildcardDirectives!=null) {
+            return wildcardDirectives;
+        }
+        // no applicable user-agents, so empty directives
+        return NO_DIRECTIVES; 
+    }
+
+    /**
+     * Return directives to use for the given User-Agent, resorting to wildcard
+     * rules or the default no-directives if necessary.
+     * 
+     * @param userAgent String User-Agent to lookup
+     * @return directives to use
+     */
+    public RobotsDirectives getDirectivesFor(String userAgent) {
+        return getDirectivesFor(userAgent, true);
+    }
+}

Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregationTest.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregationTest.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregationTest.java	2011-11-16 22:17:57 UTC (rev 3559)
@@ -0,0 +1,109 @@
+package org.archive.wayback.accesscontrol.robotstxt;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.lang.StringEscapeUtils;
+
+import com.google.common.base.Strings;
+import com.google.common.collect.Lists;
+
+import junit.framework.TestCase;
+
+public class RobotsDirectiveAggregationTest extends TestCase {
+	
+	private String[] mapRobotUrls(String[] in ) {
+		String res[] = new String[in.length];
+		for(int i = 0; i < in.length; i++) {
+			res[i] = "http://" + in[i] + "/robots.txt";
+		}
+		return res;
+	}
+	
+	
+	/**
+	 * 
+	 */
+	public void testHostToRobotUrlStrings() {
+		RobotsDirectiveAggregation f = new RobotsDirectiveAggregation();
+		String test1[] = {"www.foo.com","foo.com"};
+		compareListTo(f.hostToRobotUrlStrings("www.foo.com"),mapRobotUrls(test1));
+
+		String test2[] = {"foo.com","www.foo.com"};
+		compareListTo(f.hostToRobotUrlStrings("foo.com"),mapRobotUrls(test2));
+
+		String test3[] = {"fool.foo.com","www.fool.foo.com"};
+		compareListTo(f.hostToRobotUrlStrings("fool.foo.com"),mapRobotUrls(test3));
+
+		String test4[] = {"www4.foo.com","www.foo.com","foo.com"};
+		compareListTo(f.hostToRobotUrlStrings("www4.foo.com"),mapRobotUrls(test4));
+
+		String test5[] = {"www4w.foo.com"};
+		compareListTo(f.hostToRobotUrlStrings("www4w.foo.com"),mapRobotUrls(test5));
+		
+		String test6[] = {"www.www.foo.com","www.foo.com"};
+		compareListTo(f.hostToRobotUrlStrings("www.www.foo.com"),mapRobotUrls(test6));
+	}
+	private String strJoin(Iterable<String> i, char del) {
+		StringBuilder sb  = new StringBuilder();
+		boolean first = true;
+		for(String s : i) {
+			if(first) {
+				first = false;
+			} else {
+				sb.append(del);
+			}
+			sb.append(s);
+		}
+		return sb.toString();
+	}
+	private List<String> sortA(String[] a) {
+		Arrays.sort(a);
+		return Lists.newArrayList(a);
+	}
+	private List<String> sortL(List<String> a) {
+		String[] Empty = new String[0];
+		String[] tmp;
+		tmp = a.toArray(Empty);
+		Arrays.sort(tmp);
+		return Lists.newArrayList(tmp);
+	}
+	private void compareListTo(List<String> list, String strings[]) {
+		
+		boolean match = list.size() == strings.length;
+		List<String> ls = sortL(list);
+		List<String> ss = sortA(strings);
+		if(match) {
+			for(int i = 0; i < strings.length; i++) {
+				if(!ls.get(i).equals(ss.get(i))) {
+					match = false;
+					break;
+				}
+			}
+		}
+		if(!match) {
+			String a1 = strJoin(ls,',');
+			String a2 = strJoin(ss,',');
+			String msg = String.format("ArrayCMP (%s) != (%s)",a1,a2);
+			assertTrue(msg,false);
+		}
+	}
+
+	public void testInteraction() {
+		RobotsDirectiveAggregation agg = new RobotsDirectiveAggregation();
+		String test1[] = {"http://foo.com/robots.txt","http://www.foo.com/robots.txt"};
+		compareListTo(agg.getMissingRobotUrls("foo.com"),test1);
+		compareListTo(agg.getMissingRobotUrls("www.foo.com"),test1);
+		agg.addDirectives("http://foo.com/robots.txt", new FixedRobotsDirectives(true));
+		String test2[] = {"http://www.foo.com/robots.txt"};
+		compareListTo(agg.getMissingRobotUrls("foo.com"),test2);
+		assertFalse(agg.isBlocked("/foo"));
+
+		agg.addDirectives("http://www.foo.com/robots.txt", new FixedRobotsDirectives(false));
+		String test3[] = {};
+		compareListTo(agg.getMissingRobotUrls("foo.com"),test3);
+		assertTrue(agg.isBlocked("/foo"));
+		
+	}
+}

This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.





[Archive-access-cvs] SF.net SVN: archive-access:[3559] trunk/archive-access/projects/wayback/ wayba

[Archive-access-cvs] SF.net SVN: archive-access:[3559] trunk/archive-access/projects/wayback/ wayback-core/src