From: <bra...@us...> - 2011-11-16 22:18:05
|
Revision: 3559 http://archive-access.svn.sourceforge.net/archive-access/?rev=3559&view=rev Author: bradtofel Date: 2011-11-16 22:17:57 +0000 (Wed, 16 Nov 2011) Log Message: ----------- INITIAL REV: not fully tested but much improved robots.txt handling. Uses copy of current H3 robots handling - allows + disallow, more robust parsing, cleaner separation of responsibility to clean up the code Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/FixedRobotsDirectives.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/HRobotExclusionFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregation.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectives.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/Robotstxt.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregationTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/FixedRobotsDirectives.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/FixedRobotsDirectives.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/FixedRobotsDirectives.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,11 @@ +package org.archive.wayback.accesscontrol.robotstxt; + +public class FixedRobotsDirectives extends RobotsDirectives { + private boolean result; + public FixedRobotsDirectives(boolean result) { + this.result = result; + } + public boolean allows(String path) { + return result; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/HRobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/HRobotExclusionFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/HRobotExclusionFilter.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,164 @@ +package org.archive.wayback.accesscontrol.robotstxt; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.Resource; +import org.archive.wayback.exception.LiveDocumentNotAvailableException; +import org.archive.wayback.exception.LiveWebCacheUnavailableException; +import org.archive.wayback.exception.LiveWebTimeoutException; +import org.archive.wayback.liveweb.LiveWebCache; +import org.archive.wayback.resourceindex.filters.ExclusionFilter; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.url.UrlOperations; +import org.archive.wayback.webapp.PerformanceLogger; + +public class HRobotExclusionFilter extends ExclusionFilter { + + private final static String ROBOT_SUFFIX = "/robots.txt"; + private final static Logger LOGGER = + Logger.getLogger(HRobotExclusionFilter.class.getName()); + + // TODO: this is not the right thing! + private Charset cs = Charset.forName("UTF-8"); + + private RobotsDirectiveAggregation aggregation = null; + private LiveWebCache webCache = null; + + private String userAgent = null; + private boolean notifiedSeen = false; + private boolean notifiedPassed = false; + private static final FixedRobotsDirectives ALLOW_ROBOT_DIRECTIVE = + new FixedRobotsDirectives(true); + + /** + * Construct a new HRobotExclusionFilter that uses webCache to pull + * robots.txt documents. filtering is based on userAgent, and cached + * documents newer than maxCacheMS in the webCache are considered valid. + * + * @param webCache LiveWebCache from which documents can be retrieved + * @param userAgent String user agent to use for requests to the live web. + * @param maxCacheMS long number of milliseconds to cache documents in the + * LiveWebCache + */ + public HRobotExclusionFilter(LiveWebCache webCache, String userAgent, + long maxCacheMS) { + aggregation = new RobotsDirectiveAggregation(); + this.webCache = webCache; + this.userAgent = userAgent; + } + + private void updateAggregation(String host) + throws LiveWebCacheUnavailableException, + LiveWebTimeoutException, MalformedURLException, IOException { + + List<String> missing = aggregation.getMissingRobotUrls(host); + for(String robotUrl : missing) { + long start = System.currentTimeMillis(); + Resource resource; + try { + resource = webCache.getCachedResource(new URL(robotUrl), + 0,true); + if(resource.getStatusCode() != 200) { + LOGGER.info("ROBOT: Non200("+robotUrl+")"); + // consider it an allow: + aggregation.addDirectives(robotUrl, ALLOW_ROBOT_DIRECTIVE); + } else { + InputStreamReader isr = new InputStreamReader(resource, cs); + BufferedReader br = new BufferedReader(isr); + Robotstxt robotsTxt = new Robotstxt(br); + RobotsDirectives directives = robotsTxt.getDirectivesFor(userAgent); + aggregation.addDirectives(robotUrl, directives); + } + } catch (LiveDocumentNotAvailableException e) { + if(LOGGER.isLoggable(Level.INFO)) { + LOGGER.info("ROBOT: LiveDocumentNotAvailableException(" + + robotUrl + ")"); + } + // consider it an allow: + aggregation.addDirectives(robotUrl, ALLOW_ROBOT_DIRECTIVE); + } + long elapsed = System.currentTimeMillis() - start; + PerformanceLogger.noteElapsed("RobotRequest", elapsed, robotUrl); + } + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourceindex.SearchResultFilter#filterSearchResult(org.archive.wayback.core.SearchResult) + */ + public int filterObject(CaptureSearchResult r) { + if(!notifiedSeen) { + if(filterGroup != null) { + filterGroup.setSawRobots(); + } + notifiedSeen = true; + } + String originalURL = r.getOriginalUrl(); + String path = UrlOperations.getURLPath(originalURL); + if(path.equals(ROBOT_SUFFIX)) { + if(!notifiedPassed) { + if(filterGroup != null) { + filterGroup.setPassedRobots(); + } + notifiedPassed = true; + } + return ObjectFilter.FILTER_INCLUDE; + } + String host = UrlOperations.urlToHost(originalURL); + boolean updated = false; + try { + updateAggregation(host); + if(!aggregation.isBlocked(path)) { + if(LOGGER.isLoggable(Level.INFO)) { + LOGGER.fine("ROBOT: BLOCKED(" + originalURL + ")"); + } + if(LOGGER.isLoggable(Level.FINE)) { + LOGGER.finer("ROBOT: ALLOWED(" + originalURL + ")"); + } + if(!notifiedPassed) { + if(filterGroup != null) { + filterGroup.setPassedRobots(); + } + notifiedPassed = true; + } + return ObjectFilter.FILTER_INCLUDE; + } + +// } catch (LiveDocumentNotAvailableException e) { + } catch (LiveWebCacheUnavailableException e) { + LOGGER.severe("ROBOT: LiveWebCacheUnavailableException(" + + originalURL + ")"); + filterGroup.setLiveWebGone(); + + } catch (LiveWebTimeoutException e) { + LOGGER.severe("ROBOT: LiveDocumentTimedOutException(" + + originalURL + ")"); + filterGroup.setRobotTimedOut(); + + } catch (MalformedURLException e) { + + LOGGER.warning("ROBOT: MalformedURLException(" + + originalURL + ")"); + + } catch (IOException e) { + e.printStackTrace(); + return ObjectFilter.FILTER_EXCLUDE; + } + + if(filterGroup.getRobotTimedOut() || filterGroup.getLiveWebGone()) { + return ObjectFilter.FILTER_ABORT; + } + if(LOGGER.isLoggable(Level.INFO)) { + LOGGER.fine("ROBOT: BLOCKED(" + originalURL + ")"); + } + return ObjectFilter.FILTER_EXCLUDE; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregation.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregation.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregation.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,111 @@ +package org.archive.wayback.accesscontrol.robotstxt; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Class which acts as an aggregation of RobotsDirectives. + * + * If given a host String, will return a list of additional robot URLs that + * need to be added to the current aggregation. + * + * Allows a user to then add new RobotsDirectives for one or more robot URLs. + * + * Finally, allows the aggregation to be queried to see if any of the + * directives block a particular path. + * + * + * @author brad + * + */ +public class RobotsDirectiveAggregation { + private final static Logger LOGGER = + Logger.getLogger(RobotsDirectiveAggregation.class.getName()); + + private final static String HTTP_PREFIX = "http://"; + private final static String ROBOT_SUFFIX = "/robots.txt"; + + private static String WWWN_REGEX = "^www[0-9]+\\."; + private final static Pattern WWWN_PATTERN = Pattern.compile(WWWN_REGEX); + + private HashMap<String,RobotsDirectives> cache = + new HashMap<String, RobotsDirectives>(); + + private StringBuilder sb = new StringBuilder(); + + private String hostToRobotUrlString(final String host) { + sb.setLength(0); + sb.append(HTTP_PREFIX).append(host).append(ROBOT_SUFFIX); + String robotUrl = sb.toString(); + LOGGER.fine("Adding robot URL:" + robotUrl); + return robotUrl; + } + /* + */ + /** + * @param resultHost + * @return a List of all robots.txt urls to attempt for this HOST: + * If HOST starts with "www.DOMAIN": + * [ + * http://HOST/robots.txt, + * http://DOMAIN/robots.txt + * ] + * If HOST starts with "www[0-9]+.DOMAIN": + * [ + * http://HOST/robots.txt, + * http://www.DOMAIN/robots.txt, + * http://DOMAIN/robots.txt + * ] + * Otherwise: + * [ + * http://HOST/robots.txt, + * http://www.HOST/robots.txt + * ] + */ + List<String> hostToRobotUrlStrings(final String resultHost) { + ArrayList<String> list = new ArrayList<String>(); + list.add(hostToRobotUrlString(resultHost)); + + if(resultHost.startsWith("www")) { + if(resultHost.startsWith("www.")) { + list.add(hostToRobotUrlString(resultHost.substring(4))); + } else { + Matcher m = WWWN_PATTERN.matcher(resultHost); + if(m.find()) { + String massagedHost = resultHost.substring(m.end()); + list.add(hostToRobotUrlString("www." + massagedHost)); + list.add(hostToRobotUrlString(massagedHost)); + } + } + } else { + list.add(hostToRobotUrlString("www." + resultHost)); + } + return list; + } + + public List<String> getMissingRobotUrls(String host) { + ArrayList<String> missing = new ArrayList<String>(); + List<String> needed = hostToRobotUrlStrings(host); + for(String need : needed) { + if(!cache.containsKey(need)) { + missing.add(need); + } + } + return missing; + } + public void addDirectives(String url, RobotsDirectives directives) { + cache.put(url, directives); + } + public boolean isBlocked(String path) { + for(RobotsDirectives directives : cache.values()) { + if(!directives.allows(path)) { + return true; + } + } + return false; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectives.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectives.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectives.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,75 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.accesscontrol.robotstxt; + +import java.io.Serializable; +import java.util.concurrent.ConcurrentSkipListSet; + +/** + * Represents the directives that apply to a user-agent (or set of + * user-agents) + */ +public class RobotsDirectives implements Serializable { + private static final long serialVersionUID = 5386542759286155383L; + + ConcurrentSkipListSet<String> disallows = new ConcurrentSkipListSet<String>(); + ConcurrentSkipListSet<String> allows = new ConcurrentSkipListSet<String>(); + float crawlDelay = -1; + + public boolean allows(String path) { + return !(longestPrefixLength(disallows, path) > longestPrefixLength(allows, path)); + } + + /** + * @param prefixSet + * @param str + * @return length of longest entry in {@code prefixSet} that prefixes {@code str}, or zero + * if no entry prefixes {@code str} + */ + protected int longestPrefixLength(ConcurrentSkipListSet<String> prefixSet, + String str) { + String possiblePrefix = prefixSet.floor(str); + if (possiblePrefix != null && str.startsWith(possiblePrefix)) { + return possiblePrefix.length(); + } else { + return 0; + } + } + + public void addDisallow(String path) { + if(path.length()==0) { + // ignore empty-string disallows + // (they really mean allow, when alone) + return; + } + disallows.add(path); + } + + public void addAllow(String path) { + allows.add(path); + } + + public void setCrawlDelay(float i) { + crawlDelay=i; + } + + public float getCrawlDelay() { + return crawlDelay; + } +} \ No newline at end of file Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/Robotstxt.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/Robotstxt.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/Robotstxt.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,234 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.accesscontrol.robotstxt; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Serializable; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.io.IOUtils; +import org.archive.io.ReadSource; + +/** + * Utility class for parsing and representing 'robots.txt' format + * directives, into a list of named user-agents and map from user-agents + * to RobotsDirectives. + */ +public class Robotstxt implements Serializable { + static final long serialVersionUID = 7025386509301303890L; + private static final Logger logger = + Logger.getLogger(Robotstxt.class.getName()); + + // all user agents contained in this robots.txt + // in order of declaration + // TODO: consider discarding irrelevant entries + LinkedList<String> namedUserAgents = new LinkedList<String>(); + // map user-agents to directives + Map<String,RobotsDirectives> agentsToDirectives = + new HashMap<String,RobotsDirectives>(); + RobotsDirectives wildcardDirectives = null; + + boolean hasErrors = false; + + static RobotsDirectives NO_DIRECTIVES = new RobotsDirectives(); + /** empty, reusable instance for all sites providing no rules */ + public static Robotstxt NO_ROBOTS = new Robotstxt(); + + public Robotstxt() { + } + + public Robotstxt(BufferedReader reader) throws IOException { + initializeFromReader(reader); + } + + public Robotstxt(ReadSource customRobots) { + BufferedReader reader = new BufferedReader(customRobots.obtainReader()); + try { + initializeFromReader(reader); + } catch (IOException e) { + logger.log(Level.SEVERE, + "robots ReadSource problem: potential for inadvertent overcrawling", + e); + } finally { + IOUtils.closeQuietly(reader); + } + } + + protected void initializeFromReader(BufferedReader reader) throws IOException { + String read; + // current is the disallowed paths for the preceding User-Agent(s) + RobotsDirectives current = null; + // whether a non-'User-Agent' directive has been encountered + boolean hasDirectivesYet = false; + while (reader != null) { + do { + read = reader.readLine(); + // Skip comments & blanks + } while ((read != null) && ((read = read.trim()).startsWith("#") || + read.length() == 0)); + if (read == null) { + reader.close(); + reader = null; + } else { + // remove any html markup + read = read.replaceAll("<[^>]+>",""); + int commentIndex = read.indexOf("#"); + if (commentIndex > -1) { + // Strip trailing comment + read = read.substring(0, commentIndex); + } + read = read.trim(); + if (read.matches("(?i)^User-agent:.*")) { + String ua = read.substring(11).trim().toLowerCase(); + if (current == null || hasDirectivesYet ) { + // only create new rules-list if necessary + // otherwise share with previous user-agent + current = new RobotsDirectives(); + hasDirectivesYet = false; + } + if (ua.equals("*")) { + wildcardDirectives = current; + } else { + namedUserAgents.addLast(ua); + agentsToDirectives.put(ua, current); + } + continue; + } + if (read.matches("(?i)Disallow:.*")) { + if (current == null) { + // buggy robots.txt + hasErrors = true; + continue; + } + String path = read.substring(9).trim(); + // tolerate common error of ending path with '*' character + // (not allowed by original spec; redundant but harmless with + // Google's wildcarding extensions -- which we don't yet fully + // support). + if(path.endsWith("*")) { + path = path.substring(0,path.length()-1); + } + current.addDisallow(path); + hasDirectivesYet = true; + continue; + } + if (read.matches("(?i)Crawl-delay:.*")) { + if (current == null) { + // buggy robots.txt + hasErrors = true; + continue; + } + // consider a crawl-delay, even though we don't + // yet understand it, as sufficient to end a + // grouping of User-Agent lines + hasDirectivesYet = true; + String val = read.substring(12).trim(); + val = val.split("[^\\d\\.]+")[0]; + try { + current.setCrawlDelay(Float.parseFloat(val)); + } catch (NumberFormatException nfe) { + // ignore + } + continue; + } + if (read.matches("(?i)Allow:.*")) { + if (current == null) { + // buggy robots.txt + hasErrors = true; + continue; + } + String path = read.substring(6).trim(); + // tolerate common error of ending path with '*' character + // (not allowed by original spec; redundant but harmless with + // Google's wildcarding extensions -- which we don't yet fully + // support). + if(path.endsWith("*")) { + path = path.substring(0,path.length()-1); + } + current.addAllow(path); + hasDirectivesYet = true; + continue; + } + // unknown line; do nothing for now + } + } + } + + /** + * Does this policy effectively allow everything? (No + * disallows or timing (crawl-delay) directives?) + * @return + */ + public boolean allowsAll() { + // TODO: refine so directives that are all empty are also + // recognized as allowing all + return agentsToDirectives.isEmpty(); + } + + public List<String> getNamedUserAgents() { + return namedUserAgents; + } + + /** + * Return the RobotsDirectives, if any, appropriate for the given User-Agent + * string. If useFallbacks is true, a wildcard ('*') directives or the default + * of NO_DIRECTIVES will be returned, as appropriate, if there is no better + * match. If useFallbacks is false, a null will be returned if no declared + * directives targeted the given User-Agent. + * + * @param ua String User-Agent to lookup + * @param useFallbacks if true, fall-back to wildcard directives or + * default allow as needed + * @return directives to use, or null if useFallbacks is false and no + * non-wildcard directives match the supplied User-Agent + */ + public RobotsDirectives getDirectivesFor(String ua, boolean useFallbacks) { + // find matching ua + for(String uaListed : namedUserAgents) { + if(ua.indexOf(uaListed)>-1) { + return agentsToDirectives.get(uaListed); + } + } + if(useFallbacks==false) { + return null; + } + if (wildcardDirectives!=null) { + return wildcardDirectives; + } + // no applicable user-agents, so empty directives + return NO_DIRECTIVES; + } + + /** + * Return directives to use for the given User-Agent, resorting to wildcard + * rules or the default no-directives if necessary. + * + * @param userAgent String User-Agent to lookup + * @return directives to use + */ + public RobotsDirectives getDirectivesFor(String userAgent) { + return getDirectivesFor(userAgent, true); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregationTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregationTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregationTest.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,109 @@ +package org.archive.wayback.accesscontrol.robotstxt; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang.StringEscapeUtils; + +import com.google.common.base.Strings; +import com.google.common.collect.Lists; + +import junit.framework.TestCase; + +public class RobotsDirectiveAggregationTest extends TestCase { + + private String[] mapRobotUrls(String[] in ) { + String res[] = new String[in.length]; + for(int i = 0; i < in.length; i++) { + res[i] = "http://" + in[i] + "/robots.txt"; + } + return res; + } + + + /** + * + */ + public void testHostToRobotUrlStrings() { + RobotsDirectiveAggregation f = new RobotsDirectiveAggregation(); + String test1[] = {"www.foo.com","foo.com"}; + compareListTo(f.hostToRobotUrlStrings("www.foo.com"),mapRobotUrls(test1)); + + String test2[] = {"foo.com","www.foo.com"}; + compareListTo(f.hostToRobotUrlStrings("foo.com"),mapRobotUrls(test2)); + + String test3[] = {"fool.foo.com","www.fool.foo.com"}; + compareListTo(f.hostToRobotUrlStrings("fool.foo.com"),mapRobotUrls(test3)); + + String test4[] = {"www4.foo.com","www.foo.com","foo.com"}; + compareListTo(f.hostToRobotUrlStrings("www4.foo.com"),mapRobotUrls(test4)); + + String test5[] = {"www4w.foo.com"}; + compareListTo(f.hostToRobotUrlStrings("www4w.foo.com"),mapRobotUrls(test5)); + + String test6[] = {"www.www.foo.com","www.foo.com"}; + compareListTo(f.hostToRobotUrlStrings("www.www.foo.com"),mapRobotUrls(test6)); + } + private String strJoin(Iterable<String> i, char del) { + StringBuilder sb = new StringBuilder(); + boolean first = true; + for(String s : i) { + if(first) { + first = false; + } else { + sb.append(del); + } + sb.append(s); + } + return sb.toString(); + } + private List<String> sortA(String[] a) { + Arrays.sort(a); + return Lists.newArrayList(a); + } + private List<String> sortL(List<String> a) { + String[] Empty = new String[0]; + String[] tmp; + tmp = a.toArray(Empty); + Arrays.sort(tmp); + return Lists.newArrayList(tmp); + } + private void compareListTo(List<String> list, String strings[]) { + + boolean match = list.size() == strings.length; + List<String> ls = sortL(list); + List<String> ss = sortA(strings); + if(match) { + for(int i = 0; i < strings.length; i++) { + if(!ls.get(i).equals(ss.get(i))) { + match = false; + break; + } + } + } + if(!match) { + String a1 = strJoin(ls,','); + String a2 = strJoin(ss,','); + String msg = String.format("ArrayCMP (%s) != (%s)",a1,a2); + assertTrue(msg,false); + } + } + + public void testInteraction() { + RobotsDirectiveAggregation agg = new RobotsDirectiveAggregation(); + String test1[] = {"http://foo.com/robots.txt","http://www.foo.com/robots.txt"}; + compareListTo(agg.getMissingRobotUrls("foo.com"),test1); + compareListTo(agg.getMissingRobotUrls("www.foo.com"),test1); + agg.addDirectives("http://foo.com/robots.txt", new FixedRobotsDirectives(true)); + String test2[] = {"http://www.foo.com/robots.txt"}; + compareListTo(agg.getMissingRobotUrls("foo.com"),test2); + assertFalse(agg.isBlocked("/foo")); + + agg.addDirectives("http://www.foo.com/robots.txt", new FixedRobotsDirectives(false)); + String test3[] = {}; + compareListTo(agg.getMissingRobotUrls("foo.com"),test3); + assertTrue(agg.isBlocked("/foo")); + + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |