archive-crawler-cvs Mailing List for Heritrix: Internet Archive Web Crawler (Page 487)

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic
In directory sc8-pr-cvs1:/tmp/cvs-serv7488/src/org/archive/crawler/basic

Modified Files:
      Tag: gjm-refactor
	SimpleSelector.java CrawlStateUpdater.java 
	SimpleScheduler.java ARCWriter.java 
Added Files:
      Tag: gjm-refactor
	Scope.java PreconditionEnforcer.java Preselector.java 
	Postselector.java Frontier.java 
Removed Files:
      Tag: gjm-refactor
	SimpleFrontier.java FetcherDNS.java 
	SimplePreconditionEnforcer.java SimplePostselector.java 
	SimplePreselector.java FetcherHTTPSimple.java BasicScope.java 
	StatisticsTracker.java 
Log Message:
renaming, repackaging, streamlining

--- NEW FILE: Scope.java ---
/*
 * BasicScope.java
 * Created on Oct 1, 2003
 *
 * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/Attic/Scope.java,v 1.1.2.1 2003/10/04 00:49:24 gojomo Exp $
 */
package org.archive.crawler.basic;

import org.archive.crawler.datamodel.CandidateURI;
import org.archive.crawler.filter.HopsFilter;
import org.archive.crawler.filter.SeedExtensionFilter;
import org.archive.crawler.filter.TransclusionFilter;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.CrawlScope;
import org.archive.crawler.framework.Filter;

/**
 * A core CrawlScope suitable for the most common
 * crawl needs.
 * 
 * Roughly, its logic is that a URI is included if:
 * 
 *    (( isSeed(uri) || focusFilter.accepts(uri) ) 
 *      || transitiveFilter.accepts(uri) )
 *     && ! excludeFilter.accepts(uri)
 * 
 * The focusFilter may be specified by either:
 *   - adding a 'mode' attribute to the 
 *     <scope> element, in which case a SeedExtensionFilter
 *     will be used, with the <scope> element
 *     providing its configuration 
 *   - adding a <focus> subelement
 * If unspecified, the focusFilter will default to
 * an accepts-all filter.
 * 
 * The transitiveFilter may be specified by supploying
 * a <transitive> subelement. If unspecified, a 
 * TransclusionFilter will be used, with the <scope>
 * element providing its configuration.
 * 
 * The excludeFilter may be specified by supplying
 * a <exclude> subelement. If unspecified, a
 * accepts-none filter will be used -- meaning that
 * no URIs will pass the filter and thus be excluded.
 * 
 * @author gojomo
 *
 */
public class Scope extends CrawlScope {
	Filter focusFilter; 
	Filter transitiveFilter; 
	Filter excludeFilter;
	
	/* (non-Javadoc)
	 * @see org.archive.crawler.framework.Filter#initialize(org.archive.crawler.framework.CrawlController)
	 */
	public void initialize(CrawlController controller) {
		super.initialize(controller);
		// setup focusFilter
		if(getNodeAt("@mode")!=null) {
			// SeedExtensionFilter implied
			focusFilter = new SeedExtensionFilter();
			focusFilter.setNode(xNode);
		} else {
			focusFilter = (Filter) instantiate("focus");
		}
		if(focusFilter != null) {
			focusFilter.initialize(controller);
			// only set up transitiveFilter if focusFilter set
			transitiveFilter = (Filter) instantiate("transitive");
			if(transitiveFilter == null) {
				transitiveFilter = new TransclusionFilter();
				transitiveFilter.setNode(xNode);
				transitiveFilter.initialize(controller);
			}
		}
		// setup exclude filter
		if(getNodeAt("@max-link-hops")!=null) {
			// SeedExtensionFilter implied
			excludeFilter = new HopsFilter();
			excludeFilter.setNode(xNode);
		} else {
			excludeFilter = (Filter) instantiate("exclude");
		}
	}

	/** 
	 * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object)
	 */
	protected boolean innerAccepts(Object o) {
		return ((isSeed(o)||focusAccepts(o))||transitiveAccepts(o))&&!excludeAccepts(o);
	}
	
	/**
	 * @param o
	 * @return
	 */
	private boolean excludeAccepts(Object o) {
		if (excludeFilter == null) {
			return false;
		}
		return excludeFilter.accepts(o);
	}

	/**
	 * @param o
	 * @return
	 */
	private boolean transitiveAccepts(Object o) {
		if (transitiveFilter == null) {
			return true;
		}
		return transitiveFilter.accepts(o);
	}

	/**
	 * @param o
	 * @return
	 */
	private boolean focusAccepts(Object o) {
		if (focusFilter == null) {
			return true;
		}
		return focusFilter.accepts(o);
	}

	private boolean isSeed(Object o) {
		return o instanceof CandidateURI && ((CandidateURI)o).getIsSeed();
	}

} 

--- NEW FILE: PreconditionEnforcer.java ---
/*
 * SimplePolitenessEnforcer.java
 * Created on May 22, 2003
 *
 * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/Attic/PreconditionEnforcer.java,v 1.1.2.1 2003/10/04 00:49:24 gojomo Exp $
 */
package org.archive.crawler.basic;

import java.util.logging.Logger;

import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.framework.Processor;
import org.archive.crawler.datamodel.FetchStatusCodes;

/**
 * Ensures the preconditions for a fetch -- such as 
 * DNS lookup or acquiring a robots.txt policy -- are
 * satisfied before a URI is passed to subsequent
 * stages.
 * 
 * @author gojomo
 *
 */
public class PreconditionEnforcer extends Processor implements FetchStatusCodes {
	private static String XP_DELAY_FACTOR = "params/@delay-factor";
	private static String XP_MINIMUM_DELAY = "params/@minimum-delay";
	private static int DEFAULT_DELAY_FACTOR = 10;
	private static int DEFAULT_MINIMUM_DELAY = 2000;
	
	private static Logger logger = Logger.getLogger("org.archive.crawler.basic.SimplePolitenessEnforcer");

	/* (non-Javadoc)
	 * @see org.archive.crawler.framework.Processor#process(org.archive.crawler.datamodel.CrawlURI)
	 */
	protected void innerProcess(CrawlURI curi) {
		
		if (considerDnsPreconditions(curi)) {
			return;
		}
		
		// make sure we only process schemes we understand (i.e. not dns)
		if(!curi.getUURI().getUri().getScheme().equals("http")){
			logger.fine("PolitenessEnforcer doesn't understand uri's of type "+curi.getUURI().getUri().getScheme() + " (ignoring)");
			return;
		}
		
		if (considerRobotsPreconditions(curi)) {
			return;
		}

		// OK, it's allowed
		
		// for all curis that will in fact be fetched, set appropriate delays
		// TODOSOMEDAY: allow per-host, per-protocol, etc. factors
//		curi.setDelayFactor(getDelayFactorFor(curi));
//		curi.setMinimumDelay(getMinimumDelayFor(curi));

		return;
	}

	/**
	 * @return
	 */
	private boolean considerRobotsPreconditions(CrawlURI curi) {
		// treat /robots.txt fetches specially
		if (curi.getUURI().getUri().getPath().equals("/robots.txt")) {
			// allow processing to continue
			return false; 
		}
		// require /robots.txt if not present
		if (	curi.getServer().getRobotsExpires() < 0 // "cheap" test of default
				|| curi.getServer().getRobotsExpires()<System.currentTimeMillis()
			){
			logger.fine("No valid robots for "+curi.getServer()+"; deferring "+curi);
			curi.setPrerequisiteUri("/robots.txt");
			curi.incrementDeferrals();
			curi.skipToProcessor(controller.getPostprocessor());
			return true;
		}
		// test against robots.txt if available
		String ua = controller.getOrder().getUserAgent();
		if( curi.getServer().getRobots().disallows(curi.getUURI().getUri().getPath(),ua)) {
			// don't fetch
			curi.skipToProcessor(controller.getPostprocessor());  // turn off later stages
			curi.setFetchStatus(S_ROBOTS_PRECLUDED);
			curi.getAList().putString("error","robots.txt exclusion");
			logger.fine("robots.txt precluded "+curi);
			return true;
		}
		return false;
	}

	/**
	 * @param curi
	 * @return true if no further processing in this module shoudl occur
	 */
	private boolean considerDnsPreconditions(CrawlURI curi) {
		
		if(curi.getServer()==null) {
			curi.setFetchStatus(S_UNFETCHABLE_URI);
			curi.skipToProcessor(controller.getPostprocessor());
			return true;
		}
		// if we haven't done a dns lookup  and this isn't a dns uri 
		// shoot that off and defer further processing
		if (!curi.getServer().getHost().hasBeenLookedUp()
			&& !curi.getUURI().getUri().getScheme().equals("dns")) {
			logger.fine(
				"deferring processing of "
					+ curi.toString()
					+ " for dns lookup.");

			String hostname = curi.getServer().getHostname();
			curi.setPrerequisiteUri("dns:" + hostname);
			curi.incrementDeferrals();
			curi.skipToProcessor(controller.getPostprocessor());
			return true;
		}

		// if we've done a dns lookup and it didn't resolve a host
		// cancel all processing of this URI
		if (curi.getServer().getHost().hasBeenLookedUp()
			&& curi.getServer().getHost().getIP() == null) {
			logger.fine(
				"no dns for "
					+ curi.getServer().toString()
					+ " cancelling processing for "
					+ curi.toString());

			//TODO currently we're using FetchAttempts to denote both fetch attempts and
			// the choice to not attempt (here).  Eventually these will probably have to be treated seperately
			// to allow us to treat dns failures and connections failures (downed hosts, route failures, etc) seperately.
			curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE);
			curi.incrementFetchAttempts();
			curi.skipToProcessor(controller.getPostprocessor());
			return true;
		}
		return false;
	}

	/**
	 * 
	 */
	private int getMinimumDelayFor(CrawlURI curi) {
		return getIntAt(XP_MINIMUM_DELAY,DEFAULT_MINIMUM_DELAY);		
	}

	/**
	 * 
	 */
	private int getDelayFactorFor(CrawlURI curi) {
		return getIntAt(XP_DELAY_FACTOR, DEFAULT_DELAY_FACTOR);
	}

}

--- NEW FILE: Preselector.java ---
/*
 * SimplePreselector.java
 * Created on Sep 22, 2003
 *
 * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/Attic/Preselector.java,v 1.1.2.1 2003/10/04 00:49:24 gojomo Exp $
 */
package org.archive.crawler.basic;

import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.datamodel.FetchStatusCodes;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.CrawlScope;
import org.archive.crawler.framework.Processor;

/**
 * Gives a yes/no on whether a CrawlURI should be processed at all.
 * 
 * Usually, failing a processor filter causes that processor
 * to be skipped. Failing this processor's filter means a
 * CrawlURI will be marked OUT_OF_SCOPE.
 * 
 * 
 * @author gojomo
 *
 */
public class Preselector extends Processor implements FetchStatusCodes {
	private boolean recheckScope;

	private static String XP_RECHECK_SCOPE="@scope";
	
//	private static String XP_MAX_LINK_DEPTH="params/@max-link-depth";
//	private static String XP_MAX_EMBED_DEPTH="params/@max-embed-depth";
//	private int maxLinkDepth = -1;
//	private int maxEmbedDepth = -1;

	/* (non-Javadoc)
	 * @see org.archive.crawler.framework.Processor#innerProcess(org.archive.crawler.datamodel.CrawlURI)
	 */
	protected void innerProcess(CrawlURI curi) {
		if (recheckScope) {
			CrawlScope scope = controller.getScope();
			if (curi.getScopeVersion()==scope.getVersion()) {
				// already checked
				return;
			}
			if(scope.accepts(curi)) {
				curi.setScopeVersion(scope.getVersion());
				return;
			}
			// scope rejected
			curi.setFetchStatus(S_OUT_OF_SCOPE);
			curi.skipToProcessor(controller.getPostprocessor());
		}
		
		
//		super.innerProcess(curi);
//		
//		// check for too-deep
//		if(maxLinkDepth>=0 && curi.getLinkHopCount()>maxLinkDepth) {
//			curi.setFetchStatus(S_TOO_MANY_LINK_HOPS);
//			curi.cancelFurtherProcessing();
//			return;
//		}
//		if(maxEmbedDepth>=0 && curi.getEmbedHopCount()>maxEmbedDepth) {
//			curi.setFetchStatus(S_TOO_MANY_EMBED_HOPS);
//			curi.cancelFurtherProcessing();
//			return;
//		}
	}

//	/* (non-Javadoc)
//	 * @see org.archive.crawler.framework.Processor#innerRejectProcess(org.archive.crawler.datamodel.CrawlURI)
//	 */
//	protected void innerRejectProcess(CrawlURI curi) {
//		super.innerRejectProcess(curi);
//		// filter-rejection means out-of-scope for everything but embeds
//		if (curi.getEmbedHopCount() < 1) {
//			curi.setFetchStatus(S_OUT_OF_SCOPE);
//			curi.cancelFurtherProcessing();
//		} else {
//			// never mind; scope filters don't apply
//		}
//	}
	
	/* (non-Javadoc)
	 * @see org.archive.crawler.framework.Processor#initialize(org.archive.crawler.framework.CrawlController)
	 */
	public void initialize(CrawlController c) {
		super.initialize(c);
	    recheckScope = getBooleanAt("@scope",false);
		
		
		//maxLinkDepth = getIntAt(XP_MAX_LINK_DEPTH, maxLinkDepth);
		//maxEmbedDepth = getIntAt(XP_MAX_EMBED_DEPTH, maxEmbedDepth);
		
	}
	

}

--- NEW FILE: Postselector.java ---
/*
 * SimplePostselector.java
 * Created on Oct 2, 2003
 *
 * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/Attic/Postselector.java,v 1.1.2.1 2003/10/04 00:49:24 gojomo Exp $
 */
package org.archive.crawler.basic;

import java.net.URISyntaxException;
import java.util.logging.Level;

import org.archive.crawler.datamodel.CandidateURI;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.datamodel.FetchStatusCodes;
import org.archive.crawler.datamodel.UURI;
import org.archive.crawler.framework.Processor;

/**
 * Determine which links etc get fed back into Frontier,
 * if/when failures get retried, etc.
 * 
 * 
 * @author gojomo
 *
 */
public class Postselector extends Processor implements CoreAttributeConstants, FetchStatusCodes {
	// limits on retries TODO: separate into retryPolicy? 
	private int maxDeferrals = 10; // should be at least max-retries plus 3 or so

	
	/* (non-Javadoc)
	 * @see org.archive.crawler.framework.Processor#innerProcess(org.archive.crawler.datamodel.CrawlURI)
	 */
	protected void innerProcess(CrawlURI curi) {
		// handle any prerequisites
		if (curi.getAList().containsKey(A_PREREQUISITE_URI)) {
			handlePrerequisites(curi);
			return;
		}
	}

	protected void handlePrerequisites(CrawlURI curi) {
		try {
			UURI prereq = UURI.createUURI(curi.getPrerequisiteUri(),curi.getUURI().getUri());
			CandidateURI caUri = new CandidateURI(prereq);
			caUri.setVia(curi);
			caUri.setPathFromSeed(curi.getPathFromSeed()+"P");
			
			if ( curi.getDeferrals() > maxDeferrals ) {
				// too many deferrals, equals failure
				curi.setFetchStatus(S_PREREQUISITE_FAILURE);
				//failureDisposition(curi);
				return;
			}		
			if (!scheduleHigh(caUri)) {
				// prerequisite cannot be scheduled (perhaps excluded by scope)
				// must give up on 
				curi.setFetchStatus(S_PREREQUISITE_FAILURE);
				//failureDisposition(curi);
				return;
			}
		} catch (URISyntaxException ex) {
			Object[] array = { curi, curi.getPrerequisiteUri() };
			controller.uriErrors.log(Level.INFO,ex.getMessage(), array );
		}
	}

	/**
	 * @param prereq
	 * @return
	 */
	private boolean scheduleHigh(CandidateURI caUri) {
		if(controller.getScope().accepts(caUri)) {
			controller.getFrontier().scheduleHigh(caUri);
			return true;
		}
		return false;
	}


}

--- NEW FILE: Frontier.java ---
/*
 * SimpleFrontier.java
 * Created on Oct 1, 2003
 *
 * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/Attic/Frontier.java,v 1.1.2.1 2003/10/04 00:49:24 gojomo Exp $
 */
package org.archive.crawler.basic;

import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.crawler.datamodel.CandidateURI;
import org.archive.crawler.datamodel.CoreAttributeConstants;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.datamodel.FetchStatusCodes;
import org.archive.crawler.datamodel.MemUURISet;
import org.archive.crawler.datamodel.UURI;
import org.archive.crawler.datamodel.UURISet;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.URIFrontier;
import org.archive.crawler.framework.XMLConfig;
import org.archive.crawler.framework.exceptions.FatalConfigurationException;

/**
 * A basic in-memory mostly breadth-first frontier, which 
 * refrains from emitting more than one CrawlURI of the same 
 * 'key' (host) at once, and respects minimum-delay and 
 * delay-factor specifications for politeness
 * 
 * @author gojomo
 *
 */
public class Frontier
	extends XMLConfig 
	implements URIFrontier, FetchStatusCodes, CoreAttributeConstants {
	private static String XP_DELAY_FACTOR = "@delay-factor";
	private static String XP_MINIMUM_DELAY = "@minimum-delay";
	private static int DEFAULT_DELAY_FACTOR = 5;
	private static int DEFAULT_MINIMUM_DELAY = 2000;

	private static Logger logger =
		Logger.getLogger("org.archive.crawler.basic.SimpleFrontier");
	CrawlController controller;
	
	// HashMap allCuris = new HashMap(); // of UURI -> CrawlURI 
	
	// TODO update to use fingerprints only
	UURISet alreadyIncluded = new MemUURISet();
	
	// every CandidateURI not yet in process or another queue; 
	// all seeds start here; may contain duplicates
	LinkedList pendingQueue = new LinkedList(); // of CandidateURIs 
	
	// every CandidateURI not yet in process or another queue; 
	// all seeds start here; may contain duplicates
	LinkedList pendingHighQueue = new LinkedList(); // of CandidateURIs 

	// every CrawlURI handed out for processing but not yet returned
	HashMap inProcessMap = new HashMap(); // of String (classKey) -> CrawlURI
	
	// all active per-class queues
	HashMap allClassQueuesMap = new HashMap(); // of String (classKey) -> KeyedQueue
	
	// all per-class queues whose first item may be handed out (that is, no CrawlURI
	// of the same class is currently in-process)
	LinkedList readyClassQueues = new LinkedList(); // of String (queueKey) -> KeyedQueue 
	
	// all per-class queues who are on hold because a CrawlURI of their class
	// is already in process 
	LinkedList heldClassQueues = new LinkedList(); // of String (queueKey) -> KeyedQueue 

	// all per-class queues who are on hold until a certain time
	SortedSet snoozeQueues = new TreeSet(new SchedulingComparator()); // of KeyedQueue, sorted by wakeTime

	// CrawlURIs held until some specific other CrawlURI is emitted
	HashMap heldCuris = new HashMap(); // of UURI -> CrawlURI

    // limits on retries TODO: separate into retryPolicy? 
	private int maxRetries = 3;
	private int retryDelay = 15000;
	private long minimumDelay;
	private long delayFactor;

	// top-level stats
	int completionCount = 0;
	int failedCount = 0;

	/* (non-Javadoc)
	 * @see org.archive.crawler.framework.URIFrontier#initialize(org.archive.crawler.framework.CrawlController)
	 */
	public void initialize(CrawlController c)
		throws FatalConfigurationException {
		
		delayFactor = getIntAt(XP_DELAY_FACTOR,DEFAULT_DELAY_FACTOR);
		minimumDelay = getIntAt(XP_MINIMUM_DELAY,DEFAULT_MINIMUM_DELAY);
		
		this.controller = c;
		Iterator iter = c.getScope().getSeeds().iterator();
		while (iter.hasNext()) {
			UURI u = (UURI) iter.next();
			CandidateURI caUri = new CandidateURI(u);
			caUri.setIsSeed(true);
			schedule(caUri);
		}
	}

	/** 
	 * 
	 * @see org.archive.crawler.framework.URIFrontier#schedule(org.archive.crawler.datamodel.CandidateURI)
	 */
	public synchronized void schedule(CandidateURI caUri) {
		pendingQueue.addLast(caUri);
	}
	
	/* (non-Javadoc)
	 * @see org.archive.crawler.framework.URIFrontier#scheduleHigh(org.archive.crawler.datamodel.CandidateURI)
	 */
	public void scheduleHigh(CandidateURI caUri) {
		pendingHighQueue.addLast(caUri);
	}


	/** 
	 * 
	 * @see org.archive.crawler.framework.URIFrontier#next(int)
	 */
	public CrawlURI next(int timeout) {
		
		long now = System.currentTimeMillis();
		long waitMax = 0;
		CrawlURI curi = null;

		// first, empty the high-priority queue
		CandidateURI caUri; 
		while ((caUri = dequeueFromPendingHigh()) != null) {
			if(alreadyIncluded.contains(caUri)) {
				continue;
			}
			curi = new CrawlURI(caUri);
			if (!enqueueIfNecessary(curi)) {
				// OK to emit
				return emitCuri(curi);
			}
		} // if reached, the pendingHighQueue is empty

		// if enough time has passed to wake any snoozing queues, do it
		wakeReadyQueues(now);
		
		// first, see if any holding queues are ready with a CrawlURI
		if (!readyClassQueues.isEmpty()) {
			curi = dequeueFromReady();
			return emitCuri(curi);
		}
		
		// if that fails, check the pending queue
		while ((caUri = dequeueFromPending()) != null) {
			if(alreadyIncluded.contains(caUri)) {
				continue;
			}
			curi = new CrawlURI(caUri);
			if (!enqueueIfNecessary(curi)) {
				// OK to emit
				return emitCuri(curi);
			}
		}
		
		// consider if URIs exhausted
		if(isEmpty()) {
			// nothing left to crawl
			logger.info("nothing left to crawl");
			// TODO halt/spread the word???
			return null;
		}
		
		// nothing to return, but there are still URIs
		// held for the future
		
		// block until something changes, or timeout occurs
		waitMax = Math.min(earliestWakeTime()-now,timeout);
		try {
			if(waitMax<0) {
				logger.warning("negative wait "+waitMax+" ignored");
			} else {
				synchronized(this) {
					wait(waitMax);
				}
			}
		} catch (InterruptedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return null;
	}

	/* (non-Javadoc)
	 * @see org.archive.crawler.framework.URIFrontier#finished(org.archive.crawler.datamodel.CrawlURI)
	 */
	public synchronized void finished(CrawlURI curi) {
		logger.fine(this+".finished("+curi+")");
		
		try {
			noteProcessingDone(curi);
			// snooze queues as necessary
			updateScheduling(curi);
			notify(); // new items might be available
			
			// consider errors which halt further processing
			if (isDispositiveFailure(curi)) {
				failureDisposition(curi);
				return;
			}
				
// NOW HANDLED BY POSTSELECTOR
//			// handle any prerequisites
//			if (curi.getAList().containsKey(A_PREREQUISITE_URI)) {
//				handlePrerequisites(curi);
//				return;
//			}
				
			// consider errors which can be retried
			if (needsRetrying(curi)) {
				scheduleForRetry(curi);
				return;
			}
				
				
// NOW HANDLED BY POSTSELECTOR
//			URI baseUri = getBaseURI(curi);
//				
//			// handle http headers 
//			if (curi.getAList().containsKey(A_HTTP_HEADER_URIS)) {
//				handleHttpHeaders(curi, baseUri);
//			}
//			// handle embeds 
//			if (curi.getAList().containsKey(A_HTML_EMBEDS)) {
//				handleEmbeds(curi, baseUri);
//			}
//			// handle links
//			if (curi.getAList().containsKey(A_HTML_LINKS)) {
//				handleLinks(curi, baseUri);
//			}
				
			// SUCCESS: note & log
			successDisposition(curi);
		} catch (RuntimeException e) {
			curi.setFetchStatus(S_INTERNAL_ERROR);
			// store exception temporarily for logging
			curi.getAList().putObject(A_RUNTIME_EXCEPTION,(Object)e);
			failureDisposition(curi);
		}	
	} 
			
	/**
	 * The CrawlURI has been successfully crawled, and will be
	 * attempted no more. 
	 * 
	 * @param curi
	 */
	protected void successDisposition(CrawlURI curi) {
		completionCount++;
		if ( (completionCount % 500) == 0) {
			logger.info("==========> " +
				completionCount+" <========== HTTP URIs completed");
		}
				
		Object array[] = { curi };
		controller.uriProcessing.log(
			Level.INFO,
			curi.getUURI().getUri().toString(),
			array);
		
		// note that CURI has passed out of scheduling
		curi.setStoreState(URIStoreable.FINISHED);
		if (curi.getDontRetryBefore()<0) {
			// if not otherwise set, retire this URI forever
			curi.setDontRetryBefore(Long.MAX_VALUE);
		}
		curi.stripToMinimal();
	}


	/**
	 * Store is empty only if all queues are empty and 
	 * no URIs are in-process
	 * 
	 * @return
	 */
	public boolean isEmpty() {
		return pendingQueue.isEmpty()
				&& readyClassQueues.isEmpty()
				&& heldClassQueues.isEmpty() 
				&& snoozeQueues.isEmpty()
				&& inProcessMap.isEmpty();
	}

	/* (non-Javadoc)
	 * @see org.archive.crawler.framework.URIFrontier#size()
	 */
	public long size() {
		// TODO Auto-generated method stub
		return 0;
	}
	
	
	/**
	 * 
	 */
	protected void wakeReadyQueues(long now) {
		while(!snoozeQueues.isEmpty()&&((URIStoreable)snoozeQueues.first()).getWakeTime()<=now) {
			URIStoreable awoken = (URIStoreable)snoozeQueues.first();
			if (!snoozeQueues.remove(awoken)) {
				logger.severe("first() item couldn't be remove()d!");
			}
			if (awoken instanceof KeyedQueue) {
				assert inProcessMap.get(awoken.getClassKey()) == null : "false ready: class peer still in process";
				if(((KeyedQueue)awoken).isEmpty()) {
					// just drop queue
					discardQueue(awoken);
					return;
				}
				readyClassQueues.add(awoken);
				awoken.setStoreState(URIStoreable.READY);
			} else if (awoken instanceof CrawlURI) {
				// TODO think about whether this is right
				pushToPending((CrawlURI)awoken);
			} else {
				assert false : "something evil has awoken!";
			}
		}
	}

	private void discardQueue(URIStoreable awoken) {
		allClassQueuesMap.remove(((KeyedQueue)awoken).getClassKey());
		awoken.setStoreState(URIStoreable.FINISHED);
	}
	
	/**
	 * @return
	 */
	private CrawlURI dequeueFromReady() {
		KeyedQueue firstReadyQueue = (KeyedQueue)readyClassQueues.getFirst();
		CrawlURI readyCuri = (CrawlURI) firstReadyQueue.removeFirst();
		return readyCuri;
	}

	/**
	 * @param crawlURI
	 * @return
	 */
	private CrawlURI emitCuri(CrawlURI curi) {
		if(curi != null) {
			if (curi.getStoreState() == URIStoreable.FINISHED) {
				System.out.println("break here");
			}
			assert curi.getStoreState() != URIStoreable.FINISHED : "state "+curi.getStoreState()+" instead of ready for "+ curi; 
			//assert curi.getAList() != null : "null alist in curi " + curi + " state "+ curi.getStoreState();
			noteInProcess(curi);
			curi.setServer(controller.getHostCache().getServerFor(curi));
		}
		logger.fine(this+".emitCuri("+curi+")");
		return curi;
	}

	/**
	 * @param curi
	 */
	protected void noteInProcess(CrawlURI curi) {
		assert inProcessMap.get(curi.getClassKey()) == null : "two CrawlURIs with same classKey in process";
		
		inProcessMap.put(curi.getClassKey(), curi);
		curi.setStoreState(URIStoreable.IN_PROCESS);
		
		KeyedQueue classQueue = (KeyedQueue) allClassQueuesMap.get(curi.getClassKey());
		if (classQueue == null) {
			releaseHeld(curi); 
			return;
		}
		assert classQueue.getStoreState() == URIStoreable.READY : "odd state "+ classQueue.getStoreState() + " for classQueue "+ classQueue + "of to-be-emitted CrawlURI";
		readyClassQueues.remove(classQueue);
		enqueueToHeld(classQueue);
		releaseHeld(curi);
	}

	/**
	 * @param classQueue
	 */
	private void enqueueToHeld(KeyedQueue classQueue) {
		heldClassQueues.add(classQueue);
		classQueue.setStoreState(URIStoreable.HELD);
	}

	/**
	 * @param curi
	 */
	private void releaseHeld(CrawlURI curi) {
		CrawlURI released = (CrawlURI) heldCuris.get(curi.getUURI());
		if(released!=null) {
			heldCuris.remove(curi.getUURI());
			reinsert(released);
		}
	}

	/**
	 * @param curi
	 */
	public void reinsert(CrawlURI curi) {

		if(enqueueIfNecessary(curi)) {
			// added to classQueue
			return;
		}
		// no classQueue
		pushToPending(curi);
	}
	
	/**
	 * 
	 */
	protected CandidateURI dequeueFromPendingHigh() {
		if (pendingHighQueue.isEmpty()) {
			return null;
		}
		return (CandidateURI)pendingHighQueue.removeFirst();
	}
	/**
	 * 
	 */
	protected CandidateURI dequeueFromPending() {
		if (pendingQueue.isEmpty()) {
			return null;
		}
		return (CandidateURI)pendingQueue.removeFirst();
	}

	/**
	 * 
	 * @param curi
	 * @return true if enqueued
	 */
	public boolean enqueueIfNecessary(CrawlURI curi) {
		KeyedQueue classQueue = (KeyedQueue) allClassQueuesMap.get(curi.getClassKey());
		if (classQueue != null) {
			// must enqueue
			classQueue.add(curi);
			curi.setStoreState(classQueue.getStoreState());
			return true;
		}
		CrawlURI classmateInProgress = (CrawlURI) inProcessMap.get(curi.getClassKey());
		if (classmateInProgress != null) {
			// must create queue, and enqueue
			classQueue = new KeyedQueue(curi.getClassKey());
			allClassQueuesMap.put(classQueue.getClassKey(), classQueue);
			enqueueToHeld(classQueue);
			classQueue.add(curi);
			curi.setStoreState(classQueue.getStoreState());
			return true;
		}
		
		return false;	
	}

	/**
	 * @return
	 */
	public long earliestWakeTime() {
		if (!snoozeQueues.isEmpty()) {
			return ((URIStoreable)snoozeQueues.first()).getWakeTime();
		}
		return Long.MAX_VALUE;
	}

	/**
	 * @param curi
	 */
	private synchronized void pushToPending(CrawlURI curi) {
		pendingQueue.addFirst(curi);
		curi.setStoreState(URIStoreable.PENDING);
	}

	/* (non-Javadoc)
	 * @see org.archive.crawler.framework.URIFrontier#discoveredUriCount()
	 */
	public int discoveredUriCount() {
		// TODO Auto-generated method stub
		return 0;
	}

	/* (non-Javadoc)
	 * @see org.archive.crawler.framework.URIFrontier#successfullyFetchedCount()
	 */
	public int successfullyFetchedCount() {
		// TODO Auto-generated method stub
		return 0;
	}

	/* (non-Javadoc)
	 * @see org.archive.crawler.framework.URIFrontier#failedFetchCount()
	 */
	public int failedFetchCount() {
		// TODO Auto-generated method stub
		return 0;
	}
	
	/**
	 * 
	 * @return
	 */
	public void noteProcessingDone(CrawlURI curi) {
		assert inProcessMap.get(curi.getClassKey())
			== curi : "CrawlURI returned not in process";

		inProcessMap.remove(curi.getClassKey());

		KeyedQueue classQueue =
			(KeyedQueue) allClassQueuesMap.get(curi.getClassKey());
		if (classQueue == null) {
			return;
		}
		assert classQueue.getStoreState()
			== URIStoreable.HELD : "odd state for classQueue of remitted CrawlURI";
		heldClassQueues.remove(classQueue);
		if (classQueue.isEmpty()) {
			// just drop it
			discardQueue(classQueue);
			return;
		}
		readyClassQueues.add(classQueue);
		classQueue.setStoreState(URIStoreable.READY);
		// TODO: since usually, queue will be snoozed, this juggling is often superfluous
	}

	/**
	 * Update any scheduling structures with the new information
	 * in this CrawlURI. Chiefly means make necessary arrangements
	 * for no other URIs at the same host to be visited within the
	 * appropriate politeness window. 
	 * 
	 * @param curi
	 */
	protected void updateScheduling(CrawlURI curi) {
		long durationToWait = 0;
		if (curi.getAList().containsKey(A_FETCH_BEGAN_TIME)
			&& curi.getAList().containsKey(A_FETCH_COMPLETED_TIME)) {
				
			long completeTime = curi.getAList().getLong(A_FETCH_COMPLETED_TIME);
			durationToWait =
				delayFactor
					* (completeTime
						- curi.getAList().getLong(A_FETCH_BEGAN_TIME));

			if (minimumDelay > durationToWait) {
				durationToWait = minimumDelay;
			}
			// TODO: maximum delay? 
			
			if(durationToWait>0) {
				snoozeQueueUntil(curi.getClassKey(), completeTime + durationToWait);
			} 
		}
	}
	
	/**
	 * The CrawlURI has encountered a problem, and will not
	 * be retried. 
	 * 
	 * @param curi
	 */
	protected void failureDisposition(CrawlURI curi) {

		failedCount++;

		// send to basic log 
		Object array[] = { curi };
		controller.uriProcessing.log(
			Level.INFO,
			curi.getUURI().getUri().toString(),
			array);

		// if exception, also send to crawlErrors
		if (curi.getFetchStatus() == S_INTERNAL_ERROR) {
			controller.crawlErrors.log(
				Level.INFO,
				curi.getUURI().getUri().toString(),
				array);
		}
		if (shouldBeForgotten(curi)) {
			// curi is dismissed without prejudice: it can be reconstituted
			forget(curi);
		} else {
			curi.setStoreState(URIStoreable.FINISHED);
			if (curi.getDontRetryBefore() < 0) {
				// if not otherwise set, retire this URI forever
				curi.setDontRetryBefore(Long.MAX_VALUE);
			}
			curi.stripToMinimal();
		}
	}

	/**
	 * Has the CrawlURI suffered a failure which completes
	 * its processing?
	 * 
	 * @param curi
	 * @return
	 */
	private boolean isDispositiveFailure(CrawlURI curi) {
		switch (curi.getFetchStatus()) {

			case S_DOMAIN_UNRESOLVABLE :
				// network errors; perhaps some of these 
				// should be scheduled for retries
			case S_ROBOTS_PRECLUDED :
				// they don't want us to have it	
			case S_INTERNAL_ERROR :
				// something unexpectedly bad happened
			case S_UNFETCHABLE_URI :
				// no chance to fetch
			case S_OUT_OF_SCOPE :
				// filtered out
			case S_TOO_MANY_EMBED_HOPS :
				// too far from last true link
			case S_TOO_MANY_LINK_HOPS :
				// too far from seeds
				return true;

			case S_UNATTEMPTED :
				// this uri is virgin, let it carry on
			default :
				return false;
		}
	}
	
	/**
	 * @param curi
	 * @return
	 */
	private boolean needsRetrying(CrawlURI curi) {
		//
		if (curi.getFetchAttempts()>=maxRetries) {
			return false;
		}
		switch (curi.getFetchStatus()) {
			case S_CONNECT_FAILED:					
			case S_CONNECT_LOST:
			case S_UNATTEMPTED:
			case S_TIMEOUT:
				// these are all worth a retry
				return true;
			default:
				return false;
		}
	}

	/**
	 * @param curi
	 */
	private void scheduleForRetry(CrawlURI curi) {
		logger.fine("inserting snoozed "+curi+" for "+retryDelay);
		insertSnoozed(curi,retryDelay);
	}
	
	/**
	 * @param object
	 * @param l
	 */
	public void snoozeQueueUntil(Object classKey, long wake) {
		KeyedQueue classQueue = (KeyedQueue) allClassQueuesMap.get(classKey);
		if ( classQueue == null ) {
			classQueue = new KeyedQueue(classKey);
			allClassQueuesMap.put(classQueue.getClassKey(),classQueue);
		} else {
			assert classQueue.getStoreState() == URIStoreable.READY : "snoozing queue should have been READY";
			readyClassQueues.remove(classQueue);
		}
		classQueue.setWakeTime(wake);
		snoozeQueues.add(classQueue);
		classQueue.setStoreState(URIStoreable.SNOOZED);
	}

	/**
	 * @param curi
	 * @return
	 */
	private boolean shouldBeForgotten(CrawlURI curi) {
		switch(curi.getFetchStatus()) {
			case S_TOO_MANY_EMBED_HOPS:
			case S_TOO_MANY_LINK_HOPS:
				return true;
			default:
				return false;
		}
	}

	/**
	 * Forget the given CrawlURI. This allows a new instance
	 * to be created in the future, if it is reencountered under 
	 * different circumstances. 
	 * 
	 * @param curi
	 */
	public void forget(CrawlURI curi) {
		alreadyIncluded.remove(curi.getUURI());
		curi.setStoreState(URIStoreable.FORGOTTEN);
	}

	/**
	 * Revisit the CrawlURI -- but not before delay time has passed.
	 * @param curi
	 * @param retryDelay
	 */
	public void insertSnoozed(CrawlURI curi, long retryDelay) {
		curi.setWakeTime(System.currentTimeMillis()+retryDelay );
		curi.setStoreState(URIStoreable.SNOOZED);
		snoozeQueues.add(curi);
	}

}

Index: SimpleSelector.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleSelector.java,v
retrieving revision 1.27.2.1
retrieving revision 1.27.2.2
diff -C2 -d -r1.27.2.1 -r1.27.2.2
*** SimpleSelector.java	3 Oct 2003 17:11:30 -0000	1.27.2.1
--- SimpleSelector.java	4 Oct 2003 00:49:24 -0000	1.27.2.2
***************
*** 444,449 ****
  	}
  
- 
- 
  	/**
  	 * @param curi
--- 444,447 ----
***************
*** 460,463 ****
--- 458,463 ----
  	}
  
+ 
+ 
  	/* (non-Javadoc)
  	 * @see org.archive.crawler.framework.URISelector#initialize(org.archive.crawler.framework.CrawlController)
***************
*** 498,502 ****
  		return true;
  	}
- 
- 
  }
--- 498,500 ----

Index: CrawlStateUpdater.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/CrawlStateUpdater.java,v
retrieving revision 1.5
retrieving revision 1.5.2.1
diff -C2 -d -r1.5 -r1.5.2.1
*** CrawlStateUpdater.java	6 Aug 2003 01:20:56 -0000	1.5
--- CrawlStateUpdater.java	4 Oct 2003 00:49:24 -0000	1.5.2.1
***************
*** 8,11 ****
--- 8,12 ----
  
  import org.apache.commons.httpclient.methods.GetMethod;
+ import org.archive.crawler.datamodel.*;
  import org.archive.crawler.datamodel.CoreAttributeConstants;
  import org.archive.crawler.datamodel.CrawlURI;

Index: SimpleScheduler.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleScheduler.java,v
retrieving revision 1.13.2.2
retrieving revision 1.13.2.3
diff -C2 -d -r1.13.2.2 -r1.13.2.3
*** SimpleScheduler.java	3 Oct 2003 17:11:30 -0000	1.13.2.2
--- SimpleScheduler.java	4 Oct 2003 00:49:24 -0000	1.13.2.3
***************
*** 11,18 ****
  
  import org.archive.crawler.datamodel.CrawlURI;
- import org.archive.crawler.datamodel.FatalConfigurationException;
  import org.archive.crawler.datamodel.UURI;
  import org.archive.crawler.framework.CrawlController;
  import org.archive.crawler.framework.ToeThread;
  
  /**
--- 11,18 ----
  
  import org.archive.crawler.datamodel.CrawlURI;
  import org.archive.crawler.datamodel.UURI;
  import org.archive.crawler.framework.CrawlController;
  import org.archive.crawler.framework.ToeThread;
+ import org.archive.crawler.framework.exceptions.FatalConfigurationException;
  
  /**

Index: ARCWriter.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/ARCWriter.java,v
retrieving revision 1.32.2.1
retrieving revision 1.32.2.2
diff -C2 -d -r1.32.2.1 -r1.32.2.2
*** ARCWriter.java	3 Oct 2003 01:54:35 -0000	1.32.2.1
--- ARCWriter.java	4 Oct 2003 00:49:24 -0000	1.32.2.2
***************
*** 17,25 ****
  import org.archive.crawler.datamodel.CrawlOrder;
  import org.archive.crawler.datamodel.CrawlURI;
  import org.archive.crawler.framework.CrawlController;
  import org.archive.crawler.framework.Processor;
  import org.archive.util.ArchiveUtils;
  import org.archive.util.IAGZIPOutputStream;
- import org.w3c.dom.Node;
  import org.xbill.DNS.Record;
  
--- 17,25 ----
  import org.archive.crawler.datamodel.CrawlOrder;
  import org.archive.crawler.datamodel.CrawlURI;
+ import org.archive.crawler.datamodel.StatisticsTracker;
  import org.archive.crawler.framework.CrawlController;
  import org.archive.crawler.framework.Processor;
  import org.archive.util.ArchiveUtils;
  import org.archive.util.IAGZIPOutputStream;
  import org.xbill.DNS.Record;
  
***************
*** 36,40 ****
  	
  	private int arcMaxSize = 100000000;		// max size we want arc files to be (bytes)
! 	private String arcPrefix = "archive";			// file prefix for arcs
  	private String outputDir = "";						// where should we put them?
  	private File file = null;								// file handle
--- 36,40 ----
  	
  	private int arcMaxSize = 100000000;		// max size we want arc files to be (bytes)
! 	private String arcPrefix = "IAH";			// file prefix for arcs
  	private String outputDir = "";						// where should we put them?
  	private File file = null;								// file handle
***************
*** 67,95 ****
  		CrawlOrder order = controller.getOrder();
  		
! 		// retrieve any nodes we think we need from the dom(s)
! 		Node filePrefix = order.getNodeAt("/crawl-order/arc-file/@prefix");
! 		Node maxSize = getNodeAt("./arc-files/@max-size-bytes");
! 		Node path = order.getNodeAt("//disk/@path");
! 		Node compression = getNodeAt("./compression/@use");
! 		
! 		setUseCompression(
! 			( (compression==null) ? true : compression.getNodeValue().equals("true"))
! 		); 
! 		
! 		setArcPrefix( 
! 			( (filePrefix==null) ? arcPrefix : filePrefix.getNodeValue() )
! 		);
! 		
! 		setArcMaxSize(
! 			( (maxSize==null) ? arcMaxSize : (new Integer(maxSize.getNodeValue())).intValue() )
! 		);
! 		
! 		setOutputDir(
! 			( (path==null) ? outputDir : path.getNodeValue() )
! 		);
  
    	}
!   	
!   	/**
    	 * Takes a CrawlURI and generates an arc record, writing it
    	 * to disk.  Currently
--- 67,78 ----
  		CrawlOrder order = controller.getOrder();
  		
! 		setUseCompression(getBooleanAt("@compression",false));
! 		setArcPrefix(getStringAt("@prefix",arcPrefix));
! 		setArcMaxSize(getIntAt("@max-size-bytes",arcMaxSize));
! 		setOutputDir(getStringAt("@path",outputDir));
  
    	}
! 
! 	/**
    	 * Takes a CrawlURI and generates an arc record, writing it
    	 * to disk.  Currently

--- SimpleFrontier.java DELETED ---

--- FetcherDNS.java DELETED ---

--- SimplePreconditionEnforcer.java DELETED ---

--- SimplePostselector.java DELETED ---

--- SimplePreselector.java DELETED ---

--- FetcherHTTPSimple.java DELETED ---

--- BasicScope.java DELETED ---

--- StatisticsTracker.java DELETED ---

2003	Jan	Feb	Mar	Apr	May	Jun	Jul	Aug (2)	Sep (50)	Oct (197)	Nov (305)	Dec (295)
2004	Jan (429)	Feb (694)	Mar (443)	Apr (479)	May (357)	Jun (74)	Jul (218)	Aug (162)	Sep (156)	Oct (340)	Nov (132)	Dec (224)
2005	Jan (170)	Feb (122)	Mar (265)	Apr (215)	May (139)	Jun (247)	Jul (179)	Aug (116)	Sep (103)	Oct (125)	Nov (97)	Dec (221)
2006	Jan (132)	Feb (18)	Mar (23)	Apr (35)	May (71)	Jun (268)	Jul (220)	Aug (376)	Sep (181)	Oct (71)	Nov (131)	Dec (172)
2007	Jan (125)	Feb (79)	Mar (90)	Apr (76)	May (91)	Jun (64)	Jul (113)	Aug (96)	Sep (40)	Oct (30)	Nov (85)	Dec (56)
2008	Jan (37)	Feb (79)	Mar (22)	Apr (6)	May (13)	Jun (22)	Jul (83)	Aug (50)	Sep (8)	Oct (32)	Nov (55)	Dec (28)
2009	Jan (15)	Feb (30)	Mar (28)	Apr (69)	May (82)	Jun (19)	Jul (64)	Aug (71)	Sep (53)	Oct (84)	Nov (105)	Dec (40)
2010	Jan (11)	Feb (19)	Mar (24)	Apr (58)	May (15)	Jun (35)	Jul (14)	Aug (13)	Sep (31)	Oct (15)	Nov (39)	Dec (10)
2011	Jan (59)	Feb (32)	Mar (10)	Apr (37)	May (20)	Jun (21)	Jul (39)	Aug (9)	Sep (31)	Oct (29)	Nov (3)	Dec (1)
2012	Jan (7)	Feb (4)	Mar (5)	Apr (12)	May (5)	Jun (8)	Jul (9)	Aug (6)	Sep (15)	Oct (1)	Nov (3)	Dec (9)
2013	Jan (9)	Feb (2)	Mar (41)	Apr (13)	May (9)	Jun (20)	Jul (5)	Aug (22)	Sep (5)	Oct (3)	Nov (13)	Dec (8)
2014	Jan (27)	Feb (16)	Mar (7)	Apr (14)	May (10)	Jun (2)	Jul (16)	Aug (6)	Sep (6)	Oct (11)	Nov (7)	Dec
2015	Jan	Feb (7)	Mar (4)	Apr	May (2)	Jun	Jul	Aug (2)	Sep (2)	Oct (5)	Nov (1)	Dec
2016	Jan (15)	Feb (5)	Mar (4)	Apr (1)	May (7)	Jun (16)	Jul (6)	Aug (2)	Sep	Oct (1)	Nov	Dec
2017	Jan	Feb (1)	Mar (3)	Apr	May (4)	Jun (25)	Jul	Aug	Sep (4)	Oct (11)	Nov (9)	Dec (1)
2018	Jan (2)	Feb	Mar	Apr	May (2)	Jun	Jul (10)	Aug	Sep (1)	Oct (2)	Nov (12)	Dec (4)
2019	Jan (3)	Feb (21)	Mar (17)	Apr (13)	May (6)	Jun (4)	Jul	Aug (65)	Sep	Oct (4)	Nov (7)	Dec
2020	Jan (23)	Feb (6)	Mar (14)	Apr (25)	May (11)	Jun (9)	Jul (7)	Aug (7)	Sep (1)	Oct (4)	Nov (4)	Dec
2021	Jan (8)	Feb (11)	Mar (1)	Apr (6)	May (30)	Jun (60)	Jul (43)	Aug (23)	Sep (16)	Oct	Nov (7)	Dec (13)
2022	Jan (7)	Feb (2)	Mar (17)	Apr (16)	May (9)	Jun (2)	Jul (18)	Aug	Sep (3)	Oct (1)	Nov (2)	Dec
2023	Jan (7)	Feb	Mar (11)	Apr	May (1)	Jun	Jul	Aug	Sep (7)	Oct (5)	Nov (2)	Dec
2024	Jan	Feb (4)	Mar (8)	Apr (5)	May (5)	Jun (12)	Jul (2)	Aug (12)	Sep (25)	Oct (47)	Nov (46)	Dec (3)
2025	Jan (6)	Feb (14)	Mar (8)	Apr (23)	May (34)	Jun (44)	Jul (8)	Aug	Sep	Oct	Nov	Dec

archive-crawler-cvs Mailing List for Heritrix: Internet Archive Web Crawler (Page 487)

archive-crawler-cvs — cvs commits