You can subscribe to this list here.
2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
(2) |
Sep
(50) |
Oct
(197) |
Nov
(305) |
Dec
(295) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2004 |
Jan
(429) |
Feb
(694) |
Mar
(443) |
Apr
(479) |
May
(357) |
Jun
(74) |
Jul
(218) |
Aug
(162) |
Sep
(156) |
Oct
(340) |
Nov
(132) |
Dec
(224) |
2005 |
Jan
(170) |
Feb
(122) |
Mar
(265) |
Apr
(215) |
May
(139) |
Jun
(247) |
Jul
(179) |
Aug
(116) |
Sep
(103) |
Oct
(125) |
Nov
(97) |
Dec
(221) |
2006 |
Jan
(132) |
Feb
(18) |
Mar
(23) |
Apr
(35) |
May
(71) |
Jun
(268) |
Jul
(220) |
Aug
(376) |
Sep
(181) |
Oct
(71) |
Nov
(131) |
Dec
(172) |
2007 |
Jan
(125) |
Feb
(79) |
Mar
(90) |
Apr
(76) |
May
(91) |
Jun
(64) |
Jul
(113) |
Aug
(96) |
Sep
(40) |
Oct
(30) |
Nov
(85) |
Dec
(56) |
2008 |
Jan
(37) |
Feb
(79) |
Mar
(22) |
Apr
(6) |
May
(13) |
Jun
(22) |
Jul
(83) |
Aug
(50) |
Sep
(8) |
Oct
(32) |
Nov
(55) |
Dec
(28) |
2009 |
Jan
(15) |
Feb
(30) |
Mar
(28) |
Apr
(69) |
May
(82) |
Jun
(19) |
Jul
(64) |
Aug
(71) |
Sep
(53) |
Oct
(84) |
Nov
(105) |
Dec
(40) |
2010 |
Jan
(11) |
Feb
(19) |
Mar
(24) |
Apr
(58) |
May
(15) |
Jun
(35) |
Jul
(14) |
Aug
(13) |
Sep
(31) |
Oct
(15) |
Nov
(39) |
Dec
(10) |
2011 |
Jan
(59) |
Feb
(32) |
Mar
(10) |
Apr
(37) |
May
(20) |
Jun
(21) |
Jul
(39) |
Aug
(9) |
Sep
(31) |
Oct
(29) |
Nov
(3) |
Dec
(1) |
2012 |
Jan
(7) |
Feb
(4) |
Mar
(5) |
Apr
(12) |
May
(5) |
Jun
(8) |
Jul
(9) |
Aug
(6) |
Sep
(15) |
Oct
(1) |
Nov
(3) |
Dec
(9) |
2013 |
Jan
(9) |
Feb
(2) |
Mar
(41) |
Apr
(13) |
May
(9) |
Jun
(20) |
Jul
(5) |
Aug
(22) |
Sep
(5) |
Oct
(3) |
Nov
(13) |
Dec
(8) |
2014 |
Jan
(27) |
Feb
(16) |
Mar
(7) |
Apr
(14) |
May
(10) |
Jun
(2) |
Jul
(16) |
Aug
(6) |
Sep
(6) |
Oct
(11) |
Nov
(7) |
Dec
|
2015 |
Jan
|
Feb
(7) |
Mar
(4) |
Apr
|
May
(2) |
Jun
|
Jul
|
Aug
(2) |
Sep
(2) |
Oct
(5) |
Nov
(1) |
Dec
|
2016 |
Jan
(15) |
Feb
(5) |
Mar
(4) |
Apr
(1) |
May
(7) |
Jun
(16) |
Jul
(6) |
Aug
(2) |
Sep
|
Oct
(1) |
Nov
|
Dec
|
2017 |
Jan
|
Feb
(1) |
Mar
(3) |
Apr
|
May
(4) |
Jun
(25) |
Jul
|
Aug
|
Sep
(4) |
Oct
(11) |
Nov
(9) |
Dec
(1) |
2018 |
Jan
(2) |
Feb
|
Mar
|
Apr
|
May
(2) |
Jun
|
Jul
(10) |
Aug
|
Sep
(1) |
Oct
(2) |
Nov
(12) |
Dec
(4) |
2019 |
Jan
(3) |
Feb
(21) |
Mar
(17) |
Apr
(13) |
May
(6) |
Jun
(4) |
Jul
|
Aug
(65) |
Sep
|
Oct
(4) |
Nov
(7) |
Dec
|
2020 |
Jan
(23) |
Feb
(6) |
Mar
(14) |
Apr
(25) |
May
(11) |
Jun
(9) |
Jul
(7) |
Aug
(7) |
Sep
(1) |
Oct
(4) |
Nov
(4) |
Dec
|
2021 |
Jan
(8) |
Feb
(11) |
Mar
(1) |
Apr
(6) |
May
(30) |
Jun
(60) |
Jul
(43) |
Aug
(23) |
Sep
(16) |
Oct
|
Nov
(7) |
Dec
(13) |
2022 |
Jan
(7) |
Feb
(2) |
Mar
(17) |
Apr
(16) |
May
(9) |
Jun
(2) |
Jul
(18) |
Aug
|
Sep
(3) |
Oct
(1) |
Nov
(2) |
Dec
|
2023 |
Jan
(7) |
Feb
|
Mar
(11) |
Apr
|
May
(1) |
Jun
|
Jul
|
Aug
|
Sep
(7) |
Oct
(5) |
Nov
(2) |
Dec
|
2024 |
Jan
|
Feb
(4) |
Mar
(8) |
Apr
(5) |
May
(5) |
Jun
(12) |
Jul
(2) |
Aug
(12) |
Sep
(25) |
Oct
(47) |
Nov
(46) |
Dec
(3) |
2025 |
Jan
(6) |
Feb
(14) |
Mar
(8) |
Apr
(23) |
May
(34) |
Jun
(44) |
Jul
(8) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic In directory sc8-pr-cvs1:/tmp/cvs-serv7488/src/org/archive/crawler/basic Modified Files: Tag: gjm-refactor SimpleSelector.java CrawlStateUpdater.java SimpleScheduler.java ARCWriter.java Added Files: Tag: gjm-refactor Scope.java PreconditionEnforcer.java Preselector.java Postselector.java Frontier.java Removed Files: Tag: gjm-refactor SimpleFrontier.java FetcherDNS.java SimplePreconditionEnforcer.java SimplePostselector.java SimplePreselector.java FetcherHTTPSimple.java BasicScope.java StatisticsTracker.java Log Message: renaming, repackaging, streamlining --- NEW FILE: Scope.java --- /* * BasicScope.java * Created on Oct 1, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/Attic/Scope.java,v 1.1.2.1 2003/10/04 00:49:24 gojomo Exp $ */ package org.archive.crawler.basic; import org.archive.crawler.datamodel.CandidateURI; import org.archive.crawler.filter.HopsFilter; import org.archive.crawler.filter.SeedExtensionFilter; import org.archive.crawler.filter.TransclusionFilter; import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.CrawlScope; import org.archive.crawler.framework.Filter; /** * A core CrawlScope suitable for the most common * crawl needs. * * Roughly, its logic is that a URI is included if: * * (( isSeed(uri) || focusFilter.accepts(uri) ) * || transitiveFilter.accepts(uri) ) * && ! excludeFilter.accepts(uri) * * The focusFilter may be specified by either: * - adding a 'mode' attribute to the * <scope> element, in which case a SeedExtensionFilter * will be used, with the <scope> element * providing its configuration * - adding a <focus> subelement * If unspecified, the focusFilter will default to * an accepts-all filter. * * The transitiveFilter may be specified by supploying * a <transitive> subelement. If unspecified, a * TransclusionFilter will be used, with the <scope> * element providing its configuration. * * The excludeFilter may be specified by supplying * a <exclude> subelement. If unspecified, a * accepts-none filter will be used -- meaning that * no URIs will pass the filter and thus be excluded. * * @author gojomo * */ public class Scope extends CrawlScope { Filter focusFilter; Filter transitiveFilter; Filter excludeFilter; /* (non-Javadoc) * @see org.archive.crawler.framework.Filter#initialize(org.archive.crawler.framework.CrawlController) */ public void initialize(CrawlController controller) { super.initialize(controller); // setup focusFilter if(getNodeAt("@mode")!=null) { // SeedExtensionFilter implied focusFilter = new SeedExtensionFilter(); focusFilter.setNode(xNode); } else { focusFilter = (Filter) instantiate("focus"); } if(focusFilter != null) { focusFilter.initialize(controller); // only set up transitiveFilter if focusFilter set transitiveFilter = (Filter) instantiate("transitive"); if(transitiveFilter == null) { transitiveFilter = new TransclusionFilter(); transitiveFilter.setNode(xNode); transitiveFilter.initialize(controller); } } // setup exclude filter if(getNodeAt("@max-link-hops")!=null) { // SeedExtensionFilter implied excludeFilter = new HopsFilter(); excludeFilter.setNode(xNode); } else { excludeFilter = (Filter) instantiate("exclude"); } } /** * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object) */ protected boolean innerAccepts(Object o) { return ((isSeed(o)||focusAccepts(o))||transitiveAccepts(o))&&!excludeAccepts(o); } /** * @param o * @return */ private boolean excludeAccepts(Object o) { if (excludeFilter == null) { return false; } return excludeFilter.accepts(o); } /** * @param o * @return */ private boolean transitiveAccepts(Object o) { if (transitiveFilter == null) { return true; } return transitiveFilter.accepts(o); } /** * @param o * @return */ private boolean focusAccepts(Object o) { if (focusFilter == null) { return true; } return focusFilter.accepts(o); } private boolean isSeed(Object o) { return o instanceof CandidateURI && ((CandidateURI)o).getIsSeed(); } } --- NEW FILE: PreconditionEnforcer.java --- /* * SimplePolitenessEnforcer.java * Created on May 22, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/Attic/PreconditionEnforcer.java,v 1.1.2.1 2003/10/04 00:49:24 gojomo Exp $ */ package org.archive.crawler.basic; import java.util.logging.Logger; import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.framework.Processor; import org.archive.crawler.datamodel.FetchStatusCodes; /** * Ensures the preconditions for a fetch -- such as * DNS lookup or acquiring a robots.txt policy -- are * satisfied before a URI is passed to subsequent * stages. * * @author gojomo * */ public class PreconditionEnforcer extends Processor implements FetchStatusCodes { private static String XP_DELAY_FACTOR = "params/@delay-factor"; private static String XP_MINIMUM_DELAY = "params/@minimum-delay"; private static int DEFAULT_DELAY_FACTOR = 10; private static int DEFAULT_MINIMUM_DELAY = 2000; private static Logger logger = Logger.getLogger("org.archive.crawler.basic.SimplePolitenessEnforcer"); /* (non-Javadoc) * @see org.archive.crawler.framework.Processor#process(org.archive.crawler.datamodel.CrawlURI) */ protected void innerProcess(CrawlURI curi) { if (considerDnsPreconditions(curi)) { return; } // make sure we only process schemes we understand (i.e. not dns) if(!curi.getUURI().getUri().getScheme().equals("http")){ logger.fine("PolitenessEnforcer doesn't understand uri's of type "+curi.getUURI().getUri().getScheme() + " (ignoring)"); return; } if (considerRobotsPreconditions(curi)) { return; } // OK, it's allowed // for all curis that will in fact be fetched, set appropriate delays // TODOSOMEDAY: allow per-host, per-protocol, etc. factors // curi.setDelayFactor(getDelayFactorFor(curi)); // curi.setMinimumDelay(getMinimumDelayFor(curi)); return; } /** * @return */ private boolean considerRobotsPreconditions(CrawlURI curi) { // treat /robots.txt fetches specially if (curi.getUURI().getUri().getPath().equals("/robots.txt")) { // allow processing to continue return false; } // require /robots.txt if not present if ( curi.getServer().getRobotsExpires() < 0 // "cheap" test of default || curi.getServer().getRobotsExpires()<System.currentTimeMillis() ){ logger.fine("No valid robots for "+curi.getServer()+"; deferring "+curi); curi.setPrerequisiteUri("/robots.txt"); curi.incrementDeferrals(); curi.skipToProcessor(controller.getPostprocessor()); return true; } // test against robots.txt if available String ua = controller.getOrder().getUserAgent(); if( curi.getServer().getRobots().disallows(curi.getUURI().getUri().getPath(),ua)) { // don't fetch curi.skipToProcessor(controller.getPostprocessor()); // turn off later stages curi.setFetchStatus(S_ROBOTS_PRECLUDED); curi.getAList().putString("error","robots.txt exclusion"); logger.fine("robots.txt precluded "+curi); return true; } return false; } /** * @param curi * @return true if no further processing in this module shoudl occur */ private boolean considerDnsPreconditions(CrawlURI curi) { if(curi.getServer()==null) { curi.setFetchStatus(S_UNFETCHABLE_URI); curi.skipToProcessor(controller.getPostprocessor()); return true; } // if we haven't done a dns lookup and this isn't a dns uri // shoot that off and defer further processing if (!curi.getServer().getHost().hasBeenLookedUp() && !curi.getUURI().getUri().getScheme().equals("dns")) { logger.fine( "deferring processing of " + curi.toString() + " for dns lookup."); String hostname = curi.getServer().getHostname(); curi.setPrerequisiteUri("dns:" + hostname); curi.incrementDeferrals(); curi.skipToProcessor(controller.getPostprocessor()); return true; } // if we've done a dns lookup and it didn't resolve a host // cancel all processing of this URI if (curi.getServer().getHost().hasBeenLookedUp() && curi.getServer().getHost().getIP() == null) { logger.fine( "no dns for " + curi.getServer().toString() + " cancelling processing for " + curi.toString()); //TODO currently we're using FetchAttempts to denote both fetch attempts and // the choice to not attempt (here). Eventually these will probably have to be treated seperately // to allow us to treat dns failures and connections failures (downed hosts, route failures, etc) seperately. curi.setFetchStatus(S_DOMAIN_UNRESOLVABLE); curi.incrementFetchAttempts(); curi.skipToProcessor(controller.getPostprocessor()); return true; } return false; } /** * */ private int getMinimumDelayFor(CrawlURI curi) { return getIntAt(XP_MINIMUM_DELAY,DEFAULT_MINIMUM_DELAY); } /** * */ private int getDelayFactorFor(CrawlURI curi) { return getIntAt(XP_DELAY_FACTOR, DEFAULT_DELAY_FACTOR); } } --- NEW FILE: Preselector.java --- /* * SimplePreselector.java * Created on Sep 22, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/Attic/Preselector.java,v 1.1.2.1 2003/10/04 00:49:24 gojomo Exp $ */ package org.archive.crawler.basic; import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.datamodel.FetchStatusCodes; import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.CrawlScope; import org.archive.crawler.framework.Processor; /** * Gives a yes/no on whether a CrawlURI should be processed at all. * * Usually, failing a processor filter causes that processor * to be skipped. Failing this processor's filter means a * CrawlURI will be marked OUT_OF_SCOPE. * * * @author gojomo * */ public class Preselector extends Processor implements FetchStatusCodes { private boolean recheckScope; private static String XP_RECHECK_SCOPE="@scope"; // private static String XP_MAX_LINK_DEPTH="params/@max-link-depth"; // private static String XP_MAX_EMBED_DEPTH="params/@max-embed-depth"; // private int maxLinkDepth = -1; // private int maxEmbedDepth = -1; /* (non-Javadoc) * @see org.archive.crawler.framework.Processor#innerProcess(org.archive.crawler.datamodel.CrawlURI) */ protected void innerProcess(CrawlURI curi) { if (recheckScope) { CrawlScope scope = controller.getScope(); if (curi.getScopeVersion()==scope.getVersion()) { // already checked return; } if(scope.accepts(curi)) { curi.setScopeVersion(scope.getVersion()); return; } // scope rejected curi.setFetchStatus(S_OUT_OF_SCOPE); curi.skipToProcessor(controller.getPostprocessor()); } // super.innerProcess(curi); // // // check for too-deep // if(maxLinkDepth>=0 && curi.getLinkHopCount()>maxLinkDepth) { // curi.setFetchStatus(S_TOO_MANY_LINK_HOPS); // curi.cancelFurtherProcessing(); // return; // } // if(maxEmbedDepth>=0 && curi.getEmbedHopCount()>maxEmbedDepth) { // curi.setFetchStatus(S_TOO_MANY_EMBED_HOPS); // curi.cancelFurtherProcessing(); // return; // } } // /* (non-Javadoc) // * @see org.archive.crawler.framework.Processor#innerRejectProcess(org.archive.crawler.datamodel.CrawlURI) // */ // protected void innerRejectProcess(CrawlURI curi) { // super.innerRejectProcess(curi); // // filter-rejection means out-of-scope for everything but embeds // if (curi.getEmbedHopCount() < 1) { // curi.setFetchStatus(S_OUT_OF_SCOPE); // curi.cancelFurtherProcessing(); // } else { // // never mind; scope filters don't apply // } // } /* (non-Javadoc) * @see org.archive.crawler.framework.Processor#initialize(org.archive.crawler.framework.CrawlController) */ public void initialize(CrawlController c) { super.initialize(c); recheckScope = getBooleanAt("@scope",false); //maxLinkDepth = getIntAt(XP_MAX_LINK_DEPTH, maxLinkDepth); //maxEmbedDepth = getIntAt(XP_MAX_EMBED_DEPTH, maxEmbedDepth); } } --- NEW FILE: Postselector.java --- /* * SimplePostselector.java * Created on Oct 2, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/Attic/Postselector.java,v 1.1.2.1 2003/10/04 00:49:24 gojomo Exp $ */ package org.archive.crawler.basic; import java.net.URISyntaxException; import java.util.logging.Level; import org.archive.crawler.datamodel.CandidateURI; import org.archive.crawler.datamodel.CoreAttributeConstants; import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.datamodel.FetchStatusCodes; import org.archive.crawler.datamodel.UURI; import org.archive.crawler.framework.Processor; /** * Determine which links etc get fed back into Frontier, * if/when failures get retried, etc. * * * @author gojomo * */ public class Postselector extends Processor implements CoreAttributeConstants, FetchStatusCodes { // limits on retries TODO: separate into retryPolicy? private int maxDeferrals = 10; // should be at least max-retries plus 3 or so /* (non-Javadoc) * @see org.archive.crawler.framework.Processor#innerProcess(org.archive.crawler.datamodel.CrawlURI) */ protected void innerProcess(CrawlURI curi) { // handle any prerequisites if (curi.getAList().containsKey(A_PREREQUISITE_URI)) { handlePrerequisites(curi); return; } } protected void handlePrerequisites(CrawlURI curi) { try { UURI prereq = UURI.createUURI(curi.getPrerequisiteUri(),curi.getUURI().getUri()); CandidateURI caUri = new CandidateURI(prereq); caUri.setVia(curi); caUri.setPathFromSeed(curi.getPathFromSeed()+"P"); if ( curi.getDeferrals() > maxDeferrals ) { // too many deferrals, equals failure curi.setFetchStatus(S_PREREQUISITE_FAILURE); //failureDisposition(curi); return; } if (!scheduleHigh(caUri)) { // prerequisite cannot be scheduled (perhaps excluded by scope) // must give up on curi.setFetchStatus(S_PREREQUISITE_FAILURE); //failureDisposition(curi); return; } } catch (URISyntaxException ex) { Object[] array = { curi, curi.getPrerequisiteUri() }; controller.uriErrors.log(Level.INFO,ex.getMessage(), array ); } } /** * @param prereq * @return */ private boolean scheduleHigh(CandidateURI caUri) { if(controller.getScope().accepts(caUri)) { controller.getFrontier().scheduleHigh(caUri); return true; } return false; } } --- NEW FILE: Frontier.java --- /* * SimpleFrontier.java * Created on Oct 1, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/Attic/Frontier.java,v 1.1.2.1 2003/10/04 00:49:24 gojomo Exp $ */ package org.archive.crawler.basic; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.SortedSet; import java.util.TreeSet; import java.util.logging.Level; import java.util.logging.Logger; import org.archive.crawler.datamodel.CandidateURI; import org.archive.crawler.datamodel.CoreAttributeConstants; import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.datamodel.FetchStatusCodes; import org.archive.crawler.datamodel.MemUURISet; import org.archive.crawler.datamodel.UURI; import org.archive.crawler.datamodel.UURISet; import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.URIFrontier; import org.archive.crawler.framework.XMLConfig; import org.archive.crawler.framework.exceptions.FatalConfigurationException; /** * A basic in-memory mostly breadth-first frontier, which * refrains from emitting more than one CrawlURI of the same * 'key' (host) at once, and respects minimum-delay and * delay-factor specifications for politeness * * @author gojomo * */ public class Frontier extends XMLConfig implements URIFrontier, FetchStatusCodes, CoreAttributeConstants { private static String XP_DELAY_FACTOR = "@delay-factor"; private static String XP_MINIMUM_DELAY = "@minimum-delay"; private static int DEFAULT_DELAY_FACTOR = 5; private static int DEFAULT_MINIMUM_DELAY = 2000; private static Logger logger = Logger.getLogger("org.archive.crawler.basic.SimpleFrontier"); CrawlController controller; // HashMap allCuris = new HashMap(); // of UURI -> CrawlURI // TODO update to use fingerprints only UURISet alreadyIncluded = new MemUURISet(); // every CandidateURI not yet in process or another queue; // all seeds start here; may contain duplicates LinkedList pendingQueue = new LinkedList(); // of CandidateURIs // every CandidateURI not yet in process or another queue; // all seeds start here; may contain duplicates LinkedList pendingHighQueue = new LinkedList(); // of CandidateURIs // every CrawlURI handed out for processing but not yet returned HashMap inProcessMap = new HashMap(); // of String (classKey) -> CrawlURI // all active per-class queues HashMap allClassQueuesMap = new HashMap(); // of String (classKey) -> KeyedQueue // all per-class queues whose first item may be handed out (that is, no CrawlURI // of the same class is currently in-process) LinkedList readyClassQueues = new LinkedList(); // of String (queueKey) -> KeyedQueue // all per-class queues who are on hold because a CrawlURI of their class // is already in process LinkedList heldClassQueues = new LinkedList(); // of String (queueKey) -> KeyedQueue // all per-class queues who are on hold until a certain time SortedSet snoozeQueues = new TreeSet(new SchedulingComparator()); // of KeyedQueue, sorted by wakeTime // CrawlURIs held until some specific other CrawlURI is emitted HashMap heldCuris = new HashMap(); // of UURI -> CrawlURI // limits on retries TODO: separate into retryPolicy? private int maxRetries = 3; private int retryDelay = 15000; private long minimumDelay; private long delayFactor; // top-level stats int completionCount = 0; int failedCount = 0; /* (non-Javadoc) * @see org.archive.crawler.framework.URIFrontier#initialize(org.archive.crawler.framework.CrawlController) */ public void initialize(CrawlController c) throws FatalConfigurationException { delayFactor = getIntAt(XP_DELAY_FACTOR,DEFAULT_DELAY_FACTOR); minimumDelay = getIntAt(XP_MINIMUM_DELAY,DEFAULT_MINIMUM_DELAY); this.controller = c; Iterator iter = c.getScope().getSeeds().iterator(); while (iter.hasNext()) { UURI u = (UURI) iter.next(); CandidateURI caUri = new CandidateURI(u); caUri.setIsSeed(true); schedule(caUri); } } /** * * @see org.archive.crawler.framework.URIFrontier#schedule(org.archive.crawler.datamodel.CandidateURI) */ public synchronized void schedule(CandidateURI caUri) { pendingQueue.addLast(caUri); } /* (non-Javadoc) * @see org.archive.crawler.framework.URIFrontier#scheduleHigh(org.archive.crawler.datamodel.CandidateURI) */ public void scheduleHigh(CandidateURI caUri) { pendingHighQueue.addLast(caUri); } /** * * @see org.archive.crawler.framework.URIFrontier#next(int) */ public CrawlURI next(int timeout) { long now = System.currentTimeMillis(); long waitMax = 0; CrawlURI curi = null; // first, empty the high-priority queue CandidateURI caUri; while ((caUri = dequeueFromPendingHigh()) != null) { if(alreadyIncluded.contains(caUri)) { continue; } curi = new CrawlURI(caUri); if (!enqueueIfNecessary(curi)) { // OK to emit return emitCuri(curi); } } // if reached, the pendingHighQueue is empty // if enough time has passed to wake any snoozing queues, do it wakeReadyQueues(now); // first, see if any holding queues are ready with a CrawlURI if (!readyClassQueues.isEmpty()) { curi = dequeueFromReady(); return emitCuri(curi); } // if that fails, check the pending queue while ((caUri = dequeueFromPending()) != null) { if(alreadyIncluded.contains(caUri)) { continue; } curi = new CrawlURI(caUri); if (!enqueueIfNecessary(curi)) { // OK to emit return emitCuri(curi); } } // consider if URIs exhausted if(isEmpty()) { // nothing left to crawl logger.info("nothing left to crawl"); // TODO halt/spread the word??? return null; } // nothing to return, but there are still URIs // held for the future // block until something changes, or timeout occurs waitMax = Math.min(earliestWakeTime()-now,timeout); try { if(waitMax<0) { logger.warning("negative wait "+waitMax+" ignored"); } else { synchronized(this) { wait(waitMax); } } } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } return null; } /* (non-Javadoc) * @see org.archive.crawler.framework.URIFrontier#finished(org.archive.crawler.datamodel.CrawlURI) */ public synchronized void finished(CrawlURI curi) { logger.fine(this+".finished("+curi+")"); try { noteProcessingDone(curi); // snooze queues as necessary updateScheduling(curi); notify(); // new items might be available // consider errors which halt further processing if (isDispositiveFailure(curi)) { failureDisposition(curi); return; } // NOW HANDLED BY POSTSELECTOR // // handle any prerequisites // if (curi.getAList().containsKey(A_PREREQUISITE_URI)) { // handlePrerequisites(curi); // return; // } // consider errors which can be retried if (needsRetrying(curi)) { scheduleForRetry(curi); return; } // NOW HANDLED BY POSTSELECTOR // URI baseUri = getBaseURI(curi); // // // handle http headers // if (curi.getAList().containsKey(A_HTTP_HEADER_URIS)) { // handleHttpHeaders(curi, baseUri); // } // // handle embeds // if (curi.getAList().containsKey(A_HTML_EMBEDS)) { // handleEmbeds(curi, baseUri); // } // // handle links // if (curi.getAList().containsKey(A_HTML_LINKS)) { // handleLinks(curi, baseUri); // } // SUCCESS: note & log successDisposition(curi); } catch (RuntimeException e) { curi.setFetchStatus(S_INTERNAL_ERROR); // store exception temporarily for logging curi.getAList().putObject(A_RUNTIME_EXCEPTION,(Object)e); failureDisposition(curi); } } /** * The CrawlURI has been successfully crawled, and will be * attempted no more. * * @param curi */ protected void successDisposition(CrawlURI curi) { completionCount++; if ( (completionCount % 500) == 0) { logger.info("==========> " + completionCount+" <========== HTTP URIs completed"); } Object array[] = { curi }; controller.uriProcessing.log( Level.INFO, curi.getUURI().getUri().toString(), array); // note that CURI has passed out of scheduling curi.setStoreState(URIStoreable.FINISHED); if (curi.getDontRetryBefore()<0) { // if not otherwise set, retire this URI forever curi.setDontRetryBefore(Long.MAX_VALUE); } curi.stripToMinimal(); } /** * Store is empty only if all queues are empty and * no URIs are in-process * * @return */ public boolean isEmpty() { return pendingQueue.isEmpty() && readyClassQueues.isEmpty() && heldClassQueues.isEmpty() && snoozeQueues.isEmpty() && inProcessMap.isEmpty(); } /* (non-Javadoc) * @see org.archive.crawler.framework.URIFrontier#size() */ public long size() { // TODO Auto-generated method stub return 0; } /** * */ protected void wakeReadyQueues(long now) { while(!snoozeQueues.isEmpty()&&((URIStoreable)snoozeQueues.first()).getWakeTime()<=now) { URIStoreable awoken = (URIStoreable)snoozeQueues.first(); if (!snoozeQueues.remove(awoken)) { logger.severe("first() item couldn't be remove()d!"); } if (awoken instanceof KeyedQueue) { assert inProcessMap.get(awoken.getClassKey()) == null : "false ready: class peer still in process"; if(((KeyedQueue)awoken).isEmpty()) { // just drop queue discardQueue(awoken); return; } readyClassQueues.add(awoken); awoken.setStoreState(URIStoreable.READY); } else if (awoken instanceof CrawlURI) { // TODO think about whether this is right pushToPending((CrawlURI)awoken); } else { assert false : "something evil has awoken!"; } } } private void discardQueue(URIStoreable awoken) { allClassQueuesMap.remove(((KeyedQueue)awoken).getClassKey()); awoken.setStoreState(URIStoreable.FINISHED); } /** * @return */ private CrawlURI dequeueFromReady() { KeyedQueue firstReadyQueue = (KeyedQueue)readyClassQueues.getFirst(); CrawlURI readyCuri = (CrawlURI) firstReadyQueue.removeFirst(); return readyCuri; } /** * @param crawlURI * @return */ private CrawlURI emitCuri(CrawlURI curi) { if(curi != null) { if (curi.getStoreState() == URIStoreable.FINISHED) { System.out.println("break here"); } assert curi.getStoreState() != URIStoreable.FINISHED : "state "+curi.getStoreState()+" instead of ready for "+ curi; //assert curi.getAList() != null : "null alist in curi " + curi + " state "+ curi.getStoreState(); noteInProcess(curi); curi.setServer(controller.getHostCache().getServerFor(curi)); } logger.fine(this+".emitCuri("+curi+")"); return curi; } /** * @param curi */ protected void noteInProcess(CrawlURI curi) { assert inProcessMap.get(curi.getClassKey()) == null : "two CrawlURIs with same classKey in process"; inProcessMap.put(curi.getClassKey(), curi); curi.setStoreState(URIStoreable.IN_PROCESS); KeyedQueue classQueue = (KeyedQueue) allClassQueuesMap.get(curi.getClassKey()); if (classQueue == null) { releaseHeld(curi); return; } assert classQueue.getStoreState() == URIStoreable.READY : "odd state "+ classQueue.getStoreState() + " for classQueue "+ classQueue + "of to-be-emitted CrawlURI"; readyClassQueues.remove(classQueue); enqueueToHeld(classQueue); releaseHeld(curi); } /** * @param classQueue */ private void enqueueToHeld(KeyedQueue classQueue) { heldClassQueues.add(classQueue); classQueue.setStoreState(URIStoreable.HELD); } /** * @param curi */ private void releaseHeld(CrawlURI curi) { CrawlURI released = (CrawlURI) heldCuris.get(curi.getUURI()); if(released!=null) { heldCuris.remove(curi.getUURI()); reinsert(released); } } /** * @param curi */ public void reinsert(CrawlURI curi) { if(enqueueIfNecessary(curi)) { // added to classQueue return; } // no classQueue pushToPending(curi); } /** * */ protected CandidateURI dequeueFromPendingHigh() { if (pendingHighQueue.isEmpty()) { return null; } return (CandidateURI)pendingHighQueue.removeFirst(); } /** * */ protected CandidateURI dequeueFromPending() { if (pendingQueue.isEmpty()) { return null; } return (CandidateURI)pendingQueue.removeFirst(); } /** * * @param curi * @return true if enqueued */ public boolean enqueueIfNecessary(CrawlURI curi) { KeyedQueue classQueue = (KeyedQueue) allClassQueuesMap.get(curi.getClassKey()); if (classQueue != null) { // must enqueue classQueue.add(curi); curi.setStoreState(classQueue.getStoreState()); return true; } CrawlURI classmateInProgress = (CrawlURI) inProcessMap.get(curi.getClassKey()); if (classmateInProgress != null) { // must create queue, and enqueue classQueue = new KeyedQueue(curi.getClassKey()); allClassQueuesMap.put(classQueue.getClassKey(), classQueue); enqueueToHeld(classQueue); classQueue.add(curi); curi.setStoreState(classQueue.getStoreState()); return true; } return false; } /** * @return */ public long earliestWakeTime() { if (!snoozeQueues.isEmpty()) { return ((URIStoreable)snoozeQueues.first()).getWakeTime(); } return Long.MAX_VALUE; } /** * @param curi */ private synchronized void pushToPending(CrawlURI curi) { pendingQueue.addFirst(curi); curi.setStoreState(URIStoreable.PENDING); } /* (non-Javadoc) * @see org.archive.crawler.framework.URIFrontier#discoveredUriCount() */ public int discoveredUriCount() { // TODO Auto-generated method stub return 0; } /* (non-Javadoc) * @see org.archive.crawler.framework.URIFrontier#successfullyFetchedCount() */ public int successfullyFetchedCount() { // TODO Auto-generated method stub return 0; } /* (non-Javadoc) * @see org.archive.crawler.framework.URIFrontier#failedFetchCount() */ public int failedFetchCount() { // TODO Auto-generated method stub return 0; } /** * * @return */ public void noteProcessingDone(CrawlURI curi) { assert inProcessMap.get(curi.getClassKey()) == curi : "CrawlURI returned not in process"; inProcessMap.remove(curi.getClassKey()); KeyedQueue classQueue = (KeyedQueue) allClassQueuesMap.get(curi.getClassKey()); if (classQueue == null) { return; } assert classQueue.getStoreState() == URIStoreable.HELD : "odd state for classQueue of remitted CrawlURI"; heldClassQueues.remove(classQueue); if (classQueue.isEmpty()) { // just drop it discardQueue(classQueue); return; } readyClassQueues.add(classQueue); classQueue.setStoreState(URIStoreable.READY); // TODO: since usually, queue will be snoozed, this juggling is often superfluous } /** * Update any scheduling structures with the new information * in this CrawlURI. Chiefly means make necessary arrangements * for no other URIs at the same host to be visited within the * appropriate politeness window. * * @param curi */ protected void updateScheduling(CrawlURI curi) { long durationToWait = 0; if (curi.getAList().containsKey(A_FETCH_BEGAN_TIME) && curi.getAList().containsKey(A_FETCH_COMPLETED_TIME)) { long completeTime = curi.getAList().getLong(A_FETCH_COMPLETED_TIME); durationToWait = delayFactor * (completeTime - curi.getAList().getLong(A_FETCH_BEGAN_TIME)); if (minimumDelay > durationToWait) { durationToWait = minimumDelay; } // TODO: maximum delay? if(durationToWait>0) { snoozeQueueUntil(curi.getClassKey(), completeTime + durationToWait); } } } /** * The CrawlURI has encountered a problem, and will not * be retried. * * @param curi */ protected void failureDisposition(CrawlURI curi) { failedCount++; // send to basic log Object array[] = { curi }; controller.uriProcessing.log( Level.INFO, curi.getUURI().getUri().toString(), array); // if exception, also send to crawlErrors if (curi.getFetchStatus() == S_INTERNAL_ERROR) { controller.crawlErrors.log( Level.INFO, curi.getUURI().getUri().toString(), array); } if (shouldBeForgotten(curi)) { // curi is dismissed without prejudice: it can be reconstituted forget(curi); } else { curi.setStoreState(URIStoreable.FINISHED); if (curi.getDontRetryBefore() < 0) { // if not otherwise set, retire this URI forever curi.setDontRetryBefore(Long.MAX_VALUE); } curi.stripToMinimal(); } } /** * Has the CrawlURI suffered a failure which completes * its processing? * * @param curi * @return */ private boolean isDispositiveFailure(CrawlURI curi) { switch (curi.getFetchStatus()) { case S_DOMAIN_UNRESOLVABLE : // network errors; perhaps some of these // should be scheduled for retries case S_ROBOTS_PRECLUDED : // they don't want us to have it case S_INTERNAL_ERROR : // something unexpectedly bad happened case S_UNFETCHABLE_URI : // no chance to fetch case S_OUT_OF_SCOPE : // filtered out case S_TOO_MANY_EMBED_HOPS : // too far from last true link case S_TOO_MANY_LINK_HOPS : // too far from seeds return true; case S_UNATTEMPTED : // this uri is virgin, let it carry on default : return false; } } /** * @param curi * @return */ private boolean needsRetrying(CrawlURI curi) { // if (curi.getFetchAttempts()>=maxRetries) { return false; } switch (curi.getFetchStatus()) { case S_CONNECT_FAILED: case S_CONNECT_LOST: case S_UNATTEMPTED: case S_TIMEOUT: // these are all worth a retry return true; default: return false; } } /** * @param curi */ private void scheduleForRetry(CrawlURI curi) { logger.fine("inserting snoozed "+curi+" for "+retryDelay); insertSnoozed(curi,retryDelay); } /** * @param object * @param l */ public void snoozeQueueUntil(Object classKey, long wake) { KeyedQueue classQueue = (KeyedQueue) allClassQueuesMap.get(classKey); if ( classQueue == null ) { classQueue = new KeyedQueue(classKey); allClassQueuesMap.put(classQueue.getClassKey(),classQueue); } else { assert classQueue.getStoreState() == URIStoreable.READY : "snoozing queue should have been READY"; readyClassQueues.remove(classQueue); } classQueue.setWakeTime(wake); snoozeQueues.add(classQueue); classQueue.setStoreState(URIStoreable.SNOOZED); } /** * @param curi * @return */ private boolean shouldBeForgotten(CrawlURI curi) { switch(curi.getFetchStatus()) { case S_TOO_MANY_EMBED_HOPS: case S_TOO_MANY_LINK_HOPS: return true; default: return false; } } /** * Forget the given CrawlURI. This allows a new instance * to be created in the future, if it is reencountered under * different circumstances. * * @param curi */ public void forget(CrawlURI curi) { alreadyIncluded.remove(curi.getUURI()); curi.setStoreState(URIStoreable.FORGOTTEN); } /** * Revisit the CrawlURI -- but not before delay time has passed. * @param curi * @param retryDelay */ public void insertSnoozed(CrawlURI curi, long retryDelay) { curi.setWakeTime(System.currentTimeMillis()+retryDelay ); curi.setStoreState(URIStoreable.SNOOZED); snoozeQueues.add(curi); } } Index: SimpleSelector.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleSelector.java,v retrieving revision 1.27.2.1 retrieving revision 1.27.2.2 diff -C2 -d -r1.27.2.1 -r1.27.2.2 *** SimpleSelector.java 3 Oct 2003 17:11:30 -0000 1.27.2.1 --- SimpleSelector.java 4 Oct 2003 00:49:24 -0000 1.27.2.2 *************** *** 444,449 **** } - - /** * @param curi --- 444,447 ---- *************** *** 460,463 **** --- 458,463 ---- } + + /* (non-Javadoc) * @see org.archive.crawler.framework.URISelector#initialize(org.archive.crawler.framework.CrawlController) *************** *** 498,502 **** return true; } - - } --- 498,500 ---- Index: CrawlStateUpdater.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/CrawlStateUpdater.java,v retrieving revision 1.5 retrieving revision 1.5.2.1 diff -C2 -d -r1.5 -r1.5.2.1 *** CrawlStateUpdater.java 6 Aug 2003 01:20:56 -0000 1.5 --- CrawlStateUpdater.java 4 Oct 2003 00:49:24 -0000 1.5.2.1 *************** *** 8,11 **** --- 8,12 ---- import org.apache.commons.httpclient.methods.GetMethod; + import org.archive.crawler.datamodel.*; import org.archive.crawler.datamodel.CoreAttributeConstants; import org.archive.crawler.datamodel.CrawlURI; Index: SimpleScheduler.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleScheduler.java,v retrieving revision 1.13.2.2 retrieving revision 1.13.2.3 diff -C2 -d -r1.13.2.2 -r1.13.2.3 *** SimpleScheduler.java 3 Oct 2003 17:11:30 -0000 1.13.2.2 --- SimpleScheduler.java 4 Oct 2003 00:49:24 -0000 1.13.2.3 *************** *** 11,18 **** import org.archive.crawler.datamodel.CrawlURI; - import org.archive.crawler.datamodel.FatalConfigurationException; import org.archive.crawler.datamodel.UURI; import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.ToeThread; /** --- 11,18 ---- import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.datamodel.UURI; import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.ToeThread; + import org.archive.crawler.framework.exceptions.FatalConfigurationException; /** Index: ARCWriter.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/ARCWriter.java,v retrieving revision 1.32.2.1 retrieving revision 1.32.2.2 diff -C2 -d -r1.32.2.1 -r1.32.2.2 *** ARCWriter.java 3 Oct 2003 01:54:35 -0000 1.32.2.1 --- ARCWriter.java 4 Oct 2003 00:49:24 -0000 1.32.2.2 *************** *** 17,25 **** import org.archive.crawler.datamodel.CrawlOrder; import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.Processor; import org.archive.util.ArchiveUtils; import org.archive.util.IAGZIPOutputStream; - import org.w3c.dom.Node; import org.xbill.DNS.Record; --- 17,25 ---- import org.archive.crawler.datamodel.CrawlOrder; import org.archive.crawler.datamodel.CrawlURI; + import org.archive.crawler.datamodel.StatisticsTracker; import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.Processor; import org.archive.util.ArchiveUtils; import org.archive.util.IAGZIPOutputStream; import org.xbill.DNS.Record; *************** *** 36,40 **** private int arcMaxSize = 100000000; // max size we want arc files to be (bytes) ! private String arcPrefix = "archive"; // file prefix for arcs private String outputDir = ""; // where should we put them? private File file = null; // file handle --- 36,40 ---- private int arcMaxSize = 100000000; // max size we want arc files to be (bytes) ! private String arcPrefix = "IAH"; // file prefix for arcs private String outputDir = ""; // where should we put them? private File file = null; // file handle *************** *** 67,95 **** CrawlOrder order = controller.getOrder(); ! // retrieve any nodes we think we need from the dom(s) ! Node filePrefix = order.getNodeAt("/crawl-order/arc-file/@prefix"); ! Node maxSize = getNodeAt("./arc-files/@max-size-bytes"); ! Node path = order.getNodeAt("//disk/@path"); ! Node compression = getNodeAt("./compression/@use"); ! ! setUseCompression( ! ( (compression==null) ? true : compression.getNodeValue().equals("true")) ! ); ! ! setArcPrefix( ! ( (filePrefix==null) ? arcPrefix : filePrefix.getNodeValue() ) ! ); ! ! setArcMaxSize( ! ( (maxSize==null) ? arcMaxSize : (new Integer(maxSize.getNodeValue())).intValue() ) ! ); ! ! setOutputDir( ! ( (path==null) ? outputDir : path.getNodeValue() ) ! ); } ! ! /** * Takes a CrawlURI and generates an arc record, writing it * to disk. Currently --- 67,78 ---- CrawlOrder order = controller.getOrder(); ! setUseCompression(getBooleanAt("@compression",false)); ! setArcPrefix(getStringAt("@prefix",arcPrefix)); ! setArcMaxSize(getIntAt("@max-size-bytes",arcMaxSize)); ! setOutputDir(getStringAt("@path",outputDir)); } ! ! /** * Takes a CrawlURI and generates an arc record, writing it * to disk. Currently --- SimpleFrontier.java DELETED --- --- FetcherDNS.java DELETED --- --- SimplePreconditionEnforcer.java DELETED --- --- SimplePostselector.java DELETED --- --- SimplePreselector.java DELETED --- --- FetcherHTTPSimple.java DELETED --- --- BasicScope.java DELETED --- --- StatisticsTracker.java DELETED --- |
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/filter In directory sc8-pr-cvs1:/tmp/cvs-serv7488/src/org/archive/crawler/filter Added Files: Tag: gjm-refactor NullFilter.java SeedExtensionFilter.java HopsFilter.java TransclusionFilter.java URIRegExpFilter.java Log Message: renaming, repackaging, streamlining --- NEW FILE: NullFilter.java --- /* * NullFilter.java * Created on Oct 2, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/filter/Attic/NullFilter.java,v 1.1.2.1 2003/10/04 00:49:24 gojomo Exp $ */ package org.archive.crawler.filter; import org.archive.crawler.framework.Filter; /** * @author gojomo * */ public class NullFilter extends Filter { /* (non-Javadoc) * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object) */ protected boolean innerAccepts(Object o) { return true; } } --- NEW FILE: SeedExtensionFilter.java --- /* * SeedExtensionFilter.java * Created on Sep 15, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/filter/Attic/SeedExtensionFilter.java,v 1.1.2.1 2003/10/04 00:49:24 gojomo Exp $ */ package org.archive.crawler.filter; import java.util.Iterator; import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.datamodel.UURI; import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.Filter; /** * Accepts a new UURI if it is an "extension' of one of the URIs * in the seed set. Most loosely, this could be any other URI * under the same domain (as "calendar.yahoo.com' is to 'www.yahoo.com'). * In other cases, only URIs on the exact same host sharing the * same path prefix (as "www.geocities.com/foouser/about" is to * "www.geocities.com/foouser/"). * * Configuration options determine how expansive the extension * definition is. By default, it is very strict: same host and * identical path up to the last '/' given in the seed. * * * * @author gojomo * */ public class SeedExtensionFilter extends Filter { private CrawlController controller; static private int PATH = 0; // only accept same host, path-extensions static private int HOST = 1; // accept any URIs from same host static private int DOMAIN = 2; // accept any URIs from same domain private int extensionMode = PATH; /* (non-Javadoc) * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object) */ protected boolean innerAccepts(Object o) { UURI u = null; if(o instanceof UURI) { u = (UURI)o; } else if ( o instanceof CrawlURI) { u = ((CrawlURI)o).getUURI(); } if(u==null) { return false; } Iterator iter = controller.getScope().getSeeds().iterator(); while(iter.hasNext()) { UURI s = (UURI)iter.next(); if(s.getUri().getHost().equals(u.getUri().getHost())) { // hosts match if (extensionMode == PATH) { if(s.getUri().getPath().regionMatches(0,u.getUri().getPath(),0,s.getUri().getPath().lastIndexOf('/'))) { // matches up to last '/' return true; } else { // no match; try next seed continue; } } // else extensionMode == HOST or DOMAIN, match is good enough return true; } if (extensionMode == DOMAIN) { // might be a close-enough match String seedDomain = s.getUri().getHost(); // strip www[#] seedDomain = seedDomain.replaceFirst("^www\\d*",""); String candidateDomain = u.getUri().getHost(); if (candidateDomain==null) { // either an opaque, unfetchable, or unparseable URI continue; } if(seedDomain.regionMatches(0,candidateDomain,candidateDomain.length()-seedDomain.length(),seedDomain.length())) { // domain suffix congruence return true; } // else keep trying other seeds } } // if none found, fail return false; } /* (non-Javadoc) * @see org.archive.crawler.framework.Filter#initialize() */ public void initialize(CrawlController c) { // TODO Auto-generated method stub super.initialize(c); controller = c; String mode = getStringAt("@mode"); if(mode==null || "path".equals(mode)) { // default return; } if("host".equals(mode)) { extensionMode = HOST; } if("domain".equals(mode)) { extensionMode = DOMAIN; } } } --- NEW FILE: HopsFilter.java --- /* * HopsFilter.java * Created on Oct 3, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/filter/Attic/HopsFilter.java,v 1.1.2.1 2003/10/04 00:49:24 gojomo Exp $ */ package org.archive.crawler.filter; import org.archive.crawler.framework.Filter; /** * @author gojomo * */ public class HopsFilter extends Filter { /* (non-Javadoc) * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object) */ protected boolean innerAccepts(Object o) { // TODO fix this return false; } } --- NEW FILE: TransclusionFilter.java --- /* * TransclusionFilter.java * Created on Oct 3, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/filter/Attic/TransclusionFilter.java,v 1.1.2.1 2003/10/04 00:49:24 gojomo Exp $ */ package org.archive.crawler.filter; import org.archive.crawler.datamodel.CandidateURI; import org.archive.crawler.framework.Filter; /** * Filter which accepts CandidateURI/CrawlURI instances which * * @author Gordon Mohr */ public class TransclusionFilter extends Filter { // 1-3 trailing P(recondition)/R(eferral)/E(mbed) hops private static final String TRANSCLUSION_PATH = ".*[PRE][PRE]?[PRE]?$"; /* (non-Javadoc) * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object) */ protected boolean innerAccepts(Object o) { if(o instanceof CandidateURI) { return ((CandidateURI)o).getPathFromSeed().matches(TRANSCLUSION_PATH); } return false; } } --- NEW FILE: URIRegExpFilter.java --- /* * RegExpFilter.java * Created on Apr 16, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/filter/Attic/URIRegExpFilter.java,v 1.1.2.1 2003/10/04 00:49:24 gojomo Exp $ */ package org.archive.crawler.filter; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.datamodel.UURI; import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.Filter; /** * Compares passed object -- a CrawlURI, UURI, or String -- * against a regular expression, accepting matches. * * @author Gordon Mohr */ public class URIRegExpFilter extends Filter { Pattern pattern; /* (non-Javadoc) * @see org.archive.crawler.framework.Filter#accepts(java.lang.Object) */ protected boolean innerAccepts(Object o) { String input = null; // TODO consider changing this to ask o for its matchString if(o instanceof CrawlURI) { input = ((CrawlURI)o).getURIString(); } else if (o instanceof UURI ){ input = ((UURI)o).getUri().toString(); } else { //TODO handle other inputs } Matcher m = pattern.matcher(input); return m.matches(); } /* (non-Javadoc) * @see org.archive.crawler.framework.Filter#initialize() */ public void initialize(CrawlController c) { // TODO Auto-generated method stub super.initialize(c); String regexp = getStringAt("@regexp"); pattern = Pattern.compile(regexp); } } |
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/exceptions In directory sc8-pr-cvs1:/tmp/cvs-serv7488/src/org/archive/crawler/framework/exceptions Added Files: Tag: gjm-refactor InitializationException.java ConfigurationException.java FatalConfigurationException.java Log Message: renaming, repackaging, streamlining --- NEW FILE: InitializationException.java --- /* * Created on Jul 29, 2003 * */ package org.archive.crawler.framework.exceptions; /** InitializationExceptions should be thrown when there is a problem with * the crawl's initialization, such as file creation problems, etc. In the event * that a more specific exception can be thrown (such as a ConfigurationException * in the event that there is a configuration-specific problem) it should be. * * @author Parker Thompson * */ public class InitializationException extends Exception { public InitializationException() { super(); } /** * @param message */ public InitializationException(String message) { super(message); } /** * @param message * @param cause */ public InitializationException(String message, Throwable cause) { super(message, cause); } /** * @param cause */ public InitializationException(Throwable cause) { super(cause); } } --- NEW FILE: ConfigurationException.java --- /* * Created on Jul 29, 2003 * */ package org.archive.crawler.framework.exceptions; import org.archive.crawler.framework.exceptions.InitializationException; /** ConfigurationExceptions should be thrown when a configuration file * is missing data, or contains uninterpretable data, at runtime. Fatal * errors (that should cause the program to exit) should be thrown as * FatalConfigurationExceptions. * * You may optionally note the * * @author Parker Thompson * */ public class ConfigurationException extends InitializationException { // optionally store the file name and element so the catcher // can report the information and/or take other actions based on it protected String file = null; protected String element = null; public ConfigurationException() { super(); } /** Create a ConfigurationException * @param message */ public ConfigurationException(String message) { super(message); } /** * @param message * @param cause */ public ConfigurationException(String message, Throwable cause) { super(message, cause); // TODO Auto-generated constructor stub } /** Create a ConfigurationException * @param cause */ public ConfigurationException(Throwable cause) { super(cause); } /** Create ConfigurationException * @param message * @param filename * @param elementname */ public ConfigurationException(String message, String filename, String elementname){ super(message); file = filename; element = elementname; } /** Create ConfigurationException * @param message * @param cause * @param filename * @param elementname */ public ConfigurationException(String message, Throwable cause, String filename, String elementname){ super(message, cause); file = filename; element = elementname; } /** Create ConfigurationException * @param cause * @param filename * @param elementname */ public ConfigurationException(Throwable cause, String filename, String elementname){ super(cause); file = filename; element = elementname; } /** Store the name of the configuration file that was being parsed * when this error occured. * @param filename */ public void setFile(String name){ file = name; } public String getFile(){ return file; } /** Set the name of the element that was being parsed * when this error occured. * @param target */ public void setElement(String target){ element = target; } public String getElement(){ return element; } } --- NEW FILE: FatalConfigurationException.java --- /* * Created on Jul 29, 2003 * */ package org.archive.crawler.framework.exceptions; import org.archive.crawler.framework.exceptions.ConfigurationException; ; /** * @author Parker Thompson * */ public class FatalConfigurationException extends ConfigurationException { public FatalConfigurationException(String explanation) { super(explanation); } public FatalConfigurationException() { super(); } public FatalConfigurationException(String message, String file, String element){ super(message,file,element); } } |
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/util In directory sc8-pr-cvs1:/tmp/cvs-serv7488/src/org/archive/crawler/util Removed Files: Tag: gjm-refactor NullFilter.java TransclusionFilter.java SeedExtensionFilter.java URIRegExpFilter.java Log Message: renaming, repackaging, streamlining --- NullFilter.java DELETED --- --- TransclusionFilter.java DELETED --- --- SeedExtensionFilter.java DELETED --- --- URIRegExpFilter.java DELETED --- |
From: <go...@us...> - 2003-10-04 00:49:33
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io In directory sc8-pr-cvs1:/tmp/cvs-serv7488/src/org/archive/crawler/io Removed Files: Tag: gjm-refactor MemoryArea.java MemPoolManager.java SpreadInputStream.java SpreadOutputStream.java VirtualBuffer.java SeekableInputSubstream.java SeekableInputStream.java DiskedVirtualBuffer.java Log Message: renaming, repackaging, streamlining --- MemoryArea.java DELETED --- --- MemPoolManager.java DELETED --- --- SpreadInputStream.java DELETED --- --- SpreadOutputStream.java DELETED --- --- VirtualBuffer.java DELETED --- --- SeekableInputSubstream.java DELETED --- --- SeekableInputStream.java DELETED --- --- DiskedVirtualBuffer.java DELETED --- |
From: <go...@us...> - 2003-10-04 00:49:24
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/filter In directory sc8-pr-cvs1:/tmp/cvs-serv7460/src/org/archive/crawler/filter Log Message: Directory /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/filter added to the repository --> Using per-directory sticky tag `gjm-refactor' |
From: <go...@us...> - 2003-10-04 00:49:24
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/exceptions In directory sc8-pr-cvs1:/tmp/cvs-serv7460/src/org/archive/crawler/framework/exceptions Log Message: Directory /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/exceptions added to the repository --> Using per-directory sticky tag `gjm-refactor' |
From: <go...@us...> - 2003-10-04 00:49:24
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/fetcher In directory sc8-pr-cvs1:/tmp/cvs-serv7460/src/org/archive/crawler/fetcher Log Message: Directory /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/fetcher added to the repository --> Using per-directory sticky tag `gjm-refactor' |
From: <go...@us...> - 2003-10-03 17:12:04
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/dev-crawl In directory sc8-pr-cvs1:/tmp/cvs-serv30377/dev-crawl Modified Files: Tag: gjm-refactor order.xml Log Message: continuing reorg Index: order.xml =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/dev-crawl/Attic/order.xml,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** order.xml 3 Oct 2003 01:53:02 -0000 1.1.2.1 --- order.xml 3 Oct 2003 17:11:29 -0000 1.1.2.2 *************** *** 50,54 **** name="Preprocessor" class="org.archive.crawler.basic.SimplePreconditionEnforcer" ! next="DNS"> /> --- 50,54 ---- name="Preprocessor" class="org.archive.crawler.basic.SimplePreconditionEnforcer" ! next="DNS" /> |
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/util In directory sc8-pr-cvs1:/tmp/cvs-serv30377/src/org/archive/crawler/util Modified Files: Tag: gjm-refactor SeedExtensionFilter.java Added Files: Tag: gjm-refactor ThreadKicker.java TransclusionFilter.java Log Message: continuing reorg --- NEW FILE: ThreadKicker.java --- /* * ThreadKicker.java * Created on Jun 24, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/util/Attic/ThreadKicker.java,v 1.1.2.1 2003/10/03 17:11:30 gojomo Exp $ */ package org.archive.crawler.util; import java.util.HashMap; import java.util.SortedSet; import java.util.TreeSet; import java.util.logging.Logger; /** * Interrupts threads at the requested time. Useful * for aborting hung network IO. * * @author gojomo * */ public class ThreadKicker extends Thread { private static Logger logger = Logger.getLogger("org.archive.crawler.framework.ThreadKicker"); SortedSet scheduledKicks = new TreeSet(); // of ScheduledKicks HashMap pendingKicks = new HashMap(); // of Thread -> ScheduledKick /** * */ public ThreadKicker() { super(); setName("ThreadKicker"); } /** * Arrange to be kicked (interrupted) at the specified time. * * @param target * @param time */ public synchronized void kickMeAt(Thread target, long time) { removeKicks(target); if(time==0) { return; } ScheduledKick kick = new ScheduledKick(target, time); scheduledKicks.add(kick); pendingKicks.put(target,kick); if(scheduledKicks.first()==kick) { notify(); } } private void removeKicks(Thread target) { ScheduledKick kick = (ScheduledKick)pendingKicks.remove(target); if(kick!=null) { scheduledKicks.remove(kick); } } /* (non-Javadoc) * @see java.lang.Runnable#run() */ public synchronized void run() { while (true) { try { long now = System.currentTimeMillis(); ScheduledKick top = scheduledKicks.isEmpty() ? null : (ScheduledKick)scheduledKicks.first(); while (top!=null && top.getWhen()<now) { scheduledKicks.remove(top); pendingKicks.remove(top); logger.warning("kicking "+top.getTarget()); top.getTarget().interrupt(); top = scheduledKicks.isEmpty() ? null : (ScheduledKick)scheduledKicks.first(); } if (top == null) { wait(); } else { wait(top.getWhen()-now); } } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /** * Cancel all scheduled kicks for the given thread. * * @param thread */ public synchronized void cancelKick(Thread thread) { removeKicks(thread); } } class ScheduledKick implements Comparable { private Thread target; private long when; /** * */ public ScheduledKick(Thread th, long time) { target = th; when = time; } /* (non-Javadoc) * @see java.lang.Comparable#compareTo(java.lang.Object) */ public int compareTo(Object o) { ScheduledKick other = (ScheduledKick)o; if (this==other) { return 0; } if(when < other.getWhen()) { return -1; } if(when > other.getWhen()) { return 1; } // equal times; arbitrary ordering ok if(target.hashCode()<other.getTarget().hashCode()) { return -1; } else { // TODOSOMEDAY: fix tiny chance of bug here // if two ScheduledKicks of same time and // same target hashcode are compared, // answer is order-dependent. return 1; } } /** * */ Thread getTarget() { return target; } /** * @return */ long getWhen() { return when; } } --- NEW FILE: TransclusionFilter.java --- /* * TransclusionFilter.java * Created on Oct 3, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/util/Attic/TransclusionFilter.java,v 1.1.2.1 2003/10/03 17:11:30 gojomo Exp $ */ package org.archive.crawler.util; import org.archive.crawler.framework.Filter; /** * Filter which accepts CandidateURI/CrawlURI instances which * * @author Gordon Mohr */ public class TransclusionFilter extends Filter { /* (non-Javadoc) * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object) */ protected boolean innerAccepts(Object o) { // TODO Auto-generated method stub return false; } } Index: SeedExtensionFilter.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/util/SeedExtensionFilter.java,v retrieving revision 1.1 retrieving revision 1.1.2.1 diff -C2 -d -r1.1 -r1.1.2.1 *** SeedExtensionFilter.java 19 Sep 2003 01:37:19 -0000 1.1 --- SeedExtensionFilter.java 3 Oct 2003 17:11:30 -0000 1.1.2.1 *************** *** 52,56 **** return false; } ! Iterator iter = controller.getStore().getSeeds().iterator(); while(iter.hasNext()) { UURI s = (UURI)iter.next(); --- 52,56 ---- return false; } ! Iterator iter = controller.getScope().getSeeds().iterator(); while(iter.hasNext()) { UURI s = (UURI)iter.next(); |
From: <go...@us...> - 2003-10-03 17:11:36
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/lib/binaries In directory sc8-pr-cvs1:/tmp/cvs-serv30377/lib/binaries Removed Files: Tag: gjm-refactor commons-httpclient.jar Log Message: continuing reorg --- commons-httpclient.jar DELETED --- |
From: <go...@us...> - 2003-10-03 17:11:36
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel In directory sc8-pr-cvs1:/tmp/cvs-serv30377/src/org/archive/crawler/datamodel Modified Files: Tag: gjm-refactor CrawlOrder.java Log Message: continuing reorg Index: CrawlOrder.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlOrder.java,v retrieving revision 1.12.2.2 retrieving revision 1.12.2.3 diff -C2 -d -r1.12.2.2 -r1.12.2.3 *** CrawlOrder.java 3 Oct 2003 01:54:36 -0000 1.12.2.2 --- CrawlOrder.java 3 Oct 2003 17:11:31 -0000 1.12.2.3 *************** *** 16,19 **** --- 16,20 ---- */ public class CrawlOrder extends XMLConfig { + private static final String XP_CRAWL_ORDER_NAME = "//crawl-order/@name"; private static final String XP_HTTP_USER_AGENT = "//http-headers/@User-Agent"; private static final String XP_HTTP_FROM = "//http-headers/@From"; *************** *** 77,81 **** } ! loadParents(pathToDoc); initialize(); --- 78,82 ---- } ! //loadParents(pathToDoc); initialize(); *************** *** 86,90 **** */ public void initialize(){ ! name = getStringAt("//crawl-order/@name"); } --- 87,91 ---- */ public void initialize(){ ! name = getStringAt(XP_CRAWL_ORDER_NAME); } |
From: <go...@us...> - 2003-10-03 17:11:35
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework In directory sc8-pr-cvs1:/tmp/cvs-serv30377/src/org/archive/crawler/framework Modified Files: Tag: gjm-refactor XMLConfig.java CrawlController.java CrawlScope.java URIFrontier.java Removed Files: Tag: gjm-refactor URIScheduler.java URIStore.java URISelector.java Sorter.java ThreadKicker.java Log Message: continuing reorg Index: XMLConfig.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/XMLConfig.java,v retrieving revision 1.12.2.1 retrieving revision 1.12.2.2 diff -C2 -d -r1.12.2.1 -r1.12.2.2 *** XMLConfig.java 3 Oct 2003 01:54:35 -0000 1.12.2.1 --- XMLConfig.java 3 Oct 2003 17:11:30 -0000 1.12.2.2 *************** *** 419,423 **** return parentConfigurationFile.instantiate(xpath); } ! return instantiate(node); } catch (DOMException e) { --- 419,425 ---- return parentConfigurationFile.instantiate(xpath); } ! if(node == null){ ! return null; ! } return instantiate(node); } catch (DOMException e) { Index: CrawlController.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/CrawlController.java,v retrieving revision 1.28.2.2 retrieving revision 1.28.2.3 diff -C2 -d -r1.28.2.2 -r1.28.2.3 *** CrawlController.java 3 Oct 2003 01:54:36 -0000 1.28.2.2 --- CrawlController.java 3 Oct 2003 17:11:30 -0000 1.28.2.3 *************** *** 45,54 **** private static final String LOGNAME_RUNTIME_ERRORS = "runtime-errors"; private static final String LOGNAME_CRAWL = "crawl"; ! private static final String XP_STATS_LEVEL = "/loggers/crawl-statistics/@level"; ! private static final String XP_STATS_INTERVAL = "/loggers/crawl-statistics/@interval"; ! private static final String XP_DISK_PATH = "/behavior/@disk"; ! private static final String XP_PROCESSORS = "/behavior/processors/processor"; ! private static final String XP_FRONTIER = "/behavior/frontier"; ! private static final String XP_CRAWL_SCOPE = "/scope"; private int timeout = 1000; // to wait for CrawlURI from frontier before spinning private ToePool toePool; --- 45,54 ---- private static final String LOGNAME_RUNTIME_ERRORS = "runtime-errors"; private static final String LOGNAME_CRAWL = "crawl"; ! private static final String XP_STATS_LEVEL = "//loggers/crawl-statistics/@level"; ! private static final String XP_STATS_INTERVAL = "//loggers/crawl-statistics/@interval"; ! private static final String XP_DISK_PATH = "//behavior/@disk-path"; ! private static final String XP_PROCESSORS = "//behavior/processors/processor"; ! private static final String XP_FRONTIER = "//behavior/frontier"; ! private static final String XP_CRAWL_SCOPE = "//scope"; private int timeout = 1000; // to wait for CrawlURI from frontier before spinning private ToePool toePool; *************** *** 69,76 **** CrawlOrder order; CrawlScope scope; - - URIScheduler scheduler; - URIStore store; - URISelector selector; Processor firstProcessor; --- 69,72 ---- *************** *** 244,255 **** /** - * - */ - public URIScheduler getScheduler() { - return scheduler; - - } - - /** * @param thread */ --- 240,243 ---- *************** *** 257,268 **** // for now do nothing } - - /** - * - */ - public URISelector getSelector() { - return selector; - } - /** Return the object this controller is using to track crawl statistics --- 245,248 ---- *************** *** 272,276 **** } - /** * --- 252,255 ---- *************** *** 316,326 **** public CrawlOrder getOrder() { return order; - } - - /** - * @return - */ - public URIStore getStore() { - return store; } --- 295,298 ---- Index: CrawlScope.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/Attic/CrawlScope.java,v retrieving revision 1.1.2.2 retrieving revision 1.1.2.3 diff -C2 -d -r1.1.2.2 -r1.1.2.3 *** CrawlScope.java 3 Oct 2003 01:54:36 -0000 1.1.2.2 --- CrawlScope.java 3 Oct 2003 17:11:30 -0000 1.1.2.3 *************** *** 14,18 **** import org.archive.crawler.datamodel.CandidateURI; - import org.archive.crawler.datamodel.FatalConfigurationException; import org.archive.crawler.datamodel.UURI; --- 14,17 ---- *************** *** 64,68 **** } ! public List getSeeds() throws FatalConfigurationException { if (seeds != null) { return seeds; --- 63,67 ---- } ! public List getSeeds() { if (seeds != null) { return seeds; *************** *** 92,97 **** } } catch (IOException e) { ! throw new FatalConfigurationException( ! "Unable to locate seeds file: " + e.toString()); } return seeds; --- 91,96 ---- } } catch (IOException e) { ! e.printStackTrace(); ! // TODO throw runtime error? log something? } return seeds; Index: URIFrontier.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/Attic/URIFrontier.java,v retrieving revision 1.4.2.1 retrieving revision 1.4.2.2 diff -C2 -d -r1.4.2.1 -r1.4.2.2 *** URIFrontier.java 2 Oct 2003 01:53:51 -0000 1.4.2.1 --- URIFrontier.java 3 Oct 2003 17:11:30 -0000 1.4.2.2 *************** *** 24,26 **** --- 24,41 ---- boolean isEmpty(); long size(); + + /** + * @return + */ + int discoveredUriCount(); + + /** + * + */ + int successfullyFetchedCount(); + + /** + * + */ + int failedFetchCount(); } --- URIScheduler.java DELETED --- --- URIStore.java DELETED --- --- URISelector.java DELETED --- --- Sorter.java DELETED --- --- ThreadKicker.java DELETED --- |
From: <go...@us...> - 2003-10-03 17:11:35
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic In directory sc8-pr-cvs1:/tmp/cvs-serv30377/src/org/archive/crawler/basic Modified Files: Tag: gjm-refactor SimpleSelector.java BasicScope.java SimpleFrontier.java StatisticsTracker.java SimpleScheduler.java SimpleStore.java Log Message: continuing reorg Index: SimpleSelector.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleSelector.java,v retrieving revision 1.27 retrieving revision 1.27.2.1 diff -C2 -d -r1.27 -r1.27.2.1 *** SimpleSelector.java 23 Sep 2003 01:16:34 -0000 1.27 --- SimpleSelector.java 3 Oct 2003 17:11:30 -0000 1.27.2.1 *************** *** 21,25 **** import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.Filter; - import org.archive.crawler.framework.URISelector; import org.archive.crawler.framework.XMLConfig; --- 21,24 ---- *************** *** 28,32 **** * */ ! public class SimpleSelector extends XMLConfig implements URISelector, CoreAttributeConstants, FetchStatusCodes { /** * XPath to any specified filters --- 27,31 ---- * */ ! public class SimpleSelector extends XMLConfig implements CoreAttributeConstants, FetchStatusCodes { /** * XPath to any specified filters *************** *** 466,470 **** public void initialize(CrawlController c) { controller = c; ! store = (SimpleStore)c.getStore(); //maxLinkDepth = controller.getOrder().getBehavior().getIntAt("//limits/max-link-depth/@value", maxLinkDepth); //maxEmbedDepth = controller.getOrder().getBehavior().getIntAt("//limits/max-embed-depth/@value", maxEmbedDepth); --- 465,469 ---- public void initialize(CrawlController c) { controller = c; ! // store = (SimpleStore)c.getStore(); //maxLinkDepth = controller.getOrder().getBehavior().getIntAt("//limits/max-link-depth/@value", maxLinkDepth); //maxEmbedDepth = controller.getOrder().getBehavior().getIntAt("//limits/max-embed-depth/@value", maxEmbedDepth); Index: BasicScope.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/Attic/BasicScope.java,v retrieving revision 1.1.2.2 retrieving revision 1.1.2.3 diff -C2 -d -r1.1.2.2 -r1.1.2.3 *** BasicScope.java 3 Oct 2003 01:54:35 -0000 1.1.2.2 --- BasicScope.java 3 Oct 2003 17:11:30 -0000 1.1.2.3 *************** *** 7,14 **** package org.archive.crawler.basic; import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.CrawlScope; import org.archive.crawler.framework.Filter; ! import org.archive.crawler.util.NullFilter; /** --- 7,16 ---- package org.archive.crawler.basic; + import org.archive.crawler.datamodel.CandidateURI; import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.CrawlScope; import org.archive.crawler.framework.Filter; ! import org.archive.crawler.util.SeedExtensionFilter; ! import org.archive.crawler.util.TransclusionFilter; /** *************** *** 22,30 **** * && ! excludeFilter.accepts(uri) * * @author gojomo * */ public class BasicScope extends CrawlScope { - private static final Filter NULL_FILTER = new NullFilter(); Filter focusFilter; Filter transitiveFilter; --- 24,50 ---- * && ! excludeFilter.accepts(uri) * + * The focusFilter may be specified by either: + * - adding a 'mode' attribute to the + * <scope> element, in which case a SeedExtensionFilter + * will be used, with the <scope> element + * providing its configuration + * - adding a <focus> subelement + * If unspecified, the focusFilter will default to + * an accepts-all filter. + * + * The transitiveFilter may be specified by supploying + * a <transitive> subelement. If unspecified, a + * TransclusionFilter will be used, with the <scope> + * element providing its configuration. + * + * The excludeFilter may be specified by supplying + * a <exclude> subelement. If unspecified, a + * accepts-none filter will be used -- meaning that + * no URIs will pass the filter and thus be excluded. + * * @author gojomo * */ public class BasicScope extends CrawlScope { Filter focusFilter; Filter transitiveFilter; *************** *** 36,51 **** public void initialize(CrawlController controller) { super.initialize(controller); ! focusFilter = (Filter) instantiate("/focus"); ! if (focusFilter == null) { ! focusFilter = NULL_FILTER; } } ! /* (non-Javadoc) * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object) */ protected boolean innerAccepts(Object o) { ! // TODO Auto-generated method stub ! return false; } --- 56,126 ---- public void initialize(CrawlController controller) { super.initialize(controller); ! // setup focusFilter ! if(getNodeAt("@mode")!=null) { ! // SeedExtensionFilter implied ! focusFilter = new SeedExtensionFilter(); ! focusFilter.setNode(xNode); ! } else { ! focusFilter = (Filter) instantiate("focus"); ! } ! if(focusFilter != null) { ! focusFilter.initialize(controller); ! // only set up transitiveFilter if focusFilter set ! transitiveFilter = (Filter) instantiate("transitive"); ! if(transitiveFilter == null) { ! transitiveFilter = new TransclusionFilter(); ! transitiveFilter.setNode(xNode); ! transitiveFilter.initialize(controller); ! } ! } ! // setup exclude filter ! excludeFilter = (Filter) instantiate("exclude"); ! if(excludeFilter!=null) { ! excludeFilter.initialize(controller); } } ! /** * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object) */ protected boolean innerAccepts(Object o) { ! return ((isSeed(o)||focusAccepts(o))||transitiveAccepts(o))&&!excludeAccepts(o); ! } ! ! /** ! * @param o ! * @return ! */ ! private boolean excludeAccepts(Object o) { ! if (excludeFilter == null) { ! return false; ! } ! return excludeFilter.accepts(o); ! } ! ! /** ! * @param o ! * @return ! */ ! private boolean transitiveAccepts(Object o) { ! if (transitiveFilter == null) { ! return true; ! } ! return transitiveFilter.accepts(o); ! } ! ! /** ! * @param o ! * @return ! */ ! private boolean focusAccepts(Object o) { ! if (focusFilter == null) { ! return true; ! } ! return focusFilter.accepts(o); ! } ! ! private boolean isSeed(Object o) { ! return o instanceof CandidateURI && ((CandidateURI)o).getIsSeed(); } Index: SimpleFrontier.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/Attic/SimpleFrontier.java,v retrieving revision 1.1.2.2 retrieving revision 1.1.2.3 diff -C2 -d -r1.1.2.2 -r1.1.2.3 *** SimpleFrontier.java 3 Oct 2003 01:54:35 -0000 1.1.2.2 --- SimpleFrontier.java 3 Oct 2003 17:11:30 -0000 1.1.2.3 *************** *** 342,344 **** --- 342,368 ---- } + /* (non-Javadoc) + * @see org.archive.crawler.framework.URIFrontier#discoveredUriCount() + */ + public int discoveredUriCount() { + // TODO Auto-generated method stub + return 0; + } + + /* (non-Javadoc) + * @see org.archive.crawler.framework.URIFrontier#successfullyFetchedCount() + */ + public int successfullyFetchedCount() { + // TODO Auto-generated method stub + return 0; + } + + /* (non-Javadoc) + * @see org.archive.crawler.framework.URIFrontier#failedFetchCount() + */ + public int failedFetchCount() { + // TODO Auto-generated method stub + return 0; + } + } Index: StatisticsTracker.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/StatisticsTracker.java,v retrieving revision 1.8 retrieving revision 1.8.2.1 diff -C2 -d -r1.8 -r1.8.2.1 *** StatisticsTracker.java 1 Aug 2003 00:08:38 -0000 1.8 --- StatisticsTracker.java 3 Oct 2003 17:11:30 -0000 1.8.2.1 *************** *** 512,516 **** */ public int uriFetchSuccessCount() { ! return controller.getSelector().successfullyFetchedCount(); } --- 512,516 ---- */ public int uriFetchSuccessCount() { ! return controller.getFrontier().successfullyFetchedCount(); } *************** *** 534,538 **** */ public int urisEncounteredCount() { ! return controller.getStore().discoveredUriCount(); } --- 534,538 ---- */ public int urisEncounteredCount() { ! return controller.getFrontier().discoveredUriCount(); } *************** *** 541,546 **** */ public int totalFetchAttempts() { ! return controller.getSelector().successfullyFetchedCount() ! + controller.getSelector().failedFetchCount(); } --- 541,546 ---- */ public int totalFetchAttempts() { ! return controller.getFrontier().successfullyFetchedCount() ! + controller.getFrontier().failedFetchCount(); } *************** *** 549,553 **** */ public int failedFetchAttempts() { ! return controller.getSelector().failedFetchCount(); } --- 549,553 ---- */ public int failedFetchAttempts() { ! return controller.getFrontier().failedFetchCount(); } *************** *** 557,561 **** */ public int successfulFetchAttempts() { ! return controller.getSelector().successfullyFetchedCount(); } --- 557,561 ---- */ public int successfulFetchAttempts() { ! return controller.getFrontier().successfullyFetchedCount(); } Index: SimpleScheduler.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleScheduler.java,v retrieving revision 1.13.2.1 retrieving revision 1.13.2.2 diff -C2 -d -r1.13.2.1 -r1.13.2.2 *** SimpleScheduler.java 3 Oct 2003 01:54:35 -0000 1.13.2.1 --- SimpleScheduler.java 3 Oct 2003 17:11:30 -0000 1.13.2.2 *************** *** 15,19 **** import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.ToeThread; - import org.archive.crawler.framework.URIScheduler; /** --- 15,18 ---- *************** *** 21,25 **** * */ ! public class SimpleScheduler implements URIScheduler { private static Logger logger = Logger.getLogger("org.archive.crawler.basic.SimpleScheduler"); --- 20,24 ---- * */ ! public class SimpleScheduler { private static Logger logger = Logger.getLogger("org.archive.crawler.basic.SimpleScheduler"); *************** *** 91,95 **** public void initialize(CrawlController c) throws FatalConfigurationException { controller = c; ! store = (SimpleStore) c.getStore(); // load seeds // Iterator iter = c.getOrder().getBehavior().getSeeds().iterator(); --- 90,94 ---- public void initialize(CrawlController c) throws FatalConfigurationException { controller = c; ! // store = (SimpleStore) c.getStore(); // load seeds // Iterator iter = c.getOrder().getBehavior().getSeeds().iterator(); Index: SimpleStore.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleStore.java,v retrieving revision 1.32 retrieving revision 1.32.2.1 diff -C2 -d -r1.32 -r1.32.2.1 *** SimpleStore.java 24 Sep 2003 01:45:26 -0000 1.32 --- SimpleStore.java 3 Oct 2003 17:11:30 -0000 1.32.2.1 *************** *** 22,26 **** import org.archive.crawler.datamodel.UURISet; import org.archive.crawler.framework.CrawlController; - import org.archive.crawler.framework.URIStore; /** --- 22,25 ---- *************** *** 31,35 **** * */ ! public class SimpleStore implements URIStore, FetchStatusCodes, CoreAttributeConstants { private static Logger logger = Logger.getLogger("org.archive.crawler.basic.SimpleStore"); --- 30,34 ---- * */ ! public class SimpleStore implements FetchStatusCodes, CoreAttributeConstants { private static Logger logger = Logger.getLogger("org.archive.crawler.basic.SimpleStore"); |
From: <go...@us...> - 2003-10-03 01:55:01
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler In directory sc8-pr-cvs1:/tmp/cvs-serv20254 Modified Files: Tag: gjm-refactor .classpath Log Message: updated classpath Index: .classpath =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/.classpath,v retrieving revision 1.6 retrieving revision 1.6.2.1 diff -C2 -d -r1.6 -r1.6.2.1 *** .classpath 13 Jun 2003 02:10:11 -0000 1.6 --- .classpath 3 Oct 2003 01:54:57 -0000 1.6.2.1 *************** *** 2,5 **** --- 2,6 ---- <classpath> <classpathentry kind="src" path="src"/> + <classpathentry kind="src" path="oversrc"/> <classpathentry kind="lib" path="lib/binaries/commons-logging.jar"/> <classpathentry kind="lib" path="lib/binaries/dnsjava.jar"/> *************** *** 9,12 **** --- 10,17 ---- <classpathentry kind="src" path="/junit"/> <classpathentry kind="src" path="/dnsjava"/> + <classpathentry kind="lib" path="lib/binaries/itext.jar"/> + <classpathentry kind="lib" path="lib/binaries/javaswf.jar"/> + <classpathentry kind="lib" path="lib/binaries/org.mortbay.jetty.jar"/> + <classpathentry kind="lib" path="lib/binaries/poi.jar"/> <classpathentry kind="output" path="bin"/> </classpath> |
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework In directory sc8-pr-cvs1:/tmp/cvs-serv20111/src/org/archive/crawler/framework Modified Files: Tag: gjm-refactor XMLConfig.java CrawlController.java CrawlScope.java Log Message: big reorg in progress Index: XMLConfig.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/XMLConfig.java,v retrieving revision 1.12 retrieving revision 1.12.2.1 diff -C2 -d -r1.12 -r1.12.2.1 *** XMLConfig.java 7 Aug 2003 01:47:12 -0000 1.12 --- XMLConfig.java 3 Oct 2003 01:54:35 -0000 1.12.2.1 *************** *** 274,277 **** --- 274,295 ---- } + /** + * Retrieve a (positive) integer value from the given xpath; + * return -1 if none found or other error occurs. + * + * @param xpath + * @return + */ + public boolean getBooleanAt(String xpath, boolean defaultValue) { + String value = getStringAt(xpath); + if(value==null) { + return defaultValue; + } + if(value.equalsIgnoreCase("yes")||value.equalsIgnoreCase("true")) { + return true; + } + return false; + } + /** Index: CrawlController.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/CrawlController.java,v retrieving revision 1.28.2.1 retrieving revision 1.28.2.2 diff -C2 -d -r1.28.2.1 -r1.28.2.2 *** CrawlController.java 2 Oct 2003 01:53:51 -0000 1.28.2.1 --- CrawlController.java 3 Oct 2003 01:54:36 -0000 1.28.2.2 *************** *** 30,37 **** --- 30,54 ---- /** + * CrawlController collects all the classes which cooperate to + * perform a crawl, provides a high-level interface to the + * running crawl, and executes the "master thread" which doles + * out URIs from the Frontier to the ToeThreads. + * + * As the "global context" for a crawl, subcomponents will + * usually reach each other through the CrawlController. * * @author Gordon Mohr */ public class CrawlController { + private static final String LOGNAME_PROGRESS_STATISTICS = "progress-statistics"; + private static final String LOGNAME_URI_ERRORS = "uri-errors"; + private static final String LOGNAME_RUNTIME_ERRORS = "runtime-errors"; + private static final String LOGNAME_CRAWL = "crawl"; + private static final String XP_STATS_LEVEL = "/loggers/crawl-statistics/@level"; + private static final String XP_STATS_INTERVAL = "/loggers/crawl-statistics/@interval"; + private static final String XP_DISK_PATH = "/behavior/@disk"; + private static final String XP_PROCESSORS = "/behavior/processors/processor"; + private static final String XP_FRONTIER = "/behavior/frontier"; + private static final String XP_CRAWL_SCOPE = "/scope"; private int timeout = 1000; // to wait for CrawlURI from frontier before spinning private ToePool toePool; *************** *** 42,49 **** private File disk; ! public Logger uriProcessing = Logger.getLogger("crawl"); ! public Logger crawlErrors = Logger.getLogger("runtime-errors"); ! public Logger uriErrors = Logger.getLogger("uri-errors"); ! public Logger progressStats = Logger.getLogger("progress-statistics"); // create a statistic tracking object and have it write to the log every --- 59,66 ---- private File disk; ! public Logger uriProcessing = Logger.getLogger(LOGNAME_CRAWL); ! public Logger crawlErrors = Logger.getLogger(LOGNAME_RUNTIME_ERRORS); ! public Logger uriErrors = Logger.getLogger(LOGNAME_URI_ERRORS); ! public Logger progressStats = Logger.getLogger(LOGNAME_PROGRESS_STATISTICS); // create a statistic tracking object and have it write to the log every *************** *** 75,78 **** --- 92,102 ---- + /** + * Starting from nothing, set up CrawlController and associated + * classes to be ready fro crawling. + * + * @param o CrawlOrder + * @throws InitializationException + */ public void initialize(CrawlOrder o) throws InitializationException { order = o; *************** *** 81,123 **** checkUserAgentAndFrom(); - String diskPath = order.getStringAt("//disk/@path"); - if(diskPath == null || diskPath.length() == 0){ - - throw new FatalConfigurationException("No output Directory specified", - order.crawlOrderFilename, - "//disk/@path" - ); - } - - // read from the configuration file try { ! if(! diskPath.endsWith(File.separator)){ ! diskPath = diskPath + File.separator; ! } ! disk = new File(diskPath); ! disk.mkdirs(); ! ! FileHandler up = new FileHandler(diskPath+"uri-processing.log"); ! up.setFormatter(new UriProcessingFormatter()); ! uriProcessing.addHandler(up); ! uriProcessing.setUseParentHandlers(false); ! ! FileHandler cerr = new FileHandler(diskPath+"crawl-errors.log"); ! cerr.setFormatter(new CrawlErrorFormatter()); ! crawlErrors.addHandler(cerr); ! crawlErrors.setUseParentHandlers(false); ! ! FileHandler uerr = new FileHandler(diskPath+"uri-errors.log"); ! uerr.setFormatter(new UriErrorFormatter()); ! uriErrors.addHandler(uerr); ! uriErrors.setUseParentHandlers(false); ! ! FileHandler stat = new FileHandler(diskPath+"progress-statistics.log"); ! stat.setFormatter(new StatisticsLogFormatter()); ! progressStats.addHandler(stat); ! progressStats.setUseParentHandlers(false); ! } catch (IOException e) { --- 105,113 ---- checkUserAgentAndFrom(); // read from the configuration file try { ! setupDisk(); ! setupLogs(); } catch (IOException e) { *************** *** 125,151 **** } ! // the statistics object must be created before modules that use it if those ! // modules retrieve the object from the controller during initialization ! // (which some do). So here we go with that. ! int interval = order.getIntAt("//crawl-statistics/interval", DEFAULT_STATISTICS_REPORT_INTERVAL); ! statistics = new StatisticsTracker(this, interval); ! ! // set the log level ! String logLevel = order.getStringAt("//loggers/crawl-statistics/level"); ! if(logLevel != null){ ! if(logLevel.toLowerCase().equals("mercator")){ ! statistics.setLogLevel(StatisticsTracker.MERCATOR_LOGGING); ! }else if(logLevel.toLowerCase().equals("human")){ ! statistics.setLogLevel(StatisticsTracker.HUMAN_LOGGING); ! }else if(logLevel.toLowerCase().equals("verbose")){ ! statistics.setLogLevel(StatisticsTracker.VERBOSE_LOGGING); ! } ! } ! //statistics.setLogLevel(StatisticsTracker.VERBOSE_LOGGING); ! ! scope = (CrawlScope) order.instantiate("//scope"); ! frontier = (URIFrontier) order.instantiate("//frontier"); ! firstProcessor = (Processor) order.getBehavior().instantiateAllInto("//processors/processor",processors); // try to initialize each scope and frontier from the config file --- 115,127 ---- } ! setupStatTracking(); ! setupCrawlModules(); ! } ! ! private void setupCrawlModules() throws FatalConfigurationException { ! scope = (CrawlScope) order.instantiate(XP_CRAWL_SCOPE); ! frontier = (URIFrontier) order.instantiate(XP_FRONTIER); ! firstProcessor = (Processor) order.instantiateAllInto(XP_PROCESSORS,processors); // try to initialize each scope and frontier from the config file *************** *** 155,160 **** throw new FatalConfigurationException( "Can't initialize scope, class specified in configuration file not found", ! order.crawlOrderFilename, ! "//scope"); } try { --- 131,136 ---- throw new FatalConfigurationException( "Can't initialize scope, class specified in configuration file not found", ! order.getCrawlOrderFilename(), ! XP_CRAWL_SCOPE); } try { *************** *** 163,169 **** throw new FatalConfigurationException( "Can't initialize frontier, class specified in configuration file not found", ! order.crawlOrderFilename, ! "//frontier"); ! } hostCache = new ServerCache(); --- 139,146 ---- throw new FatalConfigurationException( "Can't initialize frontier, class specified in configuration file not found", ! order.getCrawlOrderFilename(), ! XP_FRONTIER); ! } ! hostCache = new ServerCache(); *************** *** 175,182 **** --- 152,224 ---- p.initialize(this); } + } + + + private void setupDisk() throws FatalConfigurationException { + String diskPath = order.getStringAt(XP_DISK_PATH); + if(diskPath == null || diskPath.length() == 0){ + throw new FatalConfigurationException("No output Directory specified", + order.getCrawlOrderFilename(), + XP_DISK_PATH + ); + } + + if(! diskPath.endsWith(File.separator)){ + diskPath = diskPath + File.separator; + } + disk = new File(diskPath); + disk.mkdirs(); + } + + + private void setupStatTracking() { + // the statistics object must be created before modules that use it if those + // modules retrieve the object from the controller during initialization + // (which some do). So here we go with that. + int interval = order.getIntAt(XP_STATS_INTERVAL, DEFAULT_STATISTICS_REPORT_INTERVAL); + statistics = new StatisticsTracker(this, interval); + + // set the log level + String logLevel = order.getStringAt(XP_STATS_LEVEL); + if(logLevel != null){ + if(logLevel.toLowerCase().equals("mercator")){ + statistics.setLogLevel(StatisticsTracker.MERCATOR_LOGGING); + }else if(logLevel.toLowerCase().equals("human")){ + statistics.setLogLevel(StatisticsTracker.HUMAN_LOGGING); + }else if(logLevel.toLowerCase().equals("verbose")){ + statistics.setLogLevel(StatisticsTracker.VERBOSE_LOGGING); + } + } + //statistics.setLogLevel(StatisticsTracker.VERBOSE_LOGGING); // start periodic background logging of crawl statistics Thread statLogger = new Thread(statistics); statLogger.start(); + // TODO pause stat sampling when crawler paused + } + + + private void setupLogs() throws IOException { + String diskPath = disk.getAbsolutePath(); + + FileHandler up = new FileHandler(diskPath+LOGNAME_CRAWL+".log"); + up.setFormatter(new UriProcessingFormatter()); + uriProcessing.addHandler(up); + uriProcessing.setUseParentHandlers(false); + + FileHandler cerr = new FileHandler(diskPath+LOGNAME_RUNTIME_ERRORS+".log"); + cerr.setFormatter(new CrawlErrorFormatter()); + crawlErrors.addHandler(cerr); + crawlErrors.setUseParentHandlers(false); + + FileHandler uerr = new FileHandler(diskPath+LOGNAME_URI_ERRORS+".log"); + uerr.setFormatter(new UriErrorFormatter()); + uriErrors.addHandler(uerr); + uriErrors.setUseParentHandlers(false); + + FileHandler stat = new FileHandler(diskPath+LOGNAME_PROGRESS_STATISTICS+".log"); + stat.setFormatter(new StatisticsLogFormatter()); + progressStats.addHandler(stat); + progressStats.setUseParentHandlers(false); } *************** *** 190,195 **** private void checkUserAgentAndFrom() throws InitializationException { // don't start the crawl if they're using the default user-agent ! String userAgent = order.getBehavior().getUserAgent(); ! String from = order.getBehavior().getFrom(); if(!userAgent.matches(ACCEPTABLE_USER_AGENT)||!from.matches(ACCEPTABLE_FROM)) { throw new FatalConfigurationException( --- 232,237 ---- private void checkUserAgentAndFrom() throws InitializationException { // don't start the crawl if they're using the default user-agent ! String userAgent = order.getUserAgent(); ! String from = order.getFrom(); if(!userAgent.matches(ACCEPTABLE_USER_AGENT)||!from.matches(ACCEPTABLE_FROM)) { throw new FatalConfigurationException( *************** *** 213,221 **** */ public void toeFinished(ToeThread thread) { ! // TODO Auto-generated method stub ! } - /** * --- 255,261 ---- */ public void toeFinished(ToeThread thread) { ! // for now do nothing } /** * *************** *** 268,272 **** private void setupToePool() { ! toePool = new ToePool(this,order.getBehavior().getMaxToes()); } --- 308,312 ---- private void setupToePool() { ! toePool = new ToePool(this,order.getMaxToes()); } *************** *** 379,382 **** --- 419,430 ---- public URIFrontier getFrontier() { return frontier; + } + + + /** + * + */ + public CrawlScope getScope() { + return scope; } } Index: CrawlScope.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/Attic/CrawlScope.java,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** CrawlScope.java 2 Oct 2003 01:53:51 -0000 1.1.2.1 --- CrawlScope.java 3 Oct 2003 01:54:36 -0000 1.1.2.2 *************** *** 7,10 **** --- 7,20 ---- package org.archive.crawler.framework; + import java.io.BufferedReader; + import java.io.IOException; + import java.net.URISyntaxException; + import java.util.ArrayList; + import java.util.List; + + import org.archive.crawler.datamodel.CandidateURI; + import org.archive.crawler.datamodel.FatalConfigurationException; + import org.archive.crawler.datamodel.UURI; + /** * Filter which determines, looking at the totality of *************** *** 26,38 **** * */ ! public class CrawlScope extends Filter { int version; /* (non-Javadoc) ! * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object) */ ! protected boolean innerAccepts(Object o) { ! // TODO Auto-generated method stub ! return false; } --- 36,51 ---- * */ ! public abstract class CrawlScope extends Filter { ! private static final String XP_SEEDS = "/seeds"; int version; + List seeds; + CrawlController controller; /* (non-Javadoc) ! * @see org.archive.crawler.framework.Filter#initialize(org.archive.crawler.framework.CrawlController) */ ! public void initialize(CrawlController controller) { ! super.initialize(controller); ! this.controller = controller; } *************** *** 49,52 **** --- 62,119 ---- public String toString() { return "CrawlScope<"+name+">"; + } + + public List getSeeds() throws FatalConfigurationException { + if (seeds != null) { + return seeds; + } + seeds = new ArrayList(); + try { + BufferedReader reader = nodeValueOrSrcReader(XP_SEEDS); + String read; + while (reader != null) { + do { + read = reader.readLine(); + } while ( + (read != null) + && ((read = read.trim()).startsWith("#") + || read.length() == 0)); + + if (read == null) { + reader.close(); + reader = null; + } else { + try { + seeds.add(UURI.createUURI(read)); + } catch (URISyntaxException e1) { + e1.printStackTrace(); + } + } + } + } catch (IOException e) { + throw new FatalConfigurationException( + "Unable to locate seeds file: " + e.toString()); + } + return seeds; + + } + + + /** + * @param list + */ + public void clearSeeds() { + seeds = new ArrayList(); + } + + /** + * TODO determine if this is appropriate place for this + * @param u + */ + public void addSeed(UURI u){ + seeds.add(u); + CandidateURI caUri = new CandidateURI(u); + caUri.setIsSeed(true); + controller.getFrontier().schedule(caUri); } } |
From: <go...@us...> - 2003-10-03 01:54:48
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic In directory sc8-pr-cvs1:/tmp/cvs-serv20111/src/org/archive/crawler/basic Modified Files: Tag: gjm-refactor FetcherHTTPSimple.java BasicScope.java SimpleFrontier.java SimpleScheduler.java ARCWriter.java SimplePreconditionEnforcer.java SimplePreselector.java Added Files: Tag: gjm-refactor SimplePostselector.java Log Message: big reorg in progress --- NEW FILE: SimplePostselector.java --- /* * SimplePostselector.java * Created on Oct 2, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/Attic/SimplePostselector.java,v 1.1.2.1 2003/10/03 01:54:35 gojomo Exp $ */ package org.archive.crawler.basic; import org.archive.crawler.datamodel.FetchStatusCodes; import org.archive.crawler.framework.Processor; /** * Determine which links etc get fed back into Frontier, * if/when failures get retried, etc. * * * @author gojomo * */ public class SimplePostselector extends Processor implements FetchStatusCodes { } Index: FetcherHTTPSimple.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/FetcherHTTPSimple.java,v retrieving revision 1.10 retrieving revision 1.10.2.1 diff -C2 -d -r1.10 -r1.10.2.1 *** FetcherHTTPSimple.java 30 Sep 2003 18:07:52 -0000 1.10 --- FetcherHTTPSimple.java 3 Oct 2003 01:54:35 -0000 1.10.2.1 *************** *** 84,91 **** get.setRequestHeader( "User-Agent", ! controller.getOrder().getBehavior().getUserAgent()); get.setRequestHeader( "From", ! controller.getOrder().getBehavior().getFrom()); get.setHttpRecorder(((ToeThread)Thread.currentThread()).getHttpRecorder()); --- 84,91 ---- get.setRequestHeader( "User-Agent", ! controller.getOrder().getUserAgent()); get.setRequestHeader( "From", ! controller.getOrder().getFrom()); get.setHttpRecorder(((ToeThread)Thread.currentThread()).getHttpRecorder()); Index: BasicScope.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/Attic/BasicScope.java,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** BasicScope.java 2 Oct 2003 01:53:51 -0000 1.1.2.1 --- BasicScope.java 3 Oct 2003 01:54:35 -0000 1.1.2.2 *************** *** 7,11 **** --- 7,14 ---- package org.archive.crawler.basic; + import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.CrawlScope; + import org.archive.crawler.framework.Filter; + import org.archive.crawler.util.NullFilter; /** *************** *** 15,19 **** * Roughly, its logic is that a URI is included if: * ! * ( isSeed(uri) || focusFilter.accepts(uri) ) * && ! excludeFilter.accepts(uri) * --- 18,23 ---- * Roughly, its logic is that a URI is included if: * ! * (( isSeed(uri) || focusFilter.accepts(uri) ) ! * || transitiveFilter.accepts(uri) ) * && ! excludeFilter.accepts(uri) * *************** *** 22,25 **** */ public class BasicScope extends CrawlScope { ! } --- 26,52 ---- */ public class BasicScope extends CrawlScope { + private static final Filter NULL_FILTER = new NullFilter(); + Filter focusFilter; + Filter transitiveFilter; + Filter excludeFilter; ! /* (non-Javadoc) ! * @see org.archive.crawler.framework.Filter#initialize(org.archive.crawler.framework.CrawlController) ! */ ! public void initialize(CrawlController controller) { ! super.initialize(controller); ! focusFilter = (Filter) instantiate("/focus"); ! if (focusFilter == null) { ! focusFilter = NULL_FILTER; ! } ! } ! ! /* (non-Javadoc) ! * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object) ! */ ! protected boolean innerAccepts(Object o) { ! // TODO Auto-generated method stub ! return false; ! } ! ! } Index: SimpleFrontier.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/Attic/SimpleFrontier.java,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** SimpleFrontier.java 2 Oct 2003 01:53:51 -0000 1.1.2.1 --- SimpleFrontier.java 3 Oct 2003 01:54:35 -0000 1.1.2.2 *************** *** 10,19 **** import java.util.Iterator; import java.util.LinkedList; import java.util.logging.Logger; - import org.archive.crawler.datamodel.*; import org.archive.crawler.datamodel.CandidateURI; import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.datamodel.FatalConfigurationException; import org.archive.crawler.datamodel.UURI; import org.archive.crawler.datamodel.UURISet; --- 10,21 ---- import java.util.Iterator; import java.util.LinkedList; + import java.util.SortedSet; + import java.util.TreeSet; import java.util.logging.Logger; import org.archive.crawler.datamodel.CandidateURI; import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.datamodel.FatalConfigurationException; + import org.archive.crawler.datamodel.MemUURISet; import org.archive.crawler.datamodel.UURI; import org.archive.crawler.datamodel.UURISet; *************** *** 32,39 **** public class SimpleFrontier implements URIFrontier { private static Logger logger = Logger.getLogger("org.archive.crawler.basic.SimpleFrontier"); ! ! HashMap allCuris = new HashMap(); // of UURI -> CrawlURI ! UURISet alreadyIncluded = new MemFPUURISet(); // every CandidateURI not yet in process or another queue; --- 34,43 ---- public class SimpleFrontier implements URIFrontier { private static Logger logger = Logger.getLogger("org.archive.crawler.basic.SimpleFrontier"); ! CrawlController controller; ! // HashMap allCuris = new HashMap(); // of UURI -> CrawlURI ! ! // TODO update to use fingerprints only ! UURISet alreadyIncluded = new MemUURISet(); // every CandidateURI not yet in process or another queue; *************** *** 41,45 **** --- 45,68 ---- LinkedList pendingQueue = new LinkedList(); // of CandidateURIs + // every CrawlURI handed out for processing but not yet returned + HashMap inProcessMap = new HashMap(); // of String (classKey) -> CrawlURI + + // all active per-class queues + HashMap allClassQueuesMap = new HashMap(); // of String (classKey) -> KeyedQueue + // all per-class queues whose first item may be handed out (that is, no CrawlURI + // of the same class is currently in-process) + LinkedList readyClassQueues = new LinkedList(); // of String (queueKey) -> KeyedQueue + + // all per-class queues who are on hold because a CrawlURI of their class + // is already in process + LinkedList heldClassQueues = new LinkedList(); // of String (queueKey) -> KeyedQueue + + // all per-class queues who are on hold until a certain time + SortedSet snoozeQueues = new TreeSet(new SchedulingComparator()); // of KeyedQueue, sorted by wakeTime + + // CrawlURIs held until some specific other CrawlURI is emitted + HashMap heldCuris = new HashMap(); // of UURI -> CrawlURI + *************** *** 50,59 **** public void initialize(CrawlController c) throws FatalConfigurationException { ! ! Iterator iter = c.getOrder().getBehavior().getSeeds().iterator(); while (iter.hasNext()) { UURI u = (UURI) iter.next(); CandidateURI caUri = new CandidateURI(u); ! caUri.setSeed(true); schedule(caUri); } --- 73,83 ---- public void initialize(CrawlController c) throws FatalConfigurationException { ! ! this.controller = c; ! Iterator iter = c.getScope().getSeeds().iterator(); while (iter.hasNext()) { UURI u = (UURI) iter.next(); CandidateURI caUri = new CandidateURI(u); ! caUri.setIsSeed(true); schedule(caUri); } *************** *** 61,77 **** } ! /* (non-Javadoc) * @see org.archive.crawler.framework.URIFrontier#schedule(org.archive.crawler.datamodel.CandidateURI) */ ! public void schedule(CandidateURI caUri) { ! // TODO Auto-generated method stub ! } ! /* (non-Javadoc) * @see org.archive.crawler.framework.URIFrontier#next(int) */ public CrawlURI next(int timeout) { ! // TODO Auto-generated method stub return null; } --- 85,151 ---- } ! /** ! * * @see org.archive.crawler.framework.URIFrontier#schedule(org.archive.crawler.datamodel.CandidateURI) */ ! public synchronized void schedule(CandidateURI caUri) { ! pendingQueue.addLast(caUri); } ! /** ! * * @see org.archive.crawler.framework.URIFrontier#next(int) */ public CrawlURI next(int timeout) { ! ! long now = System.currentTimeMillis(); ! long waitMax = 0; ! CrawlURI curi = null; ! ! // if enough time has passed to wake any snoozing queues, do it ! wakeReadyQueues(now); ! ! // first, see if any holding queues are ready with a CrawlURI ! if (!readyClassQueues.isEmpty()) { ! curi = dequeueFromReady(); ! return emitCuri(curi); ! } ! ! // if that fails, check the pending queue ! CandidateURI caUri; ! while ((caUri = dequeueFromPending()) != null) { ! if(alreadyIncluded.contains(caUri)) { ! continue; ! } ! curi = new CrawlURI(caUri); ! if (!enqueueIfNecessary(curi)) { ! // OK to emit ! return emitCuri(curi); ! } ! } ! ! // consider if URIs exhausted ! if(isEmpty()) { ! // nothing left to crawl ! logger.info("nothing left to crawl"); ! // TODO halt/spread the word??? ! return null; ! } ! ! // nothing to return, but there are still URIs ! // held for the future ! ! // block until something changes, or timeout occurs ! waitMax = Math.min(earliestWakeTime()-now,timeout); ! try { ! if(waitMax<0) { ! logger.warning("negative wait "+waitMax+" ignored"); ! } else { ! wait(waitMax); ! } ! } catch (InterruptedException e) { ! // TODO Auto-generated catch block ! e.printStackTrace(); ! } return null; } *************** *** 99,102 **** --- 173,343 ---- // TODO Auto-generated method stub return 0; + } + + + + /** + * + */ + protected void wakeReadyQueues(long now) { + while(!snoozeQueues.isEmpty()&&((URIStoreable)snoozeQueues.first()).getWakeTime()<=now) { + URIStoreable awoken = (URIStoreable)snoozeQueues.first(); + if (!snoozeQueues.remove(awoken)) { + logger.severe("first() item couldn't be remove()d!"); + } + if (awoken instanceof KeyedQueue) { + assert inProcessMap.get(awoken.getClassKey()) == null : "false ready: class peer still in process"; + if(((KeyedQueue)awoken).isEmpty()) { + // just drop queue + discardQueue(awoken); + return; + } + readyClassQueues.add(awoken); + awoken.setStoreState(URIStoreable.READY); + } else if (awoken instanceof CrawlURI) { + // TODO think about whether this is right + pushToPending((CrawlURI)awoken); + } else { + assert false : "something evil has awoken!"; + } + } + } + + private void discardQueue(URIStoreable awoken) { + allClassQueuesMap.remove(((KeyedQueue)awoken).getClassKey()); + awoken.setStoreState(URIStoreable.FINISHED); + } + + /** + * @return + */ + private CrawlURI dequeueFromReady() { + KeyedQueue firstReadyQueue = (KeyedQueue)readyClassQueues.getFirst(); + CrawlURI readyCuri = (CrawlURI) firstReadyQueue.removeFirst(); + return readyCuri; + } + + /** + * @param crawlURI + * @return + */ + private CrawlURI emitCuri(CrawlURI curi) { + if(curi != null) { + if (curi.getStoreState() == URIStoreable.FINISHED) { + System.out.println("break here"); + } + assert curi.getStoreState() != URIStoreable.FINISHED : "state "+curi.getStoreState()+" instead of ready for "+ curi; + //assert curi.getAList() != null : "null alist in curi " + curi + " state "+ curi.getStoreState(); + noteInProcess(curi); + curi.setServer(controller.getHostCache().getServerFor(curi)); + } + return curi; + } + + /** + * @param curi + */ + protected void noteInProcess(CrawlURI curi) { + assert inProcessMap.get(curi.getClassKey()) == null : "two CrawlURIs with same classKey in process"; + + inProcessMap.put(curi.getClassKey(), curi); + curi.setStoreState(URIStoreable.IN_PROCESS); + + KeyedQueue classQueue = (KeyedQueue) allClassQueuesMap.get(curi.getClassKey()); + if (classQueue == null) { + releaseHeld(curi); + return; + } + assert classQueue.getStoreState() == URIStoreable.READY : "odd state "+ classQueue.getStoreState() + " for classQueue "+ classQueue + "of to-be-emitted CrawlURI"; + readyClassQueues.remove(classQueue); + enqueueToHeld(classQueue); + releaseHeld(curi); + } + + /** + * @param classQueue + */ + private void enqueueToHeld(KeyedQueue classQueue) { + heldClassQueues.add(classQueue); + classQueue.setStoreState(URIStoreable.HELD); + } + + /** + * @param curi + */ + private void releaseHeld(CrawlURI curi) { + CrawlURI released = (CrawlURI) heldCuris.get(curi.getUURI()); + if(released!=null) { + heldCuris.remove(curi.getUURI()); + reinsert(released); + } + } + + /** + * @param curi + */ + public void reinsert(CrawlURI curi) { + + if(enqueueIfNecessary(curi)) { + // added to classQueue + return; + } + // no classQueue + pushToPending(curi); + } + + /** + * + */ + protected synchronized CandidateURI dequeueFromPending() { + if (pendingQueue.isEmpty()) { + return null; + } + return (CandidateURI)pendingQueue.removeFirst(); + } + + /** + * + * @param curi + * @return true if enqueued + */ + public boolean enqueueIfNecessary(CrawlURI curi) { + KeyedQueue classQueue = (KeyedQueue) allClassQueuesMap.get(curi.getClassKey()); + if (classQueue != null) { + // must enqueue + classQueue.add(curi); + curi.setStoreState(classQueue.getStoreState()); + return true; + } + CrawlURI classmateInProgress = (CrawlURI) inProcessMap.get(curi.getClassKey()); + if (classmateInProgress != null) { + // must create queue, and enqueue + classQueue = new KeyedQueue(curi.getClassKey()); + allClassQueuesMap.put(classQueue.getClassKey(), classQueue); + enqueueToHeld(classQueue); + classQueue.add(curi); + curi.setStoreState(classQueue.getStoreState()); + return true; + } + + return false; + } + + /** + * @return + */ + public long earliestWakeTime() { + if (!snoozeQueues.isEmpty()) { + return ((URIStoreable)snoozeQueues.first()).getWakeTime(); + } + return Long.MAX_VALUE; + } + + /** + * @param curi + */ + private synchronized void pushToPending(CrawlURI curi) { + pendingQueue.addFirst(curi); + curi.setStoreState(URIStoreable.PENDING); } Index: SimpleScheduler.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleScheduler.java,v retrieving revision 1.13 retrieving revision 1.13.2.1 diff -C2 -d -r1.13 -r1.13.2.1 *** SimpleScheduler.java 6 Aug 2003 01:21:35 -0000 1.13 --- SimpleScheduler.java 3 Oct 2003 01:54:35 -0000 1.13.2.1 *************** *** 7,11 **** package org.archive.crawler.basic; ! import java.util.Iterator; import java.util.logging.Logger; --- 7,11 ---- package org.archive.crawler.basic; ! //import java.util.Iterator; import java.util.logging.Logger; *************** *** 93,100 **** store = (SimpleStore) c.getStore(); // load seeds ! Iterator iter = c.getOrder().getBehavior().getSeeds().iterator(); ! while (iter.hasNext()) { ! insertAsSeed((UURI) iter.next()); ! } } --- 93,100 ---- store = (SimpleStore) c.getStore(); // load seeds ! // Iterator iter = c.getOrder().getBehavior().getSeeds().iterator(); ! // while (iter.hasNext()) { ! // insertAsSeed((UURI) iter.next()); ! // } } Index: ARCWriter.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/ARCWriter.java,v retrieving revision 1.32 retrieving revision 1.32.2.1 diff -C2 -d -r1.32 -r1.32.2.1 *** ARCWriter.java 30 Sep 2003 18:07:52 -0000 1.32 --- ARCWriter.java 3 Oct 2003 01:54:35 -0000 1.32.2.1 *************** *** 13,23 **** import java.io.OutputStream; - // import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.methods.GetMethod; - import org.archive.crawler.basic.StatisticsTracker; import org.archive.crawler.datamodel.CoreAttributeConstants; import org.archive.crawler.datamodel.CrawlOrder; import org.archive.crawler.datamodel.CrawlURI; - import org.archive.crawler.datamodel.CrawlerBehavior; import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.Processor; --- 13,20 ---- *************** *** 69,73 **** // set up output directory CrawlOrder order = controller.getOrder(); - CrawlerBehavior behavior = order.getBehavior(); // retrieve any nodes we think we need from the dom(s) --- 66,69 ---- Index: SimplePreconditionEnforcer.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimplePreconditionEnforcer.java,v retrieving revision 1.11 retrieving revision 1.11.2.1 diff -C2 -d -r1.11 -r1.11.2.1 *** SimplePreconditionEnforcer.java 1 Oct 2003 01:40:43 -0000 1.11 --- SimplePreconditionEnforcer.java 3 Oct 2003 01:54:35 -0000 1.11.2.1 *************** *** 53,58 **** // for all curis that will in fact be fetched, set appropriate delays // TODOSOMEDAY: allow per-host, per-protocol, etc. factors ! curi.setDelayFactor(getDelayFactorFor(curi)); ! curi.setMinimumDelay(getMinimumDelayFor(curi)); return; --- 53,58 ---- // for all curis that will in fact be fetched, set appropriate delays // TODOSOMEDAY: allow per-host, per-protocol, etc. factors ! // curi.setDelayFactor(getDelayFactorFor(curi)); ! // curi.setMinimumDelay(getMinimumDelayFor(curi)); return; *************** *** 79,83 **** } // test against robots.txt if available ! String ua = controller.getOrder().getBehavior().getUserAgent(); if( curi.getServer().getRobots().disallows(curi.getUURI().getUri().getPath(),ua)) { // don't fetch --- 79,83 ---- } // test against robots.txt if available ! String ua = controller.getOrder().getUserAgent(); if( curi.getServer().getRobots().disallows(curi.getUURI().getUri().getPath(),ua)) { // don't fetch Index: SimplePreselector.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimplePreselector.java,v retrieving revision 1.3 retrieving revision 1.3.2.1 diff -C2 -d -r1.3 -r1.3.2.1 *** SimplePreselector.java 1 Oct 2003 01:40:43 -0000 1.3 --- SimplePreselector.java 3 Oct 2003 01:54:35 -0000 1.3.2.1 *************** *** 10,13 **** --- 10,14 ---- import org.archive.crawler.datamodel.FetchStatusCodes; import org.archive.crawler.framework.CrawlController; + import org.archive.crawler.framework.CrawlScope; import org.archive.crawler.framework.Processor; *************** *** 24,31 **** */ public class SimplePreselector extends Processor implements FetchStatusCodes { ! private static String XP_MAX_LINK_DEPTH="params/@max-link-depth"; ! private static String XP_MAX_EMBED_DEPTH="params/@max-embed-depth"; ! private int maxLinkDepth = -1; ! private int maxEmbedDepth = -1; /* (non-Javadoc) --- 25,36 ---- */ public class SimplePreselector extends Processor implements FetchStatusCodes { ! private boolean recheckScope; ! ! private static String XP_RECHECK_SCOPE="@scope"; ! ! // private static String XP_MAX_LINK_DEPTH="params/@max-link-depth"; ! // private static String XP_MAX_EMBED_DEPTH="params/@max-embed-depth"; ! // private int maxLinkDepth = -1; ! // private int maxEmbedDepth = -1; /* (non-Javadoc) *************** *** 33,64 **** */ protected void innerProcess(CrawlURI curi) { ! super.innerProcess(curi); ! ! // check for too-deep ! if(maxLinkDepth>=0 && curi.getLinkHopCount()>maxLinkDepth) { ! curi.setFetchStatus(S_TOO_MANY_LINK_HOPS); ! curi.cancelFurtherProcessing(); ! return; ! } ! if(maxEmbedDepth>=0 && curi.getEmbedHopCount()>maxEmbedDepth) { ! curi.setFetchStatus(S_TOO_MANY_EMBED_HOPS); ! curi.cancelFurtherProcessing(); ! return; ! } ! } ! ! /* (non-Javadoc) ! * @see org.archive.crawler.framework.Processor#innerRejectProcess(org.archive.crawler.datamodel.CrawlURI) ! */ ! protected void innerRejectProcess(CrawlURI curi) { ! super.innerRejectProcess(curi); ! // filter-rejection means out-of-scope for everything but embeds ! if (curi.getEmbedHopCount() < 1) { curi.setFetchStatus(S_OUT_OF_SCOPE); curi.cancelFurtherProcessing(); - } else { - // never mind; scope filters don't apply } } /* (non-Javadoc) --- 38,85 ---- */ protected void innerProcess(CrawlURI curi) { ! if (recheckScope) { ! CrawlScope scope = controller.getScope(); ! if (curi.getScopeVersion()==scope.getVersion()) { ! // already checked ! return; ! } ! if(scope.accepts(curi)) { ! curi.setScopeVersion(scope.getVersion()); ! return; ! } ! // scope rejected curi.setFetchStatus(S_OUT_OF_SCOPE); curi.cancelFurtherProcessing(); } + + + // super.innerProcess(curi); + // + // // check for too-deep + // if(maxLinkDepth>=0 && curi.getLinkHopCount()>maxLinkDepth) { + // curi.setFetchStatus(S_TOO_MANY_LINK_HOPS); + // curi.cancelFurtherProcessing(); + // return; + // } + // if(maxEmbedDepth>=0 && curi.getEmbedHopCount()>maxEmbedDepth) { + // curi.setFetchStatus(S_TOO_MANY_EMBED_HOPS); + // curi.cancelFurtherProcessing(); + // return; + // } } + + // /* (non-Javadoc) + // * @see org.archive.crawler.framework.Processor#innerRejectProcess(org.archive.crawler.datamodel.CrawlURI) + // */ + // protected void innerRejectProcess(CrawlURI curi) { + // super.innerRejectProcess(curi); + // // filter-rejection means out-of-scope for everything but embeds + // if (curi.getEmbedHopCount() < 1) { + // curi.setFetchStatus(S_OUT_OF_SCOPE); + // curi.cancelFurtherProcessing(); + // } else { + // // never mind; scope filters don't apply + // } + // } /* (non-Javadoc) *************** *** 67,72 **** public void initialize(CrawlController c) { super.initialize(c); ! maxLinkDepth = getIntAt(XP_MAX_LINK_DEPTH, maxLinkDepth); ! maxEmbedDepth = getIntAt(XP_MAX_EMBED_DEPTH, maxEmbedDepth); } --- 88,96 ---- public void initialize(CrawlController c) { super.initialize(c); ! recheckScope = getBooleanAt("@scope",false); ! ! ! //maxLinkDepth = getIntAt(XP_MAX_LINK_DEPTH, maxLinkDepth); ! //maxEmbedDepth = getIntAt(XP_MAX_EMBED_DEPTH, maxEmbedDepth); } |
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel In directory sc8-pr-cvs1:/tmp/cvs-serv20111/src/org/archive/crawler/datamodel Modified Files: Tag: gjm-refactor CandidateURI.java CrawlOrder.java CrawlURI.java Removed Files: Tag: gjm-refactor CrawlerBehavior.java Log Message: big reorg in progress Index: CandidateURI.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CandidateURI.java,v retrieving revision 1.1.2.1 retrieving revision 1.1.2.2 diff -C2 -d -r1.1.2.1 -r1.1.2.2 *** CandidateURI.java 2 Oct 2003 01:53:51 -0000 1.1.2.1 --- CandidateURI.java 3 Oct 2003 01:54:36 -0000 1.1.2.2 *************** *** 32,36 **** String pathFromSeed; /** Where this URI was (presently) discovered */ ! UURI precursorUuri; --- 32,38 ---- String pathFromSeed; /** Where this URI was (presently) discovered */ ! // mostly for debugging; will be a CrawlURI when memory is no object ! // just a string or null when memory is an object (configurable) ! Object via; *************** *** 43,51 **** /** * @param b */ ! public void setSeed(boolean b) { isSeed=b; } } --- 45,99 ---- /** + * @param uriString + */ + public CandidateURI(String s){ + try{ + setUuri(UURI.createUURI(s)); + }catch(Exception e){ + setUuri(null); + } + } + + + /** * @param b */ ! public void setIsSeed(boolean b) { isSeed=b; } + + /** + * + */ + public UURI getUuri() { + return uuri; + } + + /** + * @param u + */ + private void setUuri(UURI u) { + uuri=u; + } + + /** + * @return + */ + public boolean getIsSeed() { + return isSeed; + } + /** + * + */ + public int getScopeVersion() { + return inScopeVersion; + } + + /** + * @param i + */ + public void setScopeVersion(int i) { + inScopeVersion = i; + } } Index: CrawlOrder.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlOrder.java,v retrieving revision 1.12.2.1 retrieving revision 1.12.2.2 diff -C2 -d -r1.12.2.1 -r1.12.2.2 *** CrawlOrder.java 2 Oct 2003 01:53:51 -0000 1.12.2.1 --- CrawlOrder.java 3 Oct 2003 01:54:36 -0000 1.12.2.2 *************** *** 10,16 **** import java.io.IOException; - import javax.xml.transform.TransformerException; - - import org.apache.xpath.XPathAPI; import org.archive.crawler.framework.XMLConfig; import org.w3c.dom.Document; --- 10,13 ---- *************** *** 19,27 **** */ public class CrawlOrder extends XMLConfig { ! protected String name; ! protected CrawlerBehavior behavior; ! protected String outputLocation; ! public String crawlOrderFilename; ! //protected CrawlOrder parentConfigurationFile; /** --- 16,26 ---- */ public class CrawlOrder extends XMLConfig { ! private static final String XP_HTTP_USER_AGENT = "//http-headers/@User-Agent"; ! private static final String XP_HTTP_FROM = "//http-headers/@From"; ! private static final String XP_MAX_TOE_THREADS = "//behavior/@max-toe-threads"; ! String caseFlattenedUserAgent; ! String name; ! String outputLocation; ! String crawlOrderFilename; /** *************** *** 87,109 **** */ public void initialize(){ ! try { ! name = getStringAt("//crawl-order/@name"); ! ! // ignore null pointers here, it may just mean this file inherited from ! // another and we can find the behavior there. ! try { ! behavior = ! new CrawlerBehavior( ! XPathAPI.selectSingleNode(xNode, "//crawler-behavior")); ! behavior.setDefaultFileLocation(this.defaultFilePath); ! behavior.setParentConfig(this.parentConfigurationFile); ! } catch (NullPointerException e) { ! } ! ! //outputLocation = getStringAt("//disk/@path"); ! ! } catch (TransformerException e) { ! e.printStackTrace(); ! } } --- 86,90 ---- */ public void initialize(){ ! name = getStringAt("//crawl-order/@name"); } *************** *** 136,150 **** /** - * - */ - public CrawlerBehavior getBehavior() { - // if this node doesn't have it but we have a parent conf file check that - if(behavior == null && parentConfigurationFile != null){ - return ((CrawlOrder)parentConfigurationFile).getBehavior(); - } - return behavior; - } - - /** * @return */ --- 117,120 ---- *************** *** 162,165 **** --- 132,166 ---- } return outputLocation; + } + + /** + * @return + */ + public String getUserAgent() { + if (caseFlattenedUserAgent==null) { + caseFlattenedUserAgent = getStringAt(XP_HTTP_USER_AGENT).toLowerCase(); + } + return caseFlattenedUserAgent; + } + + /** + * @return + */ + public String getFrom() { + return getStringAt(XP_HTTP_FROM); + } + + /** + * @return + */ + public int getMaxToes() { + return getIntAt(XP_MAX_TOE_THREADS); + } + + /** + * + */ + public String getCrawlOrderFilename() { + return crawlOrderFilename; } } Index: CrawlURI.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlURI.java,v retrieving revision 1.40 retrieving revision 1.40.2.1 diff -C2 -d -r1.40 -r1.40.2.1 *** CrawlURI.java 24 Sep 2003 01:45:26 -0000 1.40 --- CrawlURI.java 3 Oct 2003 01:54:36 -0000 1.40.2.1 *************** *** 31,39 **** * @author Gordon Mohr */ ! public class CrawlURI implements URIStoreable, CoreAttributeConstants, FetchStatusCodes { ! // core identity: the "usable URI" to be crawled ! private UURI uuri; ! // Scheduler lifecycle info private Object state; // state within scheduling/store/selector --- 31,43 ---- * @author Gordon Mohr */ ! public class CrawlURI extends CandidateURI implements URIStoreable, CoreAttributeConstants, FetchStatusCodes { ! // INHERITED FROM CANDIDATEURI ! // uuri: core identity: the "usable URI" to be crawled ! // isSeed ! // inScopeVersion ! // pathFromSeed ! // via ! // Scheduler lifecycle info private Object state; // state within scheduling/store/selector *************** *** 52,56 **** // dynamic context - private CrawlURI via; // curi that led to this (lowest hops from seed) private int linkHopCount = -1; // from seeds private int embedHopCount = -1; // from a sure link; reset upon any link traversal --- 56,59 ---- *************** *** 59,82 **** CrawlServer server; - - private int contentSize = -1; - /** * @param uuri */ ! public CrawlURI(UURI u) { ! setUuri(u); } ! ! /** ! * @param u */ ! private void setUuri(UURI u) { ! uuri=u; } - /** * Set the time this curi is considered expired (and thus must be refetched) --- 62,82 ---- CrawlServer server; private int contentSize = -1; /** * @param uuri */ ! public CrawlURI(UURI uuri) { ! super(uuri); } ! ! /** ! * @param caUri */ ! public CrawlURI(CandidateURI caUri) { ! super(caUri.getUuri()); ! setIsSeed(caUri.getIsSeed()); } /** * Set the time this curi is considered expired (and thus must be refetched) *************** *** 124,139 **** return fetchAttempts++; } ! ! /** ! * @param uriString ! */ ! public CrawlURI(String s){ ! try{ ! setUuri(UURI.createUURI(s)); ! }catch(Exception e){ ! setUuri(null); ! } ! } ! /** * @return --- 124,128 ---- return fetchAttempts++; } ! /** * @return *************** *** 242,259 **** /** - * @param object - */ - public void setDelayFactor(int f) { - alist.putInt(A_DELAY_FACTOR,f); - } - - /** - * @param object - */ - public void setMinimumDelay(int m) { - alist.putInt(A_MINIMUM_DELAY,m); - } - - /** * */ --- 231,234 ---- *************** *** 454,457 **** embedHopCount = 0; } - } --- 429,431 ---- --- CrawlerBehavior.java DELETED --- |
From: <go...@us...> - 2003-10-03 01:54:47
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler In directory sc8-pr-cvs1:/tmp/cvs-serv20111/src/org/archive/crawler Modified Files: Tag: gjm-refactor Heritrix.java Log Message: big reorg in progress Index: Heritrix.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/Heritrix.java,v retrieving revision 1.9 retrieving revision 1.9.2.1 diff -C2 -d -r1.9 -r1.9.2.1 *** Heritrix.java 1 Aug 2003 22:41:53 -0000 1.9 --- Heritrix.java 3 Oct 2003 01:54:36 -0000 1.9.2.1 *************** *** 17,26 **** * Main class for Heritrix crawler. * ! * Currently takes a single command-line argument, which * should be an XML crawl-order file describing the crawl to * undertake, and begins that crawl. * ! * (Eventually, will start web UI and await further ! * instructions.) * * @author gojomo --- 17,26 ---- * Main class for Heritrix crawler. * ! * Initially takes a single command-line argument, which * should be an XML crawl-order file describing the crawl to * undertake, and begins that crawl. * ! * Alternatively, can start a web UI and await further ! * instructions. * * @author gojomo |
From: <go...@us...> - 2003-10-03 01:54:47
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/admin In directory sc8-pr-cvs1:/tmp/cvs-serv20111/src/org/archive/crawler/admin Modified Files: Tag: gjm-refactor CrawlerHandler.java Log Message: big reorg in progress Index: CrawlerHandler.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/admin/CrawlerHandler.java,v retrieving revision 1.7 retrieving revision 1.7.2.1 diff -C2 -d -r1.7 -r1.7.2.1 *** CrawlerHandler.java 1 Aug 2003 18:12:12 -0000 1.7 --- CrawlerHandler.java 3 Oct 2003 01:54:37 -0000 1.7.2.1 *************** *** 203,222 **** "//http-headers/User-Agent", "USER AGENT", ! o.getBehavior().getUserAgent())); sb.append( genHtmlTextField( "//http-headers/From", " FROM", ! o.getBehavior().getFrom())); sb.append( genHtmlTextField( "//limits/max-toe-threads/@value", "MAX NUMBER OF THREADS", ! String.valueOf(o.getBehavior().getMaxToes()))); ! sb.append( ! genHtmlTextField( ! "//limits/max-link-depth/@value", ! "MAX LINK DEPTH", ! String.valueOf(o.getBehavior().getMaxLinkDepth()))); sb.append( genHtmlTextField( --- 203,222 ---- "//http-headers/User-Agent", "USER AGENT", ! o.getUserAgent())); sb.append( genHtmlTextField( "//http-headers/From", " FROM", ! o.getFrom())); sb.append( genHtmlTextField( "//limits/max-toe-threads/@value", "MAX NUMBER OF THREADS", ! String.valueOf(o.getMaxToes()))); ! // sb.append( ! // genHtmlTextField( ! // "//limits/max-link-depth/@value", ! // "MAX LINK DEPTH", ! // String.valueOf(o.getMaxLinkDepth()))); sb.append( genHtmlTextField( *************** *** 244,254 **** .getNodeValue())); ! sb.append( ! genHtmlTextField( ! "//selector/seeds/@src", ! "SEEDS FILE", ! o.getBehavior().getStringAt("//selector/seeds/@src"))); ! ! sb.append(genHtmlTextArea("seed-urls", "", o.getBehavior().getSeeds())); sb.append( "<INPUT type=hidden name=CrawlerAction value=2>\n<br><INPUT TYPE=submit VALUE=\"UpdateOrder\">\n</FORM>"); --- 244,254 ---- .getNodeValue())); ! // sb.append( ! // genHtmlTextField( ! // "//selector/seeds/@src", ! // "SEEDS FILE", ! // o.getStringAt("//selector/seeds/@src"))); ! // ! // sb.append(genHtmlTextArea("seed-urls", "", o.getSeeds())); sb.append( "<INPUT type=hidden name=CrawlerAction value=2>\n<br><INPUT TYPE=submit VALUE=\"UpdateOrder\">\n</FORM>"); *************** *** 298,302 **** String name; _controller.getOrder().clearCaches(); ! _controller.getOrder().getBehavior().clearCaches(); while (it.hasNext()) { --- 298,302 ---- String name; _controller.getOrder().clearCaches(); ! _controller.getOrder().clearCaches(); while (it.hasNext()) { *************** *** 305,311 **** if (name.equals("seed-urls")) { String[] urls = value.split("\n"); ! _controller.getOrder().getBehavior().clearSeeds(); ! for (int i = 0; i < urls.length; i++) ! _controller.getOrder().getBehavior().addSeed(urls[i]); } else { if (_controller.getOrder().getNodeAt(name) != null) { --- 305,311 ---- if (name.equals("seed-urls")) { String[] urls = value.split("\n"); ! // _controller.getOrder().clearSeeds(); ! // for (int i = 0; i < urls.length; i++) ! // _controller.getOrder().getBehavior().addSeed(urls[i]); } else { if (_controller.getOrder().getNodeAt(name) != null) { |
From: <go...@us...> - 2003-10-03 01:54:47
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/util In directory sc8-pr-cvs1:/tmp/cvs-serv20111/src/org/archive/crawler/util Added Files: Tag: gjm-refactor NullFilter.java Log Message: big reorg in progress --- NEW FILE: NullFilter.java --- /* * NullFilter.java * Created on Oct 2, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/util/Attic/NullFilter.java,v 1.1.2.1 2003/10/03 01:54:35 gojomo Exp $ */ package org.archive.crawler.util; import org.archive.crawler.framework.Filter; /** * @author gojomo * */ public class NullFilter extends Filter { /* (non-Javadoc) * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object) */ protected boolean innerAccepts(Object o) { return true; } } |
From: <go...@us...> - 2003-10-03 01:53:10
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/dev-crawl In directory sc8-pr-cvs1:/tmp/cvs-serv19699/dev-crawl Added Files: Tag: gjm-refactor order.xml logging.properties Log Message: working crawl info --- NEW FILE: order.xml --- <crawl-order name="dev-crawl" comment="A simple crawl for development/example purposes." > <scope class="org.archive.crawler.basic.BasicScope" max-link-depth="100" max-transitive-depth="5"> <seeds> http://www.archive.org/movies/movies.php http://dmoz.org </seeds> <filter name="domain-focus" class="SeedExtensionFilter" mode="domain" /> </scope> <behavior disk-path="disk" max-toe-threads="2" > <http-headers User-Agent="heritrix-aoc/alpha (+http://crawler.archive.org)" From="arc...@li..." /> <frontier class="org.archive.crawler.basic.SimpleFrontier" delay-factor="1" minimum-delay="0" /> <processors> <processor name="Preselector" class="org.archive.crawler.basic.SimplePreselector" next="Preprocessor" scope="yes" /> <processor name="Preprocessor" class="org.archive.crawler.basic.SimplePreconditionEnforcer" next="DNS"> /> <processor name="DNS" class="org.archive.crawler.basic.FetcherDNS" next="HTTP" /> <processor name="HTTP" class="org.archive.crawler.basic.FetcherHTTPSimple" next="ExtractorHTTP" timeout-seconds="10" /> <processor name="ExtractorHTTP" class="org.archive.crawler.extractor.ExtractorHTTP" next="ExtractorHTML" /> <processor name="ExtractorHTML" class="org.archive.crawler.extractor.ExtractorHTML" next="ExtractorDOC" /> <processor name="ExtractorDOC" class="org.archive.crawler.extractor.ExtractorDOC" next="ExtractorSWF" /> <processor name="ExtractorSWF" class="org.archive.crawler.extractor.ExtractorSWF" next="ExtractorPDF" /> <processor name="ExtractorPDF" class="org.archive.crawler.extractor.ExtractorPDF" next="Archiver" /> <processor name="Archiver" class="org.archive.crawler.basic.ARCWriter" next="Updater" compress="yes" max-arc-size="20000000"> <!-- <filter name="http-only" class="org.archive.crawler.util.URIRegExpFilter" regexp="^http://.*" /> --> </processor> <processor name="Updater" class="org.archive.crawler.basic.CrawlStateUpdater" next="Postselector" /> <processor name="Postselector" class="org.archive.crawler.basic.SimplePostselector" /> </processors> <loggers> <crawl-statistics interval="10" /> </loggers> </behavior> </crawl-order> --- NEW FILE: logging.properties --- # handlers= java.util.logging.ConsoleHandler java.util.logging.ConsoleHandler.level= ALL # Default global logging level. .level= WARNING # view selector progress # org.archive.crawler.basic.SimpleSelector.level= FINEST # view processor filterings # org.archive.crawler.framework.Processor.level= INFO # crawl.level= INFO runtime-errors.level= INFO uri-errors.level= INFO progress-statistics.level= INFO # # org.archive.crawler.basic.ExtractorHTTP.level= FINEST org.archive.crawler.basic.SimplePreconditionEnforcer.level= FINEST org.apache.commons.httpclient.level= SEVERE org.archive.crawler.basic.FetcherHTTPSimple.level= SEVERE |
From: <go...@us...> - 2003-10-03 01:53:02
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/dev-crawl In directory sc8-pr-cvs1:/tmp/cvs-serv19659/dev-crawl Log Message: Directory /cvsroot/archive-crawler/ArchiveOpenCrawler/dev-crawl added to the repository --> Using per-directory sticky tag `gjm-refactor' |
From: <go...@us...> - 2003-10-02 18:35:03
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler In directory sc8-pr-cvs1:/tmp/cvs-serv26394 Modified Files: .classpath Log Message: classpath, httpclient library cleanup Index: .classpath =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/.classpath,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** .classpath 17 Jun 2003 19:58:28 -0000 1.7 --- .classpath 2 Oct 2003 18:34:58 -0000 1.8 *************** *** 2,11 **** <classpath> <classpathentry kind="src" path="src"/> <classpathentry kind="lib" path="lib/binaries/commons-logging.jar"/> <classpathentry kind="lib" path="lib/binaries/dnsjava.jar"/> <classpathentry kind="lib" path="lib/binaries/stataclasses.jar"/> <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> - <classpathentry kind="lib" path="lib/binaries/commons-httpclient.jar"/> <classpathentry kind="lib" path="lib/binaries/junit.jar"/> <classpathentry kind="output" path="bin"/> </classpath> --- 2,16 ---- <classpath> <classpathentry kind="src" path="src"/> + <classpathentry kind="src" path="oversrc"/> <classpathentry kind="lib" path="lib/binaries/commons-logging.jar"/> <classpathentry kind="lib" path="lib/binaries/dnsjava.jar"/> <classpathentry kind="lib" path="lib/binaries/stataclasses.jar"/> <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> <classpathentry kind="lib" path="lib/binaries/junit.jar"/> + <classpathentry kind="lib" path="lib/binaries/itext.jar"/> + <classpathentry kind="lib" path="lib/binaries/org.mortbay.jetty.jar"/> + <classpathentry kind="lib" path="lib/binaries/poi.jar"/> + <classpathentry kind="lib" path="lib/binaries/javaswf.jar"/> + <classpathentry kind="lib" path="lib/binaries/httpclient-cvs-20031002.jar"/> <classpathentry kind="output" path="bin"/> </classpath> |
From: <go...@us...> - 2003-10-02 18:35:03
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/lib/binaries In directory sc8-pr-cvs1:/tmp/cvs-serv26394/lib/binaries Added Files: httpclient-cvs-20031002.jar Removed Files: commons-httpclient.jar Log Message: classpath, httpclient library cleanup --- NEW FILE: httpclient-cvs-20031002.jar --- (This appears to be a binary file; contents omitted.) --- commons-httpclient.jar DELETED --- |