Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic In directory sc8-pr-cvs1:/tmp/cvs-serv21672/src/org/archive/crawler/basic Modified Files: SimpleSelector.java URIStoreable.java SimplePreconditionEnforcer.java SimpleStore.java Added Files: SimplePreselector.java Log Message: refactorings(in progress) --- NEW FILE: SimplePreselector.java --- /* * SimplePreselector.java * Created on Sep 22, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimplePreselector.java,v 1.1 2003/09/23 01:16:34 gojomo Exp $ */ package org.archive.crawler.basic; import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.datamodel.FetchStatusCodes; import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.Processor; /** * Gives a yes/no on whether a CrawlURI should be processed at all. * * Usually, failing a processor filter causes that processor * to be skipped. Failing this processor's filter means a * CrawlURI will be marked OUT_OF_SCOPE. * * * @author gojomo * */ public class SimplePreselector extends Processor implements FetchStatusCodes { private static String XP_MAX_LINK_DEPTH="//limits/max-link-depth/@value"; private static String XP_MAX_EMBED_DEPTH="//limits/max-embed-depth/@value"; private int maxLinkDepth = -1; private int maxEmbedDepth = -1; /* (non-Javadoc) * @see org.archive.crawler.framework.Processor#innerProcess(org.archive.crawler.datamodel.CrawlURI) */ protected void innerProcess(CrawlURI curi) { super.innerProcess(curi); // check for too-deep if(maxLinkDepth>=0 && curi.getLinkHopCount()>maxLinkDepth) { curi.setFetchStatus(S_TOO_MANY_LINK_HOPS); curi.cancelFurtherProcessing(); return; } if(maxEmbedDepth>=0 && curi.getEmbedHopCount()>maxEmbedDepth) { curi.setFetchStatus(S_TOO_MANY_EMBED_HOPS); curi.cancelFurtherProcessing(); return; } } /* (non-Javadoc) * @see org.archive.crawler.framework.Processor#innerRejectProcess(org.archive.crawler.datamodel.CrawlURI) */ protected void innerRejectProcess(CrawlURI curi) { super.innerRejectProcess(curi); // filter-rejection means out-of-scope curi.setFetchStatus(S_OUT_OF_SCOPE); curi.cancelFurtherProcessing(); } /* (non-Javadoc) * @see org.archive.crawler.framework.Processor#initialize(org.archive.crawler.framework.CrawlController) */ public void initialize(CrawlController c) { super.initialize(c); maxLinkDepth = getIntAt(XP_MAX_LINK_DEPTH, maxLinkDepth); maxEmbedDepth = getIntAt(XP_MAX_EMBED_DEPTH, maxEmbedDepth); } } Index: SimpleSelector.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleSelector.java,v retrieving revision 1.26 retrieving revision 1.27 diff -C2 -d -r1.26 -r1.27 *** SimpleSelector.java 19 Sep 2003 01:37:19 -0000 1.26 --- SimpleSelector.java 23 Sep 2003 01:16:34 -0000 1.27 *************** *** 7,10 **** --- 7,11 ---- package org.archive.crawler.basic; + import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; *************** *** 37,42 **** SimpleStore store; ArrayList filters = new ArrayList(); ! private int maxLinkDepth = -1; ! private int maxEmbedDepth = -1; private int maxDeferrals = 10; // should be at least max-retries plus 3 or so --- 38,45 ---- SimpleStore store; ArrayList filters = new ArrayList(); ! ! // MOVED TO PRESELECTOR PROCESSOR ! //private int maxLinkDepth = -1; ! //private int maxEmbedDepth = -1; private int maxDeferrals = 10; // should be at least max-retries plus 3 or so *************** *** 77,98 **** } // handle http headers if (curi.getAList().containsKey(A_HTTP_HEADER_URIS)) { ! handleHttpHeaders(curi); } // handle embeds ! if ((maxEmbedDepth >= 0) ! && (curi.getEmbedHopCount()<maxEmbedDepth) ! && curi.getAList().containsKey(A_HTML_EMBEDS)) { ! handleEmbeds(curi); } ! // handle links, if not too deep ! if (curi.getAList().containsKey(A_HTML_LINKS) ! && ((maxLinkDepth == -1) ! || (curi.getLinkHopCount() < maxLinkDepth))) { ! handleLinks(curi); } - // SUCCESS: note & log successDisposition(curi); --- 80,99 ---- } + + URI baseUri = getBaseURI(curi); + // handle http headers if (curi.getAList().containsKey(A_HTTP_HEADER_URIS)) { ! handleHttpHeaders(curi, baseUri); } // handle embeds ! if (curi.getAList().containsKey(A_HTML_EMBEDS)) { ! handleEmbeds(curi, baseUri); } ! // handle links ! if (curi.getAList().containsKey(A_HTML_LINKS)) { ! handleLinks(curi, baseUri); } // SUCCESS: note & log successDisposition(curi); *************** *** 110,113 **** --- 111,132 ---- * @param curi */ + private URI getBaseURI(CrawlURI curi) { + if (!curi.getAList().containsKey(A_HTML_BASE)) { + return curi.getUURI().getUri(); + } + String base = curi.getAList().getString(A_HTML_BASE); + try { + return UURI.createUURI(base).getUri(); + } catch (URISyntaxException e) { + Object[] array = { this, base }; + controller.uriErrors.log(Level.INFO,e.getMessage(), array ); + // next best thing: use self + return curi.getUURI().getUri(); + } + } + + /** + * @param curi + */ private void scheduleForRetry(CrawlURI curi) { logger.fine("inserting snoozed "+curi+" for "+retryDelay); *************** *** 133,136 **** --- 152,162 ---- // something unexpectedly bad happened case S_UNFETCHABLE_URI: + // no chance to fetch + case S_OUT_OF_SCOPE: + // filtered out + case S_TOO_MANY_EMBED_HOPS: + // too far from last true link + case S_TOO_MANY_LINK_HOPS: + // too far from seeds return true; *************** *** 167,171 **** * @param curi */ ! private void handleHttpHeaders(CrawlURI curi) { // treat roughly the same as embeds, with same distance-from-seed Collection uris = (Collection)curi.getAList().getObject(A_HTTP_HEADER_URIS); --- 193,197 ---- * @param curi */ ! private void handleHttpHeaders(CrawlURI curi, URI baseUri) { // treat roughly the same as embeds, with same distance-from-seed Collection uris = (Collection)curi.getAList().getObject(A_HTTP_HEADER_URIS); *************** *** 174,178 **** String e = (String)iter.next(); try { ! UURI u = UURI.createUURI(e,curi.getBaseUri()); //if(filtersAccept(u)) { logger.fine("inserting header at head "+u); --- 200,204 ---- String e = (String)iter.next(); try { ! UURI u = UURI.createUURI(e,baseUri); //if(filtersAccept(u)) { logger.fine("inserting header at head "+u); *************** *** 271,275 **** ! protected void handleLinks(CrawlURI curi) { if (curi.getFetchStatus() >= 400) { // do not follow links of error pages --- 297,301 ---- ! protected void handleLinks(CrawlURI curi, URI baseUri) { if (curi.getFetchStatus() >= 400) { // do not follow links of error pages *************** *** 281,285 **** String l = (String)iter.next(); try { ! UURI link = UURI.createUURI(l,curi.getBaseUri()); if(filtersAccept(link)) { logger.fine("inserting link "+link+" "+curi.getStoreState()); --- 307,311 ---- String l = (String)iter.next(); try { ! UURI link = UURI.createUURI(l,baseUri); if(filtersAccept(link)) { logger.fine("inserting link "+link+" "+curi.getStoreState()); *************** *** 294,298 **** ! protected void handleEmbeds(CrawlURI curi) { if (curi.getFetchStatus() >= 400) { // do not follow links of error pages --- 320,324 ---- ! protected void handleEmbeds(CrawlURI curi, URI baseUri) { if (curi.getFetchStatus() >= 400) { // do not follow links of error pages *************** *** 304,308 **** String e = (String)iter.next(); try { ! UURI embed = UURI.createUURI(e,curi.getBaseUri()); //if(filtersAccept(embed)) { logger.fine("inserting embed at head "+embed); --- 330,334 ---- String e = (String)iter.next(); try { ! UURI embed = UURI.createUURI(e,baseUri); //if(filtersAccept(embed)) { logger.fine("inserting embed at head "+embed); *************** *** 337,341 **** logger.fine("inserting prereq at head "+prereq); //CrawlURI prereqCuri = store.insertAtHead(prereq,curi.getAList().getInt("distance-from-seed")); ! CrawlURI prereqCuri = store.insert(prereq,curi,false); if (prereqCuri.getStoreState()==URIStoreable.FINISHED) { curi.setFetchStatus(S_PREREQUISITE_FAILURE); --- 363,367 ---- logger.fine("inserting prereq at head "+prereq); //CrawlURI prereqCuri = store.insertAtHead(prereq,curi.getAList().getInt("distance-from-seed")); ! CrawlURI prereqCuri = store.insert(prereq,curi,true); if (prereqCuri.getStoreState()==URIStoreable.FINISHED) { curi.setFetchStatus(S_PREREQUISITE_FAILURE); *************** *** 406,419 **** array); } ! curi.setStoreState(URIStoreable.FINISHED); ! if (curi.getDontRetryBefore()<0) { ! // if not otherwise set, retire this URI forever ! curi.setDontRetryBefore(Long.MAX_VALUE); } - curi.stripToMinimal(); } /* (non-Javadoc) * @see org.archive.crawler.framework.URISelector#initialize(org.archive.crawler.framework.CrawlController) --- 432,464 ---- array); } ! if(shouldBeForgotten(curi)) { ! // curi is dismissed without prejudice: it can be reconstituted ! store.forget(curi); ! } else { ! curi.setStoreState(URIStoreable.FINISHED); ! if (curi.getDontRetryBefore()<0) { ! // if not otherwise set, retire this URI forever ! curi.setDontRetryBefore(Long.MAX_VALUE); ! } ! curi.stripToMinimal(); } } + /** + * @param curi + * @return + */ + private boolean shouldBeForgotten(CrawlURI curi) { + switch(curi.getFetchStatus()) { + case S_TOO_MANY_EMBED_HOPS: + case S_TOO_MANY_LINK_HOPS: + return true; + default: + return false; + } + } + /* (non-Javadoc) * @see org.archive.crawler.framework.URISelector#initialize(org.archive.crawler.framework.CrawlController) *************** *** 422,427 **** controller = c; store = (SimpleStore)c.getStore(); ! maxLinkDepth = controller.getOrder().getBehavior().getIntAt("//limits/max-link-depth/@value", maxLinkDepth); ! maxEmbedDepth = controller.getOrder().getBehavior().getIntAt("//limits/max-embed-depth/@value", maxEmbedDepth); instantiateAllInto(XP_FILTERS,filters); --- 467,472 ---- controller = c; store = (SimpleStore)c.getStore(); ! //maxLinkDepth = controller.getOrder().getBehavior().getIntAt("//limits/max-link-depth/@value", maxLinkDepth); ! //maxEmbedDepth = controller.getOrder().getBehavior().getIntAt("//limits/max-embed-depth/@value", maxEmbedDepth); instantiateAllInto(XP_FILTERS,filters); Index: URIStoreable.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/URIStoreable.java,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** URIStoreable.java 17 Jul 2003 16:10:19 -0000 1.3 --- URIStoreable.java 23 Sep 2003 01:16:34 -0000 1.4 *************** *** 13,16 **** --- 13,17 ---- public interface URIStoreable { + public static final Object FORGOTTEN = "FORGOTTEN".intern(); public static final Object FINISHED = "FINISHED".intern();; public static final Object HELD = "HELD".intern(); Index: SimplePreconditionEnforcer.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimplePreconditionEnforcer.java,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** SimplePreconditionEnforcer.java 6 Sep 2003 01:44:05 -0000 1.9 --- SimplePreconditionEnforcer.java 23 Sep 2003 01:16:34 -0000 1.10 *************** *** 37,44 **** protected void innerProcess(CrawlURI curi) { - if (considerChaff(curi)) { - return; - } - if (considerDnsPreconditions(curi)) { return; --- 37,40 ---- *************** *** 63,81 **** return; - } - - /** - * @param curi - * @return - */ - private boolean considerChaff(CrawlURI curi) { - //if (curi.getChaffness()>1) { - // System.out.println(curi.getChaffness()+" "+curi.getUURI().toString()); - //} if(curi.getChaffness()>getIntAt(XP_CHAFF_THRESHOLD,DEFAULT_CHAFF_THRESHOLD)) { - curi.setFetchStatus(S_DEEMED_CHAFF); - curi.cancelFurtherProcessing(); - return true; - } - return false; } --- 59,62 ---- Index: SimpleStore.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleStore.java,v retrieving revision 1.30 retrieving revision 1.31 diff -C2 -d -r1.30 -r1.31 *** SimpleStore.java 19 Sep 2003 01:36:36 -0000 1.30 --- SimpleStore.java 23 Sep 2003 01:16:34 -0000 1.31 *************** *** 381,385 **** } ! applyCarryforwards(curi,sourceCuri, embed ); allCuris.put(uuri,curi); --- 381,385 ---- } ! applyCarryforwards(curi,sourceCuri, embed); allCuris.put(uuri,curi); *************** *** 500,503 **** --- 500,515 ---- public Collection getSeeds() { return seeds; + } + + /** + * Forget the given CrawlURI. This allows a new instance + * to be created in the future, if it is reencountered under + * different circumstances. + * + * @param curi + */ + public void forget(CrawlURI curi) { + allCuris.remove(curi.getUURI()); + curi.setStoreState(URIStoreable.FORGOTTEN); } |