From: <go...@us...> - 2003-09-19 19:46:14
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic In directory sc8-pr-cvs1:/tmp/cvs-serv14400/src/org/archive/crawler/basic Modified Files: SimpleStore.java Log Message: remember seeds (to support seedlist-based filtering, for example) Index: SimpleStore.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleStore.java,v retrieving revision 1.29 retrieving revision 1.30 diff -C2 -d -r1.29 -r1.30 *** SimpleStore.java 15 Sep 2003 23:34:56 -0000 1.29 --- SimpleStore.java 19 Sep 2003 01:36:36 -0000 1.30 *************** *** 7,10 **** --- 7,11 ---- package org.archive.crawler.basic; + import java.util.Collection; import java.util.HashMap; import java.util.Iterator; *************** *** 17,21 **** --- 18,24 ---- import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.datamodel.FetchStatusCodes; + import org.archive.crawler.datamodel.MemUURISet; import org.archive.crawler.datamodel.UURI; + import org.archive.crawler.datamodel.UURISet; import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.URIStore; *************** *** 31,34 **** --- 34,39 ---- private static Logger logger = Logger.getLogger("org.archive.crawler.basic.SimpleStore"); + UURISet seeds = new MemUURISet(); + HashMap allCuris = new HashMap(); // of UURI -> CrawlURI *************** *** 94,97 **** --- 99,103 ---- */ public void insertAsSeed(UURI uuri) { + seeds.add(uuri); if(allCuris.get(uuri)!=null) { // already inserted *************** *** 99,103 **** } CrawlURI curi = new CrawlURI(uuri); ! curi.getAList().putInt("distance-from-seed",0); allCuris.put(uuri,curi); pendingQueue.addLast(curi); --- 105,109 ---- } CrawlURI curi = new CrawlURI(uuri); ! //curi.getAList().putInt("distance-from-seed",0); allCuris.put(uuri,curi); pendingQueue.addLast(curi); *************** *** 487,490 **** --- 493,503 ---- reinsert(released); } + } + + /* (non-Javadoc) + * @see org.archive.crawler.framework.URIStore#getSeeds() + */ + public Collection getSeeds() { + return seeds; } |