Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic
In directory sc8-pr-cvs1:/tmp/cvs-serv15682/src/org/archive/crawler/basic
Modified Files:
SimpleStore.java
Log Message:
carryforward chaffness indicator
Index: SimpleStore.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleStore.java,v
retrieving revision 1.25
retrieving revision 1.26
diff -C2 -d -r1.25 -r1.26
*** SimpleStore.java 6 Aug 2003 01:23:10 -0000 1.25
--- SimpleStore.java 6 Sep 2003 01:48:41 -0000 1.26
***************
*** 7,10 ****
--- 7,11 ----
package org.archive.crawler.basic;
+ import java.util.BitSet;
import java.util.HashMap;
import java.util.LinkedList;
***************
*** 359,371 ****
* @param i
*/
! public void insert(UURI uuri, int dist) {
! if(filteredOut(uuri)) return;
CrawlURI curi = (CrawlURI)allCuris.get(uuri);
if(curi!=null) {
// already inserted
// TODO: perhaps yank to front?
// if curi is still locked out, ignore request to schedule
if(curi.getStoreState()!=URIStoreable.FINISHED || curi.dontFetchYet()){
! return;
}
// yank URI back into scheduling if necessary
--- 360,373 ----
* @param i
*/
! public CrawlURI insert(UURI uuri, CrawlURI sourceCuri, int extraHop) {
! if(filteredOut(uuri)) return null;
CrawlURI curi = (CrawlURI)allCuris.get(uuri);
if(curi!=null) {
// already inserted
// TODO: perhaps yank to front?
+ // TODO: increment inlink counter?
// if curi is still locked out, ignore request to schedule
if(curi.getStoreState()!=URIStoreable.FINISHED || curi.dontFetchYet()){
! return curi;
}
// yank URI back into scheduling if necessary
***************
*** 374,382 ****
curi = new CrawlURI(uuri);
}
! int newDist = dist;
! if(curi.getAList().containsKey(A_DISTANCE_FROM_SEED)) {
! newDist = Math.max(dist,curi.getAList().getInt(A_DISTANCE_FROM_SEED));
! }
! curi.getAList().putInt(A_DISTANCE_FROM_SEED,newDist);
allCuris.put(uuri,curi);
KeyedQueue classQueue = (KeyedQueue) allClassQueuesMap.get(curi.getClassKey());
--- 376,382 ----
curi = new CrawlURI(uuri);
}
!
! applyCarryforwards(curi,sourceCuri, extraHop );
!
allCuris.put(uuri,curi);
KeyedQueue classQueue = (KeyedQueue) allClassQueuesMap.get(curi.getClassKey());
***************
*** 385,392 ****
curi.setStoreState(URIStoreable.PENDING);
notify();
! return;
}
classQueue.addLast(curi);
curi.setStoreState(classQueue.getStoreState());
}
--- 385,430 ----
curi.setStoreState(URIStoreable.PENDING);
notify();
! return curi;
}
classQueue.addLast(curi);
curi.setStoreState(classQueue.getStoreState());
+ return curi;
+ }
+
+ /**
+ * @param curi
+ * @param sourceCuri
+ */
+ private void applyCarryforwards(CrawlURI curi, CrawlURI sourceCuri, int extraHop) {
+ int newDist = sourceCuri.getAList().getInt("distance-from-seed")+extraHop;
+ if(curi.getAList().containsKey(A_DISTANCE_FROM_SEED)) {
+ int oldDist = curi.getAList().getInt(A_DISTANCE_FROM_SEED);
+ if (oldDist>newDist) {
+ curi.getAList().putInt(A_DISTANCE_FROM_SEED,newDist);
+ curi.setVia(sourceCuri);
+ } // otherwise leave alone
+ } else {
+ curi.getAList().putInt(A_DISTANCE_FROM_SEED,newDist);
+ curi.setVia(sourceCuri);
+ }
+
+
+ int newChaffness = sourceCuri.getChaffness();
+ if(!sourceCuri.getUURI().getUri().getHost().equals(curi.getUURI().getUri().getHost())) {
+ newChaffness = 0;
+ } else {
+ BitSet scratch = (BitSet) sourceCuri.getFuzzy().clone();
+ scratch.xor(curi.getFuzzy());
+ int fuzzyDiff = scratch.cardinality();
+ if(fuzzyDiff<2) {
+ newChaffness += 1;
+ } else {
+ newChaffness -= 1;
+ }
+ }
+ if(newChaffness<0) {
+ newChaffness = 0;
+ }
+ curi.setChaffness(newChaffness);
}
|