From: <go...@us...> - 2003-09-06 01:48:48
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic In directory sc8-pr-cvs1:/tmp/cvs-serv15682/src/org/archive/crawler/basic Modified Files: SimpleStore.java Log Message: carryforward chaffness indicator Index: SimpleStore.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleStore.java,v retrieving revision 1.25 retrieving revision 1.26 diff -C2 -d -r1.25 -r1.26 *** SimpleStore.java 6 Aug 2003 01:23:10 -0000 1.25 --- SimpleStore.java 6 Sep 2003 01:48:41 -0000 1.26 *************** *** 7,10 **** --- 7,11 ---- package org.archive.crawler.basic; + import java.util.BitSet; import java.util.HashMap; import java.util.LinkedList; *************** *** 359,371 **** * @param i */ ! public void insert(UURI uuri, int dist) { ! if(filteredOut(uuri)) return; CrawlURI curi = (CrawlURI)allCuris.get(uuri); if(curi!=null) { // already inserted // TODO: perhaps yank to front? // if curi is still locked out, ignore request to schedule if(curi.getStoreState()!=URIStoreable.FINISHED || curi.dontFetchYet()){ ! return; } // yank URI back into scheduling if necessary --- 360,373 ---- * @param i */ ! public CrawlURI insert(UURI uuri, CrawlURI sourceCuri, int extraHop) { ! if(filteredOut(uuri)) return null; CrawlURI curi = (CrawlURI)allCuris.get(uuri); if(curi!=null) { // already inserted // TODO: perhaps yank to front? + // TODO: increment inlink counter? // if curi is still locked out, ignore request to schedule if(curi.getStoreState()!=URIStoreable.FINISHED || curi.dontFetchYet()){ ! return curi; } // yank URI back into scheduling if necessary *************** *** 374,382 **** curi = new CrawlURI(uuri); } ! int newDist = dist; ! if(curi.getAList().containsKey(A_DISTANCE_FROM_SEED)) { ! newDist = Math.max(dist,curi.getAList().getInt(A_DISTANCE_FROM_SEED)); ! } ! curi.getAList().putInt(A_DISTANCE_FROM_SEED,newDist); allCuris.put(uuri,curi); KeyedQueue classQueue = (KeyedQueue) allClassQueuesMap.get(curi.getClassKey()); --- 376,382 ---- curi = new CrawlURI(uuri); } ! ! applyCarryforwards(curi,sourceCuri, extraHop ); ! allCuris.put(uuri,curi); KeyedQueue classQueue = (KeyedQueue) allClassQueuesMap.get(curi.getClassKey()); *************** *** 385,392 **** curi.setStoreState(URIStoreable.PENDING); notify(); ! return; } classQueue.addLast(curi); curi.setStoreState(classQueue.getStoreState()); } --- 385,430 ---- curi.setStoreState(URIStoreable.PENDING); notify(); ! return curi; } classQueue.addLast(curi); curi.setStoreState(classQueue.getStoreState()); + return curi; + } + + /** + * @param curi + * @param sourceCuri + */ + private void applyCarryforwards(CrawlURI curi, CrawlURI sourceCuri, int extraHop) { + int newDist = sourceCuri.getAList().getInt("distance-from-seed")+extraHop; + if(curi.getAList().containsKey(A_DISTANCE_FROM_SEED)) { + int oldDist = curi.getAList().getInt(A_DISTANCE_FROM_SEED); + if (oldDist>newDist) { + curi.getAList().putInt(A_DISTANCE_FROM_SEED,newDist); + curi.setVia(sourceCuri); + } // otherwise leave alone + } else { + curi.getAList().putInt(A_DISTANCE_FROM_SEED,newDist); + curi.setVia(sourceCuri); + } + + + int newChaffness = sourceCuri.getChaffness(); + if(!sourceCuri.getUURI().getUri().getHost().equals(curi.getUURI().getUri().getHost())) { + newChaffness = 0; + } else { + BitSet scratch = (BitSet) sourceCuri.getFuzzy().clone(); + scratch.xor(curi.getFuzzy()); + int fuzzyDiff = scratch.cardinality(); + if(fuzzyDiff<2) { + newChaffness += 1; + } else { + newChaffness -= 1; + } + } + if(newChaffness<0) { + newChaffness = 0; + } + curi.setChaffness(newChaffness); } |