|
From: <go...@us...> - 2003-09-12 02:02:29
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel
In directory sc8-pr-cvs1:/tmp/cvs-serv28250/src/org/archive/crawler/datamodel
Modified Files:
CrawlURI.java
Log Message:
improved link-hop, embed-hop handling: embeds get a separate leash distance from last link
Index: CrawlURI.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlURI.java,v
retrieving revision 1.37
retrieving revision 1.38
diff -C2 -d -r1.37 -r1.38
*** CrawlURI.java 6 Sep 2003 02:00:13 -0000 1.37
--- CrawlURI.java 12 Sep 2003 02:02:24 -0000 1.38
***************
*** 62,65 ****
--- 62,68 ----
private int chaffness = 0; // suspiciousness of being of chaff
+ private int linkHopCount = -1; // from seeds
+ private int embedHopCount = -1; // from a sure link
+
private int threadNumber;
***************
*** 461,470 ****
* @param sourceCuri
*/
! public void setVia(CrawlURI sourceCuri) {
via = sourceCuri;
}
/* public boolean isFubared(){
return ( fetchStatus < 0 && numberOfFetchAttempts >= 3);
}*/
}
--- 464,518 ----
* @param sourceCuri
*/
! public void setViaLinkFrom(CrawlURI sourceCuri) {
via = sourceCuri;
+ int candidateLinkHopCount = sourceCuri.getLinkHopCount()+1;
+ embedHopCount = 0;
+ if (linkHopCount == -1) {
+ linkHopCount = candidateLinkHopCount;
+ return;
+ }
+ if (linkHopCount > candidateLinkHopCount) {
+ linkHopCount = candidateLinkHopCount;
+ }
+ }
+
+ /**
+ * @param sourceCuri
+ */
+ public void setViaEmbedFrom(CrawlURI sourceCuri) {
+ via = sourceCuri;
+ int candidateLinkHopCount = sourceCuri.getLinkHopCount();
+ if (linkHopCount == -1) {
+ linkHopCount = candidateLinkHopCount;
+ } else if (linkHopCount > candidateLinkHopCount) {
+ linkHopCount = candidateLinkHopCount;
+ }
+ int candidateEmbedHopCount = sourceCuri.getEmbedHopCount()+1;
+ if (embedHopCount == -1) {
+ embedHopCount = candidateEmbedHopCount;
+ } else if (embedHopCount > candidateEmbedHopCount) {
+ embedHopCount = candidateEmbedHopCount;
+ }
}
+
/* public boolean isFubared(){
return ( fetchStatus < 0 && numberOfFetchAttempts >= 3);
}*/
+
+
+ /**
+ * @return
+ */
+ public int getEmbedHopCount() {
+ return embedHopCount;
+ }
+
+ /**
+ * @return
+ */
+ public int getLinkHopCount() {
+ return linkHopCount;
+ }
+
}
|