|
From: <go...@us...> - 2003-09-06 02:00:27
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel
In directory sc8-pr-cvs1:/tmp/cvs-serv17364/src/org/archive/crawler/datamodel
Modified Files:
FetchStatusCodes.java UURI.java CrawlURI.java
Log Message:
chaff detection support
Index: FetchStatusCodes.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/FetchStatusCodes.java,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** FetchStatusCodes.java 18 Jul 2003 18:27:59 -0000 1.8
--- FetchStatusCodes.java 6 Sep 2003 02:00:12 -0000 1.9
***************
*** 30,33 ****
--- 30,34 ----
public static int S_ROBOTS_PRECLUDED = -9998;
+ public static int S_DEEMED_CHAFF = -4000;
public static int S_DNS_SUCCESS = 1;
Index: UURI.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/UURI.java,v
retrieving revision 1.19
retrieving revision 1.20
diff -C2 -d -r1.19 -r1.20
*** UURI.java 21 Aug 2003 23:28:59 -0000 1.19
--- UURI.java 6 Sep 2003 02:00:12 -0000 1.20
***************
*** 39,45 ****
* @param u
*/
! public UURI(URI u) {
uri = u;
}
/**
--- 39,47 ----
* @param u
*/
! private UURI(URI u) {
uri = u;
}
+
+
/**
Index: CrawlURI.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlURI.java,v
retrieving revision 1.36
retrieving revision 1.37
diff -C2 -d -r1.36 -r1.37
*** CrawlURI.java 6 Aug 2003 01:19:02 -0000 1.36
--- CrawlURI.java 6 Sep 2003 02:00:13 -0000 1.37
***************
*** 10,15 ****
--- 10,18 ----
import java.net.URISyntaxException;
import java.util.ArrayList;
+ import java.util.BitSet;
import java.util.List;
import java.util.logging.Level;
+ import java.util.regex.Matcher;
+ import java.util.regex.Pattern;
import org.archive.crawler.basic.FetcherDNS;
***************
*** 37,47 ****
--- 40,55 ----
public class CrawlURI
implements URIStoreable, CoreAttributeConstants, FetchStatusCodes {
+ private Pattern FUZZY_TOKENS = Pattern.compile("\\w+");
+
private long wakeTime;
public static final String CONTENT_TYPE_LABEL = "content-type";
+ private static int FUZZY_WIDTH = 32;
private UURI baseUri;
private AList alist = new HashtableAList();
private UURI uuri;
+ private BitSet fuzzy; // uri token bitfield as sort of fuzzy checksum
+ private CrawlURI via; // curi that led to this (lowest hops from seed)
private Object state;
CrawlController controller;
***************
*** 52,56 ****
private int deferrals = 0;
private int fetchAttempts = 0; // the number of fetch attempts that have been made
!
private int threadNumber;
--- 60,65 ----
private int deferrals = 0;
private int fetchAttempts = 0; // the number of fetch attempts that have been made
! private int chaffness = 0; // suspiciousness of being of chaff
!
private int threadNumber;
***************
*** 63,70 ****
*/
public CrawlURI(UURI u) {
! uuri=u;
}
/**
* Set the time this curi is considered expired (and thus must be refetched)
* to 'expires'. This function will set the time to an arbitrary value.
--- 72,99 ----
*/
public CrawlURI(UURI u) {
! setUuri(u);
}
/**
+ * @param u
+ */
+ private void setUuri(UURI u) {
+ uuri=u;
+ setFuzzy();
+ }
+
+ /**
+ * set a fuzzy fingerprint for the correspoding URI based on its word-char segments
+ */
+ private void setFuzzy() {
+ fuzzy = new BitSet(FUZZY_WIDTH);
+ Matcher tokens = FUZZY_TOKENS.matcher(uuri.toString());
+ tokens.find(); // skip http
+ while(tokens.find()) {
+ fuzzy.set(Math.abs(tokens.group().hashCode() % FUZZY_WIDTH));
+ }
+ }
+
+ /**
* Set the time this curi is considered expired (and thus must be refetched)
* to 'expires'. This function will set the time to an arbitrary value.
***************
*** 93,103 ****
! /**
! * @param uri
! * @return
! */
! public CrawlURI(URI u){
! uuri = new UURI(u);
! }
--- 122,126 ----
!
***************
*** 123,129 ****
public CrawlURI(String s){
try{
! uuri = new UURI(new URI(s));
}catch(Exception e){
! uuri = null;
}
}
--- 146,152 ----
public CrawlURI(String s){
try{
! setUuri(UURI.createUURI(s));
}catch(Exception e){
! setUuri(null);
}
}
***************
*** 411,414 ****
--- 434,466 ----
// TODO implement
System.out.println("CrawlURI.addLocalizedError() says: \"Implement me!\"");
+ }
+
+ /**
+ * @return
+ */
+ public int getChaffness() {
+ return chaffness;
+ }
+
+ /**
+ * @return
+ */
+ public BitSet getFuzzy() {
+ // TODO Auto-generated method stub
+ return fuzzy;
+ }
+
+ /**
+ * @param i
+ */
+ public void setChaffness(int i) {
+ chaffness = i;
+ }
+
+ /**
+ * @param sourceCuri
+ */
+ public void setVia(CrawlURI sourceCuri) {
+ via = sourceCuri;
}
|