From: <go...@us...> - 2003-09-06 02:00:27
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel In directory sc8-pr-cvs1:/tmp/cvs-serv17364/src/org/archive/crawler/datamodel Modified Files: FetchStatusCodes.java UURI.java CrawlURI.java Log Message: chaff detection support Index: FetchStatusCodes.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/FetchStatusCodes.java,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** FetchStatusCodes.java 18 Jul 2003 18:27:59 -0000 1.8 --- FetchStatusCodes.java 6 Sep 2003 02:00:12 -0000 1.9 *************** *** 30,33 **** --- 30,34 ---- public static int S_ROBOTS_PRECLUDED = -9998; + public static int S_DEEMED_CHAFF = -4000; public static int S_DNS_SUCCESS = 1; Index: UURI.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/UURI.java,v retrieving revision 1.19 retrieving revision 1.20 diff -C2 -d -r1.19 -r1.20 *** UURI.java 21 Aug 2003 23:28:59 -0000 1.19 --- UURI.java 6 Sep 2003 02:00:12 -0000 1.20 *************** *** 39,45 **** * @param u */ ! public UURI(URI u) { uri = u; } /** --- 39,47 ---- * @param u */ ! private UURI(URI u) { uri = u; } + + /** Index: CrawlURI.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlURI.java,v retrieving revision 1.36 retrieving revision 1.37 diff -C2 -d -r1.36 -r1.37 *** CrawlURI.java 6 Aug 2003 01:19:02 -0000 1.36 --- CrawlURI.java 6 Sep 2003 02:00:13 -0000 1.37 *************** *** 10,15 **** --- 10,18 ---- import java.net.URISyntaxException; import java.util.ArrayList; + import java.util.BitSet; import java.util.List; import java.util.logging.Level; + import java.util.regex.Matcher; + import java.util.regex.Pattern; import org.archive.crawler.basic.FetcherDNS; *************** *** 37,47 **** --- 40,55 ---- public class CrawlURI implements URIStoreable, CoreAttributeConstants, FetchStatusCodes { + private Pattern FUZZY_TOKENS = Pattern.compile("\\w+"); + private long wakeTime; public static final String CONTENT_TYPE_LABEL = "content-type"; + private static int FUZZY_WIDTH = 32; private UURI baseUri; private AList alist = new HashtableAList(); private UURI uuri; + private BitSet fuzzy; // uri token bitfield as sort of fuzzy checksum + private CrawlURI via; // curi that led to this (lowest hops from seed) private Object state; CrawlController controller; *************** *** 52,56 **** private int deferrals = 0; private int fetchAttempts = 0; // the number of fetch attempts that have been made ! private int threadNumber; --- 60,65 ---- private int deferrals = 0; private int fetchAttempts = 0; // the number of fetch attempts that have been made ! private int chaffness = 0; // suspiciousness of being of chaff ! private int threadNumber; *************** *** 63,70 **** */ public CrawlURI(UURI u) { ! uuri=u; } /** * Set the time this curi is considered expired (and thus must be refetched) * to 'expires'. This function will set the time to an arbitrary value. --- 72,99 ---- */ public CrawlURI(UURI u) { ! setUuri(u); } /** + * @param u + */ + private void setUuri(UURI u) { + uuri=u; + setFuzzy(); + } + + /** + * set a fuzzy fingerprint for the correspoding URI based on its word-char segments + */ + private void setFuzzy() { + fuzzy = new BitSet(FUZZY_WIDTH); + Matcher tokens = FUZZY_TOKENS.matcher(uuri.toString()); + tokens.find(); // skip http + while(tokens.find()) { + fuzzy.set(Math.abs(tokens.group().hashCode() % FUZZY_WIDTH)); + } + } + + /** * Set the time this curi is considered expired (and thus must be refetched) * to 'expires'. This function will set the time to an arbitrary value. *************** *** 93,103 **** ! /** ! * @param uri ! * @return ! */ ! public CrawlURI(URI u){ ! uuri = new UURI(u); ! } --- 122,126 ---- ! *************** *** 123,129 **** public CrawlURI(String s){ try{ ! uuri = new UURI(new URI(s)); }catch(Exception e){ ! uuri = null; } } --- 146,152 ---- public CrawlURI(String s){ try{ ! setUuri(UURI.createUURI(s)); }catch(Exception e){ ! setUuri(null); } } *************** *** 411,414 **** --- 434,466 ---- // TODO implement System.out.println("CrawlURI.addLocalizedError() says: \"Implement me!\""); + } + + /** + * @return + */ + public int getChaffness() { + return chaffness; + } + + /** + * @return + */ + public BitSet getFuzzy() { + // TODO Auto-generated method stub + return fuzzy; + } + + /** + * @param i + */ + public void setChaffness(int i) { + chaffness = i; + } + + /** + * @param sourceCuri + */ + public void setVia(CrawlURI sourceCuri) { + via = sourceCuri; } |