From: <go...@us...> - 2003-09-23 01:16:40
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel In directory sc8-pr-cvs1:/tmp/cvs-serv21672/src/org/archive/crawler/datamodel Modified Files: FetchStatusCodes.java CrawlURI.java Log Message: refactorings(in progress) Index: FetchStatusCodes.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/FetchStatusCodes.java,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** FetchStatusCodes.java 6 Sep 2003 02:00:12 -0000 1.9 --- FetchStatusCodes.java 23 Sep 2003 01:16:34 -0000 1.10 *************** *** 31,34 **** --- 31,37 ---- public static int S_ROBOTS_PRECLUDED = -9998; public static int S_DEEMED_CHAFF = -4000; + public static int S_TOO_MANY_LINK_HOPS = -4001; + public static int S_TOO_MANY_EMBED_HOPS = -4002; + public static int S_OUT_OF_SCOPE = -5000; public static int S_DNS_SUCCESS = 1; Index: CrawlURI.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlURI.java,v retrieving revision 1.38 retrieving revision 1.39 diff -C2 -d -r1.38 -r1.39 *** CrawlURI.java 12 Sep 2003 02:02:24 -0000 1.38 --- CrawlURI.java 23 Sep 2003 01:16:35 -0000 1.39 *************** *** 7,22 **** package org.archive.crawler.datamodel; - import java.net.URI; - import java.net.URISyntaxException; import java.util.ArrayList; - import java.util.BitSet; import java.util.List; - import java.util.logging.Level; - import java.util.regex.Matcher; - import java.util.regex.Pattern; import org.archive.crawler.basic.FetcherDNS; import org.archive.crawler.basic.URIStoreable; - import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.Processor; --- 7,15 ---- *************** *** 26,34 **** /** ! * Represents a URI and the associated state it collects as ! * it is crawled. * ! * Except for a few special components, state is in a flexible ! * attibute list. * * Should only be instantiated via URIStore.getCrawlURI(...), --- 19,27 ---- /** ! * Represents a candidate URI and the associated state it ! * collects as it is crawled. * ! * Core state is in instance variables, but a flexible ! * attribute list is also available. * * Should only be instantiated via URIStore.getCrawlURI(...), *************** *** 40,73 **** public class CrawlURI implements URIStoreable, CoreAttributeConstants, FetchStatusCodes { ! private Pattern FUZZY_TOKENS = Pattern.compile("\\w+"); ! private long wakeTime; ! public static final String CONTENT_TYPE_LABEL = "content-type"; ! private static int FUZZY_WIDTH = 32; ! ! private UURI baseUri; ! private AList alist = new HashtableAList(); ! private UURI uuri; ! private BitSet fuzzy; // uri token bitfield as sort of fuzzy checksum ! private CrawlURI via; // curi that led to this (lowest hops from seed) ! private Object state; ! CrawlController controller; Processor nextProcessor; - CrawlServer server; - private int fetchStatus = 0; // default to unattempted ! private int deferrals = 0; ! private int fetchAttempts = 0; // the number of fetch attempts that have been made ! private int chaffness = 0; // suspiciousness of being of chaff ! private int linkHopCount = -1; // from seeds private int embedHopCount = -1; // from a sure link - private int threadNumber; private int contentSize = -1; - private long dontRetryBefore = -1; /** --- 33,66 ---- public class CrawlURI implements URIStoreable, CoreAttributeConstants, FetchStatusCodes { ! // core identity: the "usable URI" to be crawled ! private UURI uuri; ! // Scheduler lifecycle info ! private Object state; // state within scheduling/store/selector ! private long wakeTime; // if "snoozed", when this CrawlURI may awake ! private long dontRetryBefore = -1; ! private int threadNumber; ! // Processing progress Processor nextProcessor; private int fetchStatus = 0; // default to unattempted ! private int deferrals = 0; // count of postponements for prerequisites ! private int fetchAttempts = 0; // the number of fetch attempts that have been made ! ! // flexible dynamic attributes ! private AList alist = new HashtableAList(); ! ! // dynamic context ! private CrawlURI via; // curi that led to this (lowest hops from seed) private int linkHopCount = -1; // from seeds private int embedHopCount = -1; // from a sure link + + //////////////////////////////////////////////////////////////////// + CrawlServer server; + private int contentSize = -1; /** *************** *** 83,100 **** private void setUuri(UURI u) { uuri=u; - setFuzzy(); } - /** - * set a fuzzy fingerprint for the correspoding URI based on its word-char segments - */ - private void setFuzzy() { - fuzzy = new BitSet(FUZZY_WIDTH); - Matcher tokens = FUZZY_TOKENS.matcher(uuri.toString()); - tokens.find(); // skip http - while(tokens.find()) { - fuzzy.set(Math.abs(tokens.group().hashCode() % FUZZY_WIDTH)); - } - } /** --- 76,81 ---- *************** *** 289,314 **** /** - * - */ - public URI getBaseUri() { - if (baseUri != null) { - return baseUri.getUri(); - } - if (!getAList().containsKey("html-base-href")) { - return getUURI().getUri(); - } - String base = getAList().getString("html-base-href"); - try { - baseUri = UURI.createUURI(base); - } catch (URISyntaxException e) { - Object[] array = { this, base }; - controller.uriErrors.log(Level.INFO,e.getMessage(), array ); - // next best thing: use self - baseUri = getUURI(); - } - return getBaseUri(); - } - - /** * @ */ --- 270,273 ---- *************** *** 340,350 **** /** - * @param controller - */ - public void setController(CrawlController c) { - controller = c; - } - - /** * */ --- 299,302 ---- *************** *** 437,462 **** // TODO implement System.out.println("CrawlURI.addLocalizedError() says: \"Implement me!\""); - } - - /** - * @return - */ - public int getChaffness() { - return chaffness; - } - - /** - * @return - */ - public BitSet getFuzzy() { - // TODO Auto-generated method stub - return fuzzy; - } - - /** - * @param i - */ - public void setChaffness(int i) { - chaffness = i; } --- 389,392 ---- |