|
From: <go...@us...> - 2003-09-23 01:16:40
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel
In directory sc8-pr-cvs1:/tmp/cvs-serv21672/src/org/archive/crawler/datamodel
Modified Files:
FetchStatusCodes.java CrawlURI.java
Log Message:
refactorings(in progress)
Index: FetchStatusCodes.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/FetchStatusCodes.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -d -r1.9 -r1.10
*** FetchStatusCodes.java 6 Sep 2003 02:00:12 -0000 1.9
--- FetchStatusCodes.java 23 Sep 2003 01:16:34 -0000 1.10
***************
*** 31,34 ****
--- 31,37 ----
public static int S_ROBOTS_PRECLUDED = -9998;
public static int S_DEEMED_CHAFF = -4000;
+ public static int S_TOO_MANY_LINK_HOPS = -4001;
+ public static int S_TOO_MANY_EMBED_HOPS = -4002;
+ public static int S_OUT_OF_SCOPE = -5000;
public static int S_DNS_SUCCESS = 1;
Index: CrawlURI.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlURI.java,v
retrieving revision 1.38
retrieving revision 1.39
diff -C2 -d -r1.38 -r1.39
*** CrawlURI.java 12 Sep 2003 02:02:24 -0000 1.38
--- CrawlURI.java 23 Sep 2003 01:16:35 -0000 1.39
***************
*** 7,22 ****
package org.archive.crawler.datamodel;
- import java.net.URI;
- import java.net.URISyntaxException;
import java.util.ArrayList;
- import java.util.BitSet;
import java.util.List;
- import java.util.logging.Level;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
import org.archive.crawler.basic.FetcherDNS;
import org.archive.crawler.basic.URIStoreable;
- import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.Processor;
--- 7,15 ----
***************
*** 26,34 ****
/**
! * Represents a URI and the associated state it collects as
! * it is crawled.
*
! * Except for a few special components, state is in a flexible
! * attibute list.
*
* Should only be instantiated via URIStore.getCrawlURI(...),
--- 19,27 ----
/**
! * Represents a candidate URI and the associated state it
! * collects as it is crawled.
*
! * Core state is in instance variables, but a flexible
! * attribute list is also available.
*
* Should only be instantiated via URIStore.getCrawlURI(...),
***************
*** 40,73 ****
public class CrawlURI
implements URIStoreable, CoreAttributeConstants, FetchStatusCodes {
! private Pattern FUZZY_TOKENS = Pattern.compile("\\w+");
! private long wakeTime;
! public static final String CONTENT_TYPE_LABEL = "content-type";
! private static int FUZZY_WIDTH = 32;
!
! private UURI baseUri;
! private AList alist = new HashtableAList();
! private UURI uuri;
! private BitSet fuzzy; // uri token bitfield as sort of fuzzy checksum
! private CrawlURI via; // curi that led to this (lowest hops from seed)
! private Object state;
! CrawlController controller;
Processor nextProcessor;
- CrawlServer server;
-
private int fetchStatus = 0; // default to unattempted
! private int deferrals = 0;
! private int fetchAttempts = 0; // the number of fetch attempts that have been made
! private int chaffness = 0; // suspiciousness of being of chaff
!
private int linkHopCount = -1; // from seeds
private int embedHopCount = -1; // from a sure link
- private int threadNumber;
private int contentSize = -1;
- private long dontRetryBefore = -1;
/**
--- 33,66 ----
public class CrawlURI
implements URIStoreable, CoreAttributeConstants, FetchStatusCodes {
! // core identity: the "usable URI" to be crawled
! private UURI uuri;
! // Scheduler lifecycle info
! private Object state; // state within scheduling/store/selector
! private long wakeTime; // if "snoozed", when this CrawlURI may awake
! private long dontRetryBefore = -1;
! private int threadNumber;
! // Processing progress
Processor nextProcessor;
private int fetchStatus = 0; // default to unattempted
! private int deferrals = 0; // count of postponements for prerequisites
! private int fetchAttempts = 0; // the number of fetch attempts that have been made
!
! // flexible dynamic attributes
! private AList alist = new HashtableAList();
!
! // dynamic context
! private CrawlURI via; // curi that led to this (lowest hops from seed)
private int linkHopCount = -1; // from seeds
private int embedHopCount = -1; // from a sure link
+
+ ////////////////////////////////////////////////////////////////////
+ CrawlServer server;
+
private int contentSize = -1;
/**
***************
*** 83,100 ****
private void setUuri(UURI u) {
uuri=u;
- setFuzzy();
}
- /**
- * set a fuzzy fingerprint for the correspoding URI based on its word-char segments
- */
- private void setFuzzy() {
- fuzzy = new BitSet(FUZZY_WIDTH);
- Matcher tokens = FUZZY_TOKENS.matcher(uuri.toString());
- tokens.find(); // skip http
- while(tokens.find()) {
- fuzzy.set(Math.abs(tokens.group().hashCode() % FUZZY_WIDTH));
- }
- }
/**
--- 76,81 ----
***************
*** 289,314 ****
/**
- *
- */
- public URI getBaseUri() {
- if (baseUri != null) {
- return baseUri.getUri();
- }
- if (!getAList().containsKey("html-base-href")) {
- return getUURI().getUri();
- }
- String base = getAList().getString("html-base-href");
- try {
- baseUri = UURI.createUURI(base);
- } catch (URISyntaxException e) {
- Object[] array = { this, base };
- controller.uriErrors.log(Level.INFO,e.getMessage(), array );
- // next best thing: use self
- baseUri = getUURI();
- }
- return getBaseUri();
- }
-
- /**
* @
*/
--- 270,273 ----
***************
*** 340,350 ****
/**
- * @param controller
- */
- public void setController(CrawlController c) {
- controller = c;
- }
-
- /**
*
*/
--- 299,302 ----
***************
*** 437,462 ****
// TODO implement
System.out.println("CrawlURI.addLocalizedError() says: \"Implement me!\"");
- }
-
- /**
- * @return
- */
- public int getChaffness() {
- return chaffness;
- }
-
- /**
- * @return
- */
- public BitSet getFuzzy() {
- // TODO Auto-generated method stub
- return fuzzy;
- }
-
- /**
- * @param i
- */
- public void setChaffness(int i) {
- chaffness = i;
}
--- 389,392 ----
|