You can subscribe to this list here.
| 2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
(2) |
Sep
(50) |
Oct
(197) |
Nov
(305) |
Dec
(295) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2004 |
Jan
(429) |
Feb
(694) |
Mar
(443) |
Apr
(479) |
May
(357) |
Jun
(74) |
Jul
(218) |
Aug
(162) |
Sep
(156) |
Oct
(340) |
Nov
(132) |
Dec
(224) |
| 2005 |
Jan
(170) |
Feb
(122) |
Mar
(265) |
Apr
(215) |
May
(139) |
Jun
(247) |
Jul
(179) |
Aug
(116) |
Sep
(103) |
Oct
(125) |
Nov
(97) |
Dec
(221) |
| 2006 |
Jan
(132) |
Feb
(18) |
Mar
(23) |
Apr
(35) |
May
(71) |
Jun
(268) |
Jul
(220) |
Aug
(376) |
Sep
(181) |
Oct
(71) |
Nov
(131) |
Dec
(172) |
| 2007 |
Jan
(125) |
Feb
(79) |
Mar
(90) |
Apr
(76) |
May
(91) |
Jun
(64) |
Jul
(113) |
Aug
(96) |
Sep
(40) |
Oct
(30) |
Nov
(85) |
Dec
(56) |
| 2008 |
Jan
(37) |
Feb
(79) |
Mar
(22) |
Apr
(6) |
May
(13) |
Jun
(22) |
Jul
(83) |
Aug
(50) |
Sep
(8) |
Oct
(32) |
Nov
(55) |
Dec
(28) |
| 2009 |
Jan
(15) |
Feb
(30) |
Mar
(28) |
Apr
(69) |
May
(82) |
Jun
(19) |
Jul
(64) |
Aug
(71) |
Sep
(53) |
Oct
(84) |
Nov
(105) |
Dec
(40) |
| 2010 |
Jan
(11) |
Feb
(19) |
Mar
(24) |
Apr
(58) |
May
(15) |
Jun
(35) |
Jul
(14) |
Aug
(13) |
Sep
(31) |
Oct
(15) |
Nov
(39) |
Dec
(10) |
| 2011 |
Jan
(59) |
Feb
(32) |
Mar
(10) |
Apr
(37) |
May
(20) |
Jun
(21) |
Jul
(39) |
Aug
(9) |
Sep
(31) |
Oct
(29) |
Nov
(3) |
Dec
(1) |
| 2012 |
Jan
(7) |
Feb
(4) |
Mar
(5) |
Apr
(12) |
May
(5) |
Jun
(8) |
Jul
(9) |
Aug
(6) |
Sep
(15) |
Oct
(1) |
Nov
(3) |
Dec
(9) |
| 2013 |
Jan
(9) |
Feb
(2) |
Mar
(41) |
Apr
(13) |
May
(9) |
Jun
(20) |
Jul
(5) |
Aug
(22) |
Sep
(5) |
Oct
(3) |
Nov
(13) |
Dec
(8) |
| 2014 |
Jan
(27) |
Feb
(16) |
Mar
(7) |
Apr
(14) |
May
(10) |
Jun
(2) |
Jul
(16) |
Aug
(6) |
Sep
(6) |
Oct
(11) |
Nov
(7) |
Dec
|
| 2015 |
Jan
|
Feb
(7) |
Mar
(4) |
Apr
|
May
(2) |
Jun
|
Jul
|
Aug
(2) |
Sep
(2) |
Oct
(5) |
Nov
(1) |
Dec
|
| 2016 |
Jan
(15) |
Feb
(5) |
Mar
(4) |
Apr
(1) |
May
(7) |
Jun
(16) |
Jul
(6) |
Aug
(2) |
Sep
|
Oct
(1) |
Nov
|
Dec
|
| 2017 |
Jan
|
Feb
(1) |
Mar
(3) |
Apr
|
May
(4) |
Jun
(25) |
Jul
|
Aug
|
Sep
(4) |
Oct
(11) |
Nov
(9) |
Dec
(1) |
| 2018 |
Jan
(2) |
Feb
|
Mar
|
Apr
|
May
(2) |
Jun
|
Jul
(10) |
Aug
|
Sep
(1) |
Oct
(2) |
Nov
(12) |
Dec
(4) |
| 2019 |
Jan
(3) |
Feb
(21) |
Mar
(17) |
Apr
(13) |
May
(6) |
Jun
(4) |
Jul
|
Aug
(65) |
Sep
|
Oct
(4) |
Nov
(7) |
Dec
|
| 2020 |
Jan
(23) |
Feb
(6) |
Mar
(14) |
Apr
(25) |
May
(11) |
Jun
(9) |
Jul
(7) |
Aug
(7) |
Sep
(1) |
Oct
(4) |
Nov
(4) |
Dec
|
| 2021 |
Jan
(8) |
Feb
(11) |
Mar
(1) |
Apr
(6) |
May
(30) |
Jun
(60) |
Jul
(43) |
Aug
(23) |
Sep
(16) |
Oct
|
Nov
(7) |
Dec
(13) |
| 2022 |
Jan
(7) |
Feb
(2) |
Mar
(17) |
Apr
(16) |
May
(9) |
Jun
(2) |
Jul
(18) |
Aug
|
Sep
(3) |
Oct
(1) |
Nov
(2) |
Dec
|
| 2023 |
Jan
(7) |
Feb
|
Mar
(11) |
Apr
|
May
(1) |
Jun
|
Jul
|
Aug
|
Sep
(7) |
Oct
(5) |
Nov
(2) |
Dec
|
| 2024 |
Jan
|
Feb
(4) |
Mar
(8) |
Apr
(5) |
May
(5) |
Jun
(12) |
Jul
(2) |
Aug
(12) |
Sep
(25) |
Oct
(47) |
Nov
(46) |
Dec
(3) |
| 2025 |
Jan
(6) |
Feb
(14) |
Mar
(8) |
Apr
(23) |
May
(34) |
Jun
(44) |
Jul
(8) |
Aug
(14) |
Sep
(12) |
Oct
(61) |
Nov
(3) |
Dec
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic
In directory sc8-pr-cvs1:/tmp/cvs-serv21672/src/org/archive/crawler/basic
Modified Files:
SimpleSelector.java URIStoreable.java
SimplePreconditionEnforcer.java SimpleStore.java
Added Files:
SimplePreselector.java
Log Message:
refactorings(in progress)
--- NEW FILE: SimplePreselector.java ---
/*
* SimplePreselector.java
* Created on Sep 22, 2003
*
* $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimplePreselector.java,v 1.1 2003/09/23 01:16:34 gojomo Exp $
*/
package org.archive.crawler.basic;
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.datamodel.FetchStatusCodes;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.Processor;
/**
* Gives a yes/no on whether a CrawlURI should be processed at all.
*
* Usually, failing a processor filter causes that processor
* to be skipped. Failing this processor's filter means a
* CrawlURI will be marked OUT_OF_SCOPE.
*
*
* @author gojomo
*
*/
public class SimplePreselector extends Processor implements FetchStatusCodes {
private static String XP_MAX_LINK_DEPTH="//limits/max-link-depth/@value";
private static String XP_MAX_EMBED_DEPTH="//limits/max-embed-depth/@value";
private int maxLinkDepth = -1;
private int maxEmbedDepth = -1;
/* (non-Javadoc)
* @see org.archive.crawler.framework.Processor#innerProcess(org.archive.crawler.datamodel.CrawlURI)
*/
protected void innerProcess(CrawlURI curi) {
super.innerProcess(curi);
// check for too-deep
if(maxLinkDepth>=0 && curi.getLinkHopCount()>maxLinkDepth) {
curi.setFetchStatus(S_TOO_MANY_LINK_HOPS);
curi.cancelFurtherProcessing();
return;
}
if(maxEmbedDepth>=0 && curi.getEmbedHopCount()>maxEmbedDepth) {
curi.setFetchStatus(S_TOO_MANY_EMBED_HOPS);
curi.cancelFurtherProcessing();
return;
}
}
/* (non-Javadoc)
* @see org.archive.crawler.framework.Processor#innerRejectProcess(org.archive.crawler.datamodel.CrawlURI)
*/
protected void innerRejectProcess(CrawlURI curi) {
super.innerRejectProcess(curi);
// filter-rejection means out-of-scope
curi.setFetchStatus(S_OUT_OF_SCOPE);
curi.cancelFurtherProcessing();
}
/* (non-Javadoc)
* @see org.archive.crawler.framework.Processor#initialize(org.archive.crawler.framework.CrawlController)
*/
public void initialize(CrawlController c) {
super.initialize(c);
maxLinkDepth = getIntAt(XP_MAX_LINK_DEPTH, maxLinkDepth);
maxEmbedDepth = getIntAt(XP_MAX_EMBED_DEPTH, maxEmbedDepth);
}
}
Index: SimpleSelector.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleSelector.java,v
retrieving revision 1.26
retrieving revision 1.27
diff -C2 -d -r1.26 -r1.27
*** SimpleSelector.java 19 Sep 2003 01:37:19 -0000 1.26
--- SimpleSelector.java 23 Sep 2003 01:16:34 -0000 1.27
***************
*** 7,10 ****
--- 7,11 ----
package org.archive.crawler.basic;
+ import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
***************
*** 37,42 ****
SimpleStore store;
ArrayList filters = new ArrayList();
! private int maxLinkDepth = -1;
! private int maxEmbedDepth = -1;
private int maxDeferrals = 10; // should be at least max-retries plus 3 or so
--- 38,45 ----
SimpleStore store;
ArrayList filters = new ArrayList();
!
! // MOVED TO PRESELECTOR PROCESSOR
! //private int maxLinkDepth = -1;
! //private int maxEmbedDepth = -1;
private int maxDeferrals = 10; // should be at least max-retries plus 3 or so
***************
*** 77,98 ****
}
// handle http headers
if (curi.getAList().containsKey(A_HTTP_HEADER_URIS)) {
! handleHttpHeaders(curi);
}
// handle embeds
! if ((maxEmbedDepth >= 0)
! && (curi.getEmbedHopCount()<maxEmbedDepth)
! && curi.getAList().containsKey(A_HTML_EMBEDS)) {
! handleEmbeds(curi);
}
! // handle links, if not too deep
! if (curi.getAList().containsKey(A_HTML_LINKS)
! && ((maxLinkDepth == -1)
! || (curi.getLinkHopCount() < maxLinkDepth))) {
! handleLinks(curi);
}
-
// SUCCESS: note & log
successDisposition(curi);
--- 80,99 ----
}
+
+ URI baseUri = getBaseURI(curi);
+
// handle http headers
if (curi.getAList().containsKey(A_HTTP_HEADER_URIS)) {
! handleHttpHeaders(curi, baseUri);
}
// handle embeds
! if (curi.getAList().containsKey(A_HTML_EMBEDS)) {
! handleEmbeds(curi, baseUri);
}
! // handle links
! if (curi.getAList().containsKey(A_HTML_LINKS)) {
! handleLinks(curi, baseUri);
}
// SUCCESS: note & log
successDisposition(curi);
***************
*** 110,113 ****
--- 111,132 ----
* @param curi
*/
+ private URI getBaseURI(CrawlURI curi) {
+ if (!curi.getAList().containsKey(A_HTML_BASE)) {
+ return curi.getUURI().getUri();
+ }
+ String base = curi.getAList().getString(A_HTML_BASE);
+ try {
+ return UURI.createUURI(base).getUri();
+ } catch (URISyntaxException e) {
+ Object[] array = { this, base };
+ controller.uriErrors.log(Level.INFO,e.getMessage(), array );
+ // next best thing: use self
+ return curi.getUURI().getUri();
+ }
+ }
+
+ /**
+ * @param curi
+ */
private void scheduleForRetry(CrawlURI curi) {
logger.fine("inserting snoozed "+curi+" for "+retryDelay);
***************
*** 133,136 ****
--- 152,162 ----
// something unexpectedly bad happened
case S_UNFETCHABLE_URI:
+ // no chance to fetch
+ case S_OUT_OF_SCOPE:
+ // filtered out
+ case S_TOO_MANY_EMBED_HOPS:
+ // too far from last true link
+ case S_TOO_MANY_LINK_HOPS:
+ // too far from seeds
return true;
***************
*** 167,171 ****
* @param curi
*/
! private void handleHttpHeaders(CrawlURI curi) {
// treat roughly the same as embeds, with same distance-from-seed
Collection uris = (Collection)curi.getAList().getObject(A_HTTP_HEADER_URIS);
--- 193,197 ----
* @param curi
*/
! private void handleHttpHeaders(CrawlURI curi, URI baseUri) {
// treat roughly the same as embeds, with same distance-from-seed
Collection uris = (Collection)curi.getAList().getObject(A_HTTP_HEADER_URIS);
***************
*** 174,178 ****
String e = (String)iter.next();
try {
! UURI u = UURI.createUURI(e,curi.getBaseUri());
//if(filtersAccept(u)) {
logger.fine("inserting header at head "+u);
--- 200,204 ----
String e = (String)iter.next();
try {
! UURI u = UURI.createUURI(e,baseUri);
//if(filtersAccept(u)) {
logger.fine("inserting header at head "+u);
***************
*** 271,275 ****
! protected void handleLinks(CrawlURI curi) {
if (curi.getFetchStatus() >= 400) {
// do not follow links of error pages
--- 297,301 ----
! protected void handleLinks(CrawlURI curi, URI baseUri) {
if (curi.getFetchStatus() >= 400) {
// do not follow links of error pages
***************
*** 281,285 ****
String l = (String)iter.next();
try {
! UURI link = UURI.createUURI(l,curi.getBaseUri());
if(filtersAccept(link)) {
logger.fine("inserting link "+link+" "+curi.getStoreState());
--- 307,311 ----
String l = (String)iter.next();
try {
! UURI link = UURI.createUURI(l,baseUri);
if(filtersAccept(link)) {
logger.fine("inserting link "+link+" "+curi.getStoreState());
***************
*** 294,298 ****
! protected void handleEmbeds(CrawlURI curi) {
if (curi.getFetchStatus() >= 400) {
// do not follow links of error pages
--- 320,324 ----
! protected void handleEmbeds(CrawlURI curi, URI baseUri) {
if (curi.getFetchStatus() >= 400) {
// do not follow links of error pages
***************
*** 304,308 ****
String e = (String)iter.next();
try {
! UURI embed = UURI.createUURI(e,curi.getBaseUri());
//if(filtersAccept(embed)) {
logger.fine("inserting embed at head "+embed);
--- 330,334 ----
String e = (String)iter.next();
try {
! UURI embed = UURI.createUURI(e,baseUri);
//if(filtersAccept(embed)) {
logger.fine("inserting embed at head "+embed);
***************
*** 337,341 ****
logger.fine("inserting prereq at head "+prereq);
//CrawlURI prereqCuri = store.insertAtHead(prereq,curi.getAList().getInt("distance-from-seed"));
! CrawlURI prereqCuri = store.insert(prereq,curi,false);
if (prereqCuri.getStoreState()==URIStoreable.FINISHED) {
curi.setFetchStatus(S_PREREQUISITE_FAILURE);
--- 363,367 ----
logger.fine("inserting prereq at head "+prereq);
//CrawlURI prereqCuri = store.insertAtHead(prereq,curi.getAList().getInt("distance-from-seed"));
! CrawlURI prereqCuri = store.insert(prereq,curi,true);
if (prereqCuri.getStoreState()==URIStoreable.FINISHED) {
curi.setFetchStatus(S_PREREQUISITE_FAILURE);
***************
*** 406,419 ****
array);
}
! curi.setStoreState(URIStoreable.FINISHED);
! if (curi.getDontRetryBefore()<0) {
! // if not otherwise set, retire this URI forever
! curi.setDontRetryBefore(Long.MAX_VALUE);
}
- curi.stripToMinimal();
}
/* (non-Javadoc)
* @see org.archive.crawler.framework.URISelector#initialize(org.archive.crawler.framework.CrawlController)
--- 432,464 ----
array);
}
! if(shouldBeForgotten(curi)) {
! // curi is dismissed without prejudice: it can be reconstituted
! store.forget(curi);
! } else {
! curi.setStoreState(URIStoreable.FINISHED);
! if (curi.getDontRetryBefore()<0) {
! // if not otherwise set, retire this URI forever
! curi.setDontRetryBefore(Long.MAX_VALUE);
! }
! curi.stripToMinimal();
}
}
+ /**
+ * @param curi
+ * @return
+ */
+ private boolean shouldBeForgotten(CrawlURI curi) {
+ switch(curi.getFetchStatus()) {
+ case S_TOO_MANY_EMBED_HOPS:
+ case S_TOO_MANY_LINK_HOPS:
+ return true;
+ default:
+ return false;
+ }
+ }
+
/* (non-Javadoc)
* @see org.archive.crawler.framework.URISelector#initialize(org.archive.crawler.framework.CrawlController)
***************
*** 422,427 ****
controller = c;
store = (SimpleStore)c.getStore();
! maxLinkDepth = controller.getOrder().getBehavior().getIntAt("//limits/max-link-depth/@value", maxLinkDepth);
! maxEmbedDepth = controller.getOrder().getBehavior().getIntAt("//limits/max-embed-depth/@value", maxEmbedDepth);
instantiateAllInto(XP_FILTERS,filters);
--- 467,472 ----
controller = c;
store = (SimpleStore)c.getStore();
! //maxLinkDepth = controller.getOrder().getBehavior().getIntAt("//limits/max-link-depth/@value", maxLinkDepth);
! //maxEmbedDepth = controller.getOrder().getBehavior().getIntAt("//limits/max-embed-depth/@value", maxEmbedDepth);
instantiateAllInto(XP_FILTERS,filters);
Index: URIStoreable.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/URIStoreable.java,v
retrieving revision 1.3
retrieving revision 1.4
diff -C2 -d -r1.3 -r1.4
*** URIStoreable.java 17 Jul 2003 16:10:19 -0000 1.3
--- URIStoreable.java 23 Sep 2003 01:16:34 -0000 1.4
***************
*** 13,16 ****
--- 13,17 ----
public interface URIStoreable {
+ public static final Object FORGOTTEN = "FORGOTTEN".intern();
public static final Object FINISHED = "FINISHED".intern();;
public static final Object HELD = "HELD".intern();
Index: SimplePreconditionEnforcer.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimplePreconditionEnforcer.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -d -r1.9 -r1.10
*** SimplePreconditionEnforcer.java 6 Sep 2003 01:44:05 -0000 1.9
--- SimplePreconditionEnforcer.java 23 Sep 2003 01:16:34 -0000 1.10
***************
*** 37,44 ****
protected void innerProcess(CrawlURI curi) {
- if (considerChaff(curi)) {
- return;
- }
-
if (considerDnsPreconditions(curi)) {
return;
--- 37,40 ----
***************
*** 63,81 ****
return;
- }
-
- /**
- * @param curi
- * @return
- */
- private boolean considerChaff(CrawlURI curi) {
- //if (curi.getChaffness()>1) {
- // System.out.println(curi.getChaffness()+" "+curi.getUURI().toString());
- //}
if(curi.getChaffness()>getIntAt(XP_CHAFF_THRESHOLD,DEFAULT_CHAFF_THRESHOLD)) {
- curi.setFetchStatus(S_DEEMED_CHAFF);
- curi.cancelFurtherProcessing();
- return true;
- }
- return false;
}
--- 59,62 ----
Index: SimpleStore.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleStore.java,v
retrieving revision 1.30
retrieving revision 1.31
diff -C2 -d -r1.30 -r1.31
*** SimpleStore.java 19 Sep 2003 01:36:36 -0000 1.30
--- SimpleStore.java 23 Sep 2003 01:16:34 -0000 1.31
***************
*** 381,385 ****
}
! applyCarryforwards(curi,sourceCuri, embed );
allCuris.put(uuri,curi);
--- 381,385 ----
}
! applyCarryforwards(curi,sourceCuri, embed);
allCuris.put(uuri,curi);
***************
*** 500,503 ****
--- 500,515 ----
public Collection getSeeds() {
return seeds;
+ }
+
+ /**
+ * Forget the given CrawlURI. This allows a new instance
+ * to be created in the future, if it is reencountered under
+ * different circumstances.
+ *
+ * @param curi
+ */
+ public void forget(CrawlURI curi) {
+ allCuris.remove(curi.getUURI());
+ curi.setStoreState(URIStoreable.FORGOTTEN);
}
|
|
From: <go...@us...> - 2003-09-23 01:16:40
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel
In directory sc8-pr-cvs1:/tmp/cvs-serv21672/src/org/archive/crawler/datamodel
Modified Files:
FetchStatusCodes.java CrawlURI.java
Log Message:
refactorings(in progress)
Index: FetchStatusCodes.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/FetchStatusCodes.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -d -r1.9 -r1.10
*** FetchStatusCodes.java 6 Sep 2003 02:00:12 -0000 1.9
--- FetchStatusCodes.java 23 Sep 2003 01:16:34 -0000 1.10
***************
*** 31,34 ****
--- 31,37 ----
public static int S_ROBOTS_PRECLUDED = -9998;
public static int S_DEEMED_CHAFF = -4000;
+ public static int S_TOO_MANY_LINK_HOPS = -4001;
+ public static int S_TOO_MANY_EMBED_HOPS = -4002;
+ public static int S_OUT_OF_SCOPE = -5000;
public static int S_DNS_SUCCESS = 1;
Index: CrawlURI.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlURI.java,v
retrieving revision 1.38
retrieving revision 1.39
diff -C2 -d -r1.38 -r1.39
*** CrawlURI.java 12 Sep 2003 02:02:24 -0000 1.38
--- CrawlURI.java 23 Sep 2003 01:16:35 -0000 1.39
***************
*** 7,22 ****
package org.archive.crawler.datamodel;
- import java.net.URI;
- import java.net.URISyntaxException;
import java.util.ArrayList;
- import java.util.BitSet;
import java.util.List;
- import java.util.logging.Level;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
import org.archive.crawler.basic.FetcherDNS;
import org.archive.crawler.basic.URIStoreable;
- import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.Processor;
--- 7,15 ----
***************
*** 26,34 ****
/**
! * Represents a URI and the associated state it collects as
! * it is crawled.
*
! * Except for a few special components, state is in a flexible
! * attibute list.
*
* Should only be instantiated via URIStore.getCrawlURI(...),
--- 19,27 ----
/**
! * Represents a candidate URI and the associated state it
! * collects as it is crawled.
*
! * Core state is in instance variables, but a flexible
! * attribute list is also available.
*
* Should only be instantiated via URIStore.getCrawlURI(...),
***************
*** 40,73 ****
public class CrawlURI
implements URIStoreable, CoreAttributeConstants, FetchStatusCodes {
! private Pattern FUZZY_TOKENS = Pattern.compile("\\w+");
! private long wakeTime;
! public static final String CONTENT_TYPE_LABEL = "content-type";
! private static int FUZZY_WIDTH = 32;
!
! private UURI baseUri;
! private AList alist = new HashtableAList();
! private UURI uuri;
! private BitSet fuzzy; // uri token bitfield as sort of fuzzy checksum
! private CrawlURI via; // curi that led to this (lowest hops from seed)
! private Object state;
! CrawlController controller;
Processor nextProcessor;
- CrawlServer server;
-
private int fetchStatus = 0; // default to unattempted
! private int deferrals = 0;
! private int fetchAttempts = 0; // the number of fetch attempts that have been made
! private int chaffness = 0; // suspiciousness of being of chaff
!
private int linkHopCount = -1; // from seeds
private int embedHopCount = -1; // from a sure link
- private int threadNumber;
private int contentSize = -1;
- private long dontRetryBefore = -1;
/**
--- 33,66 ----
public class CrawlURI
implements URIStoreable, CoreAttributeConstants, FetchStatusCodes {
! // core identity: the "usable URI" to be crawled
! private UURI uuri;
! // Scheduler lifecycle info
! private Object state; // state within scheduling/store/selector
! private long wakeTime; // if "snoozed", when this CrawlURI may awake
! private long dontRetryBefore = -1;
! private int threadNumber;
! // Processing progress
Processor nextProcessor;
private int fetchStatus = 0; // default to unattempted
! private int deferrals = 0; // count of postponements for prerequisites
! private int fetchAttempts = 0; // the number of fetch attempts that have been made
!
! // flexible dynamic attributes
! private AList alist = new HashtableAList();
!
! // dynamic context
! private CrawlURI via; // curi that led to this (lowest hops from seed)
private int linkHopCount = -1; // from seeds
private int embedHopCount = -1; // from a sure link
+
+ ////////////////////////////////////////////////////////////////////
+ CrawlServer server;
+
private int contentSize = -1;
/**
***************
*** 83,100 ****
private void setUuri(UURI u) {
uuri=u;
- setFuzzy();
}
- /**
- * set a fuzzy fingerprint for the correspoding URI based on its word-char segments
- */
- private void setFuzzy() {
- fuzzy = new BitSet(FUZZY_WIDTH);
- Matcher tokens = FUZZY_TOKENS.matcher(uuri.toString());
- tokens.find(); // skip http
- while(tokens.find()) {
- fuzzy.set(Math.abs(tokens.group().hashCode() % FUZZY_WIDTH));
- }
- }
/**
--- 76,81 ----
***************
*** 289,314 ****
/**
- *
- */
- public URI getBaseUri() {
- if (baseUri != null) {
- return baseUri.getUri();
- }
- if (!getAList().containsKey("html-base-href")) {
- return getUURI().getUri();
- }
- String base = getAList().getString("html-base-href");
- try {
- baseUri = UURI.createUURI(base);
- } catch (URISyntaxException e) {
- Object[] array = { this, base };
- controller.uriErrors.log(Level.INFO,e.getMessage(), array );
- // next best thing: use self
- baseUri = getUURI();
- }
- return getBaseUri();
- }
-
- /**
* @
*/
--- 270,273 ----
***************
*** 340,350 ****
/**
- * @param controller
- */
- public void setController(CrawlController c) {
- controller = c;
- }
-
- /**
*
*/
--- 299,302 ----
***************
*** 437,462 ****
// TODO implement
System.out.println("CrawlURI.addLocalizedError() says: \"Implement me!\"");
- }
-
- /**
- * @return
- */
- public int getChaffness() {
- return chaffness;
- }
-
- /**
- * @return
- */
- public BitSet getFuzzy() {
- // TODO Auto-generated method stub
- return fuzzy;
- }
-
- /**
- * @param i
- */
- public void setChaffness(int i) {
- chaffness = i;
}
--- 389,392 ----
|
|
From: <go...@us...> - 2003-09-23 01:15:27
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic In directory sc8-pr-cvs1:/tmp/cvs-serv21453/src/org/archive/crawler/basic Modified Files: FetcherHTTPSimple.java Log Message: revise deprecated Index: FetcherHTTPSimple.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/FetcherHTTPSimple.java,v retrieving revision 1.7 retrieving revision 1.8 diff -C2 -d -r1.7 -r1.8 *** FetcherHTTPSimple.java 6 Sep 2003 01:43:07 -0000 1.7 --- FetcherHTTPSimple.java 23 Sep 2003 01:15:19 -0000 1.8 *************** *** 13,16 **** --- 13,17 ---- import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; + import org.apache.commons.httpclient.HttpVersion; import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; import org.apache.commons.httpclient.cookie.CookiePolicy; *************** *** 77,81 **** GetMethod get = new GetMethod(curi.getUURI().getUri().toASCIIString()); get.setFollowRedirects(false); // don't auto-follow redirects ! get.setHttp11(false); // use only HTTP/1.0 (to avoid receiving chunked responses) get.setRequestHeader( --- 78,82 ---- GetMethod get = new GetMethod(curi.getUURI().getUri().toASCIIString()); get.setFollowRedirects(false); // don't auto-follow redirects ! get.getParams().setVersion(HttpVersion.HTTP_1_0); // use only HTTP/1.0 (to avoid receiving chunked responses) get.setRequestHeader( |
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework
In directory sc8-pr-cvs1:/tmp/cvs-serv14551/src/org/archive/crawler/framework
Modified Files:
CrawlController.java Filter.java URIStore.java Processor.java
Log Message:
SeedExtensionFiltering and related changes
Index: CrawlController.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/CrawlController.java,v
retrieving revision 1.26
retrieving revision 1.27
diff -C2 -d -r1.26 -r1.27
*** CrawlController.java 6 Aug 2003 01:18:43 -0000 1.26
--- CrawlController.java 19 Sep 2003 01:37:20 -0000 1.27
***************
*** 36,41 ****
public class CrawlController implements CrawlerConfigurationConstants {
-
-
private File disk;
public Logger uriProcessing = Logger.getLogger("uri-processing");
--- 36,39 ----
***************
*** 52,56 ****
URIStore store;
URISelector selector;
!
Processor firstProcessor;
LinkedHashMap processors = new LinkedHashMap();
--- 50,54 ----
URIStore store;
URISelector selector;
!
Processor firstProcessor;
LinkedHashMap processors = new LinkedHashMap();
Index: Filter.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/Filter.java,v
retrieving revision 1.7
retrieving revision 1.8
diff -C2 -d -r1.7 -r1.8
*** Filter.java 2 Jul 2003 01:59:42 -0000 1.7
--- Filter.java 19 Sep 2003 01:37:20 -0000 1.8
***************
*** 32,36 ****
protected abstract boolean innerAccepts(Object o);
! public void initialize() {
setName(getStringAt("@name"));
if("not".equals(getStringAt("@modifier"))) {
--- 32,36 ----
protected abstract boolean innerAccepts(Object o);
! public void initialize(CrawlController controller) {
setName(getStringAt("@name"));
if("not".equals(getStringAt("@modifier"))) {
Index: URIStore.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/URIStore.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -C2 -d -r1.4 -r1.5
*** URIStore.java 17 Jul 2003 22:21:06 -0000 1.4
--- URIStore.java 19 Sep 2003 01:37:20 -0000 1.5
***************
*** 7,10 ****
--- 7,12 ----
package org.archive.crawler.framework;
+ import java.util.Collection;
+
/**
* Handles all persistence for Scheduler and Selector, allowing
***************
*** 25,27 ****
--- 27,34 ----
public int discoveredUriCount();
+
+ /**
+ *
+ */
+ Collection getSeeds();
}
Index: Processor.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/Processor.java,v
retrieving revision 1.12
retrieving revision 1.13
diff -C2 -d -r1.12 -r1.13
*** Processor.java 30 Jul 2003 01:30:26 -0000 1.12
--- Processor.java 19 Sep 2003 01:37:20 -0000 1.13
***************
*** 94,98 ****
Object o = iter.next();
Filter f = (Filter)o;
! f.initialize();
}
}
--- 94,98 ----
Object o = iter.next();
Filter f = (Filter)o;
! f.initialize(controller);
}
}
|
|
From: <go...@us...> - 2003-09-19 20:07:57
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/util In directory sc8-pr-cvs1:/tmp/cvs-serv14551/src/org/archive/crawler/util Modified Files: URIRegExpFilter.java Added Files: SeedExtensionFilter.java Log Message: SeedExtensionFiltering and related changes --- NEW FILE: SeedExtensionFilter.java --- /* * SeedExtensionFilter.java * Created on Sep 15, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/util/SeedExtensionFilter.java,v 1.1 2003/09/19 01:37:19 gojomo Exp $ */ package org.archive.crawler.util; import java.util.Iterator; import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.datamodel.UURI; import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.Filter; /** * Accepts a new UURI if it is an "extension' of one of the URIs * in the seed set. Most loosely, this could be any other URI * under the same domain (as "calendar.yahoo.com' is to 'www.yahoo.com'). * In other cases, only URIs on the exact same host sharing the * same path prefix (as "www.geocities.com/foouser/about" is to * "www.geocities.com/foouser/"). * * Configuration options determine how expansive the extension * definition is. By default, it is very strict: same host and * identical path up to the last '/' given in the seed. * * * * @author gojomo * */ public class SeedExtensionFilter extends Filter { private CrawlController controller; static private int PATH = 0; // only accept same host, path-extensions static private int HOST = 1; // accept any URIs from same host static private int DOMAIN = 2; // accept any URIs from same domain private int extensionMode = PATH; /* (non-Javadoc) * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object) */ protected boolean innerAccepts(Object o) { UURI u = null; if(o instanceof UURI) { u = (UURI)o; } else if ( o instanceof CrawlURI) { u = ((CrawlURI)o).getUURI(); } if(u==null) { return false; } Iterator iter = controller.getStore().getSeeds().iterator(); while(iter.hasNext()) { UURI s = (UURI)iter.next(); if(s.getUri().getHost().equals(u.getUri().getHost())) { // hosts match if (extensionMode == PATH) { if(s.getUri().getPath().regionMatches(0,u.getUri().getPath(),0,s.getUri().getPath().lastIndexOf('/'))) { // matches up to last '/' return true; } else { // no match; try next seed continue; } } // else extensionMode == HOST or DOMAIN, match is good enough return true; } if (extensionMode == DOMAIN) { // might be a close-enough match String seedDomain = s.getUri().getHost(); // strip www[#] seedDomain = seedDomain.replaceFirst("^www\\d*",""); String candidateDomain = u.getUri().getHost(); if (candidateDomain==null) { // either an opaque, unfetchable, or unparseable URI continue; } if(seedDomain.regionMatches(0,candidateDomain,candidateDomain.length()-seedDomain.length(),seedDomain.length())) { // domain suffix congruence return true; } // else keep trying other seeds } } // if none found, fail return false; } /* (non-Javadoc) * @see org.archive.crawler.framework.Filter#initialize() */ public void initialize(CrawlController c) { // TODO Auto-generated method stub super.initialize(c); controller = c; String mode = getStringAt("@mode"); if(mode==null || "path".equals(mode)) { // default return; } if("host".equals(mode)) { extensionMode = HOST; } if("domain".equals(mode)) { extensionMode = DOMAIN; } } } Index: URIRegExpFilter.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/util/URIRegExpFilter.java,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** URIRegExpFilter.java 9 Jul 2003 01:17:23 -0000 1.3 --- URIRegExpFilter.java 19 Sep 2003 01:37:20 -0000 1.4 *************** *** 12,15 **** --- 12,16 ---- import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.datamodel.UURI; + import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.Filter; *************** *** 46,52 **** * @see org.archive.crawler.framework.Filter#initialize() */ ! public void initialize() { // TODO Auto-generated method stub ! super.initialize(); String regexp = getStringAt("@regexp"); pattern = Pattern.compile(regexp); --- 47,53 ---- * @see org.archive.crawler.framework.Filter#initialize() */ ! public void initialize(CrawlController c) { // TODO Auto-generated method stub ! super.initialize(c); String regexp = getStringAt("@regexp"); pattern = Pattern.compile(regexp); |
|
From: <go...@us...> - 2003-09-19 20:07:39
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel
In directory sc8-pr-cvs1:/tmp/cvs-serv14551/src/org/archive/crawler/datamodel
Modified Files:
UURISet.java
Added Files:
MemUURISet.java
Log Message:
SeedExtensionFiltering and related changes
--- NEW FILE: MemUURISet.java ---
/*
* MemUURISet.java
* Created on Sep 15, 2003
*
* $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/MemUURISet.java,v 1.1 2003/09/19 01:37:19 gojomo Exp $
*/
package org.archive.crawler.datamodel;
import java.util.HashSet;
/**
* @author gojomo
*
*/
public class MemUURISet extends HashSet implements UURISet {
/* (non-Javadoc)
* @see org.archive.crawler.datamodel.UURISet#size()
*/
public long count() {
return size();
}
/* (non-Javadoc)
* @see org.archive.crawler.datamodel.UURISet#contains(org.archive.crawler.datamodel.UURI)
*/
public boolean contains(UURI u) {
return contains((Object)u);
}
/* (non-Javadoc)
* @see org.archive.crawler.datamodel.UURISet#contains(org.archive.crawler.datamodel.CrawlURI)
*/
public boolean contains(CrawlURI curi) {
return contains((Object)curi);
}
/* (non-Javadoc)
* @see org.archive.crawler.datamodel.UURISet#add(org.archive.crawler.datamodel.UURI)
*/
public void add(UURI u) {
add((Object)u);
}
/* (non-Javadoc)
* @see org.archive.crawler.datamodel.UURISet#remove(org.archive.crawler.datamodel.UURI)
*/
public void remove(UURI u) {
remove((Object)u);
}
/* (non-Javadoc)
* @see org.archive.crawler.datamodel.UURISet#add(org.archive.crawler.datamodel.CrawlURI)
*/
public void add(CrawlURI curi) {
add((Object)curi);
}
/* (non-Javadoc)
* @see org.archive.crawler.datamodel.UURISet#remove(org.archive.crawler.datamodel.CrawlURI)
*/
public void remove(CrawlURI curi) {
remove((Object)curi);
}
}
Index: UURISet.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/UURISet.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** UURISet.java 16 May 2003 01:55:52 -0000 1.1
--- UURISet.java 19 Sep 2003 01:37:19 -0000 1.2
***************
*** 7,10 ****
--- 7,12 ----
package org.archive.crawler.datamodel;
+ import java.util.Set;
+
/**
***************
*** 19,24 ****
*
*/
! public interface UURISet /* extends Set ??? */ {
! public long size();
public boolean contains(UURI u);
public boolean contains(CrawlURI curi);
--- 21,26 ----
*
*/
! public interface UURISet extends Set {
! public long count();
public boolean contains(UURI u);
public boolean contains(CrawlURI curi);
|
|
From: <go...@us...> - 2003-09-19 19:55:33
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/example-crawl
In directory sc8-pr-cvs1:/tmp/cvs-serv14623/example-crawl
Modified Files:
example-order.xml
Log Message:
example of seed-extension filtering
Index: example-order.xml
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/example-crawl/example-order.xml,v
retrieving revision 1.12
retrieving revision 1.13
diff -C2 -d -r1.12 -r1.13
*** example-order.xml 6 Aug 2003 01:16:28 -0000 1.12
--- example-order.xml 19 Sep 2003 01:37:41 -0000 1.13
***************
*** 12,21 ****
<selector class="org.archive.crawler.basic.SimpleSelector">
! <seeds src="example-seeds.txt" />
! <filter
! name="www.loc.gov-only"
class="org.archive.crawler.util.URIRegExpFilter"
! modifier="not"
! regexp="http://www\.loc\.gov/.*" />
<filter
name="pathological-path"
--- 12,46 ----
<selector class="org.archive.crawler.basic.SimpleSelector">
! <seeds>
! # http://my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/&.confirm=1&.done=http:/my.yahoo.com/p/ldep
! # http://dmoz.org
! # http://www.yahoo.com
! # http://www.msnbc.com
! # http://www.lycos.com
! # http://www.drudgereport.com
! # http://www.army.mod.uk
! # http://www.dfid.gov.uk
! # http://www.fco.gov.uk
! # http://www.mod.uk
! # http://www.odpm.gov.uk
! # http://www.pm.gov.uk
! # http://www.raf.mod.uk
! # http://www.royal-navy.mod.uk
! # http://www.sabre.mod.uk
! # http://www.archive.org/..
! # http://www.yahoo.com/../../movies
! # http://www.creativecommons.org/../
! http://www.royal-navy.mod.uk/rn/form/form.html?page=1
! http://www.dfid.gov.uk/../../aboutdfid/files/glossary_l.htm
! #http://directory.google.com/Top/Games/
! # http://www3.google.com/help/customize.html
!
! </seeds>
! <!--
! <filter
! name="yahoo"
class="org.archive.crawler.util.URIRegExpFilter"
! regexp=".*yahoo\.com.*" />
! -->
<filter
name="pathological-path"
***************
*** 28,31 ****
--- 53,77 ----
modifier="not"
regexp="[^/]*?//[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?" />
+
+ <!--
+ <filter
+ name="problemarea"
+ class="org.archive.crawler.util.URIRegExpFilter"
+ regexp="http://www\.royal-navy\.mod\.uk/rn/form/form\.html.*" />
+ <filter
+ name="within-8hosts"
+ class="org.archive.crawler.util.URIRegExpFilter"
+ regexp="http://www\.((army\.mod\.uk)|(dfid\.gov\.uk)|(fco\.gov\.uk)|(mod\.uk
+ )|(odpm\.gov\.uk)|(pm\.gov\.uk)|(raf\.mod\.uk)|(royal-navy\.mod\.uk)|(sabre\.mod
+ \.uk)).*" />
+ -->
+
+ <filter
+ name="focus"
+ class="org.archive.crawler.util.SeedExtensionFilter"
+ mode="domain"
+ />
+
+
</selector>
***************
*** 39,44 ****
class="org.archive.crawler.basic.SimplePreconditionEnforcer"
next="DNS">
! <params delay-factor="5" />
! <params minimum-delay="100" />
</processor>
<processor
--- 85,89 ----
class="org.archive.crawler.basic.SimplePreconditionEnforcer"
next="DNS">
! <params delay-factor="3" minimum-delay="100" />
</processor>
<processor
***************
*** 84,91 ****
--- 129,138 ----
<compression use="true"/>
<arc-files max-size-bytes="20000000"/>
+ <!--
<filter
name="http-only"
class="org.archive.crawler.util.URIRegExpFilter"
regexp="^http://.*" />
+ -->
</processor>
<processor
***************
*** 98,106 ****
<!-- actual enforcement of these limits may depend on choice
of SSS/processor instances that read and respect these limits -->
! <max-link-depth value="0" /> <!-- zero means crawl seeds only -->
! <max-pages value="1000" />
! <max-duration value="1h" />
! <max-resources-per-site value="1000" />
! <max-toe-threads value="3" />
</limits>
--- 145,151 ----
<!-- actual enforcement of these limits may depend on choice
of SSS/processor instances that read and respect these limits -->
! <max-link-depth value="100" /> <!-- zero means crawl seeds only -->
! <max-embed-depth value="5" /> <!-- extra hops that can be taken for embeds -->
! <max-toe-threads value="20" />
</limits>
|
|
From: <go...@us...> - 2003-09-19 19:46:14
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic
In directory sc8-pr-cvs1:/tmp/cvs-serv14400/src/org/archive/crawler/basic
Modified Files:
SimpleStore.java
Log Message:
remember seeds (to support seedlist-based filtering, for example)
Index: SimpleStore.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleStore.java,v
retrieving revision 1.29
retrieving revision 1.30
diff -C2 -d -r1.29 -r1.30
*** SimpleStore.java 15 Sep 2003 23:34:56 -0000 1.29
--- SimpleStore.java 19 Sep 2003 01:36:36 -0000 1.30
***************
*** 7,10 ****
--- 7,11 ----
package org.archive.crawler.basic;
+ import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
***************
*** 17,21 ****
--- 18,24 ----
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.datamodel.FetchStatusCodes;
+ import org.archive.crawler.datamodel.MemUURISet;
import org.archive.crawler.datamodel.UURI;
+ import org.archive.crawler.datamodel.UURISet;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.URIStore;
***************
*** 31,34 ****
--- 34,39 ----
private static Logger logger = Logger.getLogger("org.archive.crawler.basic.SimpleStore");
+ UURISet seeds = new MemUURISet();
+
HashMap allCuris = new HashMap(); // of UURI -> CrawlURI
***************
*** 94,97 ****
--- 99,103 ----
*/
public void insertAsSeed(UURI uuri) {
+ seeds.add(uuri);
if(allCuris.get(uuri)!=null) {
// already inserted
***************
*** 99,103 ****
}
CrawlURI curi = new CrawlURI(uuri);
! curi.getAList().putInt("distance-from-seed",0);
allCuris.put(uuri,curi);
pendingQueue.addLast(curi);
--- 105,109 ----
}
CrawlURI curi = new CrawlURI(uuri);
! //curi.getAList().putInt("distance-from-seed",0);
allCuris.put(uuri,curi);
pendingQueue.addLast(curi);
***************
*** 487,490 ****
--- 493,503 ----
reinsert(released);
}
+ }
+
+ /* (non-Javadoc)
+ * @see org.archive.crawler.framework.URIStore#getSeeds()
+ */
+ public Collection getSeeds() {
+ return seeds;
}
|
|
From: <go...@us...> - 2003-09-19 19:44:52
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic In directory sc8-pr-cvs1:/tmp/cvs-serv14551/src/org/archive/crawler/basic Modified Files: SimpleSelector.java Log Message: SeedExtensionFiltering and related changes Index: SimpleSelector.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleSelector.java,v retrieving revision 1.25 retrieving revision 1.26 diff -C2 -d -r1.25 -r1.26 *** SimpleSelector.java 12 Sep 2003 02:02:24 -0000 1.25 --- SimpleSelector.java 19 Sep 2003 01:37:19 -0000 1.26 *************** *** 430,434 **** Object o = iter.next(); Filter f = (Filter)o; ! f.initialize(); } } --- 430,434 ---- Object o = iter.next(); Filter f = (Filter)o; ! f.initialize(controller); } } |
|
From: <go...@us...> - 2003-09-15 23:35:00
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic In directory sc8-pr-cvs1:/tmp/cvs-serv4527/src/org/archive/crawler/basic Modified Files: SimpleStore.java Log Message: organize imports Index: SimpleStore.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleStore.java,v retrieving revision 1.28 retrieving revision 1.29 diff -C2 -d -r1.28 -r1.29 *** SimpleStore.java 12 Sep 2003 02:02:24 -0000 1.28 --- SimpleStore.java 15 Sep 2003 23:34:56 -0000 1.29 *************** *** 7,16 **** package org.archive.crawler.basic; - import java.util.BitSet; import java.util.HashMap; import java.util.LinkedList; import java.util.SortedSet; import java.util.TreeSet; ! import java.util.Iterator; import org.archive.crawler.datamodel.CoreAttributeConstants; --- 7,16 ---- package org.archive.crawler.basic; import java.util.HashMap; + import java.util.Iterator; import java.util.LinkedList; import java.util.SortedSet; import java.util.TreeSet; ! import java.util.logging.Logger; import org.archive.crawler.datamodel.CoreAttributeConstants; *************** *** 20,25 **** import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.URIStore; - - import java.util.logging.Logger; /** --- 20,23 ---- |
|
From: <go...@us...> - 2003-09-12 02:04:04
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic
In directory sc8-pr-cvs1:/tmp/cvs-serv28250/src/org/archive/crawler/basic
Modified Files:
SimpleSelector.java SimpleStore.java
Log Message:
improved link-hop, embed-hop handling: embeds get a separate leash distance from last link
Index: SimpleSelector.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleSelector.java,v
retrieving revision 1.24
retrieving revision 1.25
diff -C2 -d -r1.24 -r1.25
*** SimpleSelector.java 6 Sep 2003 01:46:38 -0000 1.24
--- SimpleSelector.java 12 Sep 2003 02:02:24 -0000 1.25
***************
*** 38,41 ****
--- 38,43 ----
ArrayList filters = new ArrayList();
private int maxLinkDepth = -1;
+ private int maxEmbedDepth = -1;
+
private int maxDeferrals = 10; // should be at least max-retries plus 3 or so
private int maxRetries = 3;
***************
*** 80,90 ****
}
// handle embeds
! if (curi.getAList().containsKey(A_HTML_EMBEDS)) {
handleEmbeds(curi);
}
// handle links, if not too deep
! if ((maxLinkDepth >= 0)
! && (curi.getAList().getInt(A_DISTANCE_FROM_SEED) < maxLinkDepth)
! && curi.getAList().containsKey(A_HTML_LINKS)) {
handleLinks(curi);
}
--- 82,94 ----
}
// handle embeds
! if ((maxEmbedDepth >= 0)
! && (curi.getEmbedHopCount()<maxEmbedDepth)
! && curi.getAList().containsKey(A_HTML_EMBEDS)) {
handleEmbeds(curi);
}
// handle links, if not too deep
! if (curi.getAList().containsKey(A_HTML_LINKS)
! && ((maxLinkDepth == -1)
! || (curi.getLinkHopCount() < maxLinkDepth))) {
handleLinks(curi);
}
***************
*** 174,178 ****
logger.fine("inserting header at head "+u);
//store.insertAtHead(u,curi.getAList().getInt("distance-from-seed"));
! store.insert(u,curi,0);
//}
} catch (URISyntaxException ex) {
--- 178,182 ----
logger.fine("inserting header at head "+u);
//store.insertAtHead(u,curi.getAList().getInt("distance-from-seed"));
! store.insert(u,curi,true);
//}
} catch (URISyntaxException ex) {
***************
*** 280,284 ****
if(filtersAccept(link)) {
logger.fine("inserting link "+link+" "+curi.getStoreState());
! store.insert(link,curi,1);
}
} catch (URISyntaxException ex) {
--- 284,288 ----
if(filtersAccept(link)) {
logger.fine("inserting link "+link+" "+curi.getStoreState());
! store.insert(link,curi,false);
}
} catch (URISyntaxException ex) {
***************
*** 304,308 ****
logger.fine("inserting embed at head "+embed);
// For now, insert at tail instead of head
! store.insert(embed,curi,0);
/*
store.insertAtHead(embed,curi.getAList().getInt("distance-from-seed"));
--- 308,312 ----
logger.fine("inserting embed at head "+embed);
// For now, insert at tail instead of head
! store.insert(embed,curi,true);
/*
store.insertAtHead(embed,curi.getAList().getInt("distance-from-seed"));
***************
*** 333,337 ****
logger.fine("inserting prereq at head "+prereq);
//CrawlURI prereqCuri = store.insertAtHead(prereq,curi.getAList().getInt("distance-from-seed"));
! CrawlURI prereqCuri = store.insert(prereq,curi,0);
if (prereqCuri.getStoreState()==URIStoreable.FINISHED) {
curi.setFetchStatus(S_PREREQUISITE_FAILURE);
--- 337,341 ----
logger.fine("inserting prereq at head "+prereq);
//CrawlURI prereqCuri = store.insertAtHead(prereq,curi.getAList().getInt("distance-from-seed"));
! CrawlURI prereqCuri = store.insert(prereq,curi,false);
if (prereqCuri.getStoreState()==URIStoreable.FINISHED) {
curi.setFetchStatus(S_PREREQUISITE_FAILURE);
***************
*** 418,422 ****
controller = c;
store = (SimpleStore)c.getStore();
! maxLinkDepth = controller.getOrder().getBehavior().getIntAt("//limits/max-link-depth/@value");
instantiateAllInto(XP_FILTERS,filters);
--- 422,427 ----
controller = c;
store = (SimpleStore)c.getStore();
! maxLinkDepth = controller.getOrder().getBehavior().getIntAt("//limits/max-link-depth/@value", maxLinkDepth);
! maxEmbedDepth = controller.getOrder().getBehavior().getIntAt("//limits/max-embed-depth/@value", maxEmbedDepth);
instantiateAllInto(XP_FILTERS,filters);
Index: SimpleStore.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleStore.java,v
retrieving revision 1.27
retrieving revision 1.28
diff -C2 -d -r1.27 -r1.28
*** SimpleStore.java 6 Sep 2003 01:52:01 -0000 1.27
--- SimpleStore.java 12 Sep 2003 02:02:24 -0000 1.28
***************
*** 360,364 ****
* @param i
*/
! public CrawlURI insert(UURI uuri, CrawlURI sourceCuri, int extraHop) {
if(filteredOut(uuri)) return null;
CrawlURI curi = (CrawlURI)allCuris.get(uuri);
--- 360,364 ----
* @param i
*/
! public CrawlURI insert(UURI uuri, CrawlURI sourceCuri, boolean embed) {
if(filteredOut(uuri)) return null;
CrawlURI curi = (CrawlURI)allCuris.get(uuri);
***************
*** 377,381 ****
}
! applyCarryforwards(curi,sourceCuri, extraHop );
allCuris.put(uuri,curi);
--- 377,381 ----
}
! applyCarryforwards(curi,sourceCuri, embed );
allCuris.put(uuri,curi);
***************
*** 396,431 ****
* @param sourceCuri
*/
! private void applyCarryforwards(CrawlURI curi, CrawlURI sourceCuri, int extraHop) {
! int newDist = sourceCuri.getAList().getInt("distance-from-seed")+extraHop;
! if(curi.getAList().containsKey(A_DISTANCE_FROM_SEED)) {
! int oldDist = curi.getAList().getInt(A_DISTANCE_FROM_SEED);
! if (oldDist>newDist) {
! curi.getAList().putInt(A_DISTANCE_FROM_SEED,newDist);
! curi.setVia(sourceCuri);
! } // otherwise leave alone
} else {
! curi.getAList().putInt(A_DISTANCE_FROM_SEED,newDist);
! curi.setVia(sourceCuri);
}
!
! int newChaffness = sourceCuri.getChaffness();
! if(sourceCuri.getUURI().getUri().getHost()==null ||
! sourceCuri.getUURI().getUri().getHost().equals(curi.getUURI().getUri().getHost())) {
! newChaffness = 0;
! } else {
! BitSet scratch = (BitSet) sourceCuri.getFuzzy().clone();
! scratch.xor(curi.getFuzzy());
! int fuzzyDiff = scratch.cardinality();
! if(fuzzyDiff<2) {
! newChaffness += 1;
! } else {
! newChaffness -= 1;
! }
! }
! if(newChaffness<0) {
! newChaffness = 0;
! }
! curi.setChaffness(newChaffness);
}
--- 396,424 ----
* @param sourceCuri
*/
! private void applyCarryforwards(CrawlURI curi, CrawlURI sourceCuri, boolean embed) {
! if (embed) {
! curi.setViaEmbedFrom(sourceCuri);
} else {
! curi.setViaLinkFrom(sourceCuri);
}
! // int newChaffness = sourceCuri.getChaffness();
! // if(sourceCuri.getUURI().getUri().getHost()==null ||
! // sourceCuri.getUURI().getUri().getHost().equals(curi.getUURI().getUri().getHost())) {
! // newChaffness = 0;
! // } else {
! // BitSet scratch = (BitSet) sourceCuri.getFuzzy().clone();
! // scratch.xor(curi.getFuzzy());
! // int fuzzyDiff = scratch.cardinality();
! // if(fuzzyDiff<2) {
! // newChaffness += 1;
! // } else {
! // newChaffness -= 1;
! // }
! // }
! // if(newChaffness<0) {
! // newChaffness = 0;
! // }
! // curi.setChaffness(newChaffness);
}
|
|
From: <go...@us...> - 2003-09-12 02:03:27
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor
In directory sc8-pr-cvs1:/tmp/cvs-serv28436/src/org/archive/crawler/extractor
Modified Files:
ExtractorHTML.java
Log Message:
stray thought TODO
Index: ExtractorHTML.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor/ExtractorHTML.java,v
retrieving revision 1.14
retrieving revision 1.15
diff -C2 -d -r1.14 -r1.15
*** ExtractorHTML.java 9 Sep 2003 23:12:20 -0000 1.14
--- ExtractorHTML.java 12 Sep 2003 02:03:24 -0000 1.15
***************
*** 201,204 ****
--- 201,205 ----
while (candidates.find()) {
curi.addEmbed(candidates.group(2));
+ // TODO: treat "looks like" html URIs as links?
}
}
|
|
From: <go...@us...> - 2003-09-12 02:03:04
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel
In directory sc8-pr-cvs1:/tmp/cvs-serv28332/src/org/archive/crawler/datamodel
Modified Files:
UURI.java
Log Message:
strip leading (/..)+
Index: UURI.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/UURI.java,v
retrieving revision 1.20
retrieving revision 1.21
diff -C2 -d -r1.20 -r1.21
*** UURI.java 6 Sep 2003 02:00:12 -0000 1.20
--- UURI.java 12 Sep 2003 02:03:01 -0000 1.21
***************
*** 89,96 ****
if (u.getSchemeSpecificPart().startsWith("/")) {
// hierarchical URI
! if ("".equals(u.getPath())) {
! u = u.resolve("/"); // ensure root URLs end with '/'
}
- u = u.normalize(); // factor out path cruft
String canonizedAuthority = u.getAuthority();
if(canonizedAuthority==null) {
--- 89,99 ----
if (u.getSchemeSpecificPart().startsWith("/")) {
// hierarchical URI
! u = u.normalize(); // factor out path cruft, according to official spec
! // now, go further and eliminate extra '..' segments
! String fixedPath = u.getPath().replaceFirst("^(/\\.\\.)+","");
! if ("".equals(fixedPath)) {
! // ensure root URLs end with '/'
! fixedPath = "/";
}
String canonizedAuthority = u.getAuthority();
if(canonizedAuthority==null) {
***************
*** 137,141 ****
u = new URI(u.getScheme().toLowerCase(), // case-flatten scheme
canonizedAuthority, // case and port flatten
! u.getPath(), // leave alone
u.getQuery(), // leave alone
null); // drop fragment
--- 140,144 ----
u = new URI(u.getScheme().toLowerCase(), // case-flatten scheme
canonizedAuthority, // case and port flatten
! fixedPath, // leave alone
u.getQuery(), // leave alone
null); // drop fragment
|
|
From: <go...@us...> - 2003-09-12 02:02:29
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel
In directory sc8-pr-cvs1:/tmp/cvs-serv28250/src/org/archive/crawler/datamodel
Modified Files:
CrawlURI.java
Log Message:
improved link-hop, embed-hop handling: embeds get a separate leash distance from last link
Index: CrawlURI.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlURI.java,v
retrieving revision 1.37
retrieving revision 1.38
diff -C2 -d -r1.37 -r1.38
*** CrawlURI.java 6 Sep 2003 02:00:13 -0000 1.37
--- CrawlURI.java 12 Sep 2003 02:02:24 -0000 1.38
***************
*** 62,65 ****
--- 62,68 ----
private int chaffness = 0; // suspiciousness of being of chaff
+ private int linkHopCount = -1; // from seeds
+ private int embedHopCount = -1; // from a sure link
+
private int threadNumber;
***************
*** 461,470 ****
* @param sourceCuri
*/
! public void setVia(CrawlURI sourceCuri) {
via = sourceCuri;
}
/* public boolean isFubared(){
return ( fetchStatus < 0 && numberOfFetchAttempts >= 3);
}*/
}
--- 464,518 ----
* @param sourceCuri
*/
! public void setViaLinkFrom(CrawlURI sourceCuri) {
via = sourceCuri;
+ int candidateLinkHopCount = sourceCuri.getLinkHopCount()+1;
+ embedHopCount = 0;
+ if (linkHopCount == -1) {
+ linkHopCount = candidateLinkHopCount;
+ return;
+ }
+ if (linkHopCount > candidateLinkHopCount) {
+ linkHopCount = candidateLinkHopCount;
+ }
+ }
+
+ /**
+ * @param sourceCuri
+ */
+ public void setViaEmbedFrom(CrawlURI sourceCuri) {
+ via = sourceCuri;
+ int candidateLinkHopCount = sourceCuri.getLinkHopCount();
+ if (linkHopCount == -1) {
+ linkHopCount = candidateLinkHopCount;
+ } else if (linkHopCount > candidateLinkHopCount) {
+ linkHopCount = candidateLinkHopCount;
+ }
+ int candidateEmbedHopCount = sourceCuri.getEmbedHopCount()+1;
+ if (embedHopCount == -1) {
+ embedHopCount = candidateEmbedHopCount;
+ } else if (embedHopCount > candidateEmbedHopCount) {
+ embedHopCount = candidateEmbedHopCount;
+ }
}
+
/* public boolean isFubared(){
return ( fetchStatus < 0 && numberOfFetchAttempts >= 3);
}*/
+
+
+ /**
+ * @return
+ */
+ public int getEmbedHopCount() {
+ return embedHopCount;
+ }
+
+ /**
+ * @return
+ */
+ public int getLinkHopCount() {
+ return linkHopCount;
+ }
+
}
|
|
From: <go...@us...> - 2003-09-09 23:12:23
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor
In directory sc8-pr-cvs1:/tmp/cvs-serv10481/src/org/archive/crawler/extractor
Modified Files:
ExtractorHTML.java
Log Message:
handle & in codebase, resource, onEvent attributes
Index: ExtractorHTML.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor/ExtractorHTML.java,v
retrieving revision 1.13
retrieving revision 1.14
diff -C2 -d -r1.13 -r1.14
*** ExtractorHTML.java 6 Sep 2003 02:01:07 -0000 1.13
--- ExtractorHTML.java 9 Sep 2003 23:12:20 -0000 1.14
***************
*** 103,107 ****
// Just in case it's an OBJECT tag
! CharSequence codebase = null;
ArrayList resources = null;
--- 103,107 ----
// Just in case it's an OBJECT tag
! String codebase = null;
ArrayList resources = null;
***************
*** 127,132 ****
} else if (attr.start(6)>-1) {
// CODEBASE
! codebase = value;
! processEmbed(curi,codebase.toString());
} else if (attr.start(7)>-1) {
// CLASSID,DATA
--- 127,133 ----
} else if (attr.start(6)>-1) {
// CODEBASE
! codebase = value.toString();
! codebase = codebase.replaceAll("&","&"); // TODO: more HTML deescaping?
! processEmbed(curi,codebase);
} else if (attr.start(7)>-1) {
// CLASSID,DATA
***************
*** 166,169 ****
--- 167,171 ----
while(iter.hasNext()) {
String res = iter.next().toString();
+ res = res.replaceAll("&","&"); // TODO: more HTML deescaping?
if (codebaseURI != null) {
res = codebaseURI.resolve(res).toString();
***************
*** 194,198 ****
*/
private void processScriptCode(CrawlURI curi, CharSequence cs) {
! Matcher candidates = JAVASCRIPT_LIKELY_URI_EXTRACTOR.matcher(cs);
while (candidates.find()) {
curi.addEmbed(candidates.group(2));
--- 196,202 ----
*/
private void processScriptCode(CrawlURI curi, CharSequence cs) {
! String code = cs.toString();
! code = code.replaceAll("&","&"); // TODO: more HTML deescaping?
! Matcher candidates = JAVASCRIPT_LIKELY_URI_EXTRACTOR.matcher(code);
while (candidates.find()) {
curi.addEmbed(candidates.group(2));
|
|
From: <go...@us...> - 2003-09-09 04:13:49
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel
In directory sc8-pr-cvs1:/tmp/cvs-serv6933/src/org/archive/crawler/datamodel
Modified Files:
CrawlHost.java
Log Message:
instantiate InetAddress for dotted-numeric IPs
Index: CrawlHost.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlHost.java,v
retrieving revision 1.16
retrieving revision 1.17
diff -C2 -d -r1.16 -r1.17
*** CrawlHost.java 6 Aug 2003 01:18:43 -0000 1.16
--- CrawlHost.java 8 Sep 2003 23:35:00 -0000 1.17
***************
*** 8,11 ****
--- 8,12 ----
import java.net.InetAddress;
+ import java.net.UnknownHostException;
/**
***************
*** 25,29 ****
public CrawlHost(String hostname) {
name = hostname;
! // TODO: immediately handle numeric hosts
}
--- 26,49 ----
public CrawlHost(String hostname) {
name = hostname;
! if (hostname.matches("[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}")) {
! try {
! String[] octets = hostname.split("\\.");
!
! setIP(
! InetAddress.getByAddress(
! hostname,
! new byte[] {
! (byte) (new Integer(octets[0])).intValue(),
! (byte) (new Integer(octets[1])).intValue(),
! (byte) (new Integer(octets[2])).intValue(),
! (byte) (new Integer(octets[3])).intValue()})
! );
! } catch (UnknownHostException e) {
! // this should never happen as a dns lookup is not made
! e.printStackTrace();
! }
! // never expire numeric IPs
! setIpExpires(Long.MAX_VALUE);
! }
}
|
|
From: <go...@us...> - 2003-09-06 02:01:10
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor
In directory sc8-pr-cvs1:/tmp/cvs-serv17531/src/org/archive/crawler/extractor
Modified Files:
ExtractorHTML.java
Log Message:
in-attribute '&' handling
Index: ExtractorHTML.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor/ExtractorHTML.java,v
retrieving revision 1.12
retrieving revision 1.13
diff -C2 -d -r1.12 -r1.13
*** ExtractorHTML.java 3 Sep 2003 01:51:05 -0000 1.12
--- ExtractorHTML.java 6 Sep 2003 02:01:07 -0000 1.13
***************
*** 205,212 ****
*/
private void processLink(CrawlURI curi, CharSequence value) {
! if(value.toString().matches("(?i)^javascript:.*")) {
processScriptCode(curi,value.subSequence(11,value.length()));
} else {
! curi.addLink(value.toString());
}
}
--- 205,214 ----
*/
private void processLink(CrawlURI curi, CharSequence value) {
! String link = value.toString();
! link = link.replaceAll("&","&"); // TODO: more HTML deescaping?
! if(link.matches("(?i)^javascript:.*")) {
processScriptCode(curi,value.subSequence(11,value.length()));
} else {
! curi.addLink(link);
}
}
***************
*** 219,223 ****
*/
private void processEmbed(CrawlURI curi, CharSequence value) {
! curi.addEmbed(value.toString());
}
--- 221,227 ----
*/
private void processEmbed(CrawlURI curi, CharSequence value) {
! String embed = value.toString();
! embed = embed.replaceAll("&","&"); // TODO: more HTML deescaping?
! curi.addEmbed(embed);
}
|
|
From: <go...@us...> - 2003-09-06 02:00:27
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel
In directory sc8-pr-cvs1:/tmp/cvs-serv17364/src/org/archive/crawler/datamodel
Modified Files:
FetchStatusCodes.java UURI.java CrawlURI.java
Log Message:
chaff detection support
Index: FetchStatusCodes.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/FetchStatusCodes.java,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** FetchStatusCodes.java 18 Jul 2003 18:27:59 -0000 1.8
--- FetchStatusCodes.java 6 Sep 2003 02:00:12 -0000 1.9
***************
*** 30,33 ****
--- 30,34 ----
public static int S_ROBOTS_PRECLUDED = -9998;
+ public static int S_DEEMED_CHAFF = -4000;
public static int S_DNS_SUCCESS = 1;
Index: UURI.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/UURI.java,v
retrieving revision 1.19
retrieving revision 1.20
diff -C2 -d -r1.19 -r1.20
*** UURI.java 21 Aug 2003 23:28:59 -0000 1.19
--- UURI.java 6 Sep 2003 02:00:12 -0000 1.20
***************
*** 39,45 ****
* @param u
*/
! public UURI(URI u) {
uri = u;
}
/**
--- 39,47 ----
* @param u
*/
! private UURI(URI u) {
uri = u;
}
+
+
/**
Index: CrawlURI.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlURI.java,v
retrieving revision 1.36
retrieving revision 1.37
diff -C2 -d -r1.36 -r1.37
*** CrawlURI.java 6 Aug 2003 01:19:02 -0000 1.36
--- CrawlURI.java 6 Sep 2003 02:00:13 -0000 1.37
***************
*** 10,15 ****
--- 10,18 ----
import java.net.URISyntaxException;
import java.util.ArrayList;
+ import java.util.BitSet;
import java.util.List;
import java.util.logging.Level;
+ import java.util.regex.Matcher;
+ import java.util.regex.Pattern;
import org.archive.crawler.basic.FetcherDNS;
***************
*** 37,47 ****
--- 40,55 ----
public class CrawlURI
implements URIStoreable, CoreAttributeConstants, FetchStatusCodes {
+ private Pattern FUZZY_TOKENS = Pattern.compile("\\w+");
+
private long wakeTime;
public static final String CONTENT_TYPE_LABEL = "content-type";
+ private static int FUZZY_WIDTH = 32;
private UURI baseUri;
private AList alist = new HashtableAList();
private UURI uuri;
+ private BitSet fuzzy; // uri token bitfield as sort of fuzzy checksum
+ private CrawlURI via; // curi that led to this (lowest hops from seed)
private Object state;
CrawlController controller;
***************
*** 52,56 ****
private int deferrals = 0;
private int fetchAttempts = 0; // the number of fetch attempts that have been made
!
private int threadNumber;
--- 60,65 ----
private int deferrals = 0;
private int fetchAttempts = 0; // the number of fetch attempts that have been made
! private int chaffness = 0; // suspiciousness of being of chaff
!
private int threadNumber;
***************
*** 63,70 ****
*/
public CrawlURI(UURI u) {
! uuri=u;
}
/**
* Set the time this curi is considered expired (and thus must be refetched)
* to 'expires'. This function will set the time to an arbitrary value.
--- 72,99 ----
*/
public CrawlURI(UURI u) {
! setUuri(u);
}
/**
+ * @param u
+ */
+ private void setUuri(UURI u) {
+ uuri=u;
+ setFuzzy();
+ }
+
+ /**
+ * set a fuzzy fingerprint for the correspoding URI based on its word-char segments
+ */
+ private void setFuzzy() {
+ fuzzy = new BitSet(FUZZY_WIDTH);
+ Matcher tokens = FUZZY_TOKENS.matcher(uuri.toString());
+ tokens.find(); // skip http
+ while(tokens.find()) {
+ fuzzy.set(Math.abs(tokens.group().hashCode() % FUZZY_WIDTH));
+ }
+ }
+
+ /**
* Set the time this curi is considered expired (and thus must be refetched)
* to 'expires'. This function will set the time to an arbitrary value.
***************
*** 93,103 ****
! /**
! * @param uri
! * @return
! */
! public CrawlURI(URI u){
! uuri = new UURI(u);
! }
--- 122,126 ----
!
***************
*** 123,129 ****
public CrawlURI(String s){
try{
! uuri = new UURI(new URI(s));
}catch(Exception e){
! uuri = null;
}
}
--- 146,152 ----
public CrawlURI(String s){
try{
! setUuri(UURI.createUURI(s));
}catch(Exception e){
! setUuri(null);
}
}
***************
*** 411,414 ****
--- 434,466 ----
// TODO implement
System.out.println("CrawlURI.addLocalizedError() says: \"Implement me!\"");
+ }
+
+ /**
+ * @return
+ */
+ public int getChaffness() {
+ return chaffness;
+ }
+
+ /**
+ * @return
+ */
+ public BitSet getFuzzy() {
+ // TODO Auto-generated method stub
+ return fuzzy;
+ }
+
+ /**
+ * @param i
+ */
+ public void setChaffness(int i) {
+ chaffness = i;
+ }
+
+ /**
+ * @param sourceCuri
+ */
+ public void setVia(CrawlURI sourceCuri) {
+ via = sourceCuri;
}
|
|
From: <go...@us...> - 2003-09-06 01:52:04
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic
In directory sc8-pr-cvs1:/tmp/cvs-serv16218/src/org/archive/crawler/basic
Modified Files:
SimpleStore.java
Log Message:
improve robustness against wacky URIs
Index: SimpleStore.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleStore.java,v
retrieving revision 1.26
retrieving revision 1.27
diff -C2 -d -r1.26 -r1.27
*** SimpleStore.java 6 Sep 2003 01:48:41 -0000 1.26
--- SimpleStore.java 6 Sep 2003 01:52:01 -0000 1.27
***************
*** 411,415 ****
int newChaffness = sourceCuri.getChaffness();
! if(!sourceCuri.getUURI().getUri().getHost().equals(curi.getUURI().getUri().getHost())) {
newChaffness = 0;
} else {
--- 411,416 ----
int newChaffness = sourceCuri.getChaffness();
! if(sourceCuri.getUURI().getUri().getHost()==null ||
! sourceCuri.getUURI().getUri().getHost().equals(curi.getUURI().getUri().getHost())) {
newChaffness = 0;
} else {
|
|
From: <go...@us...> - 2003-09-06 01:48:48
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic
In directory sc8-pr-cvs1:/tmp/cvs-serv15682/src/org/archive/crawler/basic
Modified Files:
SimpleStore.java
Log Message:
carryforward chaffness indicator
Index: SimpleStore.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleStore.java,v
retrieving revision 1.25
retrieving revision 1.26
diff -C2 -d -r1.25 -r1.26
*** SimpleStore.java 6 Aug 2003 01:23:10 -0000 1.25
--- SimpleStore.java 6 Sep 2003 01:48:41 -0000 1.26
***************
*** 7,10 ****
--- 7,11 ----
package org.archive.crawler.basic;
+ import java.util.BitSet;
import java.util.HashMap;
import java.util.LinkedList;
***************
*** 359,371 ****
* @param i
*/
! public void insert(UURI uuri, int dist) {
! if(filteredOut(uuri)) return;
CrawlURI curi = (CrawlURI)allCuris.get(uuri);
if(curi!=null) {
// already inserted
// TODO: perhaps yank to front?
// if curi is still locked out, ignore request to schedule
if(curi.getStoreState()!=URIStoreable.FINISHED || curi.dontFetchYet()){
! return;
}
// yank URI back into scheduling if necessary
--- 360,373 ----
* @param i
*/
! public CrawlURI insert(UURI uuri, CrawlURI sourceCuri, int extraHop) {
! if(filteredOut(uuri)) return null;
CrawlURI curi = (CrawlURI)allCuris.get(uuri);
if(curi!=null) {
// already inserted
// TODO: perhaps yank to front?
+ // TODO: increment inlink counter?
// if curi is still locked out, ignore request to schedule
if(curi.getStoreState()!=URIStoreable.FINISHED || curi.dontFetchYet()){
! return curi;
}
// yank URI back into scheduling if necessary
***************
*** 374,382 ****
curi = new CrawlURI(uuri);
}
! int newDist = dist;
! if(curi.getAList().containsKey(A_DISTANCE_FROM_SEED)) {
! newDist = Math.max(dist,curi.getAList().getInt(A_DISTANCE_FROM_SEED));
! }
! curi.getAList().putInt(A_DISTANCE_FROM_SEED,newDist);
allCuris.put(uuri,curi);
KeyedQueue classQueue = (KeyedQueue) allClassQueuesMap.get(curi.getClassKey());
--- 376,382 ----
curi = new CrawlURI(uuri);
}
!
! applyCarryforwards(curi,sourceCuri, extraHop );
!
allCuris.put(uuri,curi);
KeyedQueue classQueue = (KeyedQueue) allClassQueuesMap.get(curi.getClassKey());
***************
*** 385,392 ****
curi.setStoreState(URIStoreable.PENDING);
notify();
! return;
}
classQueue.addLast(curi);
curi.setStoreState(classQueue.getStoreState());
}
--- 385,430 ----
curi.setStoreState(URIStoreable.PENDING);
notify();
! return curi;
}
classQueue.addLast(curi);
curi.setStoreState(classQueue.getStoreState());
+ return curi;
+ }
+
+ /**
+ * @param curi
+ * @param sourceCuri
+ */
+ private void applyCarryforwards(CrawlURI curi, CrawlURI sourceCuri, int extraHop) {
+ int newDist = sourceCuri.getAList().getInt("distance-from-seed")+extraHop;
+ if(curi.getAList().containsKey(A_DISTANCE_FROM_SEED)) {
+ int oldDist = curi.getAList().getInt(A_DISTANCE_FROM_SEED);
+ if (oldDist>newDist) {
+ curi.getAList().putInt(A_DISTANCE_FROM_SEED,newDist);
+ curi.setVia(sourceCuri);
+ } // otherwise leave alone
+ } else {
+ curi.getAList().putInt(A_DISTANCE_FROM_SEED,newDist);
+ curi.setVia(sourceCuri);
+ }
+
+
+ int newChaffness = sourceCuri.getChaffness();
+ if(!sourceCuri.getUURI().getUri().getHost().equals(curi.getUURI().getUri().getHost())) {
+ newChaffness = 0;
+ } else {
+ BitSet scratch = (BitSet) sourceCuri.getFuzzy().clone();
+ scratch.xor(curi.getFuzzy());
+ int fuzzyDiff = scratch.cardinality();
+ if(fuzzyDiff<2) {
+ newChaffness += 1;
+ } else {
+ newChaffness -= 1;
+ }
+ }
+ if(newChaffness<0) {
+ newChaffness = 0;
+ }
+ curi.setChaffness(newChaffness);
}
|
|
From: <go...@us...> - 2003-09-06 01:46:44
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic
In directory sc8-pr-cvs1:/tmp/cvs-serv15442/src/org/archive/crawler/basic
Modified Files:
SimpleSelector.java
Log Message:
insert at tail rather than head (for now)
Index: SimpleSelector.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleSelector.java,v
retrieving revision 1.23
retrieving revision 1.24
diff -C2 -d -r1.23 -r1.24
*** SimpleSelector.java 13 Aug 2003 19:17:29 -0000 1.23
--- SimpleSelector.java 6 Sep 2003 01:46:38 -0000 1.24
***************
*** 173,177 ****
//if(filtersAccept(u)) {
logger.fine("inserting header at head "+u);
! store.insertAtHead(u,curi.getAList().getInt("distance-from-seed"));
//}
} catch (URISyntaxException ex) {
--- 173,178 ----
//if(filtersAccept(u)) {
logger.fine("inserting header at head "+u);
! //store.insertAtHead(u,curi.getAList().getInt("distance-from-seed"));
! store.insert(u,curi,0);
//}
} catch (URISyntaxException ex) {
***************
*** 279,283 ****
if(filtersAccept(link)) {
logger.fine("inserting link "+link+" "+curi.getStoreState());
! store.insert(link,curi.getAList().getInt("distance-from-seed")+1);
}
} catch (URISyntaxException ex) {
--- 280,284 ----
if(filtersAccept(link)) {
logger.fine("inserting link "+link+" "+curi.getStoreState());
! store.insert(link,curi,1);
}
} catch (URISyntaxException ex) {
***************
*** 302,306 ****
--- 303,311 ----
//if(filtersAccept(embed)) {
logger.fine("inserting embed at head "+embed);
+ // For now, insert at tail instead of head
+ store.insert(embed,curi,0);
+ /*
store.insertAtHead(embed,curi.getAList().getInt("distance-from-seed"));
+ */
//}
} catch (URISyntaxException ex) {
***************
*** 327,331 ****
}
logger.fine("inserting prereq at head "+prereq);
! CrawlURI prereqCuri = store.insertAtHead(prereq,curi.getAList().getInt("distance-from-seed"));
if (prereqCuri.getStoreState()==URIStoreable.FINISHED) {
curi.setFetchStatus(S_PREREQUISITE_FAILURE);
--- 332,337 ----
}
logger.fine("inserting prereq at head "+prereq);
! //CrawlURI prereqCuri = store.insertAtHead(prereq,curi.getAList().getInt("distance-from-seed"));
! CrawlURI prereqCuri = store.insert(prereq,curi,0);
if (prereqCuri.getStoreState()==URIStoreable.FINISHED) {
curi.setFetchStatus(S_PREREQUISITE_FAILURE);
|
|
From: <go...@us...> - 2003-09-06 01:44:08
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic
In directory sc8-pr-cvs1:/tmp/cvs-serv15073/src/org/archive/crawler/basic
Modified Files:
SimplePreconditionEnforcer.java
Log Message:
chaff threshold enforced
Index: SimplePreconditionEnforcer.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimplePreconditionEnforcer.java,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** SimplePreconditionEnforcer.java 6 Aug 2003 01:21:21 -0000 1.8
--- SimplePreconditionEnforcer.java 6 Sep 2003 01:44:05 -0000 1.9
***************
*** 25,30 ****
--- 25,32 ----
private static String XP_DELAY_FACTOR = "//params/@delay-factor";
private static String XP_MINIMUM_DELAY = "//params/@minimum-delay";
+ private static String XP_CHAFF_THRESHOLD = "//params/@chaff-threshold";
private static int DEFAULT_DELAY_FACTOR = 10;
private static int DEFAULT_MINIMUM_DELAY = 2000;
+ private static int DEFAULT_CHAFF_THRESHOLD = 3;
private static Logger logger = Logger.getLogger("org.archive.crawler.basic.SimplePolitenessEnforcer");
***************
*** 35,38 ****
--- 37,44 ----
protected void innerProcess(CrawlURI curi) {
+ if (considerChaff(curi)) {
+ return;
+ }
+
if (considerDnsPreconditions(curi)) {
return;
***************
*** 57,60 ****
--- 63,81 ----
return;
+ }
+
+ /**
+ * @param curi
+ * @return
+ */
+ private boolean considerChaff(CrawlURI curi) {
+ //if (curi.getChaffness()>1) {
+ // System.out.println(curi.getChaffness()+" "+curi.getUURI().toString());
+ //}
if(curi.getChaffness()>getIntAt(XP_CHAFF_THRESHOLD,DEFAULT_CHAFF_THRESHOLD)) {
+ curi.setFetchStatus(S_DEEMED_CHAFF);
+ curi.cancelFurtherProcessing();
+ return true;
+ }
+ return false;
}
|
|
From: <go...@us...> - 2003-09-06 01:43:13
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic
In directory sc8-pr-cvs1:/tmp/cvs-serv14925/src/org/archive/crawler/basic
Modified Files:
FetcherHTTPSimple.java
Log Message:
share single httpclient instance, using multi connection manager: risk of sync issues, but benefit of single cookie space
Index: FetcherHTTPSimple.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/FetcherHTTPSimple.java,v
retrieving revision 1.6
retrieving revision 1.7
diff -C2 -d -r1.6 -r1.7
*** FetcherHTTPSimple.java 6 Aug 2003 01:19:43 -0000 1.6
--- FetcherHTTPSimple.java 6 Sep 2003 01:43:07 -0000 1.7
***************
*** 13,16 ****
--- 13,17 ----
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
+ import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
***************
*** 18,22 ****
import org.archive.crawler.datamodel.CrawlURI;
import org.archive.crawler.datamodel.FetchStatusCodes;
- import org.archive.crawler.datamodel.InstancePerThread;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.Processor;
--- 19,22 ----
***************
*** 29,33 ****
*
*/
! public class FetcherHTTPSimple extends Processor implements InstancePerThread, CoreAttributeConstants, FetchStatusCodes {
private static String XP_TIMEOUT_SECONDS = "//params/@timeout-seconds";
private static int DEFAULT_TIMEOUT_SECONDS = 10;
--- 29,35 ----
*
*/
! public class FetcherHTTPSimple
! extends Processor
! implements CoreAttributeConstants, FetchStatusCodes {
private static String XP_TIMEOUT_SECONDS = "//params/@timeout-seconds";
private static int DEFAULT_TIMEOUT_SECONDS = 10;
***************
*** 124,127 ****
--- 126,130 ----
} finally {
//controller.getKicker().cancelKick(Thread.currentThread());
+ get.releaseConnection();
}
}
***************
*** 134,138 ****
timeout = 1000*getIntAt(XP_TIMEOUT_SECONDS, DEFAULT_TIMEOUT_SECONDS);
CookiePolicy.setDefaultPolicy(CookiePolicy.COMPATIBILITY);
! http = new HttpClient();
}
--- 137,143 ----
timeout = 1000*getIntAt(XP_TIMEOUT_SECONDS, DEFAULT_TIMEOUT_SECONDS);
CookiePolicy.setDefaultPolicy(CookiePolicy.COMPATIBILITY);
! MultiThreadedHttpConnectionManager connectionManager =
! new MultiThreadedHttpConnectionManager();
! http = new HttpClient(connectionManager);
}
|
|
From: <go...@us...> - 2003-09-03 01:51:11
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor In directory sc8-pr-cvs1:/tmp/cvs-serv27258/src/org/archive/crawler/extractor Modified Files: ExtractorHTML.java Log Message: added proper NOT, adjusted substring begin index Index: ExtractorHTML.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor/ExtractorHTML.java,v retrieving revision 1.11 retrieving revision 1.12 diff -C2 -d -r1.11 -r1.12 *** ExtractorHTML.java 26 Aug 2003 00:16:51 -0000 1.11 --- ExtractorHTML.java 3 Sep 2003 01:51:05 -0000 1.12 *************** *** 299,303 **** return true; } ! return NON_HTML_PATH_EXTENSION.matcher(path.substring(dot)).matches(); } --- 299,304 ---- return true; } ! String ext = path.substring(dot+1); ! return ! NON_HTML_PATH_EXTENSION.matcher(ext).matches(); } |
|
From: <go...@us...> - 2003-08-26 00:17:14
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor
In directory sc8-pr-cvs1:/tmp/cvs-serv1585/src/org/archive/crawler/extractor
Modified Files:
ExtractorHTML.java
Log Message:
ignore HTML from paths which suggest non-HTML content (soft 404 protection)
Index: ExtractorHTML.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor/ExtractorHTML.java,v
retrieving revision 1.10
retrieving revision 1.11
diff -C2 -d -r1.10 -r1.11
*** ExtractorHTML.java 12 Aug 2003 00:47:16 -0000 1.10
--- ExtractorHTML.java 26 Aug 2003 00:16:51 -0000 1.11
***************
*** 29,32 ****
--- 29,34 ----
*/
public class ExtractorHTML extends Processor implements CoreAttributeConstants {
+ private boolean ignoreUnexpectedHTML = true; // TODO: add config param to change
+
private static Logger logger = Logger.getLogger("org.archive.crawler.basic.ExtractorHTML");
***************
*** 230,233 ****
--- 232,245 ----
return;
}
+
+ if(ignoreUnexpectedHTML) {
+ if(!expectedHTML(curi)) {
+ // HTML was not expected (eg a GIF was expected) so ignore
+ // (as if a soft 404)
+ return;
+ }
+ }
+
+
GetMethod get = (GetMethod)curi.getAList().getObject(A_HTTP_TRANSACTION);
Header contentType = get.getResponseHeader("Content-Type");
***************
*** 268,272 ****
}
!
/**
* @param curi
--- 280,305 ----
}
!
! static Pattern NON_HTML_PATH_EXTENSION = Pattern.compile(
! "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+
! "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)");
! /**
! * @param curi
! * @return
! */
! private boolean expectedHTML(CrawlURI curi) {
! String path = curi.getUURI().getUri().getPath();
! int dot = path.lastIndexOf('.');
! if (dot<0) {
! // no path extension, HTML is fine
! return true;
! }
! if(dot<(path.length()-5)) {
! // extension too long to recognize, HTML is fine
! return true;
! }
! return NON_HTML_PATH_EXTENSION.matcher(path.substring(dot)).matches();
! }
!
/**
* @param curi
|