You can subscribe to this list here.
2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
(2) |
Sep
(50) |
Oct
(197) |
Nov
(305) |
Dec
(295) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2004 |
Jan
(429) |
Feb
(694) |
Mar
(443) |
Apr
(479) |
May
(357) |
Jun
(74) |
Jul
(218) |
Aug
(162) |
Sep
(156) |
Oct
(340) |
Nov
(132) |
Dec
(224) |
2005 |
Jan
(170) |
Feb
(122) |
Mar
(265) |
Apr
(215) |
May
(139) |
Jun
(247) |
Jul
(179) |
Aug
(116) |
Sep
(103) |
Oct
(125) |
Nov
(97) |
Dec
(221) |
2006 |
Jan
(132) |
Feb
(18) |
Mar
(23) |
Apr
(35) |
May
(71) |
Jun
(268) |
Jul
(220) |
Aug
(376) |
Sep
(181) |
Oct
(71) |
Nov
(131) |
Dec
(172) |
2007 |
Jan
(125) |
Feb
(79) |
Mar
(90) |
Apr
(76) |
May
(91) |
Jun
(64) |
Jul
(113) |
Aug
(96) |
Sep
(40) |
Oct
(30) |
Nov
(85) |
Dec
(56) |
2008 |
Jan
(37) |
Feb
(79) |
Mar
(22) |
Apr
(6) |
May
(13) |
Jun
(22) |
Jul
(83) |
Aug
(50) |
Sep
(8) |
Oct
(32) |
Nov
(55) |
Dec
(28) |
2009 |
Jan
(15) |
Feb
(30) |
Mar
(28) |
Apr
(69) |
May
(82) |
Jun
(19) |
Jul
(64) |
Aug
(71) |
Sep
(53) |
Oct
(84) |
Nov
(105) |
Dec
(40) |
2010 |
Jan
(11) |
Feb
(19) |
Mar
(24) |
Apr
(58) |
May
(15) |
Jun
(35) |
Jul
(14) |
Aug
(13) |
Sep
(31) |
Oct
(15) |
Nov
(39) |
Dec
(10) |
2011 |
Jan
(59) |
Feb
(32) |
Mar
(10) |
Apr
(37) |
May
(20) |
Jun
(21) |
Jul
(39) |
Aug
(9) |
Sep
(31) |
Oct
(29) |
Nov
(3) |
Dec
(1) |
2012 |
Jan
(7) |
Feb
(4) |
Mar
(5) |
Apr
(12) |
May
(5) |
Jun
(8) |
Jul
(9) |
Aug
(6) |
Sep
(15) |
Oct
(1) |
Nov
(3) |
Dec
(9) |
2013 |
Jan
(9) |
Feb
(2) |
Mar
(41) |
Apr
(13) |
May
(9) |
Jun
(20) |
Jul
(5) |
Aug
(22) |
Sep
(5) |
Oct
(3) |
Nov
(13) |
Dec
(8) |
2014 |
Jan
(27) |
Feb
(16) |
Mar
(7) |
Apr
(14) |
May
(10) |
Jun
(2) |
Jul
(16) |
Aug
(6) |
Sep
(6) |
Oct
(11) |
Nov
(7) |
Dec
|
2015 |
Jan
|
Feb
(7) |
Mar
(4) |
Apr
|
May
(2) |
Jun
|
Jul
|
Aug
(2) |
Sep
(2) |
Oct
(5) |
Nov
(1) |
Dec
|
2016 |
Jan
(15) |
Feb
(5) |
Mar
(4) |
Apr
(1) |
May
(7) |
Jun
(16) |
Jul
(6) |
Aug
(2) |
Sep
|
Oct
(1) |
Nov
|
Dec
|
2017 |
Jan
|
Feb
(1) |
Mar
(3) |
Apr
|
May
(4) |
Jun
(25) |
Jul
|
Aug
|
Sep
(4) |
Oct
(11) |
Nov
(9) |
Dec
(1) |
2018 |
Jan
(2) |
Feb
|
Mar
|
Apr
|
May
(2) |
Jun
|
Jul
(10) |
Aug
|
Sep
(1) |
Oct
(2) |
Nov
(12) |
Dec
(4) |
2019 |
Jan
(3) |
Feb
(21) |
Mar
(17) |
Apr
(13) |
May
(6) |
Jun
(4) |
Jul
|
Aug
(65) |
Sep
|
Oct
(4) |
Nov
(7) |
Dec
|
2020 |
Jan
(23) |
Feb
(6) |
Mar
(14) |
Apr
(25) |
May
(11) |
Jun
(9) |
Jul
(7) |
Aug
(7) |
Sep
(1) |
Oct
(4) |
Nov
(4) |
Dec
|
2021 |
Jan
(8) |
Feb
(11) |
Mar
(1) |
Apr
(6) |
May
(30) |
Jun
(60) |
Jul
(43) |
Aug
(23) |
Sep
(16) |
Oct
|
Nov
(7) |
Dec
(13) |
2022 |
Jan
(7) |
Feb
(2) |
Mar
(17) |
Apr
(16) |
May
(9) |
Jun
(2) |
Jul
(18) |
Aug
|
Sep
(3) |
Oct
(1) |
Nov
(2) |
Dec
|
2023 |
Jan
(7) |
Feb
|
Mar
(11) |
Apr
|
May
(1) |
Jun
|
Jul
|
Aug
|
Sep
(7) |
Oct
(5) |
Nov
(2) |
Dec
|
2024 |
Jan
|
Feb
(4) |
Mar
(8) |
Apr
(5) |
May
(5) |
Jun
(12) |
Jul
(2) |
Aug
(12) |
Sep
(25) |
Oct
(47) |
Nov
(46) |
Dec
(3) |
2025 |
Jan
(6) |
Feb
(14) |
Mar
(8) |
Apr
(23) |
May
(34) |
Jun
(44) |
Jul
(8) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel In directory sc8-pr-cvs1:/tmp/cvs-serv27910/src/org/archive/crawler/datamodel Modified Files: Tag: gjm-refactor CandidateURI.java CrawlOrder.java CrawlerBehavior.java Added Files: Tag: gjm-refactor MemFPUURISet.java Log Message: beginnings of big refactor branch --- NEW FILE: MemFPUURISet.java --- /* * MemFPUURISet.java * Created on Oct 1, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/Attic/MemFPUURISet.java,v 1.1.2.1 2003/10/02 01:53:51 gojomo Exp $ */ package org.archive.crawler.datamodel; import java.util.Collection; import java.util.Iterator; /** * UURISet which only stores 64-bit UURI fingerprints. * @author gojomo * */ public class MemFPUURISet implements UURISet { /* (non-Javadoc) * @see org.archive.crawler.datamodel.UURISet#count() */ public long count() { // TODO Auto-generated method stub return 0; } /* (non-Javadoc) * @see org.archive.crawler.datamodel.UURISet#contains(org.archive.crawler.datamodel.UURI) */ public boolean contains(UURI u) { // TODO Auto-generated method stub return false; } /* (non-Javadoc) * @see org.archive.crawler.datamodel.UURISet#contains(org.archive.crawler.datamodel.CrawlURI) */ public boolean contains(CrawlURI curi) { // TODO Auto-generated method stub return false; } /* (non-Javadoc) * @see org.archive.crawler.datamodel.UURISet#add(org.archive.crawler.datamodel.UURI) */ public void add(UURI u) { // TODO Auto-generated method stub } /* (non-Javadoc) * @see org.archive.crawler.datamodel.UURISet#remove(org.archive.crawler.datamodel.UURI) */ public void remove(UURI u) { // TODO Auto-generated method stub } /* (non-Javadoc) * @see org.archive.crawler.datamodel.UURISet#add(org.archive.crawler.datamodel.CrawlURI) */ public void add(CrawlURI curi) { // TODO Auto-generated method stub } /* (non-Javadoc) * @see org.archive.crawler.datamodel.UURISet#remove(org.archive.crawler.datamodel.CrawlURI) */ public void remove(CrawlURI curi) { // TODO Auto-generated method stub } /* (non-Javadoc) * @see java.util.Collection#size() */ public int size() { // TODO Auto-generated method stub return 0; } /* (non-Javadoc) * @see java.util.Collection#clear() */ public void clear() { // TODO Auto-generated method stub } /* (non-Javadoc) * @see java.util.Collection#isEmpty() */ public boolean isEmpty() { // TODO Auto-generated method stub return false; } /* (non-Javadoc) * @see java.util.Collection#toArray() */ public Object[] toArray() { // TODO Auto-generated method stub return null; } /* (non-Javadoc) * @see java.util.Collection#add(java.lang.Object) */ public boolean add(Object o) { // TODO Auto-generated method stub return false; } /* (non-Javadoc) * @see java.util.Collection#contains(java.lang.Object) */ public boolean contains(Object o) { // TODO Auto-generated method stub return false; } /* (non-Javadoc) * @see java.util.Collection#remove(java.lang.Object) */ public boolean remove(Object o) { // TODO Auto-generated method stub return false; } /* (non-Javadoc) * @see java.util.Collection#addAll(java.util.Collection) */ public boolean addAll(Collection c) { // TODO Auto-generated method stub return false; } /* (non-Javadoc) * @see java.util.Collection#containsAll(java.util.Collection) */ public boolean containsAll(Collection c) { // TODO Auto-generated method stub return false; } /* (non-Javadoc) * @see java.util.Collection#removeAll(java.util.Collection) */ public boolean removeAll(Collection c) { // TODO Auto-generated method stub return false; } /* (non-Javadoc) * @see java.util.Collection#retainAll(java.util.Collection) */ public boolean retainAll(Collection c) { // TODO Auto-generated method stub return false; } /* (non-Javadoc) * @see java.util.Collection#iterator() */ public Iterator iterator() { // TODO Auto-generated method stub return null; } /* (non-Javadoc) * @see java.util.Collection#toArray(java.lang.Object[]) */ public Object[] toArray(Object[] a) { // TODO Auto-generated method stub return null; } } Index: CandidateURI.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CandidateURI.java,v retrieving revision 1.1 retrieving revision 1.1.2.1 diff -C2 -d -r1.1 -r1.1.2.1 *** CandidateURI.java 1 Oct 2003 16:33:39 -0000 1.1 --- CandidateURI.java 2 Oct 2003 01:53:51 -0000 1.1.2.1 *************** *** 22,26 **** /** Seed status */ boolean isSeed = false; ! /** Latest version of the inScope definition met*/ int inScopeVersion = -1; /** String of letters indicating how this URI was reached from a seed */ --- 22,26 ---- /** Seed status */ boolean isSeed = false; ! /** Latest version of the inScope definition met; (zero if not)*/ int inScopeVersion = -1; /** String of letters indicating how this URI was reached from a seed */ *************** *** 29,35 **** --- 29,51 ---- // E embedded (as frame, src, link, codebase, etc.) // L link + // for example LLLE (an embedded image on a page 3 links from seed) String pathFromSeed; /** Where this URI was (presently) discovered */ UURI precursorUuri; + + + /** + * @param u + */ + public CandidateURI(UURI u) { + uuri = u; + } + + /** + * @param b + */ + public void setSeed(boolean b) { + isSeed=b; + } } Index: CrawlOrder.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlOrder.java,v retrieving revision 1.12 retrieving revision 1.12.2.1 diff -C2 -d -r1.12 -r1.12.2.1 *** CrawlOrder.java 1 Aug 2003 22:41:24 -0000 1.12 --- CrawlOrder.java 2 Oct 2003 01:53:51 -0000 1.12.2.1 *************** *** 7,12 **** package org.archive.crawler.datamodel; ! import org.archive.crawler.basic.CrawlerConfigurationConstants; ! import java.io.IOException; --- 7,11 ---- package org.archive.crawler.datamodel; ! import java.io.File; import java.io.IOException; *************** *** 14,25 **** import org.apache.xpath.XPathAPI; ! import org.archive.crawler.framework.*; import org.w3c.dom.Document; - import java.io.File; - /** Read and manipulate configuration (order) file. */ ! public class CrawlOrder extends XMLConfig implements CrawlerConfigurationConstants { protected String name; protected CrawlerBehavior behavior; --- 13,22 ---- import org.apache.xpath.XPathAPI; ! import org.archive.crawler.framework.XMLConfig; import org.w3c.dom.Document; /** Read and manipulate configuration (order) file. */ ! public class CrawlOrder extends XMLConfig { protected String name; protected CrawlerBehavior behavior; Index: CrawlerBehavior.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlerBehavior.java,v retrieving revision 1.14 retrieving revision 1.14.2.1 diff -C2 -d -r1.14 -r1.14.2.1 *** CrawlerBehavior.java 1 Aug 2003 22:41:24 -0000 1.14 --- CrawlerBehavior.java 2 Oct 2003 01:53:51 -0000 1.14.2.1 *************** *** 13,17 **** import java.util.List; - import org.archive.crawler.basic.CrawlerConfigurationConstants; import org.archive.crawler.framework.XMLConfig; import org.w3c.dom.Node; --- 13,16 ---- *************** *** 21,25 **** * */ ! public class CrawlerBehavior extends XMLConfig implements CrawlerConfigurationConstants { List seeds = null; String caseFlattenedUserAgent = null; --- 20,24 ---- * */ ! public class CrawlerBehavior extends XMLConfig { List seeds = null; String caseFlattenedUserAgent = null; |
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic In directory sc8-pr-cvs1:/tmp/cvs-serv27910/src/org/archive/crawler/basic Added Files: Tag: gjm-refactor BasicScope.java SimpleFrontier.java Removed Files: Tag: gjm-refactor CrawlerConfigurationConstants.java Log Message: beginnings of big refactor branch --- NEW FILE: BasicScope.java --- /* * BasicScope.java * Created on Oct 1, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/Attic/BasicScope.java,v 1.1.2.1 2003/10/02 01:53:51 gojomo Exp $ */ package org.archive.crawler.basic; import org.archive.crawler.framework.CrawlScope; /** * A core CrawlScope suitable for the most common * crawl needs. * * Roughly, its logic is that a URI is included if: * * ( isSeed(uri) || focusFilter.accepts(uri) ) * && ! excludeFilter.accepts(uri) * * @author gojomo * */ public class BasicScope extends CrawlScope { } --- NEW FILE: SimpleFrontier.java --- /* * SimpleFrontier.java * Created on Oct 1, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/Attic/SimpleFrontier.java,v 1.1.2.1 2003/10/02 01:53:51 gojomo Exp $ */ package org.archive.crawler.basic; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.logging.Logger; import org.archive.crawler.datamodel.*; import org.archive.crawler.datamodel.CandidateURI; import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.datamodel.FatalConfigurationException; import org.archive.crawler.datamodel.UURI; import org.archive.crawler.datamodel.UURISet; import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.URIFrontier; /** * A basic in-memory mostly breadth-first frontier, which * refrains from emitting more than one CrawlURI of the same * 'key' (host) at once, and respects minimum-delay and * delay-factor specifications for politeness * * @author gojomo * */ public class SimpleFrontier implements URIFrontier { private static Logger logger = Logger.getLogger("org.archive.crawler.basic.SimpleFrontier"); HashMap allCuris = new HashMap(); // of UURI -> CrawlURI UURISet alreadyIncluded = new MemFPUURISet(); // every CandidateURI not yet in process or another queue; // all seeds start here; may contain duplicates LinkedList pendingQueue = new LinkedList(); // of CandidateURIs /* (non-Javadoc) * @see org.archive.crawler.framework.URIFrontier#initialize(org.archive.crawler.framework.CrawlController) */ public void initialize(CrawlController c) throws FatalConfigurationException { Iterator iter = c.getOrder().getBehavior().getSeeds().iterator(); while (iter.hasNext()) { UURI u = (UURI) iter.next(); CandidateURI caUri = new CandidateURI(u); caUri.setSeed(true); schedule(caUri); } } /* (non-Javadoc) * @see org.archive.crawler.framework.URIFrontier#schedule(org.archive.crawler.datamodel.CandidateURI) */ public void schedule(CandidateURI caUri) { // TODO Auto-generated method stub } /* (non-Javadoc) * @see org.archive.crawler.framework.URIFrontier#next(int) */ public CrawlURI next(int timeout) { // TODO Auto-generated method stub return null; } /* (non-Javadoc) * @see org.archive.crawler.framework.URIFrontier#finished(org.archive.crawler.datamodel.CrawlURI) */ public void finished(CrawlURI curi) { // TODO Auto-generated method stub } /* (non-Javadoc) * @see org.archive.crawler.framework.URIFrontier#isEmpty() */ public boolean isEmpty() { // TODO Auto-generated method stub return false; } /* (non-Javadoc) * @see org.archive.crawler.framework.URIFrontier#size() */ public long size() { // TODO Auto-generated method stub return 0; } } --- CrawlerConfigurationConstants.java DELETED --- |
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework In directory sc8-pr-cvs1:/tmp/cvs-serv27910/src/org/archive/crawler/framework Modified Files: Tag: gjm-refactor ToeThread.java CrawlController.java Added Files: Tag: gjm-refactor CrawlScope.java URIFrontier.java ToePool.java Log Message: beginnings of big refactor branch --- NEW FILE: CrawlScope.java --- /* * CrawlScope.java * Created on Oct 1, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/Attic/CrawlScope.java,v 1.1.2.1 2003/10/02 01:53:51 gojomo Exp $ */ package org.archive.crawler.framework; /** * Filter which determines, looking at the totality of * information available about a CandidateURI/CrawlURI, * instamce, if that URI should be scheduled for crawling. * * Dynamic information inherent in the discovery of the * URI -- such as the path by which it was discovered -- * may be considered. * * Dynamic information which requires the consultation * of external and potentially volatile information -- * such as current robots.txt requests and the history * of attempts to crawl the same URI -- should NOT be * considered. Those potentially high-latency decisions * should be made at another step. . * * @author gojomo * */ public class CrawlScope extends Filter { int version; /* (non-Javadoc) * @see org.archive.crawler.framework.Filter#innerAccepts(java.lang.Object) */ protected boolean innerAccepts(Object o) { // TODO Auto-generated method stub return false; } /** * @return */ public int getVersion() { return version; } /* (non-Javadoc) * @see java.lang.Object#toString() */ public String toString() { return "CrawlScope<"+name+">"; } } --- NEW FILE: URIFrontier.java --- /* * URIFrontier.java * Created on Oct 1, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/Attic/URIFrontier.java,v 1.4.2.1 2003/10/02 01:53:51 gojomo Exp $ */ package org.archive.crawler.framework; import org.archive.crawler.datamodel.CandidateURI; import org.archive.crawler.datamodel.CrawlURI; import org.archive.crawler.datamodel.FatalConfigurationException; /** * @author gojomo * */ public interface URIFrontier { void initialize(CrawlController c) throws FatalConfigurationException; void schedule(CandidateURI caUri); CrawlURI next(int timeout); void finished(CrawlURI curi); boolean isEmpty(); long size(); } --- NEW FILE: ToePool.java --- /* * ToePool.java * Created on Oct 1, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/Attic/ToePool.java,v 1.1.2.1 2003/10/02 01:53:51 gojomo Exp $ */ package org.archive.crawler.framework; import java.util.ArrayList; /** * @author gojomo * */ public class ToePool { protected CrawlController controller; protected ArrayList toes; /** * @param i */ public ToePool(CrawlController c, int count) { controller = c; toes = new ArrayList(count); // TODO make number of threads self-optimizing for(int i = 0; i<count; i++) { ToeThread newThread = new ToeThread(c,this,i); toes.add(newThread); newThread.start(); } } /** * */ public synchronized ToeThread available() { for(int i=0; i < toes.size();i++){ if(((ToeThread)toes.get(i)).isAvailable()) { return (ToeThread) toes.get(i); } } // nothing available try { wait(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } return available(); } /** * @param thread */ public synchronized void noteAvailable(ToeThread thread) { notify(); } /** * @return */ public int getActiveToeCount() { int count = 0; // will be an approximation for(int i=0; i < toes.size();i++){ if(((ToeThread)toes.get(i)).isAvailable()) { count++; } } return count; } } Index: ToeThread.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/ToeThread.java,v retrieving revision 1.14 retrieving revision 1.14.2.1 diff -C2 -d -r1.14 -r1.14.2.1 *** ToeThread.java 25 Sep 2003 00:14:03 -0000 1.14 --- ToeThread.java 2 Oct 2003 01:53:51 -0000 1.14.2.1 *************** *** 25,29 **** private static Logger logger = Logger.getLogger("org.archive.crawler.framework.ToeThread"); ! private boolean paused = false; private boolean shouldCrawl = true; CrawlController controller; --- 25,29 ---- private static Logger logger = Logger.getLogger("org.archive.crawler.framework.ToeThread"); ! private ToePool pool; private boolean shouldCrawl = true; CrawlController controller; *************** *** 33,36 **** --- 33,38 ---- CrawlURI currentCuri; + long lastStartTime; + long lastFinishTime; // in-process/on-hold curis? not for now // a queue of curis to do next? not for now *************** *** 39,44 **** * @param c */ ! public ToeThread(CrawlController c, int sn) { controller = c; serialNumber = sn; setName("ToeThread #"+serialNumber); --- 41,47 ---- * @param c */ ! public ToeThread(CrawlController c, ToePool p, int sn) { controller = c; + pool = p; serialNumber = sn; setName("ToeThread #"+serialNumber); *************** *** 46,49 **** --- 49,64 ---- } + + public synchronized void crawl(CrawlURI curi) { + assert currentCuri == null : "attempt to clobber crawlUri"; + currentCuri = curi; + currentCuri.setThreadNumber(serialNumber); + notify(); + } + + public boolean isAvailable() { + return currentCuri == null; + } + /* (non-Javadoc) * @see java.lang.Runnable#run() *************** *** 59,78 **** private synchronized void processingLoop() { - assert currentCuri == null; - - while ( paused ) { - try { - this.wait(); - } catch (InterruptedException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - } try { - currentCuri = controller.crawlUriFor(this); - if ( currentCuri != null ) { ! try { while ( currentCuri.nextProcessor() != null ) { --- 74,82 ---- private synchronized void processingLoop() { try { if ( currentCuri != null ) { ! lastStartTime = System.currentTimeMillis(); ! try { while ( currentCuri.nextProcessor() != null ) { *************** *** 85,99 **** } ! controller.getSelector().inter(currentCuri); currentCuri = null; ! } else { ! // self-pause, because there's nothing left to crawl ! logger.warning(getName()+" pausing: nothing to crawl"); ! paused = true; } } catch (OutOfMemoryError e) { e.printStackTrace(); logger.warning(getName()+" pausing: out of memory error"); ! paused = true; } } --- 89,105 ---- } ! controller.getFrontier().finished(currentCuri); currentCuri = null; ! lastFinishTime = System.currentTimeMillis(); } + pool.noteAvailable(this); + wait(); } catch (OutOfMemoryError e) { e.printStackTrace(); logger.warning(getName()+" pausing: out of memory error"); ! // TODO: hard stop? notify elsewhere? ! } catch (InterruptedException e) { ! e.printStackTrace(); ! logger.warning(getName()+" interrupted"); } } *************** *** 122,138 **** return shouldCrawl; } - - /** - * - */ - public synchronized void unpause() { - if(!paused) return; - paused = false; - this.notify(); - } - - public void pauseAfterCurrent() { - paused = true; - } public void stopAfterCurrent() { --- 128,131 ---- *************** *** 146,153 **** return serialNumber; } ! ! public boolean isPaused(){ ! return paused; ! } /** * @return --- 139,143 ---- return serialNumber; } ! /** * @return Index: CrawlController.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/CrawlController.java,v retrieving revision 1.28 retrieving revision 1.28.2.1 diff -C2 -d -r1.28 -r1.28.2.1 *** CrawlController.java 23 Sep 2003 01:16:35 -0000 1.28 --- CrawlController.java 2 Oct 2003 01:53:51 -0000 1.28.2.1 *************** *** 18,22 **** import java.util.logging.Logger; - import org.archive.crawler.basic.CrawlerConfigurationConstants; import org.archive.crawler.basic.StatisticsTracker; import org.archive.crawler.datamodel.CrawlOrder; --- 18,21 ---- *************** *** 34,42 **** * @author Gordon Mohr */ ! public class CrawlController implements CrawlerConfigurationConstants { ! private File disk; ! public Logger uriProcessing = Logger.getLogger("uri-processing"); ! public Logger crawlErrors = Logger.getLogger("crawl-errors"); public Logger uriErrors = Logger.getLogger("uri-errors"); public Logger progressStats = Logger.getLogger("progress-statistics"); --- 33,47 ---- * @author Gordon Mohr */ ! public class CrawlController { ! private int timeout = 1000; // to wait for CrawlURI from frontier before spinning ! private ToePool toePool; ! private URIFrontier frontier; ! private boolean shouldCrawl; ! ! public static final int DEFAULT_STATISTICS_REPORT_INTERVAL = 60; ! private File disk; ! public Logger uriProcessing = Logger.getLogger("crawl"); ! public Logger crawlErrors = Logger.getLogger("runtime-errors"); public Logger uriErrors = Logger.getLogger("uri-errors"); public Logger progressStats = Logger.getLogger("progress-statistics"); *************** *** 46,49 **** --- 51,55 ---- CrawlOrder order; + CrawlScope scope; URIScheduler scheduler; *************** *** 138,179 **** //statistics.setLogLevel(StatisticsTracker.VERBOSE_LOGGING); ! store = (URIStore) order.getBehavior().instantiate("//store"); ! scheduler = (URIScheduler) order.getBehavior().instantiate("//scheduler"); ! selector = (URISelector) order.getBehavior().instantiate("//selector"); ! firstProcessor = (Processor) order.getBehavior().instantiateAllInto("//processors/processor",processors); ! // try to initialize each of the store, schduler, and selector from the config file ! try{ ! store.initialize(this); ! }catch(NullPointerException e){ ! throw new FatalConfigurationException( ! "Can't initialize store, class specified in configuration file not found", ! order.crawlOrderFilename, ! "//store" ! ); ! } ! try{ ! scheduler.initialize(this); ! }catch(NullPointerException e){ throw new FatalConfigurationException( ! "Can't initialize scheduler, class specified in configuration file not found", ! order.crawlOrderFilename, ! "//scheduler" ! ); } ! try{ ! selector.initialize(this); ! }catch(NullPointerException e){ throw new FatalConfigurationException( ! "Can't initialize selector, class specified in configuration file not found", ! order.crawlOrderFilename, ! "//selector" ! ); ! } ! hostCache = new ServerCache(); - //kicker = new ThreadKicker(); - //kicker.start(); Iterator iter = processors.entrySet().iterator(); --- 144,170 ---- //statistics.setLogLevel(StatisticsTracker.VERBOSE_LOGGING); ! scope = (CrawlScope) order.instantiate("//scope"); ! frontier = (URIFrontier) order.instantiate("//frontier"); ! firstProcessor = (Processor) order.getBehavior().instantiateAllInto("//processors/processor",processors); ! // try to initialize each scope and frontier from the config file ! try { ! scope.initialize(this); ! } catch (NullPointerException e) { throw new FatalConfigurationException( ! "Can't initialize scope, class specified in configuration file not found", ! order.crawlOrderFilename, ! "//scope"); } ! try { ! frontier.initialize(this); ! } catch (NullPointerException e) { throw new FatalConfigurationException( ! "Can't initialize frontier, class specified in configuration file not found", ! order.crawlOrderFilename, ! "//frontier"); ! } hostCache = new ServerCache(); Iterator iter = processors.entrySet().iterator(); *************** *** 234,259 **** } - - /** - * @param thread - * @return - */ - public CrawlURI crawlUriFor(ToeThread thread) { - if( paused ) { - thread.pauseAfterCurrent(); - return null; - } - // TODO check global limits, etc to see if finished - if ( finished ) { - thread.stopAfterCurrent(); - return null; - } - CrawlURI curi = scheduler.curiFor(thread); - if (curi != null) { - curi.setNextProcessor(firstProcessor); - curi.setThreadNumber(thread.getSerialNumber()); - } - return curi; - } /** Return the object this controller is using to track crawl statistics --- 225,228 ---- *************** *** 268,301 **** */ public void startCrawl() { ! // assume scheduler/URIStore already loaded state ! ! // start toes ! adjustToeCount(); ! Iterator iter = toes.iterator(); ! while(iter.hasNext()) { ! ((ToeThread)iter.next()).unpause(); } } public void stopCrawl() { ! // stop toes ! for (int i = 0; i < toes.size(); i++) ! ((ToeThread)toes.get(i)).stopAfterCurrent(); ! } public int getActiveToeCount(){ ! List toes = getToes(); ! int active = 0; ! ! Iterator list = toes.listIterator(); ! ! while(list.hasNext()){ ! ToeThread t = (ToeThread)list.next(); ! if(!t.isPaused()){ ! active++; ! } ! } ! return active; } --- 237,264 ---- */ public void startCrawl() { ! // assume Frontier state already loaded ! setupToePool(); ! shouldCrawl=true; ! runCrawl(); ! } ! ! ! public void runCrawl() { ! while(shouldCrawl) { ! CrawlURI curi = frontier.next(timeout); ! if(curi != null) { ! curi.setNextProcessor(firstProcessor); ! toePool.available().crawl(curi); ! } } } public void stopCrawl() { ! shouldCrawl = false; ! } public int getActiveToeCount(){ ! return toePool.getActiveToeCount(); } *************** *** 304,315 **** } ! private void adjustToeCount() { ! while(toes.size()<order.getBehavior().getMaxToes()) { ! // TODO make number of threads self-optimizing ! ToeThread newThread = new ToeThread(this,nextToeSerialNumber); ! nextToeSerialNumber++; ! toes.add(newThread); ! newThread.start(); ! } } --- 267,272 ---- } ! private void setupToePool() { ! toePool = new ToePool(this,order.getBehavior().getMaxToes()); } *************** *** 410,417 **** } }else{ ! System.out.println("No code sistribution statistics."); } } } --- 367,382 ---- } }else{ ! System.out.println("No code distribution statistics."); } + } + + + /** + * + */ + public URIFrontier getFrontier() { + return frontier; } } |
From: <go...@us...> - 2003-10-02 01:53:57
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util In directory sc8-pr-cvs1:/tmp/cvs-serv27910/src/org/archive/util Modified Files: Tag: gjm-refactor ARCReader.java Log Message: beginnings of big refactor branch Index: ARCReader.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util/ARCReader.java,v retrieving revision 1.1 retrieving revision 1.1.2.1 diff -C2 -d -r1.1 -r1.1.2.1 *** ARCReader.java 27 Sep 2003 00:48:12 -0000 1.1 --- ARCReader.java 2 Oct 2003 01:53:51 -0000 1.1.2.1 *************** *** 24,27 **** --- 24,29 ---- protected FileInputStream arcStream; protected ARCResource lastResource; + protected int resourcePosition; + protected long filePosition; /** |
From: <go...@us...> - 2003-10-01 16:33:43
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel In directory sc8-pr-cvs1:/tmp/cvs-serv19135/src/org/archive/crawler/datamodel Added Files: CandidateURI.java Log Message: skeletal pre-CrawlURI object --- NEW FILE: CandidateURI.java --- /* * CandidateURI.java * Created on Sep 30, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CandidateURI.java,v 1.1 2003/10/01 16:33:39 gojomo Exp $ */ package org.archive.crawler.datamodel; /** * A URI, discovered or passed-in, that may be scheduled (and * thus become a CrawlURI). Contains just the fields necessary * to perform quick in-scope analysis. * * A flexible AttributeList, as in CrawlURI, could be added, * possibly even subsuming the existing fields. * * @author Gordon Mohr */ public class CandidateURI { /** Usuable URI under consideration */ UURI uuri; /** Seed status */ boolean isSeed = false; /** Latest version of the inScope definition met*/ int inScopeVersion = -1; /** String of letters indicating how this URI was reached from a seed */ // P precondition // R redirection // E embedded (as frame, src, link, codebase, etc.) // L link String pathFromSeed; /** Where this URI was (presently) discovered */ UURI precursorUuri; } |
From: <go...@us...> - 2003-10-01 01:42:00
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/example-crawl In directory sc8-pr-cvs1:/tmp/cvs-serv16955/example-crawl Modified Files: example-order.xml Log Message: example Index: example-order.xml =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/example-crawl/example-order.xml,v retrieving revision 1.14 retrieving revision 1.15 diff -C2 -d -r1.14 -r1.15 *** example-order.xml 23 Sep 2003 01:16:35 -0000 1.14 --- example-order.xml 1 Oct 2003 01:41:56 -0000 1.15 *************** *** 39,43 **** #http://directory.google.com/Top/Games/ # http://www3.google.com/help/customize.html ! http://dmoz.org </seeds> --- 39,44 ---- #http://directory.google.com/Top/Games/ # http://www3.google.com/help/customize.html ! # http://www.yahoo.com ! http://www.archive.org/movies/movies.php </seeds> *************** *** 84,88 **** class="org.archive.crawler.basic.SimplePreselector" next="Preprocessor"> ! <params max-link-depth="1" max-embed-depth="2" /> <filter name="focus" --- 85,89 ---- class="org.archive.crawler.basic.SimplePreselector" next="Preprocessor"> ! <params max-link-depth="0" max-embed-depth="2" /> <filter name="focus" *************** *** 155,165 **** <!-- actual enforcement of these limits may depend on choice of SSS/processor instances that read and respect these limits --> ! <max-link-depth value="100" /> <!-- zero means crawl seeds only --> ! <max-embed-depth value="5" /> <!-- extra hops that can be taken for embeds --> ! <max-toe-threads value="20" /> </limits> </crawler-behavior> ! <disk path="example-crawl" /> <loggers> --- 156,164 ---- <!-- actual enforcement of these limits may depend on choice of SSS/processor instances that read and respect these limits --> ! <max-toe-threads value="2" /> </limits> </crawler-behavior> ! <disk path="disk" /> <loggers> |
From: <go...@us...> - 2003-10-01 01:41:46
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor In directory sc8-pr-cvs1:/tmp/cvs-serv16920/src/org/archive/crawler/extractor Modified Files: ExtractorHTML.java Log Message: use replayCharSequence; treat <link> hrefs as 'embed's (as they are usually css, ico, etc) Index: ExtractorHTML.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor/ExtractorHTML.java,v retrieving revision 1.16 retrieving revision 1.17 diff -C2 -d -r1.16 -r1.17 *** ExtractorHTML.java 30 Sep 2003 18:07:53 -0000 1.16 --- ExtractorHTML.java 1 Oct 2003 01:41:42 -0000 1.17 *************** *** 113,117 **** if (attr.start(2)>-1) { // HREF ! processLink(curi, value); if (element.toString().equalsIgnoreCase("base")) { curi.getAList().putString(A_HTML_BASE,value.toString()); --- 113,123 ---- if (attr.start(2)>-1) { // HREF ! if(element.toString().equalsIgnoreCase("link")) { ! // <LINK> elements treated as embeds (css, ico, etc) ! processEmbed(curi, value); ! } else { ! // other HREFs treated as links ! processLink(curi, value); ! } if (element.toString().equalsIgnoreCase("base")) { curi.getAList().putString(A_HTML_BASE,value.toString()); *************** *** 259,263 **** ! CharSequence cs = get.getResponseBodyAsString(); if (cs==null) { --- 265,269 ---- ! CharSequence cs = get.getHttpRecorder().getRecordedInput().getCharSequence(); if (cs==null) { |
From: <go...@us...> - 2003-10-01 01:40:47
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic In directory sc8-pr-cvs1:/tmp/cvs-serv16728/src/org/archive/crawler/basic Modified Files: SimplePreconditionEnforcer.java SimplePreselector.java Log Message: params cleanup Index: SimplePreconditionEnforcer.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimplePreconditionEnforcer.java,v retrieving revision 1.10 retrieving revision 1.11 diff -C2 -d -r1.10 -r1.11 *** SimplePreconditionEnforcer.java 23 Sep 2003 01:16:34 -0000 1.10 --- SimplePreconditionEnforcer.java 1 Oct 2003 01:40:43 -0000 1.11 *************** *** 23,32 **** */ public class SimplePreconditionEnforcer extends Processor implements FetchStatusCodes { ! private static String XP_DELAY_FACTOR = "//params/@delay-factor"; ! private static String XP_MINIMUM_DELAY = "//params/@minimum-delay"; ! private static String XP_CHAFF_THRESHOLD = "//params/@chaff-threshold"; private static int DEFAULT_DELAY_FACTOR = 10; private static int DEFAULT_MINIMUM_DELAY = 2000; - private static int DEFAULT_CHAFF_THRESHOLD = 3; private static Logger logger = Logger.getLogger("org.archive.crawler.basic.SimplePolitenessEnforcer"); --- 23,30 ---- */ public class SimplePreconditionEnforcer extends Processor implements FetchStatusCodes { ! private static String XP_DELAY_FACTOR = "params/@delay-factor"; ! private static String XP_MINIMUM_DELAY = "params/@minimum-delay"; private static int DEFAULT_DELAY_FACTOR = 10; private static int DEFAULT_MINIMUM_DELAY = 2000; private static Logger logger = Logger.getLogger("org.archive.crawler.basic.SimplePolitenessEnforcer"); Index: SimplePreselector.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimplePreselector.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** SimplePreselector.java 24 Sep 2003 01:44:42 -0000 1.2 --- SimplePreselector.java 1 Oct 2003 01:40:43 -0000 1.3 *************** *** 24,29 **** */ public class SimplePreselector extends Processor implements FetchStatusCodes { ! private static String XP_MAX_LINK_DEPTH="//limits/max-link-depth/@value"; ! private static String XP_MAX_EMBED_DEPTH="//limits/max-embed-depth/@value"; private int maxLinkDepth = -1; private int maxEmbedDepth = -1; --- 24,29 ---- */ public class SimplePreselector extends Processor implements FetchStatusCodes { ! private static String XP_MAX_LINK_DEPTH="params/@max-link-depth"; ! private static String XP_MAX_EMBED_DEPTH="params/@max-embed-depth"; private int maxLinkDepth = -1; private int maxEmbedDepth = -1; |
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io In directory sc8-pr-cvs1:/tmp/cvs-serv16570/src/org/archive/crawler/io Modified Files: RecordingInputStream.java ReplayInputStream.java ReplayCharSequence.java RecordingOutputStream.java Added Files: CharSubSequence.java Log Message: improved record/replay --- NEW FILE: CharSubSequence.java --- /* * CharSubSequence.java * Created on Sep 30, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/CharSubSequence.java,v 1.1 2003/10/01 01:39:46 gojomo Exp $ */ package org.archive.crawler.io; /** * @author gojomo * */ public class CharSubSequence implements CharSequence { CharSequence inner; int start; int end; /** * */ public CharSubSequence(CharSequence inner, int start, int end) { // TODO bounds check super(); this.inner = inner; this.start = start; this.end = end; } /* (non-Javadoc) * @see java.lang.CharSequence#length() */ public int length() { return end-start; } /* (non-Javadoc) * @see java.lang.CharSequence#charAt(int) */ public char charAt(int index) { return inner.charAt(start+index); } /* (non-Javadoc) * @see java.lang.CharSequence#subSequence(int, int) */ public CharSequence subSequence(int start, int end) { return new CharSubSequence(this,start,end); } /* (non-Javadoc) * @see java.lang.Object#toString() */ public String toString() { StringBuffer sb = new StringBuffer((int)length()); for(int i=0; i<length(); i++) { sb.append(charAt(i)); } return sb.toString(); } } Index: RecordingInputStream.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/RecordingInputStream.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** RecordingInputStream.java 30 Sep 2003 18:07:53 -0000 1.2 --- RecordingInputStream.java 1 Oct 2003 01:39:46 -0000 1.3 *************** *** 10,15 **** import java.io.InputStream; - import javax.swing.text.Position; - import org.archive.util.NullOutputStream; --- 10,13 ---- *************** *** 81,84 **** --- 79,89 ---- public void markResponseBodyStart() { recordingOutputStream.markResponseBodyStart(); + } + + /** + * @return + */ + public CharSequence getCharSequence() { + return recordingOutputStream.getCharSequence(); } Index: ReplayInputStream.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/ReplayInputStream.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** ReplayInputStream.java 30 Sep 2003 18:07:53 -0000 1.2 --- ReplayInputStream.java 1 Oct 2003 01:39:46 -0000 1.3 *************** *** 14,17 **** --- 14,20 ---- /** + * Replays the bytes recorded from a RecordingInputStream or + * RecordingOutputStream. + * * @author gojomo * *************** *** 26,38 **** /** ! * @param buffer ! * @param size ! * @param responseBodyStart ! * @param backingFilename ! */ ! public ReplayInputStream(byte[] buffer, long size, long responseBodyStart, String backingFilename) throws IOException { ! this(buffer,size,backingFilename); ! this.responseBodyStart = responseBodyStart; ! } /** --- 29,41 ---- /** ! * @param buffer ! * @param size ! * @param responseBodyStart ! * @param backingFilename ! */ ! public ReplayInputStream(byte[] buffer, long size, long responseBodyStart, String backingFilename) throws IOException { ! this(buffer,size,backingFilename); ! this.responseBodyStart = responseBodyStart; ! } /** Index: ReplayCharSequence.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/ReplayCharSequence.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** ReplayCharSequence.java 30 Sep 2003 18:07:53 -0000 1.1 --- ReplayCharSequence.java 1 Oct 2003 01:39:46 -0000 1.2 *************** *** 7,11 **** package org.archive.crawler.io; ! import java.io.BufferedInputStream; /** --- 7,12 ---- package org.archive.crawler.io; ! import java.io.IOException; ! import java.io.RandomAccessFile; /** *************** *** 20,32 **** * * When rereading of a location is necessary, the whole window is ! * recentered around the location requested. (??? Is this the best ! * strategy?) * ! * TODO determine in memory mapped files is better way to do this * * @author Gordon Mohr */ public class ReplayCharSequence implements CharSequence { - protected BufferedInputStream diskStream; protected byte[] prefixBuffer; protected long size; --- 21,34 ---- * * When rereading of a location is necessary, the whole window is ! * recentered around the location requested. (TODO: More research ! * into whether this is the best strategy.) * ! * TODO determine in memory mapped files is better way to do this; ! * probably not -- they don't offer the level of control over ! * total memory used that this approach does. * * @author Gordon Mohr */ public class ReplayCharSequence implements CharSequence { protected byte[] prefixBuffer; protected long size; *************** *** 34,39 **** protected byte[] wraparoundBuffer; ! protected long position; protected String backingFilename; /* (non-Javadoc) --- 36,73 ---- protected byte[] wraparoundBuffer; ! protected int wrapOrigin; // index in underlying bytestream where wraparound buffer starts ! protected int wrapOffset; // index in wraparoundBuffer that corresponds to wrapOrigin ! protected String backingFilename; + protected RandomAccessFile raFile; + + /** + * @param buffer + * @param size + * @param responseBodyStart + * @param backingFilename + */ + public ReplayCharSequence(byte[] buffer, long size, long responseBodyStart, String backingFilename) throws IOException { + this(buffer,size,backingFilename); + this.responseBodyStart = responseBodyStart; + } + + /** + * @param buffer + * @param size + * @param backingFilename + */ + public ReplayCharSequence(byte[] buffer, long size, String backingFilename) throws IOException { + this.prefixBuffer = buffer; + this.size = size; + if (size>buffer.length) { + this.backingFilename = backingFilename; + raFile = new RandomAccessFile(backingFilename,"r"); + wraparoundBuffer = new byte[buffer.length]; + wrapOrigin = prefixBuffer.length; + wrapOffset = 0; + loadBuffer(); + } + } /* (non-Javadoc) *************** *** 41,46 **** */ public int length() { ! // TODO Auto-generated method stub ! return 0; } --- 75,79 ---- */ public int length() { ! return (int) size; } *************** *** 49,54 **** */ public char charAt(int index) { ! // TODO Auto-generated method stub ! return 0; } --- 82,169 ---- */ public char charAt(int index) { ! if(index>size) { ! throw new IndexOutOfBoundsException(); ! } ! if(index < prefixBuffer.length) { ! return (char) prefixBuffer[index]; ! } ! if(index >= wrapOrigin && index-wrapOrigin < wraparoundBuffer.length) { ! try { ! return (char) wraparoundBuffer[(index-wrapOrigin+wrapOffset) % wraparoundBuffer.length]; ! } catch (ArrayIndexOutOfBoundsException aoe) { ! System.out.println("oops"); ! } ! } ! return faultCharAt(index); ! } ! ! /** ! * get a character that's outside the current buffers ! * ! * will cause the wraparoundBuffer to be changed to ! * cover a region including the index ! * ! * if index is higher than the highest index in the ! * wraparound buffer, buffer is moved forward such ! * that requested char is last item in buffer ! * ! * if index is lower than lowest index in the ! * wraparound buffer, buffet is reset centered around ! * index ! * ! * @param index ! * @return ! */ ! private char faultCharAt(int index) { ! if(index>=wrapOrigin+wraparoundBuffer.length) { ! // moving forward ! while (index>=wrapOrigin+wraparoundBuffer.length){ ! // TODO optimize this ! advanceBuffer(); ! } ! return charAt(index); ! } else { ! // moving backward ! recenterBuffer(index); ! return charAt(index); ! } ! } ! ! ! private void recenterBuffer(int index) { ! System.out.println("recentering around "+index+" in "+ backingFilename); ! wrapOrigin = index - (wraparoundBuffer.length/2); ! if(wrapOrigin<prefixBuffer.length) { ! wrapOrigin = prefixBuffer.length; ! } ! wrapOffset = 0; ! loadBuffer(); ! } ! ! private void loadBuffer() { ! try { ! raFile.seek(wrapOrigin-prefixBuffer.length); ! raFile.readFully(wraparoundBuffer,0,(int)Math.min(wraparoundBuffer.length, size-wrapOrigin )); ! } catch (IOException e) { ! // TODO convert this to a runtime error? ! e.printStackTrace(); ! } ! } ! ! /** ! * Roll the wraparound buffer forward one position ! * ! */ ! private void advanceBuffer() { ! try { ! wraparoundBuffer[wrapOffset] = (byte)raFile.read(); ! wrapOffset++; ! wrapOffset %= wraparoundBuffer.length; ! wrapOrigin++; ! } catch (IOException e) { ! // TODO convert this to a runtime error? ! e.printStackTrace(); ! } ! } *************** *** 57,63 **** */ public CharSequence subSequence(int start, int end) { ! // TODO Auto-generated method stub ! return null; } } --- 172,189 ---- */ public CharSequence subSequence(int start, int end) { ! return new CharSubSequence(this,start,end); } + + /* (non-Javadoc) + * @see java.lang.Object#toString() + */ + public String toString() { + StringBuffer sb = new StringBuffer((int)size); + for(int i=0; i<size; i++) { + sb.append(charAt(i)); + } + return sb.toString(); + } + } Index: RecordingOutputStream.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/RecordingOutputStream.java,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** RecordingOutputStream.java 30 Sep 2003 18:07:53 -0000 1.3 --- RecordingOutputStream.java 1 Oct 2003 01:39:46 -0000 1.4 *************** *** 133,135 **** --- 133,149 ---- } + + /** + * @return + */ + public CharSequence getCharSequence() { + try { + return new ReplayCharSequence(buffer,size,responseBodyStart,backingFilename); + } catch (IOException e) { + // TODO convert to runtime exception? + e.printStackTrace(); + } + return null; + } + } |
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc/org/apache/commons/httpclient In directory sc8-pr-cvs1:/tmp/cvs-serv23505/oversrc/org/apache/commons/httpclient Modified Files: HttpConnection.java HttpMethodBase.java Added Files: MultiThreadedHttpConnectionManager.java Log Message: stream/http recording --- NEW FILE: MultiThreadedHttpConnectionManager.java --- /* * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc/org/apache/commons/httpclient/MultiThreadedHttpConnectionManager.java,v 1.1 2003/09/30 18:10:00 gojomo Exp $ * $Revision: 1.1 $ * $Date: 2003/09/30 18:10:00 $ * * ==================================================================== * * The Apache Software License, Version 1.1 * * Copyright (c) 2002-2003 The Apache Software Foundation. All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * [...1148 lines suppressed...] public int getSendBufferSize() throws SocketException { if (hasConnection()) { return wrappedConnection.getSendBufferSize(); } else { throw new IllegalStateException("Connection has been released"); } } public void setSendBufferSize(int sendBufferSize) throws SocketException { if (hasConnection()) { wrappedConnection.setSendBufferSize(sendBufferSize); } else { throw new IllegalStateException("Connection has been released"); } } } } Index: HttpConnection.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc/org/apache/commons/httpclient/HttpConnection.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** HttpConnection.java 24 Sep 2003 01:43:36 -0000 1.1 --- HttpConnection.java 30 Sep 2003 18:10:00 -0000 1.2 *************** *** 1373,1377 **** /** Optional recorder. */ ! private HttpRecorder recorder; /* (non-Javadoc) --- 1373,1377 ---- /** Optional recorder. */ ! protected HttpRecorder recorder; /* (non-Javadoc) Index: HttpMethodBase.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc/org/apache/commons/httpclient/HttpMethodBase.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** HttpMethodBase.java 24 Sep 2003 01:43:36 -0000 1.1 --- HttpMethodBase.java 30 Sep 2003 18:10:00 -0000 1.2 *************** *** 1615,1618 **** --- 1615,1621 ---- } } + if(recorder!=null) { + recorder.markResponseBodyStart(); + } readResponseBody(state, conn); processResponseBody(state, conn); |
From: <go...@us...> - 2003-09-30 18:07:58
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor In directory sc8-pr-cvs1:/tmp/cvs-serv21645/src/org/archive/crawler/extractor Modified Files: ExtractorHTML.java Log Message: stream/http recording Index: ExtractorHTML.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor/ExtractorHTML.java,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** ExtractorHTML.java 12 Sep 2003 02:03:24 -0000 1.15 --- ExtractorHTML.java 30 Sep 2003 18:07:53 -0000 1.16 *************** *** 257,260 **** --- 257,261 ---- return; } + CharSequence cs = get.getResponseBodyAsString(); |
From: <go...@us...> - 2003-09-30 18:07:58
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util In directory sc8-pr-cvs1:/tmp/cvs-serv21645/src/org/archive/util Modified Files: HttpRecorder.java Log Message: stream/http recording Index: HttpRecorder.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util/HttpRecorder.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** HttpRecorder.java 25 Sep 2003 00:14:03 -0000 1.2 --- HttpRecorder.java 30 Sep 2003 18:07:53 -0000 1.3 *************** *** 21,27 **** */ public class HttpRecorder { ! String backingFilenamePrefix; ! RecordingInputStream ris; ! RecordingOutputStream ros; /** --- 21,27 ---- */ public class HttpRecorder { ! protected String backingFilenamePrefix; ! protected RecordingInputStream ris; ! protected RecordingOutputStream ros; /** *************** *** 50,53 **** --- 50,75 ---- ros.open(os); return ros; + } + + /** + * + */ + public void close() throws IOException { + ris.close(); + ros.close(); + } + + /** + * + */ + public RecordingInputStream getRecordedInput() { + return ris; + } + + /** + * + */ + public void markResponseBodyStart() { + ris.markResponseBodyStart(); } |
From: <go...@us...> - 2003-09-30 18:07:58
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic In directory sc8-pr-cvs1:/tmp/cvs-serv21645/src/org/archive/crawler/basic Modified Files: FetcherHTTPSimple.java ARCWriter.java Log Message: stream/http recording Index: FetcherHTTPSimple.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/FetcherHTTPSimple.java,v retrieving revision 1.9 retrieving revision 1.10 diff -C2 -d -r1.9 -r1.10 *** FetcherHTTPSimple.java 25 Sep 2003 00:14:02 -0000 1.9 --- FetcherHTTPSimple.java 30 Sep 2003 18:07:52 -0000 1.10 *************** *** 106,109 **** --- 106,110 ---- InputStream is = get.getResponseBodyAsStream(); while(is.read()!=-1) {} // TODOSOON: read in bigger chunks! + get.getHttpRecorder().close(); Header contentLength = get.getResponseHeader("Content-Length"); Index: ARCWriter.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/ARCWriter.java,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** ARCWriter.java 6 Aug 2003 01:19:29 -0000 1.31 --- ARCWriter.java 30 Sep 2003 18:07:52 -0000 1.32 *************** *** 13,17 **** import java.io.OutputStream; ! import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.methods.GetMethod; import org.archive.crawler.basic.StatisticsTracker; --- 13,17 ---- import java.io.OutputStream; ! // import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.methods.GetMethod; import org.archive.crawler.basic.StatisticsTracker; *************** *** 281,308 **** } - int headersSize = 0; int recordLength = 0; - Header[] headers = get.getResponseHeaders(); - - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - baos.write(get.getStatusLine().toString().getBytes()); // get status line (it's not a header) - baos.write("\n".getBytes()); - for(int i=0; i < headers.length; i++){ - baos.write(headers[i].toExternalForm().getBytes()); - } - recordLength += baos.size(); ! // get body so we can calc length for metaline ! byte[] body = get.getResponseBody(); ! recordLength += body.length; ! // don't forget the extra CRLF between headers and body ! recordLength += 2; writeMetaLine(curi, recordLength); ! baos.writeTo(out); ! out.write("\r\n".getBytes()); ! out.write(body); ! out.write("\n".getBytes()); } --- 281,314 ---- } int recordLength = 0; ! // OLD WAY ! // Header[] headers = get.getResponseHeaders(); ! // ! // ByteArrayOutputStream baos = new ByteArrayOutputStream(); ! // baos.write(get.getStatusLine().toString().getBytes()); // get status line (it's not a header) ! // baos.write("\n".getBytes()); ! // for(int i=0; i < headers.length; i++){ ! // baos.write(headers[i].toExternalForm().getBytes()); ! // } ! // recordLength += baos.size(); ! // ! // // get body so we can calc length for metaline ! // byte[] body = get.getResponseBody(); ! // // don't forget the extra CRLF between headers and body ! // recordLength += 2; + recordLength += get.getHttpRecorder().getRecordedInput().getSize(); + writeMetaLine(curi, recordLength); ! get.getHttpRecorder().getRecordedInput().getReplayInputStream().readFullyTo(out); ! out.write('\n'); // trailing newline ! ! // OLD WAY ! // baos.writeTo(out); ! // out.write("\r\n".getBytes()); ! // out.write(body); ! // out.write("\n".getBytes()); } |
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io In directory sc8-pr-cvs1:/tmp/cvs-serv21645/src/org/archive/crawler/io Modified Files: RecordingInputStream.java ReplayInputStream.java RecordingOutputStream.java Added Files: ReplayCharSequence.java Log Message: stream/http recording --- NEW FILE: ReplayCharSequence.java --- /* * ReplayCharSequence.java * Created on Sep 30, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/ReplayCharSequence.java,v 1.1 2003/09/30 18:07:53 gojomo Exp $ */ package org.archive.crawler.io; import java.io.BufferedInputStream; /** * Provides a CharSequence view on recorded stream bytes (a prefix buffer * and overflow backing file). * * Uses a wraparound rolling buffer of the last windowSize bytes read * from disk in memory; as long as the 'random access' of a CharSequence * user stays within this window, access should remain fairly efficient. * (So design any regexps pointed at these CharSequences to work within * that range!) * * When rereading of a location is necessary, the whole window is * recentered around the location requested. (??? Is this the best * strategy?) * * TODO determine in memory mapped files is better way to do this * * @author Gordon Mohr */ public class ReplayCharSequence implements CharSequence { protected BufferedInputStream diskStream; protected byte[] prefixBuffer; protected long size; protected long responseBodyStart; // where the response body starts, if marked protected byte[] wraparoundBuffer; protected long position; protected String backingFilename; /* (non-Javadoc) * @see java.lang.CharSequence#length() */ public int length() { // TODO Auto-generated method stub return 0; } /* (non-Javadoc) * @see java.lang.CharSequence#charAt(int) */ public char charAt(int index) { // TODO Auto-generated method stub return 0; } /* (non-Javadoc) * @see java.lang.CharSequence#subSequence(int, int) */ public CharSequence subSequence(int start, int end) { // TODO Auto-generated method stub return null; } } Index: RecordingInputStream.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/RecordingInputStream.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** RecordingInputStream.java 25 Sep 2003 00:14:03 -0000 1.1 --- RecordingInputStream.java 30 Sep 2003 18:07:53 -0000 1.2 *************** *** 10,13 **** --- 10,15 ---- import java.io.InputStream; + import javax.swing.text.Position; + import org.archive.util.NullOutputStream; *************** *** 62,67 **** while(read()!=-1) { } ! return recordingOutputStream.size; } --- 64,84 ---- while(read()!=-1) { } ! return recordingOutputStream.getSize(); + } + + /** + * @return + */ + public long getSize() { + // TODO Auto-generated method stub + return recordingOutputStream.getSize(); + } + + /** + * + */ + public void markResponseBodyStart() { + recordingOutputStream.markResponseBodyStart(); } Index: ReplayInputStream.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/ReplayInputStream.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** ReplayInputStream.java 25 Sep 2003 00:14:03 -0000 1.1 --- ReplayInputStream.java 30 Sep 2003 18:07:53 -0000 1.2 *************** *** 11,14 **** --- 11,15 ---- import java.io.IOException; import java.io.InputStream; + import java.io.OutputStream; /** *************** *** 17,27 **** */ public class ReplayInputStream extends InputStream { ! private BufferedInputStream diskStream; ! byte[] buffer; ! long size; ! long position; ! String backingFilename; /** * @param buffer * @param size --- 18,40 ---- */ public class ReplayInputStream extends InputStream { ! protected BufferedInputStream diskStream; ! protected byte[] buffer; ! protected long size; ! protected long responseBodyStart; // where the response body starts, if marked ! protected long position; ! protected String backingFilename; /** + * @param buffer + * @param size + * @param responseBodyStart + * @param backingFilename + */ + public ReplayInputStream(byte[] buffer, long size, long responseBodyStart, String backingFilename) throws IOException { + this(buffer,size,backingFilename); + this.responseBodyStart = responseBodyStart; + } + + /** * @param buffer * @param size *************** *** 37,44 **** --- 50,65 ---- } + public long setToResponseBodyStart() { + position = responseBodyStart; + return position; + } + /* (non-Javadoc) * @see java.io.InputStream#read() */ public int read() throws IOException { + if (position==size) { + return -1; // EOF + } if (position<buffer.length) { return buffer[(int)position++]; *************** *** 51,53 **** --- 72,83 ---- // TODO: implement other read()s for efficiency + + public void readFullyTo(OutputStream os) throws IOException { + // TODO make this more efficient + int c = read(); + while (c != -1) { + os.write(c); + c = read(); + } + } } Index: RecordingOutputStream.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/RecordingOutputStream.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** RecordingOutputStream.java 25 Sep 2003 00:14:03 -0000 1.2 --- RecordingOutputStream.java 30 Sep 2003 18:07:53 -0000 1.3 *************** *** 35,38 **** --- 35,40 ---- protected byte[] buffer; protected long position; + protected long responseBodyStart; // when recording HTTP, where the content-body starts + /** *************** *** 83,87 **** */ private void record(int b) throws IOException { ! if(position>buffer.length){ diskStream.write(b); } else { --- 85,89 ---- */ private void record(int b) throws IOException { ! if(position>=buffer.length){ diskStream.write(b); } else { *************** *** 112,116 **** public ReplayInputStream getReplayInputStream() throws IOException { ! return new ReplayInputStream(buffer,size,backingFilename); } --- 114,134 ---- public ReplayInputStream getReplayInputStream() throws IOException { ! return new ReplayInputStream(buffer,size,responseBodyStart,backingFilename); ! } ! ! ! /** ! * @return ! */ ! public long getSize() { ! return size; ! } ! ! ! /** ! * ! */ ! public void markResponseBodyStart() { ! responseBodyStart = position; } |
From: <go...@us...> - 2003-09-30 18:07:58
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel In directory sc8-pr-cvs1:/tmp/cvs-serv21645/src/org/archive/crawler/datamodel Modified Files: CrawlServer.java Log Message: stream/http recording Index: CrawlServer.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlServer.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** CrawlServer.java 6 Aug 2003 01:18:43 -0000 1.1 --- CrawlServer.java 30 Sep 2003 18:07:53 -0000 1.2 *************** *** 13,16 **** --- 13,17 ---- import org.apache.commons.httpclient.methods.GetMethod; + import org.archive.crawler.io.ReplayInputStream; /** *************** *** 91,97 **** // note that akamai will return 400 for some "not founds" try { BufferedReader reader = new BufferedReader( ! new InputStreamReader( ! get.getResponseBodyAsStream())); robots = RobotsExclusionPolicy.policyFor(reader); } catch (IOException e) { --- 92,99 ---- // note that akamai will return 400 for some "not founds" try { + ReplayInputStream contentBodyStream = get.getHttpRecorder().getRecordedInput().getReplayInputStream(); + contentBodyStream.setToResponseBodyStart(); BufferedReader reader = new BufferedReader( ! new InputStreamReader(contentBodyStream)); robots = RobotsExclusionPolicy.policyFor(reader); } catch (IOException e) { |
From: <go...@us...> - 2003-09-27 01:10:02
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util In directory sc8-pr-cvs1:/tmp/cvs-serv15349/src/org/archive/util Added Files: FencedInputStream.java ARCReader.java ARCResource.java Log Message: skeletal first ARCReading work --- NEW FILE: FencedInputStream.java --- /* * FencedInputStream.java * Created on Sep 26, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util/FencedInputStream.java,v 1.1 2003/09/27 00:48:12 gojomo Exp $ */ package org.archive.util; import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; /** * @author gojomo * */ public class FencedInputStream extends FilterInputStream { long maxToRead; long position = 0; /** * @param in */ protected FencedInputStream(InputStream in, long maxToRead) { super(in); this.maxToRead = maxToRead; } /* (non-Javadoc) * @see java.io.InputStream#read() */ public int read() throws IOException { if (position < maxToRead) { int b = super.read(); if (b>=0) { position++; } return b; } else { return -1; // virtual EOF } } /* (non-Javadoc) * @see java.io.InputStream#read(byte[], int, int) */ public int read(byte[] b, int off, int len) throws IOException { // TODO Auto-generated method stub return super.read(b, off, len); } /* (non-Javadoc) * @see java.io.InputStream#read(byte[]) */ public int read(byte[] b) throws IOException { // TODO Auto-generated method stub return super.read(b); } } --- NEW FILE: ARCReader.java --- /* * ARCReader.java * Created on Sep 26, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util/ARCReader.java,v 1.1 2003/09/27 00:48:12 gojomo Exp $ */ package org.archive.util; import java.io.BufferedInputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.zip.GZIPInputStream; /** * Utility class for reading ARC files, including .arc.gz * files. * * @author gojomo * */ public class ARCReader { protected InputStream inStream; protected FileInputStream arcStream; protected ARCResource lastResource; /** * */ public ARCReader() { super(); } public void open(String filename) throws IOException { String flattenedFilename = filename.toLowerCase(); assert flattenedFilename.endsWith(".arc") || flattenedFilename.endsWith(".arc.gz") : "non-arc filename extension"; arcStream = new FileInputStream(filename); inStream = new BufferedInputStream(arcStream,4096); if (flattenedFilename.endsWith(".gz")) { inStream = new GZIPInputStream(inStream); } } public ARCResource getNextResource() { return null; } } --- NEW FILE: ARCResource.java --- /* * ARCResource.java * Created on Sep 26, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util/ARCResource.java,v 1.1 2003/09/27 00:48:12 gojomo Exp $ */ package org.archive.util; /** * @author gojomo * */ public class ARCResource { } |
From: <go...@us...> - 2003-09-25 00:14:08
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util In directory sc8-pr-cvs1:/tmp/cvs-serv4243/src/org/archive/util Modified Files: HttpRecorder.java Log Message: http byte-level recording (in progress) Index: HttpRecorder.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util/HttpRecorder.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** HttpRecorder.java 24 Sep 2003 01:46:37 -0000 1.1 --- HttpRecorder.java 25 Sep 2003 00:14:03 -0000 1.2 *************** *** 7,13 **** --- 7,17 ---- package org.archive.util; + import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; + import org.archive.crawler.io.RecordingInputStream; + import org.archive.crawler.io.RecordingOutputStream; + /** * Initially only supports HTTP/1.0 (one request, one response per stream) *************** *** 17,20 **** --- 21,36 ---- */ public class HttpRecorder { + String backingFilenamePrefix; + RecordingInputStream ris; + RecordingOutputStream ros; + + /** + * + */ + public HttpRecorder(String backingFilenamePrefix) { + super(); + ris = new RecordingInputStream(32768,backingFilenamePrefix+".ris",2^20); + ros = new RecordingOutputStream(2048,backingFilenamePrefix+".ros",2^12); + } /** *************** *** 22,27 **** * @return */ ! public InputStream inputWrap(InputStream is) { ! return is; } --- 38,44 ---- * @return */ ! public InputStream inputWrap(InputStream is) throws IOException { ! ris.open(is); ! return ris; } *************** *** 30,35 **** * @return */ ! public OutputStream outputWrap(OutputStream outputStream) { ! return outputStream; } --- 47,53 ---- * @return */ ! public OutputStream outputWrap(OutputStream os) throws IOException { ! ros.open(os); ! return ros; } |
From: <go...@us...> - 2003-09-25 00:14:07
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework In directory sc8-pr-cvs1:/tmp/cvs-serv4243/src/org/archive/crawler/framework Modified Files: ToeThread.java Log Message: http byte-level recording (in progress) Index: ToeThread.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/ToeThread.java,v retrieving revision 1.13 retrieving revision 1.14 diff -C2 -d -r1.13 -r1.14 *** ToeThread.java 17 Jul 2003 22:21:05 -0000 1.13 --- ToeThread.java 25 Sep 2003 00:14:03 -0000 1.14 *************** *** 14,17 **** --- 14,18 ---- import org.archive.crawler.datamodel.FetchStatusCodes; import org.archive.crawler.datamodel.InstancePerThread; + import org.archive.util.HttpRecorder; /** *************** *** 28,31 **** --- 29,33 ---- CrawlController controller; int serialNumber; + HttpRecorder httpRecorder; HashMap localProcessors = new HashMap(); *************** *** 41,44 **** --- 43,47 ---- serialNumber = sn; setName("ToeThread #"+serialNumber); + httpRecorder = new HttpRecorder("tt"+sn+"http"); } *************** *** 147,149 **** --- 150,166 ---- return paused; } + /** + * @return + */ + public HttpRecorder getHttpRecorder() { + return httpRecorder; + } + + /** + * @param recorder + */ + public void setHttpRecorder(HttpRecorder recorder) { + httpRecorder = recorder; + } + } |
From: <go...@us...> - 2003-09-25 00:14:07
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic In directory sc8-pr-cvs1:/tmp/cvs-serv4243/src/org/archive/crawler/basic Modified Files: FetcherHTTPSimple.java Log Message: http byte-level recording (in progress) Index: FetcherHTTPSimple.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/FetcherHTTPSimple.java,v retrieving revision 1.8 retrieving revision 1.9 diff -C2 -d -r1.8 -r1.9 *** FetcherHTTPSimple.java 23 Sep 2003 01:15:19 -0000 1.8 --- FetcherHTTPSimple.java 25 Sep 2003 00:14:02 -0000 1.9 *************** *** 8,11 **** --- 8,12 ---- import java.io.IOException; + import java.io.InputStream; import java.util.logging.Logger; *************** *** 22,25 **** --- 23,27 ---- import org.archive.crawler.framework.CrawlController; import org.archive.crawler.framework.Processor; + import org.archive.crawler.framework.ToeThread; /** *************** *** 87,90 **** --- 89,93 ---- controller.getOrder().getBehavior().getFrom()); + get.setHttpRecorder(((ToeThread)Thread.currentThread()).getHttpRecorder()); //controller.getKicker().kickMeAt(Thread.currentThread(),now+timeout); *************** *** 101,105 **** // this might be wasteful. As it is, it just moves // the cost here rather than elsewhere. ) ! get.getResponseBody(); Header contentLength = get.getResponseHeader("Content-Length"); --- 104,109 ---- // this might be wasteful. As it is, it just moves // the cost here rather than elsewhere. ) ! InputStream is = get.getResponseBodyAsStream(); ! while(is.read()!=-1) {} // TODOSOON: read in bigger chunks! Header contentLength = get.getResponseHeader("Content-Length"); |
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io In directory sc8-pr-cvs1:/tmp/cvs-serv4243/src/org/archive/crawler/io Modified Files: RecordingOutputStream.java Added Files: RecordingInputStream.java ReplayInputStream.java Log Message: http byte-level recording (in progress) --- NEW FILE: RecordingInputStream.java --- /* * RecordingInputStream.java * Created on Sep 24, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/RecordingInputStream.java,v 1.1 2003/09/25 00:14:03 gojomo Exp $ */ package org.archive.crawler.io; import java.io.IOException; import java.io.InputStream; import org.archive.util.NullOutputStream; /** * @author gojomo * */ public class RecordingInputStream extends InputStream { protected InputStream wrappedStream; protected RecordingOutputStream recordingOutputStream; /** * Create a new RecordingInputStream with the specified parameters. * * @param bufferSize * @param backingFile * @param maxSize */ public RecordingInputStream(int bufferSize, String backingFilename, int maxSize) { recordingOutputStream = new RecordingOutputStream(bufferSize, backingFilename, maxSize); } public void open(InputStream wrappedStream) throws IOException { this.wrappedStream = wrappedStream; recordingOutputStream.open(new NullOutputStream()); } /* (non-Javadoc) * @see java.io.InputStream#read() */ public int read() throws IOException { int b = wrappedStream.read(); recordingOutputStream.write(b); return b; } /* (non-Javadoc) * @see java.io.OutputStream#close() */ public void close() throws IOException { super.close(); wrappedStream.close(); recordingOutputStream.close(); } public ReplayInputStream getReplayInputStream() throws IOException { return recordingOutputStream.getReplayInputStream(); } public long readFully() throws IOException { while(read()!=-1) { } return recordingOutputStream.size; } } --- NEW FILE: ReplayInputStream.java --- /* * ReplayInputStream.java * Created on Sep 24, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/ReplayInputStream.java,v 1.1 2003/09/25 00:14:03 gojomo Exp $ */ package org.archive.crawler.io; import java.io.BufferedInputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; /** * @author gojomo * */ public class ReplayInputStream extends InputStream { private BufferedInputStream diskStream; byte[] buffer; long size; long position; String backingFilename; /** * @param buffer * @param size * @param backingFilename */ public ReplayInputStream(byte[] buffer, long size, String backingFilename) throws IOException { this.buffer = buffer; this.size = size; if (size>buffer.length) { this.backingFilename = backingFilename; diskStream = new BufferedInputStream(new FileInputStream(backingFilename),4096); } } /* (non-Javadoc) * @see java.io.InputStream#read() */ public int read() throws IOException { if (position<buffer.length) { return buffer[(int)position++]; } else { position++; return diskStream.read(); } } // TODO: implement other read()s for efficiency } Index: RecordingOutputStream.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/RecordingOutputStream.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** RecordingOutputStream.java 24 Sep 2003 01:46:37 -0000 1.1 --- RecordingOutputStream.java 25 Sep 2003 00:14:03 -0000 1.2 *************** *** 7,18 **** --- 7,71 ---- package org.archive.crawler.io; + import java.io.BufferedOutputStream; + import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; /** + * A RecordingOutputStream can be wrapped around any other + * OutputStream to record all bytes written to it. You can + * then request a ReplayInputStream to read those bytes. + * + * The RecordingOutputStream uses an in-memory buffer and + * backing disk file to allow it to record streams of + * arbitrary length, limited only by available disk space. + * + * As long as the stream recorded is smaller than the + * in-memory buffer, no disk access will occur. + * * @author gojomo * */ public class RecordingOutputStream extends OutputStream { + protected long size; + protected int maxSize; + protected String backingFilename; + protected BufferedOutputStream diskStream; + protected OutputStream wrappedStream; + protected byte[] buffer; + protected long position; + + /** + * Create a new RecordingInputStream with the specified parameters. + * + * @param bufferSize + * @param backingFile + * @param maxSize + */ + public RecordingOutputStream(int bufferSize, String backingFilename, int maxSize) { + buffer = new byte[bufferSize]; + this.backingFilename = backingFilename; + this.maxSize = maxSize; + } + + + public void open(OutputStream wrappedStream) throws IOException { + this.wrappedStream = wrappedStream; + this.position = 0; + diskStream = new BufferedOutputStream(new FileOutputStream(backingFilename),4096); + } + + /** + * Total reset -- discarding all + */ + public void clear() { + try { + diskStream.close(); + } catch (IOException e) { + // nothing + } + diskStream = null; + } + /* (non-Javadoc) *************** *** 20,25 **** */ public void write(int b) throws IOException { ! // TODO Auto-generated method stub } --- 73,116 ---- */ public void write(int b) throws IOException { ! wrappedStream.write(b); ! record(b); ! } ! ! // TODO implement other forms of write() for efficiency ! ! /** ! * @param b ! */ ! private void record(int b) throws IOException { ! if(position>buffer.length){ ! diskStream.write(b); ! } else { ! buffer[(int)position] = (byte)b; ! } ! position++; ! } ! ! // TODO implement other forms of record() for efficiency + /* (non-Javadoc) + * @see java.io.OutputStream#close() + */ + public void close() throws IOException { + super.close(); + wrappedStream.close(); + diskStream.close(); + this.size = position; + } + + /* (non-Javadoc) + * @see java.io.OutputStream#flush() + */ + public void flush() throws IOException { + super.flush(); + wrappedStream.flush(); + } + + public ReplayInputStream getReplayInputStream() throws IOException { + return new ReplayInputStream(buffer,size,backingFilename); } |
From: <go...@us...> - 2003-09-24 01:46:42
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util In directory sc8-pr-cvs1:/tmp/cvs-serv27393/src/org/archive/util Added Files: HttpRecorder.java Removed Files: HTTPRecorder.java Log Message: skeletal http-recording (in progress) --- NEW FILE: HttpRecorder.java --- /* * HTTPRecorder.java * Created on Sep 22, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util/HttpRecorder.java,v 1.1 2003/09/24 01:46:37 gojomo Exp $ */ package org.archive.util; import java.io.InputStream; import java.io.OutputStream; /** * Initially only supports HTTP/1.0 (one request, one response per stream) * * @author gojomo * */ public class HttpRecorder { /** * @param is * @return */ public InputStream inputWrap(InputStream is) { return is; } /** * @param outputStream * @return */ public OutputStream outputWrap(OutputStream outputStream) { return outputStream; } } --- HTTPRecorder.java DELETED --- |
From: <go...@us...> - 2003-09-24 01:46:42
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io In directory sc8-pr-cvs1:/tmp/cvs-serv27393/src/org/archive/crawler/io Added Files: RecordingOutputStream.java Log Message: skeletal http-recording (in progress) --- NEW FILE: RecordingOutputStream.java --- /* * ReplayableOutputStream.java * Created on Sep 23, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/RecordingOutputStream.java,v 1.1 2003/09/24 01:46:37 gojomo Exp $ */ package org.archive.crawler.io; import java.io.IOException; import java.io.OutputStream; /** * @author gojomo * */ public class RecordingOutputStream extends OutputStream { /* (non-Javadoc) * @see java.io.OutputStream#write(int) */ public void write(int b) throws IOException { // TODO Auto-generated method stub } } |
From: <go...@us...> - 2003-09-24 01:46:15
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io In directory sc8-pr-cvs1:/tmp/cvs-serv27323/src/org/archive/crawler/io Modified Files: UriProcessingFormatter.java Log Message: no '-' for blank fields, use '.' instead Index: UriProcessingFormatter.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/UriProcessingFormatter.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** UriProcessingFormatter.java 11 Jul 2003 02:30:42 -0000 1.2 --- UriProcessingFormatter.java 24 Sep 2003 01:46:11 -0000 1.3 *************** *** 32,37 **** CrawlURI curi = (CrawlURI) lr.getParameters()[0]; ! String length = "-"; ! String mime = "-"; String uri = curi.getUURI().getUri().toASCIIString(); if ( curi.getAList().containsKey(A_HTTP_TRANSACTION)) { --- 32,37 ---- CrawlURI curi = (CrawlURI) lr.getParameters()[0]; ! String length = "."; ! String mime = "."; String uri = curi.getUURI().getUri().toASCIIString(); if ( curi.getAList().containsKey(A_HTTP_TRANSACTION)) { |
From: <go...@us...> - 2003-09-24 01:45:30
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel In directory sc8-pr-cvs1:/tmp/cvs-serv27184/src/org/archive/crawler/datamodel Modified Files: CrawlURI.java Log Message: ensure sensible link, embed counts Index: CrawlURI.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlURI.java,v retrieving revision 1.39 retrieving revision 1.40 diff -C2 -d -r1.39 -r1.40 *** CrawlURI.java 23 Sep 2003 01:16:35 -0000 1.39 --- CrawlURI.java 24 Sep 2003 01:45:26 -0000 1.40 *************** *** 40,44 **** private long wakeTime; // if "snoozed", when this CrawlURI may awake private long dontRetryBefore = -1; - private int threadNumber; // Processing progress --- 40,43 ---- *************** *** 47,50 **** --- 46,50 ---- private int deferrals = 0; // count of postponements for prerequisites private int fetchAttempts = 0; // the number of fetch attempts that have been made + private int threadNumber; // flexible dynamic attributes *************** *** 54,58 **** private CrawlURI via; // curi that led to this (lowest hops from seed) private int linkHopCount = -1; // from seeds ! private int embedHopCount = -1; // from a sure link //////////////////////////////////////////////////////////////////// --- 54,58 ---- private CrawlURI via; // curi that led to this (lowest hops from seed) private int linkHopCount = -1; // from seeds ! private int embedHopCount = -1; // from a sure link; reset upon any link traversal //////////////////////////////////////////////////////////////////// *************** *** 396,401 **** public void setViaLinkFrom(CrawlURI sourceCuri) { via = sourceCuri; int candidateLinkHopCount = sourceCuri.getLinkHopCount()+1; - embedHopCount = 0; if (linkHopCount == -1) { linkHopCount = candidateLinkHopCount; --- 396,402 ---- public void setViaLinkFrom(CrawlURI sourceCuri) { via = sourceCuri; + // reset embedCount -- but only back to 1 if >0, so special embed handling still applies + embedHopCount = (embedHopCount > 0) ? 1 : 0; int candidateLinkHopCount = sourceCuri.getLinkHopCount()+1; if (linkHopCount == -1) { linkHopCount = candidateLinkHopCount; *************** *** 444,447 **** --- 445,456 ---- public int getLinkHopCount() { return linkHopCount; + } + + /** + * + */ + public void markAsSeed() { + linkHopCount = 0; + embedHopCount = 0; } |
From: <go...@us...> - 2003-09-24 01:45:30
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic In directory sc8-pr-cvs1:/tmp/cvs-serv27184/src/org/archive/crawler/basic Modified Files: SimpleStore.java Log Message: ensure sensible link, embed counts Index: SimpleStore.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleStore.java,v retrieving revision 1.31 retrieving revision 1.32 diff -C2 -d -r1.31 -r1.32 *** SimpleStore.java 23 Sep 2003 01:16:34 -0000 1.31 --- SimpleStore.java 24 Sep 2003 01:45:26 -0000 1.32 *************** *** 105,108 **** --- 105,109 ---- } CrawlURI curi = new CrawlURI(uuri); + curi.markAsSeed(); //curi.getAList().putInt("distance-from-seed",0); allCuris.put(uuri,curi); |