You can subscribe to this list here.
| 2003 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
|
Aug
(2) |
Sep
(50) |
Oct
(197) |
Nov
(305) |
Dec
(295) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2004 |
Jan
(429) |
Feb
(694) |
Mar
(443) |
Apr
(479) |
May
(357) |
Jun
(74) |
Jul
(218) |
Aug
(162) |
Sep
(156) |
Oct
(340) |
Nov
(132) |
Dec
(224) |
| 2005 |
Jan
(170) |
Feb
(122) |
Mar
(265) |
Apr
(215) |
May
(139) |
Jun
(247) |
Jul
(179) |
Aug
(116) |
Sep
(103) |
Oct
(125) |
Nov
(97) |
Dec
(221) |
| 2006 |
Jan
(132) |
Feb
(18) |
Mar
(23) |
Apr
(35) |
May
(71) |
Jun
(268) |
Jul
(220) |
Aug
(376) |
Sep
(181) |
Oct
(71) |
Nov
(131) |
Dec
(172) |
| 2007 |
Jan
(125) |
Feb
(79) |
Mar
(90) |
Apr
(76) |
May
(91) |
Jun
(64) |
Jul
(113) |
Aug
(96) |
Sep
(40) |
Oct
(30) |
Nov
(85) |
Dec
(56) |
| 2008 |
Jan
(37) |
Feb
(79) |
Mar
(22) |
Apr
(6) |
May
(13) |
Jun
(22) |
Jul
(83) |
Aug
(50) |
Sep
(8) |
Oct
(32) |
Nov
(55) |
Dec
(28) |
| 2009 |
Jan
(15) |
Feb
(30) |
Mar
(28) |
Apr
(69) |
May
(82) |
Jun
(19) |
Jul
(64) |
Aug
(71) |
Sep
(53) |
Oct
(84) |
Nov
(105) |
Dec
(40) |
| 2010 |
Jan
(11) |
Feb
(19) |
Mar
(24) |
Apr
(58) |
May
(15) |
Jun
(35) |
Jul
(14) |
Aug
(13) |
Sep
(31) |
Oct
(15) |
Nov
(39) |
Dec
(10) |
| 2011 |
Jan
(59) |
Feb
(32) |
Mar
(10) |
Apr
(37) |
May
(20) |
Jun
(21) |
Jul
(39) |
Aug
(9) |
Sep
(31) |
Oct
(29) |
Nov
(3) |
Dec
(1) |
| 2012 |
Jan
(7) |
Feb
(4) |
Mar
(5) |
Apr
(12) |
May
(5) |
Jun
(8) |
Jul
(9) |
Aug
(6) |
Sep
(15) |
Oct
(1) |
Nov
(3) |
Dec
(9) |
| 2013 |
Jan
(9) |
Feb
(2) |
Mar
(41) |
Apr
(13) |
May
(9) |
Jun
(20) |
Jul
(5) |
Aug
(22) |
Sep
(5) |
Oct
(3) |
Nov
(13) |
Dec
(8) |
| 2014 |
Jan
(27) |
Feb
(16) |
Mar
(7) |
Apr
(14) |
May
(10) |
Jun
(2) |
Jul
(16) |
Aug
(6) |
Sep
(6) |
Oct
(11) |
Nov
(7) |
Dec
|
| 2015 |
Jan
|
Feb
(7) |
Mar
(4) |
Apr
|
May
(2) |
Jun
|
Jul
|
Aug
(2) |
Sep
(2) |
Oct
(5) |
Nov
(1) |
Dec
|
| 2016 |
Jan
(15) |
Feb
(5) |
Mar
(4) |
Apr
(1) |
May
(7) |
Jun
(16) |
Jul
(6) |
Aug
(2) |
Sep
|
Oct
(1) |
Nov
|
Dec
|
| 2017 |
Jan
|
Feb
(1) |
Mar
(3) |
Apr
|
May
(4) |
Jun
(25) |
Jul
|
Aug
|
Sep
(4) |
Oct
(11) |
Nov
(9) |
Dec
(1) |
| 2018 |
Jan
(2) |
Feb
|
Mar
|
Apr
|
May
(2) |
Jun
|
Jul
(10) |
Aug
|
Sep
(1) |
Oct
(2) |
Nov
(12) |
Dec
(4) |
| 2019 |
Jan
(3) |
Feb
(21) |
Mar
(17) |
Apr
(13) |
May
(6) |
Jun
(4) |
Jul
|
Aug
(65) |
Sep
|
Oct
(4) |
Nov
(7) |
Dec
|
| 2020 |
Jan
(23) |
Feb
(6) |
Mar
(14) |
Apr
(25) |
May
(11) |
Jun
(9) |
Jul
(7) |
Aug
(7) |
Sep
(1) |
Oct
(4) |
Nov
(4) |
Dec
|
| 2021 |
Jan
(8) |
Feb
(11) |
Mar
(1) |
Apr
(6) |
May
(30) |
Jun
(60) |
Jul
(43) |
Aug
(23) |
Sep
(16) |
Oct
|
Nov
(7) |
Dec
(13) |
| 2022 |
Jan
(7) |
Feb
(2) |
Mar
(17) |
Apr
(16) |
May
(9) |
Jun
(2) |
Jul
(18) |
Aug
|
Sep
(3) |
Oct
(1) |
Nov
(2) |
Dec
|
| 2023 |
Jan
(7) |
Feb
|
Mar
(11) |
Apr
|
May
(1) |
Jun
|
Jul
|
Aug
|
Sep
(7) |
Oct
(5) |
Nov
(2) |
Dec
|
| 2024 |
Jan
|
Feb
(4) |
Mar
(8) |
Apr
(5) |
May
(5) |
Jun
(12) |
Jul
(2) |
Aug
(12) |
Sep
(25) |
Oct
(47) |
Nov
(46) |
Dec
(3) |
| 2025 |
Jan
(6) |
Feb
(14) |
Mar
(8) |
Apr
(23) |
May
(34) |
Jun
(44) |
Jul
(8) |
Aug
(14) |
Sep
(12) |
Oct
(61) |
Nov
(3) |
Dec
|
|
From: <go...@us...> - 2003-09-30 18:07:58
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor In directory sc8-pr-cvs1:/tmp/cvs-serv21645/src/org/archive/crawler/extractor Modified Files: ExtractorHTML.java Log Message: stream/http recording Index: ExtractorHTML.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/extractor/ExtractorHTML.java,v retrieving revision 1.15 retrieving revision 1.16 diff -C2 -d -r1.15 -r1.16 *** ExtractorHTML.java 12 Sep 2003 02:03:24 -0000 1.15 --- ExtractorHTML.java 30 Sep 2003 18:07:53 -0000 1.16 *************** *** 257,260 **** --- 257,261 ---- return; } + CharSequence cs = get.getResponseBodyAsString(); |
|
From: <go...@us...> - 2003-09-30 18:07:58
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util
In directory sc8-pr-cvs1:/tmp/cvs-serv21645/src/org/archive/util
Modified Files:
HttpRecorder.java
Log Message:
stream/http recording
Index: HttpRecorder.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util/HttpRecorder.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** HttpRecorder.java 25 Sep 2003 00:14:03 -0000 1.2
--- HttpRecorder.java 30 Sep 2003 18:07:53 -0000 1.3
***************
*** 21,27 ****
*/
public class HttpRecorder {
! String backingFilenamePrefix;
! RecordingInputStream ris;
! RecordingOutputStream ros;
/**
--- 21,27 ----
*/
public class HttpRecorder {
! protected String backingFilenamePrefix;
! protected RecordingInputStream ris;
! protected RecordingOutputStream ros;
/**
***************
*** 50,53 ****
--- 50,75 ----
ros.open(os);
return ros;
+ }
+
+ /**
+ *
+ */
+ public void close() throws IOException {
+ ris.close();
+ ros.close();
+ }
+
+ /**
+ *
+ */
+ public RecordingInputStream getRecordedInput() {
+ return ris;
+ }
+
+ /**
+ *
+ */
+ public void markResponseBodyStart() {
+ ris.markResponseBodyStart();
}
|
|
From: <go...@us...> - 2003-09-30 18:07:58
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic
In directory sc8-pr-cvs1:/tmp/cvs-serv21645/src/org/archive/crawler/basic
Modified Files:
FetcherHTTPSimple.java ARCWriter.java
Log Message:
stream/http recording
Index: FetcherHTTPSimple.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/FetcherHTTPSimple.java,v
retrieving revision 1.9
retrieving revision 1.10
diff -C2 -d -r1.9 -r1.10
*** FetcherHTTPSimple.java 25 Sep 2003 00:14:02 -0000 1.9
--- FetcherHTTPSimple.java 30 Sep 2003 18:07:52 -0000 1.10
***************
*** 106,109 ****
--- 106,110 ----
InputStream is = get.getResponseBodyAsStream();
while(is.read()!=-1) {} // TODOSOON: read in bigger chunks!
+ get.getHttpRecorder().close();
Header contentLength = get.getResponseHeader("Content-Length");
Index: ARCWriter.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/ARCWriter.java,v
retrieving revision 1.31
retrieving revision 1.32
diff -C2 -d -r1.31 -r1.32
*** ARCWriter.java 6 Aug 2003 01:19:29 -0000 1.31
--- ARCWriter.java 30 Sep 2003 18:07:52 -0000 1.32
***************
*** 13,17 ****
import java.io.OutputStream;
! import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.methods.GetMethod;
import org.archive.crawler.basic.StatisticsTracker;
--- 13,17 ----
import java.io.OutputStream;
! // import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.methods.GetMethod;
import org.archive.crawler.basic.StatisticsTracker;
***************
*** 281,308 ****
}
- int headersSize = 0;
int recordLength = 0;
- Header[] headers = get.getResponseHeaders();
-
- ByteArrayOutputStream baos = new ByteArrayOutputStream();
- baos.write(get.getStatusLine().toString().getBytes()); // get status line (it's not a header)
- baos.write("\n".getBytes());
- for(int i=0; i < headers.length; i++){
- baos.write(headers[i].toExternalForm().getBytes());
- }
- recordLength += baos.size();
! // get body so we can calc length for metaline
! byte[] body = get.getResponseBody();
! recordLength += body.length;
! // don't forget the extra CRLF between headers and body
! recordLength += 2;
writeMetaLine(curi, recordLength);
! baos.writeTo(out);
! out.write("\r\n".getBytes());
! out.write(body);
! out.write("\n".getBytes());
}
--- 281,314 ----
}
int recordLength = 0;
! // OLD WAY
! // Header[] headers = get.getResponseHeaders();
! //
! // ByteArrayOutputStream baos = new ByteArrayOutputStream();
! // baos.write(get.getStatusLine().toString().getBytes()); // get status line (it's not a header)
! // baos.write("\n".getBytes());
! // for(int i=0; i < headers.length; i++){
! // baos.write(headers[i].toExternalForm().getBytes());
! // }
! // recordLength += baos.size();
! //
! // // get body so we can calc length for metaline
! // byte[] body = get.getResponseBody();
! // // don't forget the extra CRLF between headers and body
! // recordLength += 2;
+ recordLength += get.getHttpRecorder().getRecordedInput().getSize();
+
writeMetaLine(curi, recordLength);
! get.getHttpRecorder().getRecordedInput().getReplayInputStream().readFullyTo(out);
! out.write('\n'); // trailing newline
!
! // OLD WAY
! // baos.writeTo(out);
! // out.write("\r\n".getBytes());
! // out.write(body);
! // out.write("\n".getBytes());
}
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io In directory sc8-pr-cvs1:/tmp/cvs-serv21645/src/org/archive/crawler/io Modified Files: RecordingInputStream.java ReplayInputStream.java RecordingOutputStream.java Added Files: ReplayCharSequence.java Log Message: stream/http recording --- NEW FILE: ReplayCharSequence.java --- /* * ReplayCharSequence.java * Created on Sep 30, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/ReplayCharSequence.java,v 1.1 2003/09/30 18:07:53 gojomo Exp $ */ package org.archive.crawler.io; import java.io.BufferedInputStream; /** * Provides a CharSequence view on recorded stream bytes (a prefix buffer * and overflow backing file). * * Uses a wraparound rolling buffer of the last windowSize bytes read * from disk in memory; as long as the 'random access' of a CharSequence * user stays within this window, access should remain fairly efficient. * (So design any regexps pointed at these CharSequences to work within * that range!) * * When rereading of a location is necessary, the whole window is * recentered around the location requested. (??? Is this the best * strategy?) * * TODO determine in memory mapped files is better way to do this * * @author Gordon Mohr */ public class ReplayCharSequence implements CharSequence { protected BufferedInputStream diskStream; protected byte[] prefixBuffer; protected long size; protected long responseBodyStart; // where the response body starts, if marked protected byte[] wraparoundBuffer; protected long position; protected String backingFilename; /* (non-Javadoc) * @see java.lang.CharSequence#length() */ public int length() { // TODO Auto-generated method stub return 0; } /* (non-Javadoc) * @see java.lang.CharSequence#charAt(int) */ public char charAt(int index) { // TODO Auto-generated method stub return 0; } /* (non-Javadoc) * @see java.lang.CharSequence#subSequence(int, int) */ public CharSequence subSequence(int start, int end) { // TODO Auto-generated method stub return null; } } Index: RecordingInputStream.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/RecordingInputStream.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** RecordingInputStream.java 25 Sep 2003 00:14:03 -0000 1.1 --- RecordingInputStream.java 30 Sep 2003 18:07:53 -0000 1.2 *************** *** 10,13 **** --- 10,15 ---- import java.io.InputStream; + import javax.swing.text.Position; + import org.archive.util.NullOutputStream; *************** *** 62,67 **** while(read()!=-1) { } ! return recordingOutputStream.size; } --- 64,84 ---- while(read()!=-1) { } ! return recordingOutputStream.getSize(); + } + + /** + * @return + */ + public long getSize() { + // TODO Auto-generated method stub + return recordingOutputStream.getSize(); + } + + /** + * + */ + public void markResponseBodyStart() { + recordingOutputStream.markResponseBodyStart(); } Index: ReplayInputStream.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/ReplayInputStream.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** ReplayInputStream.java 25 Sep 2003 00:14:03 -0000 1.1 --- ReplayInputStream.java 30 Sep 2003 18:07:53 -0000 1.2 *************** *** 11,14 **** --- 11,15 ---- import java.io.IOException; import java.io.InputStream; + import java.io.OutputStream; /** *************** *** 17,27 **** */ public class ReplayInputStream extends InputStream { ! private BufferedInputStream diskStream; ! byte[] buffer; ! long size; ! long position; ! String backingFilename; /** * @param buffer * @param size --- 18,40 ---- */ public class ReplayInputStream extends InputStream { ! protected BufferedInputStream diskStream; ! protected byte[] buffer; ! protected long size; ! protected long responseBodyStart; // where the response body starts, if marked ! protected long position; ! protected String backingFilename; /** + * @param buffer + * @param size + * @param responseBodyStart + * @param backingFilename + */ + public ReplayInputStream(byte[] buffer, long size, long responseBodyStart, String backingFilename) throws IOException { + this(buffer,size,backingFilename); + this.responseBodyStart = responseBodyStart; + } + + /** * @param buffer * @param size *************** *** 37,44 **** --- 50,65 ---- } + public long setToResponseBodyStart() { + position = responseBodyStart; + return position; + } + /* (non-Javadoc) * @see java.io.InputStream#read() */ public int read() throws IOException { + if (position==size) { + return -1; // EOF + } if (position<buffer.length) { return buffer[(int)position++]; *************** *** 51,53 **** --- 72,83 ---- // TODO: implement other read()s for efficiency + + public void readFullyTo(OutputStream os) throws IOException { + // TODO make this more efficient + int c = read(); + while (c != -1) { + os.write(c); + c = read(); + } + } } Index: RecordingOutputStream.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/RecordingOutputStream.java,v retrieving revision 1.2 retrieving revision 1.3 diff -C2 -d -r1.2 -r1.3 *** RecordingOutputStream.java 25 Sep 2003 00:14:03 -0000 1.2 --- RecordingOutputStream.java 30 Sep 2003 18:07:53 -0000 1.3 *************** *** 35,38 **** --- 35,40 ---- protected byte[] buffer; protected long position; + protected long responseBodyStart; // when recording HTTP, where the content-body starts + /** *************** *** 83,87 **** */ private void record(int b) throws IOException { ! if(position>buffer.length){ diskStream.write(b); } else { --- 85,89 ---- */ private void record(int b) throws IOException { ! if(position>=buffer.length){ diskStream.write(b); } else { *************** *** 112,116 **** public ReplayInputStream getReplayInputStream() throws IOException { ! return new ReplayInputStream(buffer,size,backingFilename); } --- 114,134 ---- public ReplayInputStream getReplayInputStream() throws IOException { ! return new ReplayInputStream(buffer,size,responseBodyStart,backingFilename); ! } ! ! ! /** ! * @return ! */ ! public long getSize() { ! return size; ! } ! ! ! /** ! * ! */ ! public void markResponseBodyStart() { ! responseBodyStart = position; } |
|
From: <go...@us...> - 2003-09-30 18:07:58
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel
In directory sc8-pr-cvs1:/tmp/cvs-serv21645/src/org/archive/crawler/datamodel
Modified Files:
CrawlServer.java
Log Message:
stream/http recording
Index: CrawlServer.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlServer.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** CrawlServer.java 6 Aug 2003 01:18:43 -0000 1.1
--- CrawlServer.java 30 Sep 2003 18:07:53 -0000 1.2
***************
*** 13,16 ****
--- 13,17 ----
import org.apache.commons.httpclient.methods.GetMethod;
+ import org.archive.crawler.io.ReplayInputStream;
/**
***************
*** 91,97 ****
// note that akamai will return 400 for some "not founds"
try {
BufferedReader reader = new BufferedReader(
! new InputStreamReader(
! get.getResponseBodyAsStream()));
robots = RobotsExclusionPolicy.policyFor(reader);
} catch (IOException e) {
--- 92,99 ----
// note that akamai will return 400 for some "not founds"
try {
+ ReplayInputStream contentBodyStream = get.getHttpRecorder().getRecordedInput().getReplayInputStream();
+ contentBodyStream.setToResponseBodyStart();
BufferedReader reader = new BufferedReader(
! new InputStreamReader(contentBodyStream));
robots = RobotsExclusionPolicy.policyFor(reader);
} catch (IOException e) {
|
|
From: <go...@us...> - 2003-09-27 01:10:02
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util
In directory sc8-pr-cvs1:/tmp/cvs-serv15349/src/org/archive/util
Added Files:
FencedInputStream.java ARCReader.java ARCResource.java
Log Message:
skeletal first ARCReading work
--- NEW FILE: FencedInputStream.java ---
/*
* FencedInputStream.java
* Created on Sep 26, 2003
*
* $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util/FencedInputStream.java,v 1.1 2003/09/27 00:48:12 gojomo Exp $
*/
package org.archive.util;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
/**
* @author gojomo
*
*/
public class FencedInputStream extends FilterInputStream {
long maxToRead;
long position = 0;
/**
* @param in
*/
protected FencedInputStream(InputStream in, long maxToRead) {
super(in);
this.maxToRead = maxToRead;
}
/* (non-Javadoc)
* @see java.io.InputStream#read()
*/
public int read() throws IOException {
if (position < maxToRead) {
int b = super.read();
if (b>=0) {
position++;
}
return b;
} else {
return -1; // virtual EOF
}
}
/* (non-Javadoc)
* @see java.io.InputStream#read(byte[], int, int)
*/
public int read(byte[] b, int off, int len) throws IOException {
// TODO Auto-generated method stub
return super.read(b, off, len);
}
/* (non-Javadoc)
* @see java.io.InputStream#read(byte[])
*/
public int read(byte[] b) throws IOException {
// TODO Auto-generated method stub
return super.read(b);
}
}
--- NEW FILE: ARCReader.java ---
/*
* ARCReader.java
* Created on Sep 26, 2003
*
* $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util/ARCReader.java,v 1.1 2003/09/27 00:48:12 gojomo Exp $
*/
package org.archive.util;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.GZIPInputStream;
/**
* Utility class for reading ARC files, including .arc.gz
* files.
*
* @author gojomo
*
*/
public class ARCReader {
protected InputStream inStream;
protected FileInputStream arcStream;
protected ARCResource lastResource;
/**
*
*/
public ARCReader() {
super();
}
public void open(String filename) throws IOException {
String flattenedFilename = filename.toLowerCase();
assert flattenedFilename.endsWith(".arc") || flattenedFilename.endsWith(".arc.gz") : "non-arc filename extension";
arcStream = new FileInputStream(filename);
inStream = new BufferedInputStream(arcStream,4096);
if (flattenedFilename.endsWith(".gz")) {
inStream = new GZIPInputStream(inStream);
}
}
public ARCResource getNextResource() {
return null;
}
}
--- NEW FILE: ARCResource.java ---
/*
* ARCResource.java
* Created on Sep 26, 2003
*
* $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util/ARCResource.java,v 1.1 2003/09/27 00:48:12 gojomo Exp $
*/
package org.archive.util;
/**
* @author gojomo
*
*/
public class ARCResource {
}
|
|
From: <go...@us...> - 2003-09-25 00:14:08
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util
In directory sc8-pr-cvs1:/tmp/cvs-serv4243/src/org/archive/util
Modified Files:
HttpRecorder.java
Log Message:
http byte-level recording (in progress)
Index: HttpRecorder.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util/HttpRecorder.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** HttpRecorder.java 24 Sep 2003 01:46:37 -0000 1.1
--- HttpRecorder.java 25 Sep 2003 00:14:03 -0000 1.2
***************
*** 7,13 ****
--- 7,17 ----
package org.archive.util;
+ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
+ import org.archive.crawler.io.RecordingInputStream;
+ import org.archive.crawler.io.RecordingOutputStream;
+
/**
* Initially only supports HTTP/1.0 (one request, one response per stream)
***************
*** 17,20 ****
--- 21,36 ----
*/
public class HttpRecorder {
+ String backingFilenamePrefix;
+ RecordingInputStream ris;
+ RecordingOutputStream ros;
+
+ /**
+ *
+ */
+ public HttpRecorder(String backingFilenamePrefix) {
+ super();
+ ris = new RecordingInputStream(32768,backingFilenamePrefix+".ris",2^20);
+ ros = new RecordingOutputStream(2048,backingFilenamePrefix+".ros",2^12);
+ }
/**
***************
*** 22,27 ****
* @return
*/
! public InputStream inputWrap(InputStream is) {
! return is;
}
--- 38,44 ----
* @return
*/
! public InputStream inputWrap(InputStream is) throws IOException {
! ris.open(is);
! return ris;
}
***************
*** 30,35 ****
* @return
*/
! public OutputStream outputWrap(OutputStream outputStream) {
! return outputStream;
}
--- 47,53 ----
* @return
*/
! public OutputStream outputWrap(OutputStream os) throws IOException {
! ros.open(os);
! return ros;
}
|
|
From: <go...@us...> - 2003-09-25 00:14:07
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework
In directory sc8-pr-cvs1:/tmp/cvs-serv4243/src/org/archive/crawler/framework
Modified Files:
ToeThread.java
Log Message:
http byte-level recording (in progress)
Index: ToeThread.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/ToeThread.java,v
retrieving revision 1.13
retrieving revision 1.14
diff -C2 -d -r1.13 -r1.14
*** ToeThread.java 17 Jul 2003 22:21:05 -0000 1.13
--- ToeThread.java 25 Sep 2003 00:14:03 -0000 1.14
***************
*** 14,17 ****
--- 14,18 ----
import org.archive.crawler.datamodel.FetchStatusCodes;
import org.archive.crawler.datamodel.InstancePerThread;
+ import org.archive.util.HttpRecorder;
/**
***************
*** 28,31 ****
--- 29,33 ----
CrawlController controller;
int serialNumber;
+ HttpRecorder httpRecorder;
HashMap localProcessors = new HashMap();
***************
*** 41,44 ****
--- 43,47 ----
serialNumber = sn;
setName("ToeThread #"+serialNumber);
+ httpRecorder = new HttpRecorder("tt"+sn+"http");
}
***************
*** 147,149 ****
--- 150,166 ----
return paused;
}
+ /**
+ * @return
+ */
+ public HttpRecorder getHttpRecorder() {
+ return httpRecorder;
+ }
+
+ /**
+ * @param recorder
+ */
+ public void setHttpRecorder(HttpRecorder recorder) {
+ httpRecorder = recorder;
+ }
+
}
|
|
From: <go...@us...> - 2003-09-25 00:14:07
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic
In directory sc8-pr-cvs1:/tmp/cvs-serv4243/src/org/archive/crawler/basic
Modified Files:
FetcherHTTPSimple.java
Log Message:
http byte-level recording (in progress)
Index: FetcherHTTPSimple.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/FetcherHTTPSimple.java,v
retrieving revision 1.8
retrieving revision 1.9
diff -C2 -d -r1.8 -r1.9
*** FetcherHTTPSimple.java 23 Sep 2003 01:15:19 -0000 1.8
--- FetcherHTTPSimple.java 25 Sep 2003 00:14:02 -0000 1.9
***************
*** 8,11 ****
--- 8,12 ----
import java.io.IOException;
+ import java.io.InputStream;
import java.util.logging.Logger;
***************
*** 22,25 ****
--- 23,27 ----
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.Processor;
+ import org.archive.crawler.framework.ToeThread;
/**
***************
*** 87,90 ****
--- 89,93 ----
controller.getOrder().getBehavior().getFrom());
+ get.setHttpRecorder(((ToeThread)Thread.currentThread()).getHttpRecorder());
//controller.getKicker().kickMeAt(Thread.currentThread(),now+timeout);
***************
*** 101,105 ****
// this might be wasteful. As it is, it just moves
// the cost here rather than elsewhere. )
! get.getResponseBody();
Header contentLength = get.getResponseHeader("Content-Length");
--- 104,109 ----
// this might be wasteful. As it is, it just moves
// the cost here rather than elsewhere. )
! InputStream is = get.getResponseBodyAsStream();
! while(is.read()!=-1) {} // TODOSOON: read in bigger chunks!
Header contentLength = get.getResponseHeader("Content-Length");
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io In directory sc8-pr-cvs1:/tmp/cvs-serv4243/src/org/archive/crawler/io Modified Files: RecordingOutputStream.java Added Files: RecordingInputStream.java ReplayInputStream.java Log Message: http byte-level recording (in progress) --- NEW FILE: RecordingInputStream.java --- /* * RecordingInputStream.java * Created on Sep 24, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/RecordingInputStream.java,v 1.1 2003/09/25 00:14:03 gojomo Exp $ */ package org.archive.crawler.io; import java.io.IOException; import java.io.InputStream; import org.archive.util.NullOutputStream; /** * @author gojomo * */ public class RecordingInputStream extends InputStream { protected InputStream wrappedStream; protected RecordingOutputStream recordingOutputStream; /** * Create a new RecordingInputStream with the specified parameters. * * @param bufferSize * @param backingFile * @param maxSize */ public RecordingInputStream(int bufferSize, String backingFilename, int maxSize) { recordingOutputStream = new RecordingOutputStream(bufferSize, backingFilename, maxSize); } public void open(InputStream wrappedStream) throws IOException { this.wrappedStream = wrappedStream; recordingOutputStream.open(new NullOutputStream()); } /* (non-Javadoc) * @see java.io.InputStream#read() */ public int read() throws IOException { int b = wrappedStream.read(); recordingOutputStream.write(b); return b; } /* (non-Javadoc) * @see java.io.OutputStream#close() */ public void close() throws IOException { super.close(); wrappedStream.close(); recordingOutputStream.close(); } public ReplayInputStream getReplayInputStream() throws IOException { return recordingOutputStream.getReplayInputStream(); } public long readFully() throws IOException { while(read()!=-1) { } return recordingOutputStream.size; } } --- NEW FILE: ReplayInputStream.java --- /* * ReplayInputStream.java * Created on Sep 24, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/ReplayInputStream.java,v 1.1 2003/09/25 00:14:03 gojomo Exp $ */ package org.archive.crawler.io; import java.io.BufferedInputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; /** * @author gojomo * */ public class ReplayInputStream extends InputStream { private BufferedInputStream diskStream; byte[] buffer; long size; long position; String backingFilename; /** * @param buffer * @param size * @param backingFilename */ public ReplayInputStream(byte[] buffer, long size, String backingFilename) throws IOException { this.buffer = buffer; this.size = size; if (size>buffer.length) { this.backingFilename = backingFilename; diskStream = new BufferedInputStream(new FileInputStream(backingFilename),4096); } } /* (non-Javadoc) * @see java.io.InputStream#read() */ public int read() throws IOException { if (position<buffer.length) { return buffer[(int)position++]; } else { position++; return diskStream.read(); } } // TODO: implement other read()s for efficiency } Index: RecordingOutputStream.java =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/RecordingOutputStream.java,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** RecordingOutputStream.java 24 Sep 2003 01:46:37 -0000 1.1 --- RecordingOutputStream.java 25 Sep 2003 00:14:03 -0000 1.2 *************** *** 7,18 **** --- 7,71 ---- package org.archive.crawler.io; + import java.io.BufferedOutputStream; + import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStream; /** + * A RecordingOutputStream can be wrapped around any other + * OutputStream to record all bytes written to it. You can + * then request a ReplayInputStream to read those bytes. + * + * The RecordingOutputStream uses an in-memory buffer and + * backing disk file to allow it to record streams of + * arbitrary length, limited only by available disk space. + * + * As long as the stream recorded is smaller than the + * in-memory buffer, no disk access will occur. + * * @author gojomo * */ public class RecordingOutputStream extends OutputStream { + protected long size; + protected int maxSize; + protected String backingFilename; + protected BufferedOutputStream diskStream; + protected OutputStream wrappedStream; + protected byte[] buffer; + protected long position; + + /** + * Create a new RecordingInputStream with the specified parameters. + * + * @param bufferSize + * @param backingFile + * @param maxSize + */ + public RecordingOutputStream(int bufferSize, String backingFilename, int maxSize) { + buffer = new byte[bufferSize]; + this.backingFilename = backingFilename; + this.maxSize = maxSize; + } + + + public void open(OutputStream wrappedStream) throws IOException { + this.wrappedStream = wrappedStream; + this.position = 0; + diskStream = new BufferedOutputStream(new FileOutputStream(backingFilename),4096); + } + + /** + * Total reset -- discarding all + */ + public void clear() { + try { + diskStream.close(); + } catch (IOException e) { + // nothing + } + diskStream = null; + } + /* (non-Javadoc) *************** *** 20,25 **** */ public void write(int b) throws IOException { ! // TODO Auto-generated method stub } --- 73,116 ---- */ public void write(int b) throws IOException { ! wrappedStream.write(b); ! record(b); ! } ! ! // TODO implement other forms of write() for efficiency ! ! /** ! * @param b ! */ ! private void record(int b) throws IOException { ! if(position>buffer.length){ ! diskStream.write(b); ! } else { ! buffer[(int)position] = (byte)b; ! } ! position++; ! } ! ! // TODO implement other forms of record() for efficiency + /* (non-Javadoc) + * @see java.io.OutputStream#close() + */ + public void close() throws IOException { + super.close(); + wrappedStream.close(); + diskStream.close(); + this.size = position; + } + + /* (non-Javadoc) + * @see java.io.OutputStream#flush() + */ + public void flush() throws IOException { + super.flush(); + wrappedStream.flush(); + } + + public ReplayInputStream getReplayInputStream() throws IOException { + return new ReplayInputStream(buffer,size,backingFilename); } |
|
From: <go...@us...> - 2003-09-24 01:46:42
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util
In directory sc8-pr-cvs1:/tmp/cvs-serv27393/src/org/archive/util
Added Files:
HttpRecorder.java
Removed Files:
HTTPRecorder.java
Log Message:
skeletal http-recording (in progress)
--- NEW FILE: HttpRecorder.java ---
/*
* HTTPRecorder.java
* Created on Sep 22, 2003
*
* $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util/HttpRecorder.java,v 1.1 2003/09/24 01:46:37 gojomo Exp $
*/
package org.archive.util;
import java.io.InputStream;
import java.io.OutputStream;
/**
* Initially only supports HTTP/1.0 (one request, one response per stream)
*
* @author gojomo
*
*/
public class HttpRecorder {
/**
* @param is
* @return
*/
public InputStream inputWrap(InputStream is) {
return is;
}
/**
* @param outputStream
* @return
*/
public OutputStream outputWrap(OutputStream outputStream) {
return outputStream;
}
}
--- HTTPRecorder.java DELETED ---
|
|
From: <go...@us...> - 2003-09-24 01:46:42
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io In directory sc8-pr-cvs1:/tmp/cvs-serv27393/src/org/archive/crawler/io Added Files: RecordingOutputStream.java Log Message: skeletal http-recording (in progress) --- NEW FILE: RecordingOutputStream.java --- /* * ReplayableOutputStream.java * Created on Sep 23, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/RecordingOutputStream.java,v 1.1 2003/09/24 01:46:37 gojomo Exp $ */ package org.archive.crawler.io; import java.io.IOException; import java.io.OutputStream; /** * @author gojomo * */ public class RecordingOutputStream extends OutputStream { /* (non-Javadoc) * @see java.io.OutputStream#write(int) */ public void write(int b) throws IOException { // TODO Auto-generated method stub } } |
|
From: <go...@us...> - 2003-09-24 01:46:15
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io
In directory sc8-pr-cvs1:/tmp/cvs-serv27323/src/org/archive/crawler/io
Modified Files:
UriProcessingFormatter.java
Log Message:
no '-' for blank fields, use '.' instead
Index: UriProcessingFormatter.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/io/UriProcessingFormatter.java,v
retrieving revision 1.2
retrieving revision 1.3
diff -C2 -d -r1.2 -r1.3
*** UriProcessingFormatter.java 11 Jul 2003 02:30:42 -0000 1.2
--- UriProcessingFormatter.java 24 Sep 2003 01:46:11 -0000 1.3
***************
*** 32,37 ****
CrawlURI curi = (CrawlURI) lr.getParameters()[0];
! String length = "-";
! String mime = "-";
String uri = curi.getUURI().getUri().toASCIIString();
if ( curi.getAList().containsKey(A_HTTP_TRANSACTION)) {
--- 32,37 ----
CrawlURI curi = (CrawlURI) lr.getParameters()[0];
! String length = ".";
! String mime = ".";
String uri = curi.getUURI().getUri().toASCIIString();
if ( curi.getAList().containsKey(A_HTTP_TRANSACTION)) {
|
|
From: <go...@us...> - 2003-09-24 01:45:30
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel
In directory sc8-pr-cvs1:/tmp/cvs-serv27184/src/org/archive/crawler/datamodel
Modified Files:
CrawlURI.java
Log Message:
ensure sensible link, embed counts
Index: CrawlURI.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/datamodel/CrawlURI.java,v
retrieving revision 1.39
retrieving revision 1.40
diff -C2 -d -r1.39 -r1.40
*** CrawlURI.java 23 Sep 2003 01:16:35 -0000 1.39
--- CrawlURI.java 24 Sep 2003 01:45:26 -0000 1.40
***************
*** 40,44 ****
private long wakeTime; // if "snoozed", when this CrawlURI may awake
private long dontRetryBefore = -1;
- private int threadNumber;
// Processing progress
--- 40,43 ----
***************
*** 47,50 ****
--- 46,50 ----
private int deferrals = 0; // count of postponements for prerequisites
private int fetchAttempts = 0; // the number of fetch attempts that have been made
+ private int threadNumber;
// flexible dynamic attributes
***************
*** 54,58 ****
private CrawlURI via; // curi that led to this (lowest hops from seed)
private int linkHopCount = -1; // from seeds
! private int embedHopCount = -1; // from a sure link
////////////////////////////////////////////////////////////////////
--- 54,58 ----
private CrawlURI via; // curi that led to this (lowest hops from seed)
private int linkHopCount = -1; // from seeds
! private int embedHopCount = -1; // from a sure link; reset upon any link traversal
////////////////////////////////////////////////////////////////////
***************
*** 396,401 ****
public void setViaLinkFrom(CrawlURI sourceCuri) {
via = sourceCuri;
int candidateLinkHopCount = sourceCuri.getLinkHopCount()+1;
- embedHopCount = 0;
if (linkHopCount == -1) {
linkHopCount = candidateLinkHopCount;
--- 396,402 ----
public void setViaLinkFrom(CrawlURI sourceCuri) {
via = sourceCuri;
+ // reset embedCount -- but only back to 1 if >0, so special embed handling still applies
+ embedHopCount = (embedHopCount > 0) ? 1 : 0;
int candidateLinkHopCount = sourceCuri.getLinkHopCount()+1;
if (linkHopCount == -1) {
linkHopCount = candidateLinkHopCount;
***************
*** 444,447 ****
--- 445,456 ----
public int getLinkHopCount() {
return linkHopCount;
+ }
+
+ /**
+ *
+ */
+ public void markAsSeed() {
+ linkHopCount = 0;
+ embedHopCount = 0;
}
|
|
From: <go...@us...> - 2003-09-24 01:45:30
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic
In directory sc8-pr-cvs1:/tmp/cvs-serv27184/src/org/archive/crawler/basic
Modified Files:
SimpleStore.java
Log Message:
ensure sensible link, embed counts
Index: SimpleStore.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimpleStore.java,v
retrieving revision 1.31
retrieving revision 1.32
diff -C2 -d -r1.31 -r1.32
*** SimpleStore.java 23 Sep 2003 01:16:34 -0000 1.31
--- SimpleStore.java 24 Sep 2003 01:45:26 -0000 1.32
***************
*** 105,108 ****
--- 105,109 ----
}
CrawlURI curi = new CrawlURI(uuri);
+ curi.markAsSeed();
//curi.getAList().putInt("distance-from-seed",0);
allCuris.put(uuri,curi);
|
|
From: <go...@us...> - 2003-09-24 01:44:46
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic
In directory sc8-pr-cvs1:/tmp/cvs-serv26968/src/org/archive/crawler/basic
Modified Files:
SimplePreselector.java
Log Message:
don't apply scoping to 'embeds' (which includes prerequisities)
Index: SimplePreselector.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/basic/SimplePreselector.java,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** SimplePreselector.java 23 Sep 2003 01:16:34 -0000 1.1
--- SimplePreselector.java 24 Sep 2003 01:44:42 -0000 1.2
***************
*** 53,59 ****
protected void innerRejectProcess(CrawlURI curi) {
super.innerRejectProcess(curi);
! // filter-rejection means out-of-scope
! curi.setFetchStatus(S_OUT_OF_SCOPE);
! curi.cancelFurtherProcessing();
}
--- 53,63 ----
protected void innerRejectProcess(CrawlURI curi) {
super.innerRejectProcess(curi);
! // filter-rejection means out-of-scope for everything but embeds
! if (curi.getEmbedHopCount() < 1) {
! curi.setFetchStatus(S_OUT_OF_SCOPE);
! curi.cancelFurtherProcessing();
! } else {
! // never mind; scope filters don't apply
! }
}
|
|
From: <go...@us...> - 2003-09-24 01:43:45
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc/org/apache/commons In directory sc8-pr-cvs1:/tmp/cvs-serv26774/oversrc/org/apache/commons Log Message: Directory /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc/org/apache/commons added to the repository |
|
From: <go...@us...> - 2003-09-24 01:43:42
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc/org/apache/commons/httpclient In directory sc8-pr-cvs1:/tmp/cvs-serv26774/oversrc/org/apache/commons/httpclient Log Message: Directory /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc/org/apache/commons/httpclient added to the repository |
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc/org/apache/commons/httpclient
In directory sc8-pr-cvs1:/tmp/cvs-serv26822/oversrc/org/apache/commons/httpclient
Added Files:
HttpConnection.java HttpMethod.java HttpMethodDirector.java
HttpMethodBase.java
Log Message:
patched HTTPClient classes: to enable byte-for-byte HTTP traffic recording
--- NEW FILE: HttpConnection.java ---
/*
* $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc/org/apache/commons/httpclient/HttpConnection.java,v 1.1 2003/09/24 01:43:36 gojomo Exp $
* $Revision: 1.1 $
* $Date: 2003/09/24 01:43:36 $
*
* ====================================================================
*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999-2003 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
[...1345 lines suppressed...]
/** Timeout until connection established (Socket created). 0 means no timeout. */
private int connectTimeout = 0;
/** the connection manager that created this connection or null */
private HttpConnectionManager httpConnectionManager;
/** The local interface on which the connection is created, or null for the default */
private InetAddress localAddress;
/** Optional recorder. */
private HttpRecorder recorder;
/* (non-Javadoc)
* @see org.apache.commons.httpclient.HttpMethod#setHttpRecorder(org.archive.util.HttpRecorder)
*/
public void setHttpRecorder(HttpRecorder recorder) {
this.recorder = recorder;
}
}
--- NEW FILE: HttpMethod.java ---
/*
* $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc/org/apache/commons/httpclient/HttpMethod.java,v 1.1 2003/09/24 01:43:36 gojomo Exp $
* $Revision: 1.1 $
* $Date: 2003/09/24 01:43:36 $
*
* ====================================================================
*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999-2003 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution, if
* any, must include the following acknowlegement:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowlegement may appear in the software itself,
* if and wherever such third-party acknowlegements normally appear.
*
* 4. The names "The Jakarta Project", "Commons", and "Apache Software
* Foundation" must not be used to endorse or promote products derived
* from this software without prior written permission. For written
* permission, please contact ap...@ap....
*
* 5. Products derived from this software may not be called "Apache"
* nor may "Apache" appear in their names without prior written
* permission of the Apache Group.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*
* [Additional notices, if required by prior licensing conditions]
*
*/
package org.apache.commons.httpclient;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.httpclient.params.*;
import org.archive.util.HttpRecorder;
/**
* <p>
* HttpMethod interface represents a request to be sent via a
* {@link HttpConnection HTTP connection} and a corresponding response.
* </p>
* @author <a href="mailto:re...@ap...">Remy Maucherat</a>
* @author Rod Waldhoff
* @author <a href="js...@ap...">Jeff Dever</a>
* @author <a href="mailto:mb...@Ga...">Mike Bowler</a>
* @author <a href="mailto:ol...@ur...">Oleg Kalnichevski</a>
*
* @version $Revision: 1.1 $ $Date: 2003/09/24 01:43:36 $
*
* @since 1.0
*/
public interface HttpMethod {
// ------------------------------------------- Property Setters and Getters
/**
* Obtains the name of the HTTP method as used in the HTTP request line,
* for example <tt>"GET"</tt> or <tt>"POST"</tt>.
*
* @return the name of this method
*/
String getName();
/**
* Gets the host configuration for this method. The configuration specifies
* the server, port, protocol, and proxy server via which this method will
* send its HTTP request.
*
* @return the HostConfiguration or <code>null</code> if none is set
*/
HostConfiguration getHostConfiguration();
/**
* Sets the path of the HTTP method.
* It is responsibility of the caller to ensure that the path is
* properly encoded (URL safe).
*
* @param path The path of the HTTP method. The path is expected
* to be URL encoded.
*/
void setPath(String path);
/**
* Returns the path of the HTTP method.
*
* Calling this method <em>after</em> the request has been executed will
* return the <em>actual</em> path, following any redirects automatically
* handled by this HTTP method.
*
* @return the path of the HTTP method, in URL encoded form
*/
String getPath();
/**
* Returns the URI for this method. The URI will be absolute if the host
* configuration has been set and relative otherwise.
*
* @return the URI for this method
*
* @throws URIException if a URI cannot be constructed
*/
URI getURI() throws URIException;
/**
* Sets the URI for this method.
*
* @param uri URI to be set
*
* @throws URIException if a URI cannot be set
*/
void setURI(URI uri) throws URIException;
/**
* Defines how strictly the method follows the HTTP protocol specification.
* (See RFC 2616 and other relevant RFCs.) In the strict mode the method precisely
* implements the requirements of the specification, whereas in non-strict mode
* it attempts to mimic the exact behaviour of commonly used HTTP agents,
* which many HTTP servers expect.
*
* @param strictMode <tt>true</tt> for strict mode, <tt>false</tt> otherwise
*
* @see #isStrictMode()
*/
void setStrictMode(boolean strictMode);
/**
* Returns the value of the strict mode flag.
*
* @return <tt>true</tt> if strict mode is enabled, <tt>false</tt> otherwise
*
* @see #setStrictMode(boolean)
*/
boolean isStrictMode();
/**
* Sets the specified request header, overwriting any
* previous value.
* Note that header-name matching is case insensitive.
* @param headerName the header's name
* @param headerValue the header's value
*
* @see #setRequestHeader(Header)
* @see #getRequestHeader(String)
* @see #removeRequestHeader(String)
*/
void setRequestHeader(String headerName, String headerValue);
/**
* Sets the specified request header, overwriting any
* previous value.
* Note that header-name matching is case insensitive.
* @param header the header to be set
*
* @see #setRequestHeader(String,String)
* @see #getRequestHeader(String)
* @see #removeRequestHeader(String)
*/
void setRequestHeader(Header header);
/**
* Adds the specified request header, <em>not</em> overwriting any previous value.
* If the same header is added multiple times, perhaps with different values,
* multiple instances of that header will be sent in the HTTP request.
* Note that header-name matching is case insensitive.
* @param headerName the header's name
* @param headerValue the header's value
*
* @see #addRequestHeader(Header)
* @see #getRequestHeader(String)
* @see #removeRequestHeader(String)
*/
void addRequestHeader(String headerName, String headerValue);
/**
* Adds the specified request header, <em>not</em> overwriting any previous value.
* If the same header is added multiple times, perhaps with different values,
* multiple instances of that header will be sent in the HTTP request.
* Note that header-name matching is case insensitive.
* @param header the header
*
* @see #addRequestHeader(String,String)
* @see #getRequestHeader(String)
* @see #removeRequestHeader(String)
*/
void addRequestHeader(Header header);
/**
* Gets the request header with the given name.
* If there are multiple headers with the same name,
* there values will be combined with the ',' separator as specified by RFC2616.
* Note that header-name matching is case insensitive.
* @param headerName the header name
* @return the header
*/
Header getRequestHeader(String headerName);
/**
* Removes all request headers with the given name.
* Note that header-name matching is case insensitive.
* @param headerName the header name
*/
void removeRequestHeader(String headerName);
/**
* Returns <tt>true</tt> if the HTTP method should automatically follow HTTP redirects
* (status code 302, etc.), <tt>false</tt> otherwise.
*
* @return <tt>true</tt> if the method will automatically follow HTTP redirects,
* <tt>false</tt> otherwise
*/
boolean getFollowRedirects();
/**
* Sets whether or not the HTTP method should automatically follow HTTP redirects
* (status code 302, etc.)
*
* @param followRedirects <tt>true</tt> if the method will automatically follow redirects,
* <tt>false</tt> otherwise.
*/
void setFollowRedirects(boolean followRedirects);
/**
* Sets the query string of the HTTP method.
* It is responsibility of the caller to ensure that the path is
* properly encoded (URL safe). The string must not include an initial '?' character.
*
* @param queryString the query to be used in the request, with no leading '?' character
*
* @see #getQueryString()
* @see #setQueryString(NameValuePair[])
*/
void setQueryString(String queryString);
/**
* Sets the query string of this HTTP method. The pairs are encoded as UTF-8 characters.
* To use a different charset the parameters can be encoded manually using EncodingUtil
* and set as a single String.
*
* @param params An array of <code>NameValuePair</code>s to use as the query string.
* The name/value pairs will be automatically URL encoded and should not
* have been encoded previously.
*
* @see #getQueryString()
* @see #setQueryString(String)
* @see org.apache.commons.httpclient.util.EncodingUtil#formUrlEncode(NameValuePair[], String)
*/
void setQueryString(NameValuePair[] params);
/**
* Returns the query string of this HTTP method.
*
* @return the query string in URL encoded form, without a leading '?'.
*
* @see #setQueryString(NameValuePair[])
* @see #setQueryString(String)
*/
String getQueryString();
/**
* Returns the current request headers for this HTTP method. The returned headers
* will be in the same order that they were added with <code>addRequestHeader</code>.
* If there are multiple request headers with the same name (e.g. <code>Cookie</code>),
* they will be returned as multiple entries in the array.
*
* @return an array containing all of the request headers
*
* @see #addRequestHeader(Header)
* @see #addRequestHeader(String,String)
*/
Header[] getRequestHeaders();
/**
* Returns the request headers with the given name. Note that header-name matching is
* case insensitive.
* @param headerName the name of the headers to be returned.
* @return an array of zero or more headers
*/
Header[] getRequestHeaders(String headerName);
// ---------------------------------------------------------------- Queries
/**
* Returns <tt>true</tt> the method is ready to execute, <tt>false</tt> otherwise.
*
* @return <tt>true</tt> if the method is ready to execute, <tt>false</tt> otherwise.
*/
boolean validate();
/**
* Returns the status code associated with the latest response.
*
* @return The status code from the most recent execution of this method.
* If the method has not yet been executed, the result is undefined.
*/
int getStatusCode();
/**
* Returns the status text (or "reason phrase") associated with the latest
* response.
*
* @return The status text from the most recent execution of this method.
* If the method has not yet been executed, the result is undefined.
*/
String getStatusText();
/**
* Returns the response headers from the most recent execution of this request.
*
* @return A newly-created array containing all of the response headers,
* in the order in which they appeared in the response.
*/
Header[] getResponseHeaders();
/**
* Returns the specified response header. Note that header-name matching is
* case insensitive.
*
* @param headerName The name of the header to be returned.
*
* @return The specified response header. If the repsonse contained multiple
* instances of the header, its values will be combined using the ','
* separator as specified by RFC2616.
*/
Header getResponseHeader(String headerName);
/**
* Returns the response headers with the given name. Note that header-name matching is
* case insensitive.
* @param headerName the name of the headers to be returned.
* @return an array of zero or more headers
*/
Header[] getResponseHeaders(String headerName);
/**
* Returns the response footers from the most recent execution of this request.
*
* @return an array containing the response footers in the order that they
* appeared in the response. If the response had no footers,
* an empty array will be returned.
*/
Header[] getResponseFooters();
/**
* Return the specified response footer. Note that footer-name matching is
* case insensitive.
*
* @param footerName The name of the footer.
* @return The response footer.
*/
Header getResponseFooter(String footerName);
/**
* Returns the response body of the HTTP method, if any, as an array of bytes.
* If the method has not yet been executed or the response has no body, <code>null</code>
* is returned. Note that this method does not propagate I/O exceptions.
* If an error occurs while reading the body, <code>null</code> will be returned.
*
* @return The response body, or <code>null</code> if the
* body is not available.
*/
byte[] getResponseBody();
/**
* Returns the response body of the HTTP method, if any, as a {@link String}.
* If response body is not available or cannot be read, <tt>null</tt> is returned.
* The raw bytes in the body are converted to a <code>String</code> using the
* character encoding specified in the response's <tt>Content-Type</tt> header, or
* ISO-8859-1 if the response did not specify a character set.
* <p>
* Note that this method does not propagate I/O exceptions.
* If an error occurs while reading the body, <code>null</code> will be returned.
*
* @return The response body converted to a <code>String</code>, or <code>null</code>
* if the body is not available.
*/
String getResponseBodyAsString();
/**
* Returns the response body of the HTTP method, if any, as an InputStream.
* If the response had no body or the method has not yet been executed,
* <code>null</code> is returned. Additionally, <code>null</code> may be returned
* if {@link #releaseConnection} has been called or
* if this method was called previously and the resulting stream was closed.
*
* @return The response body, or <code>null</code> if it is not available
*
* @throws IOException if an I/O (transport) problem occurs
*/
InputStream getResponseBodyAsStream() throws IOException;
/**
* Returns <tt>true</tt> if the HTTP method has been already {@link #execute executed},
* but not {@link #recycle recycled}.
*
* @return <tt>true</tt> if the method has been executed, <tt>false</tt> otherwise
*/
boolean hasBeenUsed();
// --------------------------------------------------------- Action Methods
/**
* Executes this method using the specified <code>HttpConnection</code> and
* <code>HttpState</code>.
*
* @param state the {@link HttpState state} information to associate with this method
* @param connection the {@link HttpConnection connection} used to execute
* this HTTP method
*
* @throws IOException If an I/O (transport) error occurs. Some transport exceptions
* can be recovered from.
* @throws HttpException If a protocol exception occurs. Usually protocol exceptions
* cannot be recovered from.
*
* @return the integer status code if one was obtained, or <tt>-1</tt>
*/
int execute(HttpState state, HttpConnection connection)
throws HttpException, IOException;
/**
* Recycles the HTTP method so that it can be used again.
* Note that all of the instance variables will be reset
* once this method has been called. This method will also
* release the connection being used by this HTTP method.
*
* @see #releaseConnection()
*/
void recycle();
/**
* Releases the connection being used by this HTTP method. In particular the
* connection is used to read the response (if there is one) and will be held
* until the response has been read. If the connection can be reused by other
* HTTP methods it is NOT closed at this point.
* <p>
* After this method is called, {@link #getResponseBodyAsStream} will return
* <code>null</code>, and {@link #getResponseBody} and {@link #getResponseBodyAsString}
* <em>may</em> return <code>null</code>.
*/
void releaseConnection();
/**
* Add a footer to this method's response.
* <p>
* <b>Note:</b> This method is for
* internal use only and should not be called by external clients.
*
* @param footer the footer to add
*
* @since 2.0
*/
void addResponseFooter(Header footer);
/**
* Returns the Status-Line from the most recent response for this method,
* or <code>null</code> if the method has not been executed.
*
* @return the status line, or <code>null</code> if the method has not been executed
*
* @since 2.0
*/
StatusLine getStatusLine();
/**
* Returns <tt>true</tt> if the HTTP method should automatically handle HTTP
* authentication challenges (status code 401, etc.), <tt>false</tt> otherwise
*
* @return <tt>true</tt> if authentication challenges will be processed
* automatically, <tt>false</tt> otherwise.
*
* @since 2.0
*
* @see #setDoAuthentication(boolean)
*/
boolean getDoAuthentication();
/**
* Sets whether or not the HTTP method should automatically handle HTTP
* authentication challenges (status code 401, etc.)
*
* @param doAuthentication <tt>true</tt> to process authentication challenges
* automatically, <tt>false</tt> otherwise.
*
* @since 2.0
*
* @see #getDoAuthentication()
*/
void setDoAuthentication(boolean doAuthentication);
/**
* Returns a collection of parameters associated with this method
*
* @since 2.1
*
* @see HttpMethodParams
*/
public HttpMethodParams getParams();
/**
* Arrange for the provided HttpRecorder to record HTTP traffic.
*
* @param recorder
*/
public void setHttpRecorder(HttpRecorder recorder);
/**
* Retreive the previously-supplied HttpRecorder
*
* @return
*/
public HttpRecorder getHttpRecorder();
}
--- NEW FILE: HttpMethodDirector.java ---
/*
* $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc/org/apache/commons/httpclient/HttpMethodDirector.java,v 1.1 2003/09/24 01:43:36 gojomo Exp $
* $Revision: 1.1 $
* $Date: 2003/09/24 01:43:36 $
*
* ====================================================================
*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999-2003 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution, if
* any, must include the following acknowlegement:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowlegement may appear in the software itself,
* if and wherever such third-party acknowlegements normally appear.
*
* 4. The names "The Jakarta Project", "Commons", and "Apache Software
* Foundation" must not be used to endorse or promote products derived
* from this software without prior written permission. For written
* permission, please contact ap...@ap....
*
* 5. Products derived from this software may not be called "Apache"
* nor may "Apache" appear in their names without prior written
* permission of the Apache Group.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*
* [Additional notices, if required by prior licensing conditions]
*
*/
package org.apache.commons.httpclient;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.httpclient.auth.AuthScheme;
import org.apache.commons.httpclient.auth.AuthenticationException;
import org.apache.commons.httpclient.auth.CredentialsNotAvailableException;
import org.apache.commons.httpclient.auth.HttpAuthenticator;
import org.apache.commons.httpclient.auth.MalformedChallengeException;
import org.apache.commons.httpclient.params.*;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
/**
* Handles the process of executing a method including authentication, redirection and retries.
*/
class HttpMethodDirector {
/** Maximum number of redirects and authentications that will be followed */
private static final int MAX_FORWARDS = 100;
private static final Log LOG = LogFactory.getLog(HttpMethodDirector.class);
private HttpMethod method;
private HttpState state;
private HostConfiguration hostConfiguration;
private HttpConnectionManager connectionManager;
private HttpConnection connection;
private HttpClientParams params;
/** A flag to indicate if the connection should be released after the method is executed. */
private boolean releaseConnection = false;
/** How many times did this transparently handle a recoverable exception? */
private int recoverableExceptionCount = 0;
/** Realms that we tried to authenticate to */
private Set realms = null;
/** Proxy Realms that we tried to authenticate to */
private Set proxyRealms = null;
/** Actual authentication realm */
private String realm = null;
/** Actual proxy authentication realm */
private String proxyRealm = null;
/**
* Executes the method associated with this method director.
*
* @throws IOException
* @throws HttpException
*/
public void executeMethod() throws IOException, HttpException {
method.getParams().setDefaults(this.params);
try {
int forwardCount = 0; //protect from an infinite loop
while (forwardCount++ < MAX_FORWARDS) {
// on every retry, reset this state information.
if (LOG.isDebugEnabled()) {
LOG.debug("Execute loop try " + forwardCount);
}
executeMethodForHost();
if (!isRetryNeeded()) {
// nope, no retry needed, exit loop.
break;
}
// retry - close previous stream. Caution - this causes
// responseBodyConsumed to be called, which may also close the
// connection.
if (method.getResponseBodyAsStream() != null) {
method.getResponseBodyAsStream().close();
}
} //end of retry loop
if (forwardCount >= MAX_FORWARDS) {
LOG.error("Narrowly avoided an infinite loop in execute");
throw new ProtocolException("Maximum redirects ("
+ MAX_FORWARDS + ") exceeded");
}
} finally {
if (connection != null) {
connection.setLocked(false);
}
// If the response has been fully processed, return the connection
// to the pool. Use this flag, rather than other tests (like
// responseStream == null), as subclasses, might reset the stream,
// for example, reading the entire response into a file and then
// setting the file as the stream.
if (releaseConnection && connection != null) {
connection.releaseConnection();
} else if (method.getResponseBodyAsStream() == null) {
method.releaseConnection();
}
}
}
/**
* Adds authentication headers if <code>authenticationPreemtive</code> has been set.
*
* @see HttpState#isAuthenticationPreemptive()
*/
private void addPreemtiveAuthenticationHeaders() {
//pre-emptively add the authorization header, if required.
if (this.params.isAuthenticationPreemptive()) {
LOG.debug("Preemptively sending default basic credentials");
try {
if (HttpAuthenticator.authenticateDefault(method, connection, state)) {
LOG.debug("Default basic credentials applied");
}
if (connection.isProxied()) {
if (HttpAuthenticator.authenticateProxyDefault(method, connection, state)) {
LOG.debug("Default basic proxy credentials applied");
}
}
} catch (AuthenticationException e) {
// Log error and move on
LOG.error(e.getMessage(), e);
}
}
}
/**
* Makes sure there is a connection allocated and that it is valid and open.
*
* @return <code>true</code> if a valid connection was established,
* <code>false</code> otherwise
*
* @throws IOException
* @throws HttpException
*/
private boolean establishValidOpenConnection() throws IOException, HttpException {
// make sure the connection we have is appropriate
if (connection != null && !hostConfiguration.hostEquals(connection)) {
connection.setLocked(false);
connection.releaseConnection();
connection = null;
}
// get a connection, if we need one
if (connection == null) {
connection = connectionManager.getConnectionWithTimeout(
hostConfiguration,
this.params.getConnectionManagerTimeout()
);
connection.setLocked(true);
realms = new HashSet();
proxyRealms = new HashSet();
addPreemtiveAuthenticationHeaders();
}
try {
// Catch all possible exceptions to make sure to release the
// connection, as although the user may call
// Method->releaseConnection(), the method doesn't know about the
// connection until HttpMethod.execute() is called.
if (!connection.isOpen()) {
// this connection must be opened before it can be used
connection.setSoTimeout(this.params.getSoTimeout());
connection.setConnectionTimeout(this.params.getConnectionTimeout());
if(method.getHttpRecorder()!=null) {
connection.setHttpRecorder(method.getHttpRecorder());
}
connection.open();
if (connection.isProxied() && connection.isSecure()) {
// we need to create a secure tunnel before we can execute the real method
if (!executeConnect()) {
// abort, the connect method failed
return false;
}
}
} else if (
!(method instanceof ConnectMethod)
&& connection.isProxied()
&& connection.isSecure()
&& !connection.isTransparent()
) {
// this connection is open but the secure tunnel has not be created yet,
// execute the connect again
if (!executeConnect()) {
// abort, the connect method failed
return false;
}
}
} catch (IOException e) {
releaseConnection = true;
throw e;
} catch (RuntimeException e) {
releaseConnection = true;
throw e;
}
return true;
}
/**
* Executes a method with the current hostConfiguration.
*
* @throws IOException if an I/O (transport) error occurs. Some transport exceptions
* can be recovered from.
* @throws HttpException if a protocol exception occurs. Usually protocol exceptions
* cannot be recovered from.
*/
private void executeMethodForHost() throws IOException, HttpException {
int execCount = 0;
// TODO: how do we get requestSent?
boolean requestSent = false;
// loop until the method is successfully processed, the retryHandler
// returns false or a non-recoverable exception is thrown
while (true) {
execCount++;
requestSent = false;
if (!establishValidOpenConnection()) {
return;
}
if (LOG.isTraceEnabled()) {
LOG.trace("Attempt number " + execCount + " to process request");
}
try {
method.execute(state, connection);
break;
} catch (HttpRecoverableException httpre) {
if (LOG.isDebugEnabled()) {
LOG.debug("Closing the connection.");
}
connection.close();
LOG.info("Recoverable exception caught when processing request");
// update the recoverable exception count.
recoverableExceptionCount++;
// test if this method should be retried
if (!getMethodRetryHandler().retryMethod(
method,
connection,
httpre,
execCount,
requestSent)
) {
LOG.warn(
"Recoverable exception caught but MethodRetryHandler.retryMethod() "
+ "returned false, rethrowing exception"
);
throw httpre;
}
}
}
}
private MethodRetryHandler getMethodRetryHandler() {
if (method instanceof HttpMethodBase) {
return ((HttpMethodBase) method).getMethodRetryHandler();
} else {
return new DefaultMethodRetryHandler();
}
}
/**
* Executes a ConnectMethod to establish a tunneled connection.
*
* @return <code>true</code> if the connect was successful
*
* @throws IOException
* @throws HttpException
*/
private boolean executeConnect() throws IOException, HttpException {
ConnectMethod connectMethod = new ConnectMethod();
HttpMethod tempMethod = this.method;
this.method = connectMethod;
try {
executeMethod();
} catch (HttpException e) {
this.method = tempMethod;
throw e;
} catch (IOException e) {
this.method = tempMethod;
throw e;
}
int code = method.getStatusCode();
if ((code >= 200) && (code < 300)) {
this.method = tempMethod;
return true;
} else {
// What is to follow is an ugly hack.
// I REALLY hate having to resort to such
// an appalling trick
// TODO: Connect method must be redesigned.
// The only feasible solution is to split monolithic
// HttpMethod into HttpRequest/HttpResponse pair.
// That would allow to execute CONNECT method
// behind the scene and return CONNECT HttpResponse
// object in response to the original request that
// contains the correct status line, headers &
// response body.
LOG.debug("CONNECT failed, fake the response for the original method");
// Pass the status, headers and response stream to the wrapped
// method.
// To ensure that the connection is not released more than once
// this method is still responsible for releasing the connection.
// This will happen when the response body is consumed, or when
// the wrapped method closes the response connection in
// releaseConnection().
if (tempMethod instanceof HttpMethodBase) {
((HttpMethodBase) tempMethod).fakeResponse(
connectMethod.getStatusLine(),
connectMethod.getResponseHeaderGroup(),
connectMethod.getResponseBodyAsStream()
);
} else {
releaseConnection = true;
LOG.warn(
"Unable to fake response on method as it is not derived from HttpMethodBase.");
}
this.method = tempMethod;
return false;
}
}
/**
* Process the redirect response.
*
* @return <code>true</code> if the redirect was successful
*/
private boolean processRedirectResponse() {
if (!method.getFollowRedirects()) {
LOG.info("Redirect requested but followRedirects is "
+ "disabled");
return false;
}
//get the location header to find out where to redirect to
Header locationHeader = method.getResponseHeader("location");
if (locationHeader == null) {
// got a redirect response, but no location header
LOG.error("Received redirect response " + method.getStatusCode()
+ " but no location header");
return false;
}
String location = locationHeader.getValue();
if (LOG.isDebugEnabled()) {
LOG.debug("Redirect requested to location '" + location
+ "'");
}
//rfc2616 demands the location value be a complete URI
//Location = "Location" ":" absoluteURI
URI redirectUri = null;
URI currentUri = null;
try {
currentUri = new URI(
connection.getProtocol().getScheme(),
null,
connection.getHost(),
connection.getPort(),
method.getPath()
);
redirectUri = new URI(location, true);
if (redirectUri.isRelativeURI()) {
if (method.isStrictMode()) {
LOG.warn("Redirected location '" + location
+ "' is not acceptable in strict mode");
return false;
} else {
//location is incomplete, use current values for defaults
LOG.debug("Redirect URI is not absolute - parsing as relative");
redirectUri = new URI(currentUri, redirectUri);
}
}
} catch (URIException e) {
LOG.warn("Redirected location '" + location + "' is malformed");
return false;
}
//invalidate the list of authentication attempts
this.realms.clear();
//remove exisitng authentication headers
method.removeRequestHeader(HttpAuthenticator.WWW_AUTH_RESP);
//update the current location with the redirect location.
//avoiding use of URL.getPath() and URL.getQuery() to keep
//jdk1.2 comliance.
method.setPath(redirectUri.getEscapedPath());
method.setQueryString(redirectUri.getEscapedQuery());
hostConfiguration.setHost(redirectUri);
if (LOG.isDebugEnabled()) {
LOG.debug("Redirecting from '" + currentUri.getEscapedURI()
+ "' to '" + redirectUri.getEscapedURI());
}
return true;
}
/**
* Processes a response that requires authentication
*
* @param state the current state
* @param conn The connection
*
* @return <code>true</code> if the request has completed processing, <code>false</code>
* if more attempts are needed
*/
private boolean processAuthenticationResponse(HttpState state, HttpConnection conn) {
LOG.trace("enter HttpMethodBase.processAuthenticationResponse("
+ "HttpState, HttpConnection)");
int statusCode = method.getStatusCode();
// handle authentication required
Header[] challenges = null;
Set realmsUsed = null;
String host = null;
switch (statusCode) {
case HttpStatus.SC_UNAUTHORIZED:
challenges = method.getResponseHeaders(HttpAuthenticator.WWW_AUTH);
realmsUsed = realms;
host = conn.getVirtualHost();
if (host == null) {
host = conn.getHost();
}
break;
case HttpStatus.SC_PROXY_AUTHENTICATION_REQUIRED:
challenges = method.getResponseHeaders(HttpAuthenticator.PROXY_AUTH);
realmsUsed = proxyRealms;
host = conn.getProxyHost();
break;
}
boolean authenticated = false;
// if there was a header requesting authentication
if (challenges.length > 0) {
AuthScheme authscheme = null;
try {
authscheme = HttpAuthenticator.selectAuthScheme(challenges);
} catch (MalformedChallengeException e) {
if (LOG.isErrorEnabled()) {
LOG.error(e.getMessage(), e);
}
return true;
} catch (UnsupportedOperationException e) {
if (LOG.isErrorEnabled()) {
LOG.error(e.getMessage(), e);
}
return true;
}
StringBuffer buffer = new StringBuffer();
buffer.append(host);
buffer.append('#');
buffer.append(authscheme.getID());
String realm = buffer.toString();
if (realmsUsed.contains(realm)) {
if (LOG.isInfoEnabled()) {
buffer = new StringBuffer();
buffer.append("Already tried to authenticate with '");
buffer.append(authscheme.getRealm());
buffer.append("' authentication realm at ");
buffer.append(host);
buffer.append(", but still receiving: ");
buffer.append(method.getStatusLine().toString());
LOG.info(buffer.toString());
}
return true;
} else {
realmsUsed.add(realm);
}
try {
//remove preemptive header and reauthenticate
switch (statusCode) {
case HttpStatus.SC_UNAUTHORIZED:
method.removeRequestHeader(HttpAuthenticator.WWW_AUTH_RESP);
authenticated = HttpAuthenticator.authenticate(
authscheme, method, conn, state);
this.realm = authscheme.getRealm();
break;
case HttpStatus.SC_PROXY_AUTHENTICATION_REQUIRED:
method.removeRequestHeader(HttpAuthenticator.PROXY_AUTH_RESP);
authenticated = HttpAuthenticator.authenticateProxy(
authscheme, method, conn, state);
this.proxyRealm = authscheme.getRealm();
break;
}
} catch (CredentialsNotAvailableException e) {
if (LOG.isWarnEnabled()) {
LOG.warn(e.getMessage());
}
return true; // finished request
} catch (AuthenticationException e) {
if (LOG.isErrorEnabled()) {
LOG.error(e.getMessage(), e);
}
return true; // finished request
}
if (!authenticated) {
// won't be able to authenticate to this challenge
// without additional information
LOG.debug("HttpMethodBase.execute(): Server demands "
+ "authentication credentials, but none are "
+ "available, so aborting.");
} else {
LOG.debug("HttpMethodBase.execute(): Server demanded "
+ "authentication credentials, will try again.");
// let's try it again, using the credentials
}
}
return !authenticated; // finished processing if we aren't authenticated
}
/**
* Returns true if a retry is needed.
*
* @return boolean <code>true</code> if a retry is needed.
*/
private boolean isRetryNeeded() {
switch (method.getStatusCode()) {
case HttpStatus.SC_UNAUTHORIZED:
case HttpStatus.SC_PROXY_AUTHENTICATION_REQUIRED:
LOG.debug("Authorization required");
if (method.getDoAuthentication()) { //process authentication response
//if the authentication is successful, return the statusCode
//otherwise, drop through the switch and try again.
if (processAuthenticationResponse(state, connection)) {
return false;
}
} else { //let the client handle the authenticaiton
return false;
}
break;
case HttpStatus.SC_MOVED_TEMPORARILY:
case HttpStatus.SC_MOVED_PERMANENTLY:
case HttpStatus.SC_SEE_OTHER:
case HttpStatus.SC_TEMPORARY_REDIRECT:
LOG.debug("Redirect required");
if (!processRedirectResponse()) {
return false;
}
break;
default:
// neither an unauthorized nor a redirect response
return false;
} //end of switch
return true;
}
/**
* @return
*/
public HostConfiguration getHostConfiguration() {
return hostConfiguration;
}
/**
* @param hostConfiguration
*/
public void setHostConfiguration(HostConfiguration hostConfiguration) {
this.hostConfiguration = hostConfiguration;
}
/**
* @return
*/
public HttpMethod getMethod() {
return method;
}
/**
* @param method
*/
public void setMethod(HttpMethod method) {
this.method = method;
}
/**
* @return
*/
public HttpState getState() {
return state;
}
/**
* @param state
*/
public void setState(HttpState state) {
this.state = state;
}
/**
* @return
*/
public HttpConnectionManager getConnectionManager() {
return connectionManager;
}
/**
* @param connectionManager
*/
public void setConnectionManager(HttpConnectionManager connectionManager) {
this.connectionManager = connectionManager;
}
/**
* @return
*/
public HttpParams getParams() {
return this.params;
}
/**
* @param params
*/
public void setParams(final HttpClientParams params) {
this.params = params;
}
}
--- NEW FILE: HttpMethodBase.java ---
/*
* $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc/org/apache/commons/httpclient/HttpMethodBase.java,v 1.1 2003/09/24 01:43:36 gojomo Exp $
* $Revision: 1.1 $
* $Date: 2003/09/24 01:43:36 $
*
* ====================================================================
*
* The Apache Software License, Version 1.1
*
* Copyright (c) 1999-2003 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
[...2278 lines suppressed...]
this.responseBody = null;
this.responseStream = responseStream;
}
/* (non-Javadoc)
* @see org.apache.commons.httpclient.HttpMethod#getHttpRecorder()
*/
public HttpRecorder getHttpRecorder() {
return recorder;
}
/* (non-Javadoc)
* @see org.apache.commons.httpclient.HttpMethod#setHttpRecorder(org.archive.util.HttpRecorder)
*/
public void setHttpRecorder(HttpRecorder recorder) {
this.recorder = recorder;
}
}
|
|
From: <go...@us...> - 2003-09-24 01:43:41
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc/org In directory sc8-pr-cvs1:/tmp/cvs-serv26774/oversrc/org Log Message: Directory /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc/org added to the repository |
|
From: <go...@us...> - 2003-09-24 01:43:35
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc In directory sc8-pr-cvs1:/tmp/cvs-serv26774/oversrc Log Message: Directory /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc added to the repository |
|
From: <go...@us...> - 2003-09-24 01:43:35
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc/org/apache In directory sc8-pr-cvs1:/tmp/cvs-serv26774/oversrc/org/apache Log Message: Directory /cvsroot/archive-crawler/ArchiveOpenCrawler/oversrc/org/apache added to the repository |
|
From: <go...@us...> - 2003-09-23 01:16:42
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework
In directory sc8-pr-cvs1:/tmp/cvs-serv21672/src/org/archive/crawler/framework
Modified Files:
CrawlController.java Processor.java
Log Message:
refactorings(in progress)
Index: CrawlController.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/CrawlController.java,v
retrieving revision 1.27
retrieving revision 1.28
diff -C2 -d -r1.27 -r1.28
*** CrawlController.java 19 Sep 2003 01:37:20 -0000 1.27
--- CrawlController.java 23 Sep 2003 01:16:35 -0000 1.28
***************
*** 253,257 ****
curi.setNextProcessor(firstProcessor);
curi.setThreadNumber(thread.getSerialNumber());
- curi.setController(this);
}
return curi;
--- 253,256 ----
Index: Processor.java
===================================================================
RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/crawler/framework/Processor.java,v
retrieving revision 1.13
retrieving revision 1.14
diff -C2 -d -r1.13 -r1.14
*** Processor.java 19 Sep 2003 01:37:20 -0000 1.13
--- Processor.java 23 Sep 2003 01:16:35 -0000 1.14
***************
*** 36,42 ****
if(filtersAccept(curi)) {
innerProcess(curi);
! } // TODO: else perhaps send to different next?
}
/**
* @param curi
--- 36,51 ----
if(filtersAccept(curi)) {
innerProcess(curi);
! } else {
! innerRejectProcess(curi);
! }
}
+ /**
+ * @param curi
+ */
+ protected void innerRejectProcess(CrawlURI curi) {
+ // by default do nothing
+ }
+
/**
* @param curi
|
|
From: <go...@us...> - 2003-09-23 01:16:42
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/example-crawl In directory sc8-pr-cvs1:/tmp/cvs-serv21672/example-crawl Modified Files: example-order.xml Log Message: refactorings(in progress) Index: example-order.xml =================================================================== RCS file: /cvsroot/archive-crawler/ArchiveOpenCrawler/example-crawl/example-order.xml,v retrieving revision 1.13 retrieving revision 1.14 diff -C2 -d -r1.13 -r1.14 *** example-order.xml 19 Sep 2003 01:37:41 -0000 1.13 --- example-order.xml 23 Sep 2003 01:16:35 -0000 1.14 *************** *** 29,38 **** # http://www.sabre.mod.uk # http://www.archive.org/.. ! # http://www.yahoo.com/../../movies # http://www.creativecommons.org/../ ! http://www.royal-navy.mod.uk/rn/form/form.html?page=1 ! http://www.dfid.gov.uk/../../aboutdfid/files/glossary_l.htm #http://directory.google.com/Top/Games/ # http://www3.google.com/help/customize.html </seeds> --- 29,43 ---- # http://www.sabre.mod.uk # http://www.archive.org/.. ! #http://www.yahoo.com/../../movies ! #http://www.archive.org/movies/fake/../movies.php ! #http://www.archive.org/movies/../../blahblah ! #http://www.archive.org/movies/../../../../somethingelse ! #http://www.army.mod.uk/ceremonialandheritage/museums_main.htm # http://www.creativecommons.org/../ ! #http://www.royal-navy.mod.uk/rn/form/form.html?page=1 ! #http://www.dfid.gov.uk/../../aboutdfid/files/glossary_l.htm #http://directory.google.com/Top/Games/ # http://www3.google.com/help/customize.html + http://dmoz.org </seeds> *************** *** 43,46 **** --- 48,52 ---- regexp=".*yahoo\.com.*" /> --> + <!-- <filter name="pathological-path" *************** *** 53,57 **** modifier="not" regexp="[^/]*?//[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?" /> ! <!-- <filter --- 59,63 ---- modifier="not" regexp="[^/]*?//[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?/[^/]*?" /> ! --> <!-- <filter *************** *** 67,77 **** --> - <filter - name="focus" - class="org.archive.crawler.util.SeedExtensionFilter" - mode="domain" - /> - - </selector> --- 73,76 ---- *************** *** 81,84 **** --- 80,94 ---- <processors> + <processor + name="Preselector" + class="org.archive.crawler.basic.SimplePreselector" + next="Preprocessor"> + <params max-link-depth="1" max-embed-depth="2" /> + <filter + name="focus" + class="org.archive.crawler.util.SeedExtensionFilter" + mode="domain" + /> + </processor> <processor name="Preprocessor" |
|
From: <go...@us...> - 2003-09-23 01:16:42
|
Update of /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util
In directory sc8-pr-cvs1:/tmp/cvs-serv21672/src/org/archive/util
Added Files:
HTTPRecorder.java
Log Message:
refactorings(in progress)
--- NEW FILE: HTTPRecorder.java ---
/*
* HTTPRecorder.java
* Created on Sep 22, 2003
*
* $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/org/archive/util/HTTPRecorder.java,v 1.1 2003/09/23 01:16:35 gojomo Exp $
*/
package org.archive.util;
/**
*
* Initially only supports HTTP/1.0 (one request, one response per stream)
* @author gojomo
*
*/
public class HTTPRecorder {
}
|