From: <bra...@us...> - 2010-04-14 21:21:01
|
Revision: 3038 http://archive-access.svn.sourceforge.net/archive-access/?rev=3038&view=rev Author: bradtofel Date: 2010-04-14 21:20:49 +0000 (Wed, 14 Apr 2010) Log Message: ----------- REFACTOR: removed buggy local-cache, added less buggy cache which relies on varnish/squid to do RAM/Filesystem caching, new implementation uses Heritrix HTTP Recorder code to write raw bytes from the web into ARCs, rather than whatever the apache HTTP client returned. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/FileRegion.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCRecordingProxy.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCUnwrappingProxy.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/RemoteLiveWebCache.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCachingProxy.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLCacher.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCachingProxy.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCachingProxy.java 2010-04-14 21:15:01 UTC (rev 3037) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCCachingProxy.java 2010-04-14 21:20:49 UTC (rev 3038) @@ -1,157 +0,0 @@ -/* ARCCachingProxy - * - * $Id$: - * - * Created on Dec 8, 2009. - * - * Copyright (C) 2006 Internet Archive. - * - * This file is part of Wayback. - * - * Wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * Wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with Wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -package org.archive.wayback.liveweb; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.io.OutputStream; -import java.io.PrintWriter; -import java.io.RandomAccessFile; -import java.net.URL; - -import javax.servlet.ServletException; -import javax.servlet.http.HttpServletRequest; -import javax.servlet.http.HttpServletResponse; - -import org.apache.log4j.Logger; -import org.archive.io.arc.ARCLocation; -import org.archive.io.arc.ARCRecord; -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.core.Resource; -import org.archive.wayback.exception.LiveDocumentNotAvailableException; -import org.archive.wayback.resourcestore.resourcefile.ArcResource; -import org.archive.wayback.webapp.ServletRequestContext; - -/** - * @author brad - * - */ -public class ARCCachingProxy extends ServletRequestContext { - - private final static String EXPIRES_HEADER = "Expires"; - - private final static String ARC_RECORD_CONTENT_TYPE = "application/x-arc-record"; - private static final Logger LOGGER = Logger.getLogger( - ARCCachingProxy.class.getName()); - private ARCCacheDirectory arcCacheDir = null; - private URLCacher cacher = null; - private long expiresMS = 60 * 60 * 1000; - /* (non-Javadoc) - * @see org.archive.wayback.webapp.ServletRequestContext#handleRequest(javax.servlet.http.HttpServletRequest, javax.servlet.http.HttpServletResponse) - */ - @Override - public boolean handleRequest(HttpServletRequest httpRequest, - HttpServletResponse httpResponse) throws ServletException, - IOException { - - StringBuffer sb = httpRequest.getRequestURL(); - String query = httpRequest.getQueryString(); - if(query != null) { - sb.append("?").append(query); - } - URL url = new URL(sb.toString()); - FileRegion r = null; - try { - r = getLiveResource(url); - httpResponse.setStatus(httpResponse.SC_OK); - httpResponse.setContentLength((int)r.getLength()); - httpResponse.setContentType(ARC_RECORD_CONTENT_TYPE); - httpResponse.setDateHeader("Expires", System.currentTimeMillis() + expiresMS); - r.copyToOutputStream(httpResponse.getOutputStream()); - - } catch (LiveDocumentNotAvailableException e) { - - e.printStackTrace(); - httpResponse.sendError(httpResponse.SC_NOT_FOUND); - } -// httpResponse.setContentType("text/plain"); -// PrintWriter pw = httpResponse.getWriter(); -// pw.println("PathInfo:" + httpRequest.getPathInfo()); -// pw.println("RequestURI:" + httpRequest.getRequestURI()); -// pw.println("RequestURL:" + httpRequest.getRequestURL()); -// pw.println("QueryString:" + httpRequest.getQueryString()); -// pw.println("PathTranslated:" + httpRequest.getPathTranslated()); -// pw.println("ServletPath:" + httpRequest.getServletPath()); -// pw.println("ContextPath:" + httpRequest.getContextPath()); -// if(r != null) { -// pw.println("CachePath:" + r.file.getAbsolutePath()); -// pw.println("CacheStart:" + r.start); -// pw.println("CacheEnd:" + r.end); -// } else { -// pw.println("FAILED CACHE!"); -// } - - return true; - } - - - private FileRegion getLiveResource(URL url) - throws LiveDocumentNotAvailableException, IOException { - - Resource resource = null; - - LOGGER.info("Caching URL(" + url.toString() + ")"); - FileRegion region = cacher.cache2(arcCacheDir, url.toString()); - if(region != null) { - LOGGER.info("Cached URL(" + url.toString() + ") in " + - "ARC(" + region.file.getAbsolutePath() + ") at (" - + region.start + " - " + region.end + ")"); - - } else { - throw new IOException("No location!"); - } - - return region; -} - - /** - * @return the arcCacheDir - */ - public ARCCacheDirectory getArcCacheDir() { - return arcCacheDir; - } - - /** - * @param arcCacheDir the arcCacheDir to set - */ - public void setArcCacheDir(ARCCacheDirectory arcCacheDir) { - this.arcCacheDir = arcCacheDir; - } - - /** - * @return the cacher - */ - public URLCacher getCacher() { - return cacher; - } - - /** - * @param cacher the cacher to set - */ - public void setCacher(URLCacher cacher) { - this.cacher = cacher; - } -} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCRecordingProxy.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCRecordingProxy.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCRecordingProxy.java 2010-04-14 21:20:49 UTC (rev 3038) @@ -0,0 +1,148 @@ +/* ARCRecordingProxy + * + * $Id$: + * + * Created on Apr 1, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.liveweb; + +import java.io.IOException; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.apache.commons.httpclient.URIException; +import org.apache.log4j.Logger; +import org.archive.wayback.webapp.ServletRequestContext; + +/** + * @author brad + * + */ +public class ARCRecordingProxy extends ServletRequestContext { + + private final static String EXPIRES_HEADER = "Expires"; + private long expiresMS = 60 * 60 * 1000; + private long fakeExpiresMS = 5 * 60 * 1000; + private final static String ARC_RECORD_CONTENT_TYPE = + "application/x-arc-record"; + + private static final Logger LOGGER = + Logger.getLogger(ARCRecordingProxy.class.getName()); + + private ARCCacheDirectory arcCacheDir = null; + private URLtoARCCacher cacher = null; + /* (non-Javadoc) + * @see org.archive.wayback.webapp.ServletRequestContext#handleRequest(javax.servlet.http.HttpServletRequest, javax.servlet.http.HttpServletResponse) + */ + @Override + public boolean handleRequest(HttpServletRequest httpRequest, + HttpServletResponse httpResponse) throws ServletException, + IOException { + + StringBuffer sb = httpRequest.getRequestURL(); + String query = httpRequest.getQueryString(); + if(query != null) { + sb.append("?").append(query); + } + FileRegion r = null; + try { + + String url = sb.toString(); + LOGGER.info("Caching URL(" + url + ")"); + r = cacher.cacheURL(url, arcCacheDir); + + httpResponse.setStatus(HttpServletResponse.SC_OK); + httpResponse.setContentLength((int)r.getLength()); + httpResponse.setContentType(ARC_RECORD_CONTENT_TYPE); + long exp = System.currentTimeMillis(); + exp += (r.isFake ? fakeExpiresMS : expiresMS); + + httpResponse.setDateHeader(EXPIRES_HEADER, exp); + + r.copyToOutputStream(httpResponse.getOutputStream()); + + } catch (URIException e) { + + e.printStackTrace(); + httpResponse.sendError(HttpServletResponse.SC_NOT_FOUND); + } + return true; + } + + /** + * @return the arcCacheDir + */ + public ARCCacheDirectory getArcCacheDir() { + return arcCacheDir; + } + + /** + * @param arcCacheDir the arcCacheDir to set + */ + public void setArcCacheDir(ARCCacheDirectory arcCacheDir) { + this.arcCacheDir = arcCacheDir; + } + + /** + * @return the cacher + */ + public URLtoARCCacher getCacher() { + return cacher; + } + + /** + * @param cacher the cacher to set + */ + public void setCacher(URLtoARCCacher cacher) { + this.cacher = cacher; + } + + /** + * @return the expiresMS + */ + public long getExpiresMS() { + return expiresMS; + } + + /** + * @param expiresMS the expiresMS to set + */ + public void setExpiresMS(long expiresMS) { + this.expiresMS = expiresMS; + } + + /** + * @return the fakeExpiresMS + */ + public long getFakeExpiresMS() { + return fakeExpiresMS; + } + + /** + * @param fakeExpiresMS the fakeExpiresMS to set + */ + public void setFakeExpiresMS(long fakeExpiresMS) { + this.fakeExpiresMS = fakeExpiresMS; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCRecordingProxy.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCUnwrappingProxy.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCUnwrappingProxy.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCUnwrappingProxy.java 2010-04-14 21:20:49 UTC (rev 3038) @@ -0,0 +1,128 @@ +/* ARCUnwrappingProxy + * + * $Id$: + * + * Created on Dec 10, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.liveweb; + +import java.io.IOException; +import java.util.zip.GZIPInputStream; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HostConfiguration; +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.HttpMethod; +import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; +import org.apache.commons.httpclient.methods.GetMethod; +import org.archive.io.arc.ARCRecord; +import org.archive.wayback.util.ByteOp; +import org.archive.wayback.webapp.ServletRequestContext; + +/** + * + * ServletRequestContext which proxies to an ARCRecordingProxy, and unwraps + * the "application/x-arc-record" MIME response into the inner HTTP response, + * sending all HTTP headers AS-IS, and the HTTP Entity. + * + * Can be used to use an ARCRecordingProxy with a UserAgent expecting real + * HTTP responses, not "application/x-arc-record". A web browser for example. + * + * @author brad + * + */ +public class ARCUnwrappingProxy extends ServletRequestContext { + + private MultiThreadedHttpConnectionManager connectionManager = null; + private HostConfiguration hostConfiguration = null; + /** + * + */ + public ARCUnwrappingProxy() { + connectionManager = new MultiThreadedHttpConnectionManager(); + hostConfiguration = new HostConfiguration(); + } + +// protected HttpClient http = new HttpClient( +// new MultiThreadedHttpConnectionManager()); + + /* (non-Javadoc) + * @see org.archive.wayback.webapp.ServletRequestContext#handleRequest(javax.servlet.http.HttpServletRequest, javax.servlet.http.HttpServletResponse) + */ + @Override + public boolean handleRequest(HttpServletRequest httpRequest, + HttpServletResponse httpResponse) throws ServletException, + IOException { + StringBuffer sb = httpRequest.getRequestURL(); + String query = httpRequest.getQueryString(); + if(query != null) { + sb.append("?").append(query); + } +// URL url = new URL(sb.toString()); + HttpMethod method = new GetMethod(sb.toString()); +// method.addRequestHeader("User-Agent", userAgent); + boolean got200 = false; + try { + HttpClient http = new HttpClient(connectionManager); + http.setHostConfiguration(hostConfiguration); + + int status = http.executeMethod(method); + if(status == 200) { + ARCRecord r = + new ARCRecord(new GZIPInputStream( + method.getResponseBodyAsStream()), + "id",0L,false,false,true); + r.skipHttpHeader(); + httpResponse.setStatus(r.getStatusCode()); + Header headers[] = r.getHttpHeaders(); + for(Header header : headers) { + httpResponse.addHeader(header.getName(), header.getValue()); + } + + ByteOp.copyStream(r, httpResponse.getOutputStream()); + got200 = true; + } + } finally { + method.releaseConnection(); + + } + + return got200; + } + + /** + * @param hostPort location of ARCRecordingProxy ServletRequestContext, ex: + * "localhost:3128" + */ + public void setProxyHostPort(String hostPort) { + int colonIdx = hostPort.indexOf(':'); + if(colonIdx > 0) { + String host = hostPort.substring(0,colonIdx); + int port = Integer.valueOf(hostPort.substring(colonIdx+1)); + hostConfiguration.setProxy(host, port); + } + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/ARCUnwrappingProxy.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/FileRegion.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/FileRegion.java 2010-04-14 21:15:01 UTC (rev 3037) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/FileRegion.java 2010-04-14 21:20:49 UTC (rev 3038) @@ -38,9 +38,20 @@ File file = null; long start = -1; long end = -1; + boolean isFake = false; + /** + * @return the number of bytes in this record, including headers. If the + * containing file is compressed, then this represents the number of + * compressed bytes. + */ public long getLength() { return end - start; } + /** + * Copy this record to the provided OutputStream + * @param o the OutputStream where the bytes should be sent. + * @throws IOException for usual reasons + */ public void copyToOutputStream(OutputStream o) throws IOException { long left = end - start; int BUFF_SIZE = 4096; @@ -58,5 +69,4 @@ } raf.close(); } - } Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java 2010-04-14 21:15:01 UTC (rev 3037) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebLocalResourceIndex.java 2010-04-14 21:20:49 UTC (rev 3038) @@ -1,55 +0,0 @@ -/* LiveWebLocalResourceIndex - * - * $Id$ - * - * Created on 5:53:29 PM Mar 13, 2007. - * - * Copyright (C) 2007 Internet Archive. - * - * This file is part of wayback. - * - * wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback-svn; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.liveweb; - -import java.io.IOException; -import java.util.ArrayList; - -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.resourceindex.LocalResourceIndex; - -/** - * Alternate LocalResourceIndex that supports an alternate BDB configuration, - * and allows adding of SearchResults to the index. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class LiveWebLocalResourceIndex extends LocalResourceIndex { - - /** - * Add a single SearchResult to the index. - * @param result - * @throws IOException - * @throws UnsupportedOperationException - */ - public void addSearchResult(CaptureSearchResult result) - throws UnsupportedOperationException, IOException { - - ArrayList<CaptureSearchResult> l = new ArrayList<CaptureSearchResult>(); - l.add(result); - addSearchResults(l.iterator()); - } -} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/RemoteLiveWebCache.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/RemoteLiveWebCache.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/RemoteLiveWebCache.java 2010-04-14 21:20:49 UTC (rev 3038) @@ -0,0 +1,136 @@ +/* RemoteLiveWebCache + * + * $Id$: + * + * Created on Dec 15, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.liveweb; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.net.ConnectException; +import java.net.URL; +import java.util.zip.GZIPInputStream; + +import org.apache.commons.httpclient.HostConfiguration; +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.HttpMethod; +import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; +import org.apache.commons.httpclient.methods.GetMethod; +import org.archive.io.arc.ARCRecord; +import org.archive.wayback.core.Resource; +import org.archive.wayback.exception.LiveDocumentNotAvailableException; +import org.archive.wayback.exception.LiveWebCacheUnavailableException; +import org.archive.wayback.exception.ResourceNotAvailableException; +import org.archive.wayback.resourcestore.resourcefile.ArcResource; +import org.archive.wayback.resourcestore.resourcefile.ResourceFactory; + +/** + * @author brad + * + */ +public class RemoteLiveWebCache implements LiveWebCache { + + private MultiThreadedHttpConnectionManager connectionManager = null; + private HostConfiguration hostConfiguration = null; + private HttpClient http = null; + /** + * + */ + public RemoteLiveWebCache() { + connectionManager = new MultiThreadedHttpConnectionManager(); + hostConfiguration = new HostConfiguration(); + http = new HttpClient(connectionManager); + http.setHostConfiguration(hostConfiguration); + } + + /* (non-Javadoc) + * @see org.archive.wayback.liveweb.LiveWebCache#getCachedResource(java.net.URL, long, boolean) + */ + public Resource getCachedResource(URL url, long maxCacheMS, + boolean bUseOlder) throws LiveDocumentNotAvailableException, + LiveWebCacheUnavailableException, IOException { + String urlString = url.toExternalForm(); + HttpMethod method = new GetMethod(urlString); + try { + int status = http.executeMethod(method); + if(status == 200) { + ByteArrayInputStream bais = new ByteArrayInputStream(method.getResponseBody()); + ARCRecord r = new ARCRecord( + new GZIPInputStream(bais), + "id",0L,false,false,true); + ArcResource ar = (ArcResource) + ResourceFactory.ARCArchiveRecordToResource(r, null); + if(ar.getStatusCode() == 502) { + throw new LiveDocumentNotAvailableException(urlString); + } + return ar; + + } else { + throw new LiveWebCacheUnavailableException(urlString); + } + } catch (ResourceNotAvailableException e) { + throw new LiveDocumentNotAvailableException(urlString); + } catch (ConnectException e) { + throw new LiveWebCacheUnavailableException(e.getLocalizedMessage() + + " : " + urlString); + } finally { + method.releaseConnection(); + } + } + + /* (non-Javadoc) + * @see org.archive.wayback.liveweb.LiveWebCache#shutdown() + */ + public void shutdown() { + // TODO Auto-generated method stub + } + + + /** + * @param hostPort to proxy requests through - ex. "localhost:3128" + */ + public void setProxyHostPort(String hostPort) { + int colonIdx = hostPort.indexOf(':'); + if(colonIdx > 0) { + String host = hostPort.substring(0,colonIdx); + int port = Integer.valueOf(hostPort.substring(colonIdx+1)); + +// http.getHostConfiguration().setProxy(host, port); + hostConfiguration.setProxy(host, port); + } + } + /** + * @param maxTotalConnections the HttpConnectionManagerParams config + */ + public void setMaxTotalConnections(int maxTotalConnections) { + connectionManager.getParams(). + setMaxTotalConnections(maxTotalConnections); + } + /** + * @param maxHostConnections the HttpConnectionManagerParams config + */ + public void setMaxHostConnections(int maxHostConnections) { + connectionManager.getParams(). + setMaxConnectionsPerHost(hostConfiguration, maxHostConnections); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/RemoteLiveWebCache.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLCacher.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLCacher.java 2010-04-14 21:15:01 UTC (rev 3037) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLCacher.java 2010-04-14 21:20:49 UTC (rev 3038) @@ -1,525 +0,0 @@ -/* URLCacher - * - * $Id$ - * - * Created on 5:30:31 PM Mar 12, 2007. - * - * Copyright (C) 2007 Internet Archive. - * - * This file is part of wayback-svn. - * - * wayback-svn is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback-svn is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback-svn; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.liveweb; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.net.ConnectException; -import java.net.MalformedURLException; -import java.net.NoRouteToHostException; -import java.net.URL; -import java.net.UnknownHostException; -import java.util.Arrays; -import java.util.Date; -import java.util.Properties; -import java.util.concurrent.atomic.AtomicInteger; - -import org.apache.commons.httpclient.ConnectTimeoutException; -import org.apache.commons.httpclient.Header; -import org.apache.commons.httpclient.HostConfiguration; -import org.apache.commons.httpclient.HttpClient; -import org.apache.commons.httpclient.HttpConnection; -import org.apache.commons.httpclient.HttpException; -import org.apache.commons.httpclient.HttpMethodBase; -import org.apache.commons.httpclient.HttpState; -import org.apache.commons.httpclient.SimpleHttpConnectionManager; -import org.apache.commons.httpclient.URIException; -import org.apache.log4j.Logger; -import org.archive.io.arc.ARCLocation; -import org.archive.io.arc.ARCWriter; -import org.archive.net.LaxURI; -import org.archive.wayback.exception.LiveDocumentNotAvailableException; - -/** - * Class for performing an HTTP GET request, and storing all related info - * required to create a valid ARC Record. This info is also actually stored in - * an ARC file via an ARCWriter. This should leverage more Heritrix fetcher code - * but because the Heritrix settings system is tightly coupled with the fetcher - * code, we'll try to limp by with this class until it gets untangled. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class URLCacher { - private static final Logger LOGGER = Logger.getLogger( - URLCacher.class.getName()); - - private static final String CACHE_PATH = "liveweb.tmp.dir"; - - protected File tmpDir = null; - @SuppressWarnings("unchecked") - private final ThreadLocal tl = new ThreadLocal() { - protected synchronized Object initialValue() { - HttpClient http = new HttpClient(); - IPHttpConnectionManager manager = new IPHttpConnectionManager(); - manager.getParams().setConnectionTimeout(10000); - manager.getParams().setSoTimeout(10000); - http.setHttpConnectionManager(manager); - return http; - } - }; - private HttpClient getHttpClient() { - return (HttpClient) tl.get(); - } - - private File getTmpFile() { - String tmpName; - File tmpFile; - try { - tmpFile = File.createTempFile("robot-tmp-",null); - tmpName = tmpFile.getName(); - tmpFile.delete(); - } catch (IOException e) { - tmpName = "oops" + Thread.currentThread().getName(); - e.printStackTrace(); - } - tmpFile = new File(tmpDir,tmpName); - if (tmpFile.exists()) { - tmpFile.delete(); - } - return tmpFile; - } - - protected ExtendedGetMethod urlToFile(String urlString, File file) - throws LiveDocumentNotAvailableException, URIException, IOException { - - HttpClient http = getHttpClient(); - OutputStream os = new FileOutputStream(file); - ExtendedGetMethod method = new ExtendedGetMethod(os); - LaxURI lURI = new LaxURI(urlString,true); - method.setURI(lURI); - try { - int code = http.executeMethod(method); - os.close(); - // TODO: Constant 200 - if(code != 200) { - throw new LiveDocumentNotAvailableException(urlString); - } - } catch (HttpException e) { - e.printStackTrace(); - throw new LiveDocumentNotAvailableException(urlString); - } catch(UnknownHostException e) { - LOGGER.info("Unknown host for URL " + urlString); - throw new LiveDocumentNotAvailableException(urlString); - } catch(ConnectTimeoutException e) { - LOGGER.info("Connection Timeout for URL " + urlString); - throw new LiveDocumentNotAvailableException(urlString); - } catch(NoRouteToHostException e) { - LOGGER.info("No route to host for URL " + urlString); - throw new LiveDocumentNotAvailableException(urlString); - } catch(ConnectException e) { - LOGGER.info("ConnectException URL " + urlString); - throw new LiveDocumentNotAvailableException(urlString); - } - LOGGER.info("Stored " + urlString + " in " + file.getAbsolutePath()); - return method; - } - - private ARCLocation storeFile(File file, ARCWriter writer, String url, - ExtendedGetMethod method) throws IOException { - - FileInputStream fis = new FileInputStream(file); - int len = (int) file.length(); - String mime = method.getMime(); - String ip = method.getRemoteIP(); - Date captureDate = method.getCaptureDate(); - - writer.checkSize(); - final long arcOffset = writer.getPosition(); - final String arcPath = writer.getFile().getAbsolutePath(); - - writer.write(url,mime,ip,captureDate.getTime(),len,fis); - writer.checkSize(); - long newSize = writer.getPosition(); - long oSize = writer.getFile().length(); - final long arcEndOffset = oSize; - LOGGER.info("Wrote " + url + " at " + arcPath + ":" + arcOffset); - LOGGER.info("NewSize:" + newSize + " oSize: " + oSize); - fis.close(); - - return new ARCLocation() { - private String filename = arcPath; - private long offset = arcOffset; - private long endOffset = arcEndOffset; - - public String getName() { return this.filename; } - public long getOffset() { return this.offset; } - public long getEndOffset() { return this.endOffset; } - - }; - } - private FileRegion storeFile2(File file, ARCWriter writer, String url, - ExtendedGetMethod method) throws IOException { - - FileInputStream fis = new FileInputStream(file); - int len = (int) file.length(); - String mime = method.getMime(); - String ip = method.getRemoteIP(); - Date captureDate = method.getCaptureDate(); - - writer.checkSize(); - final long arcOffset = writer.getPosition(); - final String arcPath = writer.getFile().getAbsolutePath(); - - writer.write(url,mime,ip,captureDate.getTime(),len,fis); - writer.checkSize(); - long newSize = writer.getPosition(); - long oSize = writer.getFile().length(); - final long arcEndOffset = oSize; - LOGGER.info("Wrote " + url + " at " + arcPath + ":" + arcOffset); - LOGGER.info("NewSize:" + newSize + " oSize: " + oSize); - fis.close(); - FileRegion fr = new FileRegion(); - fr.file = writer.getFile(); - fr.start = arcOffset; - fr.end = oSize; - return fr; - } - - /** - * Retrieve urlString, and store using ARCWriter, returning - * ARCLocation where the document was stored. - * - * @param cache - * @param urlString - * @return ARCLocation where document was stored - * @throws LiveDocumentNotAvailableException - * @throws URIException - * @throws IOException if something internal went wrong. - */ - public ARCLocation cache(ARCCacheDirectory cache, String urlString) - throws LiveDocumentNotAvailableException, IOException, URIException { - - // localize URL - File tmpFile = getTmpFile(); - ExtendedGetMethod method; - try { - method = urlToFile(urlString,tmpFile); - } catch (LiveDocumentNotAvailableException e) { - LOGGER.info("Attempted to get " + urlString + " failed..."); - tmpFile.delete(); - throw e; - } catch (URIException e) { - tmpFile.delete(); - throw e; - } catch (IOException e) { - tmpFile.delete(); - throw e; - } - - // store URL - ARCLocation location = null; - ARCWriter writer = null; - try { - writer = cache.getWriter(); - location = storeFile(tmpFile, writer, urlString, method); - } catch(IOException e) { - e.printStackTrace(); - throw e; - } finally { - if(writer != null) { - cache.returnWriter(writer); - } - tmpFile.delete(); - } - return location; - } - public FileRegion cache2(ARCCacheDirectory cache, String urlString) - throws LiveDocumentNotAvailableException, IOException, URIException { - - // localize URL - File tmpFile = getTmpFile(); - ExtendedGetMethod method; - try { - method = urlToFile(urlString,tmpFile); - } catch (LiveDocumentNotAvailableException e) { - LOGGER.info("Attempted to get " + urlString + " failed..."); - tmpFile.delete(); - throw e; - } catch (URIException e) { - tmpFile.delete(); - throw e; - } catch (IOException e) { - tmpFile.delete(); - throw e; - } - - // store URL - FileRegion region = null; - ARCWriter writer = null; - try { - writer = cache.getWriter(); - region = storeFile2(tmpFile, writer, urlString, method); - } catch(IOException e) { - e.printStackTrace(); - throw e; - } finally { - if(writer != null) { - cache.returnWriter(writer); - } - tmpFile.delete(); - } - return region; -} - - /** - * @param args - */ - public static void main(String[] args) { - int DEFAULT_MAX_ARC_FILE_SIZE = 1024 * 1024 * 100; - File arcDir = new File(args[0]); - URL url; - if(!arcDir.isDirectory()) { - arcDir.mkdir(); - } - File [] files = {arcDir}; - boolean compress = true; - ARCWriter writer = new ARCWriter(new AtomicInteger(), - Arrays.asList(files), "test", compress, - DEFAULT_MAX_ARC_FILE_SIZE); - Properties p = new Properties(); - p.setProperty(ARCCacheDirectory.LIVE_WEB_ARC_DIR, args[0]); - p.setProperty(ARCCacheDirectory.LIVE_WEB_ARC_PREFIX, "test"); - p.setProperty(CACHE_PATH, arcDir.getAbsolutePath()); - - URLCacher uc = new URLCacher(); - ARCCacheDirectory cache = new ARCCacheDirectory(); -// try { -//// cache.init(p); -//// uc.init(p); -// } catch (ConfigurationException e) { -// e.printStackTrace(); -// System.exit(1); -// } - for(int k = 1; k < args.length; k++) { - try { - url = new URL(args[k]); - } catch (MalformedURLException e1) { - e1.printStackTrace(); - continue; - } - try { - uc.cache(cache, url.toString()); - } catch (URIException e) { - e.printStackTrace(); - } catch (LiveDocumentNotAvailableException e) { - e.printStackTrace(); - } catch (IOException e) { - e.printStackTrace(); - } - } - try { - writer.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - /* - * Get method which stores the entire HTTP response: message, headers & body - * in the OutputStream provided, and also provides access to the data needed - * to generate an ARC record: IP, Date and Mime - */ - private class ExtendedGetMethod extends HttpMethodBase { - - private String remoteIP = ""; - private Date captureDate = null; - private String mime = "unk"; - private OutputStream os = null; - - /** - * Constructor - * - * @param os - */ - public ExtendedGetMethod(OutputStream os) { - super(); - this.os = os; - } - - /* (non-Javadoc) - * @see org.apache.commons.httpclient.HttpMethodBase#getName() - */ - public String getName() { - return "GET"; - } - - protected void processStatusLine(HttpState state, HttpConnection conn) { - captureDate = new Date(); - IPStoringHttpConnection bhc = (IPStoringHttpConnection) conn; - remoteIP = bhc.getRemoteIP(); - try { - String statusLine = this.getStatusLine().toString() + "\r\n"; - os.write(statusLine.getBytes()); - } catch (IOException e) { - // TODO hrm..? - e.printStackTrace(); - } - } - - protected void processResponseBody(HttpState state, HttpConnection conn) { - try { - - // copy the HTTP Headers... - Header headers[] = this.getResponseHeaders(); - for (int i = 0; i < headers.length; i++) { - if(headers[i].getName().equals("Content-Type")) { - mime = headers[i].getValue(); - } - os.write(headers[i].toExternalForm().getBytes()); - } - os.write(new String("\r\n").getBytes()); - - // now copy the whole response body: - - InputStream is = this.getResponseStream(); - final int BUFFER_SIZE = 1024 * 4; - byte[] buffer = new byte[BUFFER_SIZE]; - while (true) { - int x = is.read(buffer); - if (x == -1) { - break; - } - os.write(buffer, 0, x); - } - //is.close(); - os.close(); - - } catch (IOException e) { - // TODO don't eat it - e.printStackTrace(); - } - } - - /** - * @return Returns the captureDate. - */ - public Date getCaptureDate() { - return captureDate; - } - - /** - * @return Returns the mime. - */ - public String getMime() { - return mime; - } - - /** - * @return Returns the remoteIP. - */ - public String getRemoteIP() { - return remoteIP; - } - - } - - /** - * HttpConnectionManager that returns IPHttpConnection objects, for - * accessing the IP address - */ - private class IPHttpConnectionManager extends SimpleHttpConnectionManager { - public HttpConnection getConnection(HostConfiguration hostConfiguration) { - IPStoringHttpConnection conn = new IPStoringHttpConnection(hostConfiguration); - conn.setHttpConnectionManager(this); - conn.getParams().setDefaults(this.getParams()); - return conn; - } - - public HttpConnection getConnectionWithTimeout( - HostConfiguration hostConfiguration, long timeout) { - // TODO: is this lying? have we really set the time out? - IPStoringHttpConnection conn = new IPStoringHttpConnection(hostConfiguration); - conn.setHttpConnectionManager(this); - conn.getParams().setDefaults(this.getParams()); - return conn; - } - - public HttpConnection getConnection( - HostConfiguration hostConfiguration, long timeout) { - - return new IPStoringHttpConnection(hostConfiguration); - } - public void releaseConnection(HttpConnection conn) { - // ensure connection is closed - conn.close(); - InputStream lastResponse = conn.getLastResponseInputStream(); - if (lastResponse != null) { - conn.setLastResponseInputStream(null); - try { - lastResponse.close(); - } catch (IOException ioe) { - //FIX ME: badness - close to force reconnect. - conn.close(); - } - } - } - } - - /** - * HttpConnection that allows access to the IP address which was - * used for the connection. - */ - private class IPStoringHttpConnection extends HttpConnection { - - /** - * @param hc - */ - public IPStoringHttpConnection(HostConfiguration hc) { - super(hc); - } - /** - * @return the remote IP address that was connected to, as a String - */ - public String getRemoteIP() { - return getSocket().getInetAddress().getHostAddress(); - } - } - - /** - * @return the tmpDir - */ - public String getTmpDir() { - if(tmpDir == null) { - return null; - } - return tmpDir.getAbsolutePath(); - } - - /** - * @param tmpDir the tmpDir to set - */ - public void setTmpDir(String tmpDir) { - this.tmpDir = new File(tmpDir); - if(!this.tmpDir.exists()) { - this.tmpDir.mkdirs(); - } - } - -} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java 2010-04-14 21:20:49 UTC (rev 3038) @@ -0,0 +1,421 @@ +/* URLtoARCCacher + * + * $Id$: + * + * Created on Mar 26, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.liveweb; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.UnknownHostException; +import java.util.Date; + +import org.apache.commons.httpclient.ConnectTimeoutException; +import org.apache.commons.httpclient.Header; +import org.apache.commons.httpclient.HostConfiguration; +import org.apache.commons.httpclient.HttpClient; +import org.apache.commons.httpclient.HttpConnection; +import org.apache.commons.httpclient.HttpException; +import org.apache.commons.httpclient.HttpState; +import org.apache.commons.httpclient.SimpleHttpConnectionManager; +import org.apache.commons.httpclient.URIException; +import org.apache.commons.httpclient.cookie.CookiePolicy; +import org.apache.commons.httpclient.params.HttpClientParams; +import org.apache.log4j.Logger; +import org.archive.httpclient.HttpRecorderGetMethod; +import org.archive.io.RecordingInputStream; +import org.archive.io.arc.ARCWriter; +import org.archive.net.LaxURI; +import org.archive.util.Recorder; +import org.archive.wayback.util.ByteOp; + +/** + * + * Takes an input URL String argument, downloads, stores in an ARCWriter, + * and returns a FileRegion consisting of the compressed ARCRecord containing + * the response, or a forged, "fake error response" ARCRecord which can be + * used to send the content to an OutputStream. + * + * @author brad + * + */ +public class URLtoARCCacher { + private static final Logger LOGGER = Logger.getLogger( + URLtoARCCacher.class.getName()); + + private static String CONTENT_TYPE_HEADER = "Content-Type".toLowerCase(); + private static String GET_METHOD_NAME = "GET"; + + private static String DEFAULT_RECORDER_DIR = "/var/tmp/brad/recorder"; + private File recorderCacheDir = new File(DEFAULT_RECORDER_DIR); + + private static String DEFAULT_BACKING_FILE_BASE = "recorder-tmp"; + private String backingFileBase = DEFAULT_BACKING_FILE_BASE; + private String userAgent = "genericUserAgent"; + private int connectionTimeoutMS = 10000; + private int socketTimeoutMS = 10000; + private int outBufferSize = 1024 * 100; + private int inBufferSize = 1024 * 100; + + private final ThreadLocal<HttpClient> tl = new ThreadLocal<HttpClient>() { + + protected synchronized HttpClient initialValue() { + HttpClient http = new HttpClient(); + IPHttpConnectionManager manager = new IPHttpConnectionManager(); + manager.getParams().setConnectionTimeout(connectionTimeoutMS); + manager.getParams().setSoTimeout(socketTimeoutMS); + http.setHttpConnectionManager(manager); + HttpClientParams clientParams = new HttpClientParams(); + clientParams.setParameter("http.useragent", userAgent); + return http; + } + }; + + private HttpClient getHttpClient() { + return tl.get(); + } + + + private static byte[] ERROR_BYTES = "HTTP 502 Bad Gateway\n\n".getBytes(); + private static String ERROR_MIME = "unk"; + private static String ERROR_IP = "0.0.0.0"; + + /** + * @param url to cache + * @param cache ARCCacheDirectory for storing result or faked result + * @return FileRegion of compressed byte range for ARCRecord. + * @throws IOException for the usual reasons + * @throws URIException if url argument isn't really an URL.. + */ + public FileRegion cacheURL(String url, ARCCacheDirectory cache) + throws IOException, URIException { + + FileRegion region = null; + + // to track if we got a response (any response) or an exception. + boolean gotUrl = false; + + Recorder recorder = new Recorder(recorderCacheDir,backingFileBase, + outBufferSize, inBufferSize); + + ExtendedGetMethod getMethod = null; + + // TWO STEPS: + // first do the GET, using a Recorder to get the response. + // then, if that worked, save the recorded value into an ARC + // and return it's region + // if we didn't get a response, forge a fake record and return that. + try { + Recorder.setHttpRecorder(recorder); + LaxURI lURI = new LaxURI(url,true); + getMethod = new ExtendedGetMethod(url,recorder); + getMethod.setURI(lURI); + HttpClient client = getHttpClient(); + getMethod.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES); + getMethod.setFollowRedirects(false); + int code = client.executeMethod(getMethod); + LOGGER.info("URL(" + url + ") HTTP:" + code); + ByteOp.discardStream(getMethod.getResponseBodyAsStream()); + getMethod.releaseConnection(); + recorder.closeRecorders(); + gotUrl = true; + + } catch (URIException e) { + e.printStackTrace(); + } catch (UnknownHostException e) { + LOGGER.warn("Unknown host for " + url); + } catch (ConnectTimeoutException e) { + // TODO: should we act like it's a full block? + LOGGER.warn("Timeout out connecting to " + url); + } catch (HttpException e) { + e.printStackTrace(); + // we have to let IOExceptions out, problems caused by local disk + // NEED to return errors, indicating that there is not an + // authoritative answer, and thus... NOTHING can be shown. +// } catch (IOException e) { +// e.printStackTrace(); + } finally { + Recorder.setHttpRecorder(null); + } + + // now write the content, or a fake record: + ARCWriter writer = null; + try { + writer = cache.getWriter(); + if(gotUrl) { + + RecordingInputStream ris = recorder.getRecordedInput(); + region = storeInputStreamARCRecord(writer, url, + getMethod.getMime(), getMethod.getRemoteIP(), + getMethod.getCaptureDate(), + ris.getReplayInputStream(), (int) ris.getSize()); + + } else { + region = storeNotAvailable(writer, url); + } + + } finally { + if(writer != null) { + cache.returnWriter(writer); + } + } + recorder.close(); + + return region; + } + + private FileRegion storeInputStreamARCRecord(ARCWriter writer, + String url, String mime, String ip, Date captureDate, + InputStream is, int length) throws IOException { + + writer.checkSize(); + final long arcOffset = writer.getPosition(); + final String arcPath = writer.getFile().getAbsolutePath(); + + writer.write(url,mime,ip,captureDate.getTime(),length,is); + writer.checkSize(); +// long newSize = writer.getPosition(); + long oSize = writer.getFile().length(); +// final long arcEndOffset = oSize; + LOGGER.info("Wrote " + url + ": " + arcPath + "(" + arcOffset + + "-" + oSize + ")"); + + FileRegion fr = new FileRegion(); + fr.file = writer.getFile(); + fr.start = arcOffset; + fr.end = oSize; + fr.isFake = false; + return fr; + } + + private FileRegion storeNotAvailable(ARCWriter writer, String url) + throws IOException { + + ByteArrayInputStream bais = new ByteArrayInputStream(ERROR_BYTES); + FileRegion fr = storeInputStreamARCRecord(writer, url, + ERROR_MIME, ERROR_IP, new Date(), bais, ERROR_BYTES.length); + fr.isFake = true; + return fr; + } + + /* + * Get method which ferrets away the Content-Type header, the remote IP + * and remembers when the HTTP Message header was received. + */ + private class ExtendedGetMethod extends HttpRecorderGetMethod { + + /** + * @param uri to be fetched + * @param recorder which is not currently used by base class, but + * we're going to require and send it on anyways. + */ + public ExtendedGetMethod(String uri, Recorder recorder) { + super(uri, recorder); + } + + private String remoteIP = ""; + private Date captureDate = null; + private String mime = "unk"; + + public String getName() { + return GET_METHOD_NAME; + } + + protected void processStatusLine(HttpState state, HttpConnection conn) { + // grab the remote IP, and record when we started getting bytes.. + // Sam thinks we should somehow record how fast we got it back.. + // and then replay it at the same rate we received it. + + captureDate = new Date(); + IPStoringHttpConnection bhc = (IPStoringHttpConnection) conn; + remoteIP = bhc.getRemoteIP(); + } + protected void processResponseBody(HttpState state, HttpConnection conn) { + // grab the mime.. + Header headers[] = this.getResponseHeaders(); + for (int i = 0; i < headers.length; i++) { + String lcHeader = headers[i].getName().toLowerCase(); + if(lcHeader.compareTo(CONTENT_TYPE_HEADER) == 0) { + mime = headers[i].getValue(); + } + } + } + + /** + * @return Returns the captureDate. + */ + public Date getCaptureDate() { + return captureDate; + } + + /** + * @return Returns the mime. + */ + public String getMime() { + return mime; + } + + /** + * @return Returns the remoteIP. + */ + public String getRemoteIP() { + return remoteIP; + } + } + + /** + * HttpConnectionManager that returns IPHttpConnection objects, for + * accessing the IP address + */ + private class IPHttpConnectionManager extends SimpleHttpConnectionManager { + public HttpConnection getConnection(HostConfiguration hostConfiguration) { + IPStoringHttpConnection conn = new IPStoringHttpConnection(hostConfiguration); + conn.setHttpConnectionManager(this); + conn.getParams().setDefaults(this.getParams()); + return conn; + } + + public HttpConnection getConnectionWithTimeout( + HostConfiguration hostConfiguration, long timeout) { + // TODO: is this lying? have we really set the time out? + IPStoringHttpConnection conn = + new IPStoringHttpConnection(hostConfiguration); + conn.setHttpConnectionManager(this); + conn.getParams().setDefaults(this.getParams()); + return conn; + } + + public HttpConnection getConnection( + HostConfiguration hostConfiguration, long timeout) { + + return new IPStoringHttpConnection(hostConfiguration); + } + public void releaseConnection(HttpConnection conn) { + // ensure connection is closed + conn.close(); + InputStream lastResponse = conn.getLastResponseInputStream(); + if (lastResponse != null) { + conn.setLastResponseInputStream(null); + try { + lastResponse.close(); + } catch (IOException ioe) { + //FIX ME: badness - close to force reconnect. + conn.close(); + } + } + } + } + + /** + * HttpConnection that allows access to the IP address which was + * used for the connection. + */ + private class IPStoringHttpConnection extends HttpConnection { + + /** + * @param hc HostConfiguration + */ + public IPStoringHttpConnection(HostConfiguration hc) { + super(hc); + } + /** + * @return the remote IP address that was connected to, as a String + */ + public String getRemoteIP() { + return getSocket().getInetAddress().getHostAddress(); + } + } + + /** + * @return the recorderCacheDir + */ + public String getRecorderCacheDir() { + return recorderCacheDir.getAbsolutePath(); + } + + /** + * @param recorderCacheDirPath the recorderCacheDir to set + */ + public void setRecorderCacheDir(String recorderCacheDirPath) { + this.recorderCacheDir = new File(recorderCacheDirPath); + } + + /** + * @return the backingFileBase + */ + public String getBackingFileBase() { + return backingFileBase; + } + + /** + * @param backingFileBase the backingFileBase to set + */ + public void setBackingFileBase(String backingFileBase) { + this.backingFileBase = backingFileBase; + } + + /** + * @return the userAgent + */ + public String getUserAgent() { + return userAgent; + } + + /** + * @param userAgent the userAgent to set + */ + public void setUserAgent(String userAgent) { + this.userAgent = userAgent; + } + + /** + * @return the connectionTimeoutMS + */ + public int getConnectionTimeoutMS() { + return connectionTimeoutMS; + } + + /** + * @param connectionTimeoutMS the connectionTimeoutMS to set + */ + public void setConnectionTimeoutMS(int connectionTimeoutMS) { + this.connectionTimeoutMS = connectionTimeoutMS; + } + + /** + * @return the socketTimeoutMS + */ + public int getSocketTimeoutMS() { + return socketTimeoutMS; + } + + /** + * @param socketTimeoutMS the socketTimeoutMS to set + */ + public void setSocketTimeoutMS(int socketTimeoutMS) { + this.socketTimeoutMS = socketTimeoutMS; + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/URLtoARCCacher.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |