From: <bra...@us...> - 2010-04-14 21:41:50
|
Revision: 3044 http://archive-access.svn.sourceforge.net/archive-access/?rev=3044&view=rev Author: bradtofel Date: 2010-04-14 21:41:44 +0000 (Wed, 14 Apr 2010) Log Message: ----------- INITIAL REV: very early /liveweb/ AccessPoint, which returns and rewrites resources from the live web according to configuration of an "inner" AccessPoint Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2010-04-14 21:41:44 UTC (rev 3044) @@ -0,0 +1,213 @@ +/* LiveWebAccessPoint + * + * $Id$: + * + * Created on Apr 1, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.webapp; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +import org.archive.io.arc.ARCRecord; +import org.archive.wayback.accesscontrol.robotstxt.RobotExclusionFilterFactory; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.CaptureSearchResults; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.exception.ResourceNotInArchiveException; +import org.archive.wayback.exception.RobotAccessControlException; +import org.archive.wayback.exception.WaybackException; +import org.archive.wayback.liveweb.LiveWebCache; +import org.archive.wayback.resourceindex.filters.ExclusionFilter; +import org.archive.wayback.resourcestore.resourcefile.ArcResource; +import org.springframework.beans.factory.BeanNameAware; + +/** + * @author brad + * + * AccessPoint subclass which allows no Queries, but makes all replay requests + * through a LiveWebCache + * + */ +public class LiveWebAccessPoint extends ServletRequestContext implements BeanNameAware { + private AccessPoint inner = null; + private LiveWebCache cache = null; + private RobotExclusionFilterFactory robotFactory = null; + private long maxCacheMS = 86400000; + private String beanName = null; + private int contextPort = 0; + private String contextName = null; + + public void setBeanName(String beanName) { + this.beanName = beanName; + this.contextName = ""; + int idx = beanName.indexOf(":"); + if(idx > -1) { + contextPort = Integer.valueOf(beanName.substring(0,idx)); + contextName = beanName.substring(idx + 1); + } else { + try { + this.contextPort = Integer.valueOf(beanName); + } catch(NumberFormatException e) { + e.printStackTrace(); + } + } + } + /** + * @param httpRequest HttpServletRequest which is being handled + * @return the prefix of paths received by this server that are handled by + * this WaybackContext, including the trailing '/' + */ + public String getContextPath(HttpServletRequest httpRequest) { + String httpContextPath = httpRequest.getContextPath(); + if(contextName.length() == 0) { + return httpContextPath + "/"; + } + return httpContextPath + "/" + contextName + "/"; + } + + + protected String translateRequest(HttpServletRequest httpRequest, + boolean includeQuery) { + + String origRequestPath = httpRequest.getRequestURI(); + if(includeQuery) { + String queryString = httpRequest.getQueryString(); + if (queryString != null) { + origRequestPath += "?" + queryString; + } + } + String contextPath = getContextPath(httpRequest); + if (!origRequestPath.startsWith(contextPath)) { + if(contextPath.startsWith(origRequestPath)) { + // missing trailing '/', just omit: + return ""; + } + return null; + } + return origRequestPath.substring(contextPath.length()); + } + + public boolean handleRequest(HttpServletRequest httpRequest, + HttpServletResponse httpResponse) + throws ServletException, IOException { + + String urlString = translateRequest(httpRequest,true); + boolean handled = true; + WaybackRequest wbRequest = new WaybackRequest(); + wbRequest.setAccessPoint(inner); + wbRequest.setContextPrefix(inner.getAbsoluteServerPrefix(httpRequest)); + wbRequest.setServerPrefix(inner.getAbsoluteServerPrefix(httpRequest)); + wbRequest.setLiveWebRequest(true); + wbRequest.setRequestUrl(urlString); + URL url = null; + try { + try { + url = new URL(urlString); + } catch(MalformedURLException e) { + throw new BadQueryException("Bad URL(" + urlString + ")"); + } + + CaptureSearchResult result = new CaptureSearchResult(); + result.setOriginalUrl(urlString); + result.setUrlKey(urlString); + // should we check robots, first? + if(robotFactory != null) { + int ruling = robotFactory.get().filterObject(result); + if(ruling == ExclusionFilter.FILTER_EXCLUDE) { + throw new RobotAccessControlException(urlString + "is blocked by robots.txt"); + } + } + // robots says GO: + ArcResource r = (ArcResource) cache.getCachedResource(url, maxCacheMS , false); + ARCRecord ar = (ARCRecord) r.getArcRecord(); + int status = ar.getStatusCode(); + if((status == 200) || ((status >= 300) && (status < 400))) { + result.setCaptureTimestamp(ar.getMetaData().getDate()); + result.setMimeType(ar.getMetaData().getMimetype()); + CaptureSearchResults results = new CaptureSearchResults(); + results.addSearchResult(result); + + wbRequest.setReplayTimestamp(result.getCaptureTimestamp()); + + inner.getReplay().getRenderer(wbRequest, result, r).renderResource( + httpRequest, httpResponse, wbRequest, result, r, + inner.getUriConverter(), results); + } else { + throw new ResourceNotInArchiveException("Not In Archive - Not on Live web"); + } + + } catch(WaybackException e) { + inner.getException().renderException(httpRequest, httpResponse, wbRequest, + e, inner.getUriConverter()); + } + return handled; + } + + /** + * @return the cache + */ + public LiveWebCache getCache() { + return cache; + } + + /** + * @param cache the cache to set + */ + public void setCache(LiveWebCache cache) { + this.cache = cache; + } + + /** + * @return the robotFactory + */ + public RobotExclusionFilterFactory getRobotFactory() { + return robotFactory; + } + + /** + * @param robotFactory the robotFactory to set + */ + public void setRobotFactory(RobotExclusionFilterFactory robotFactory) { + this.robotFactory = robotFactory; + } + + /** + * @return the inner + */ + public AccessPoint getInner() { + return inner; + } + + /** + * @param inner the inner to set + */ + public void setInner(AccessPoint inner) { + this.inner = inner; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-05-18 23:01:20
|
Revision: 3108 http://archive-access.svn.sourceforge.net/archive-access/?rev=3108&view=rev Author: bradtofel Date: 2010-05-18 23:01:13 +0000 (Tue, 18 May 2010) Log Message: ----------- TWEAK: removed calls to wbrequest setContextPrefix and setServerPrefix Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2010-05-18 23:00:00 UTC (rev 3107) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2010-05-18 23:01:13 UTC (rev 3108) @@ -70,9 +70,6 @@ WaybackRequest wbRequest = new WaybackRequest(); wbRequest.setAccessPoint(inner); - wbRequest.setContextPrefix(inner.getUrlRoot()); - wbRequest.setServerPrefix(inner.getUrlRoot()); - wbRequest.setLiveWebRequest(true); wbRequest.setRequestUrl(urlString); URL url = null; @@ -86,14 +83,14 @@ CaptureSearchResult result = new CaptureSearchResult(); result.setOriginalUrl(urlString); result.setUrlKey(urlString); - // should we check robots, first? + // check robots first, if configured if(robotFactory != null) { int ruling = robotFactory.get().filterObject(result); if(ruling == ExclusionFilter.FILTER_EXCLUDE) { throw new RobotAccessControlException(urlString + "is blocked by robots.txt"); } } - // robots says GO: + // no robots check, or robots.txt says GO: ArcResource r = (ArcResource) cache.getCachedResource(url, maxCacheMS , false); ARCRecord ar = (ARCRecord) r.getArcRecord(); int status = ar.getStatusCode(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-02-06 14:31:52
|
Revision: 3388 http://archive-access.svn.sourceforge.net/archive-access/?rev=3388&view=rev Author: bradtofel Date: 2011-02-06 14:31:46 +0000 (Sun, 06 Feb 2011) Log Message: ----------- Now blocks from excludes prior to lookup Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2011-02-06 14:31:04 UTC (rev 3387) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2011-02-06 14:31:46 UTC (rev 3388) @@ -22,6 +22,7 @@ import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; +import java.util.logging.Logger; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; @@ -29,9 +30,11 @@ import org.archive.io.arc.ARCRecord; import org.archive.wayback.accesscontrol.robotstxt.RobotExclusionFilterFactory; +import org.archive.wayback.accesscontrol.staticmap.StaticMapExclusionFilterFactory; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.core.CaptureSearchResults; import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.AdministrativeAccessControlException; import org.archive.wayback.exception.BadQueryException; import org.archive.wayback.exception.ResourceNotInArchiveException; import org.archive.wayback.exception.RobotAccessControlException; @@ -39,6 +42,7 @@ import org.archive.wayback.liveweb.LiveWebCache; import org.archive.wayback.resourceindex.filters.ExclusionFilter; import org.archive.wayback.resourcestore.resourcefile.ArcResource; +import org.archive.wayback.util.url.UrlOperations; import org.archive.wayback.util.webapp.AbstractRequestHandler; /** @@ -49,9 +53,14 @@ * */ public class LiveWebAccessPoint extends AbstractRequestHandler { + private static final Logger LOGGER = Logger.getLogger( + LiveWebAccessPoint.class.getName()); + private AccessPoint inner = null; private LiveWebCache cache = null; private RobotExclusionFilterFactory robotFactory = null; + private StaticMapExclusionFilterFactory adminFactory = null; + private long maxCacheMS = 86400000; public boolean handleRequest(HttpServletRequest httpRequest, @@ -59,7 +68,7 @@ throws ServletException, IOException { String urlString = translateRequestPathQuery(httpRequest); - + urlString = UrlOperations.fixupHTTPUrlWithOneSlash(urlString); boolean handled = true; WaybackRequest wbRequest = new WaybackRequest(); wbRequest.setAccessPoint(inner); @@ -84,6 +93,17 @@ throw new RobotAccessControlException(urlString + "is blocked by robots.txt"); } } + if(adminFactory != null) { + ExclusionFilter f = adminFactory.get(); + if(f == null) { + LOGGER.severe("Unable to get administrative exclusion filter!"); + throw new AdministrativeAccessControlException(urlString + "is blocked."); + } + int ruling = f.filterObject(result); + if(ruling == ExclusionFilter.FILTER_EXCLUDE) { + throw new AdministrativeAccessControlException(urlString + "is blocked."); + } + } // no robots check, or robots.txt says GO: ArcResource r = (ArcResource) cache.getCachedResource(url, maxCacheMS , false); ARCRecord ar = (ARCRecord) r.getArcRecord(); @@ -151,4 +171,12 @@ public void setInner(AccessPoint inner) { this.inner = inner; } + + public StaticMapExclusionFilterFactory getAdminFactory() { + return adminFactory; + } + + public void setAdminFactory(StaticMapExclusionFilterFactory adminFactory) { + this.adminFactory = adminFactory; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-02-09 07:03:32
|
Revision: 3415 http://archive-access.svn.sourceforge.net/archive-access/?rev=3415&view=rev Author: bradtofel Date: 2011-02-09 07:03:26 +0000 (Wed, 09 Feb 2011) Log Message: ----------- Swapped order of admin & robot exclusion checking - admin is now first Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2011-02-06 15:00:40 UTC (rev 3414) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2011-02-09 07:03:26 UTC (rev 3415) @@ -86,13 +86,7 @@ CaptureSearchResult result = new CaptureSearchResult(); result.setOriginalUrl(urlString); result.setUrlKey(urlString); - // check robots first, if configured - if(robotFactory != null) { - int ruling = robotFactory.get().filterObject(result); - if(ruling == ExclusionFilter.FILTER_EXCLUDE) { - throw new RobotAccessControlException(urlString + "is blocked by robots.txt"); - } - } + // check admin excludes first, if configured: if(adminFactory != null) { ExclusionFilter f = adminFactory.get(); if(f == null) { @@ -104,6 +98,13 @@ throw new AdministrativeAccessControlException(urlString + "is blocked."); } } + // check robots next, if configured + if(robotFactory != null) { + int ruling = robotFactory.get().filterObject(result); + if(ruling == ExclusionFilter.FILTER_EXCLUDE) { + throw new RobotAccessControlException(urlString + "is blocked by robots.txt"); + } + } // no robots check, or robots.txt says GO: ArcResource r = (ArcResource) cache.getCachedResource(url, maxCacheMS , false); ARCRecord ar = (ARCRecord) r.getArcRecord(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:04:17
|
Revision: 3441 http://archive-access.svn.sourceforge.net/archive-access/?rev=3441&view=rev Author: bradtofel Date: 2011-05-25 01:04:11 +0000 (Wed, 25 May 2011) Log Message: ----------- TWEAK: live web AccessPoint no longer attempts to retrieve non HTTP documents. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2011-05-25 01:03:17 UTC (rev 3440) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2011-05-25 01:04:11 UTC (rev 3441) @@ -77,6 +77,9 @@ wbRequest.setRequestUrl(urlString); URL url = null; try { + if(!urlString.startsWith(UrlOperations.HTTP_SCHEME)) { + throw new ResourceNotInArchiveException(urlString); + } Thread.currentThread().setName("Thread " + Thread.currentThread().getId() + " " + getBeanName() + " handling: " + urlString); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-06-16 17:23:14
|
Revision: 3477 http://archive-access.svn.sourceforge.net/archive-access/?rev=3477&view=rev Author: bradtofel Date: 2011-06-16 17:23:08 +0000 (Thu, 16 Jun 2011) Log Message: ----------- Performance loggging, plus now throwing correct LiveDocumentNotAvailable exception, rather than NIA Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2011-06-16 17:22:14 UTC (rev 3476) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/LiveWebAccessPoint.java 2011-06-16 17:23:08 UTC (rev 3477) @@ -36,6 +36,7 @@ import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.AdministrativeAccessControlException; import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.exception.LiveDocumentNotAvailableException; import org.archive.wayback.exception.ResourceNotInArchiveException; import org.archive.wayback.exception.RobotAccessControlException; import org.archive.wayback.exception.WaybackException; @@ -113,7 +114,10 @@ } } // no robots check, or robots.txt says GO: + long start = System.currentTimeMillis(); ArcResource r = (ArcResource) cache.getCachedResource(url, maxCacheMS , false); + long elapsed = System.currentTimeMillis() - start; + PerformanceLogger.noteElapsed("LiveWebRequest",elapsed,urlString); ARCRecord ar = (ARCRecord) r.getArcRecord(); int status = ar.getStatusCode(); if((status == 200) || ((status >= 300) && (status < 400))) { @@ -128,7 +132,7 @@ httpRequest, httpResponse, wbRequest, result, r, inner.getUriConverter(), results); } else { - throw new ResourceNotInArchiveException("Not In Archive - Not on Live web"); + throw new LiveDocumentNotAvailableException(urlString); } } catch(WaybackException e) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |