Update of /cvsroot/archive-access/archive-access/projects/wayback/src/java/org/archive/wayback/archivalurl In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv30992/src/java/org/archive/wayback/archivalurl Added Files: ReplayFilter.java QueryFilter.java JSReplayRenderer.java ResultURIConverter.java Log Message: Massive overhaul decomposing into three main categories of changes: 1) All internal datatypes are now extensible (currently Properties, but should be Maps) including: a) WaybackRequest(was WBRequest) b) SearchResults (was ResourceResults) c) SearchResult (was ResourceResult) d) Resource so that there is no longer an assumption of Archival URL queries, or "CDX-style" index results. This will put more responsiblility on the UI components to interrogate SearchResults to decide how to render, but should enable extension to data returned from Indexes, as well as allow far more flexibility in queries, predominantly geared towards free-text searching. This is still somewhat clunky, as there are no convenience accessor methods, so all users refer to constants when interacting with them. 2) Major cleanup of servlet and filter interaction with servlet container. ReplayUI and QueryUI are now just plain old servlets, and filters can be optionally added to allow non-CGI argument requests to be coerced into standard WaybackRequest objects. 3) Alternate "Proxy" Replay mode is now functional, and some work has been done towards an alternate Nutch ResourceIndex. Currently the web.xml contains example configurations for both Proxy and Archival Url replay modes, but the Proxy related configurations are commented out. Proxy mode *requires* changing the servlet context to ROOT. ArchivalUrl replay mode works as ROOT context and as any (I think) other context. There are some cosmetic double-slashe issues to work out. --- NEW FILE: QueryFilter.java --- /* QueryFilter * * $Id: QueryFilter.java,v 1.1 2005/11/16 03:11:30 bradtofel Exp $ * * Created on 1:22:14 PM Nov 8, 2005. * * Copyright (C) 2005 Internet Archive. * * This file is part of wayback. * * wayback is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * wayback is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with wayback; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.archive.wayback.archivalurl; import java.text.ParseException; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.servlet.http.HttpServletRequest; import org.apache.commons.httpclient.URIException; import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.RequestFilter; import org.archive.wayback.core.Timestamp; import org.archive.wayback.core.WaybackRequest; /** * * * @author brad * @version $Date: 2005/11/16 03:11:30 $, $Revision: 1.1 $ */ public class QueryFilter extends RequestFilter { private final static Pattern WB_QUERY_REGEX = Pattern .compile("^/(\\d{0,13})\\*/(.*[^*])$"); private final static Pattern WB_PATH_QUERY_REGEX = Pattern .compile("^/(\\d{0,13})\\*/(.*)\\*$"); public WaybackRequest parseRequest(HttpServletRequest request) { WaybackRequest wbRequest = null; Matcher matcher = null; String queryString = request.getQueryString(); String origRequestPath = request.getRequestURI(); if (queryString != null) { origRequestPath = request.getRequestURI() + "?" + queryString; } String contextPath = request.getContextPath(); if (!origRequestPath.startsWith(contextPath)) { return null; } String requestPath = origRequestPath.substring(contextPath.length()); matcher = WB_QUERY_REGEX.matcher(requestPath); if (matcher != null && matcher.matches()) { wbRequest = new WaybackRequest(); String dateStr = matcher.group(1); String urlStr = matcher.group(2); try { String startDate = Timestamp.parseBefore(dateStr).getDateStr(); String endDate = Timestamp.parseAfter(dateStr).getDateStr(); wbRequest.put(WaybackConstants.REQUEST_START_DATE,startDate); wbRequest.put(WaybackConstants.REQUEST_END_DATE,endDate); // wbRequest.setStartTimestamp(Timestamp.parseBefore(dateStr)); // wbRequest.setEndTimestamp(Timestamp.parseAfter(dateStr)); } catch (ParseException e1) { e1.printStackTrace(); return null; } wbRequest.put(WaybackConstants.REQUEST_TYPE, WaybackConstants.REQUEST_URL_QUERY); // wbRequest.setQuery(); if (!urlStr.startsWith("http://")) { urlStr = "http://" + urlStr; } try { UURI requestURI = UURIFactory.getInstance(urlStr); wbRequest.put(WaybackConstants.REQUEST_URL, requestURI.toString()); // wbRequest.setRequestURI(requestURI); } catch (URIException e) { wbRequest = null; } } else { matcher = WB_PATH_QUERY_REGEX.matcher(requestPath); if (matcher != null && matcher.matches()) { wbRequest = new WaybackRequest(); String dateStr = matcher.group(1); String urlStr = matcher.group(2); try { String startDate = Timestamp.parseBefore(dateStr).getDateStr(); String endDate = Timestamp.parseAfter(dateStr).getDateStr(); wbRequest.put(WaybackConstants.REQUEST_START_DATE, startDate); wbRequest.put(WaybackConstants.REQUEST_END_DATE,endDate); // wbRequest.setStartTimestamp(Timestamp.parseBefore(dateStr)); // wbRequest.setEndTimestamp(Timestamp.parseAfter(dateStr)); } catch (ParseException e1) { e1.printStackTrace(); return null; } wbRequest.put(WaybackConstants.REQUEST_TYPE, WaybackConstants.REQUEST_URL_PREFIX_QUERY); // wbRequest.setPathQuery(); if (!urlStr.startsWith("http://")) { urlStr = "http://" + urlStr; } try { UURI requestURI = UURIFactory.getInstance(urlStr); wbRequest.put(WaybackConstants.REQUEST_URL,requestURI.toString()); } catch (URIException e) { wbRequest = null; } } } return wbRequest; } } --- NEW FILE: ResultURIConverter.java --- /* ResultURIConverter * * $Id: ResultURIConverter.java,v 1.1 2005/11/16 03:11:30 bradtofel Exp $ * * Created on 5:24:36 PM Nov 1, 2005. * * Copyright (C) 2005 Internet Archive. * * This file is part of wayback. * * wayback is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * wayback is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with wayback; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.archive.wayback.archivalurl; import java.util.Properties; import org.apache.commons.httpclient.URIException; import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.wayback.WaybackConstants; import org.archive.wayback.ReplayResultURIConverter; import org.archive.wayback.core.SearchResult; import org.archive.wayback.exception.ConfigurationException; /** * * * @author brad * @version $Date: 2005/11/16 03:11:30 $, $Revision: 1.1 $ */ public class ResultURIConverter implements ReplayResultURIConverter { private final static String REPLAY_URI_PREFIX_PROPERTY = "replayuriprefix"; private String replayUriPrefix; /* (non-Javadoc) * @see org.archive.wayback.ReplayResultURIConverter#init(java.util.Properties) */ public void init(Properties p) throws ConfigurationException { // TODO Auto-generated method stub replayUriPrefix = (String) p.get( REPLAY_URI_PREFIX_PROPERTY); if (replayUriPrefix == null || replayUriPrefix.length() <= 0) { throw new ConfigurationException("Failed to find " + REPLAY_URI_PREFIX_PROPERTY); } } /* (non-Javadoc) * @see org.archive.wayback.ReplayResultURIConverter#makeReplayURI(org.archive.wayback.core.ResourceResult) */ public String makeReplayURI(SearchResult result) { return replayUriPrefix + "/" + result.get(WaybackConstants.RESULT_CAPTURE_DATE) + "/" + result.get(WaybackConstants.RESULT_URL); } /** * @return Returns the replayUriPrefix. */ public String getReplayUriPrefix() { return replayUriPrefix; } /* (non-Javadoc) * @see org.archive.wayback.ReplayResultURIConverter#makeRedirectReplayURI(org.archive.wayback.core.SearchResult, java.lang.String) */ public String makeRedirectReplayURI(SearchResult result, String url) { String finalUrl = url; try { UURI origURI = UURIFactory.getInstance(url); if(!origURI.isAbsoluteURI()) { String resultUrl = result.get(WaybackConstants.RESULT_URL); UURI absResultURI = UURIFactory.getInstance(resultUrl); UURI finalURI = absResultURI.resolve(url); finalUrl = finalURI.getEscapedURI(); } } catch (URIException e) { // TODO Auto-generated catch block e.printStackTrace(); } return replayUriPrefix + "/" + result.get(WaybackConstants.RESULT_CAPTURE_DATE) + "/" + finalUrl; } } --- NEW FILE: JSReplayRenderer.java --- /* JSRenderer * * $Id: JSReplayRenderer.java,v 1.1 2005/11/16 03:11:30 bradtofel Exp $ * * Created on 1:34:16 PM Nov 8, 2005. * * Copyright (C) 2005 Internet Archive. * * This file is part of wayback. * * wayback is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * wayback is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with wayback; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.archive.wayback.archivalurl; import java.io.IOException; import java.text.ParseException; import javax.servlet.ServletException; import javax.servlet.ServletOutputStream; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.archive.wayback.WaybackConstants; import org.archive.wayback.ReplayResultURIConverter; import org.archive.wayback.core.Resource; import org.archive.wayback.core.SearchResult; import org.archive.wayback.core.Timestamp; import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.proxy.RawReplayRenderer; /** * * * @author brad * @version $Date: 2005/11/16 03:11:30 $, $Revision: 1.1 $ */ public class JSReplayRenderer extends RawReplayRenderer { private final static String TEXT_HTML_MIME = "text/html"; private boolean isRawReplayResult(SearchResult result) { if (-1 == result.get(WaybackConstants.RESULT_MIME_TYPE).indexOf( TEXT_HTML_MIME)) { return true; } return false; } private void redirectToBetterUrl(HttpServletResponse httpResponse, String url) throws IOException { httpResponse.sendRedirect(url); } public void renderResource(HttpServletRequest httpRequest, HttpServletResponse httpResponse, WaybackRequest wbRequest, SearchResult result, Resource resource, ReplayResultURIConverter uriConverter) throws ServletException, IOException { if (resource == null) { throw new IllegalArgumentException("No resource"); } if (result == null) { throw new IllegalArgumentException("No result"); } // redirect to actual date if diff than request: if (!wbRequest.get(WaybackConstants.REQUEST_EXACT_DATE).equals( result.get(WaybackConstants.RESULT_CAPTURE_DATE))) { String betterURI = uriConverter.makeReplayURI(result); redirectToBetterUrl(httpResponse, betterURI); } else { if (isRawReplayResult(result)) { super.renderResource(httpRequest, httpResponse, wbRequest, result, resource, uriConverter); } else { resource.parseHeaders(); copyRecordHttpHeader(httpResponse, resource, uriConverter, result, false); // slurp the whole thing into RAM: byte[] bbuffer = new byte[4 * 1024]; StringBuffer sbuffer = new StringBuffer(); for (int r = -1; (r = resource.read(bbuffer, 0, bbuffer.length)) != -1;) { String chunk = new String(bbuffer); sbuffer.append(chunk.substring(0, r)); } markUpPage(sbuffer, result, uriConverter); httpResponse.setHeader("Content-Length", "" + sbuffer.length()); ServletOutputStream out = httpResponse.getOutputStream(); out.print(new String(sbuffer)); } } } private void markUpPage(StringBuffer page, SearchResult result, ReplayResultURIConverter uriConverter) { // TODO deal with frames.. insertBaseTag(page, result); insertJavascript(page, result, uriConverter); } private void insertBaseTag(StringBuffer page, SearchResult result) { String resultUrl = result.get(WaybackConstants.RESULT_URL); String baseTag = "<BASE HREF=\"http://" + resultUrl + "\">"; int insertPoint = page.indexOf("<head>"); if (-1 == insertPoint) { insertPoint = page.indexOf("<HEAD>"); } if (-1 == insertPoint) { insertPoint = 0; } else { insertPoint += 6; // just after the tag } page.insert(insertPoint, baseTag); } private void insertJavascript(StringBuffer page, SearchResult result, ReplayResultURIConverter uriConverter) { String resourceTS = result.get(WaybackConstants.RESULT_CAPTURE_DATE); String nowTS; try { nowTS = Timestamp.currentTimestamp().getDateStr(); } catch (ParseException e) { nowTS = "UNKNOWN"; } String contextPath = uriConverter.getReplayUriPrefix() + "/" + resourceTS + "/"; String scriptInsert = "<SCRIPT language=\"Javascript\">\n" + "<!--\n" + "\n" + "// FILE ARCHIVED ON " + resourceTS + " AND RETRIEVED FROM THE\n" + "// INTERNET ARCHIVE ON " + nowTS + ".\n" + "// JAVASCRIPT APPENDED BY WAYBACK MACHINE, COPYRIGHT INTERNET ARCHIVE.\n" + "//\n" + "// ALL OTHER CONTENT MAY ALSO BE PROTECTED BY COPYRIGHT (17 U.S.C.\n" + "// SECTION 108(a)(3)).\n" + "\n" + " var sWayBackCGI = \"" + contextPath + "\";\n" + " \n" + "function xResolveUrl(url) {\n" + " var image = new Image();\n" + " image.src = url;\n" + " return image.src;\n" + "}\n" + "function xLateUrl(aCollection, sProp) {\n" + " var i = 0;\n" + " for(i = 0; i < aCollection.length; i++) {\n" + " if (typeof(aCollection[i][sProp]) == \"string\") {\n" + " if (aCollection[i][sProp].indexOf(\"mailto:\") == -1 &&\n" + " aCollection[i][sProp].indexOf(\"javascript:\") == -1) {\n" + " if(aCollection[i][sProp].indexOf(\"http\") == 0) {\n" + " aCollection[i][sProp] = sWayBackCGI + aCollection[i][sProp];\n" + " } else {\n" + " aCollection[i][sProp] = sWayBackCGI + xResolveUrl(aCollection[i][sProp]);\n" + " }\n" + " }\n" + " }\n" + " }\n" + "}\n" + " \n" + " xLateUrl(document.getElementsByTagName(\"IMG\"),\"src\");\n" + " xLateUrl(document.getElementsByTagName(\"A\"),\"href\");\n" + " xLateUrl(document.getElementsByTagName(\"AREA\"),\"href\");\n" + " xLateUrl(document.getElementsByTagName(\"OBJECT\"),\"codebase\");\n" + " xLateUrl(document.getElementsByTagName(\"OBJECT\"),\"data\");\n" + " xLateUrl(document.getElementsByTagName(\"APPLET\"),\"codebase\");\n" + " xLateUrl(document.getElementsByTagName(\"APPLET\"),\"archive\");\n" + " xLateUrl(document.getElementsByTagName(\"EMBED\"),\"src\");\n" + " xLateUrl(document.getElementsByTagName(\"BODY\"),\"background\");\n" + "\n" + "// -->\n" + "\n" + "</SCRIPT>\n"; int insertPoint = page.indexOf("</body>"); if (-1 == insertPoint) { insertPoint = page.indexOf("</BODY>"); } if (-1 == insertPoint) { insertPoint = page.length(); } page.insert(insertPoint, scriptInsert); } } --- NEW FILE: ReplayFilter.java --- /* ReplayFilter * * $Id: ReplayFilter.java,v 1.1 2005/11/16 03:11:30 bradtofel Exp $ * * Created on 1:08:38 PM Nov 8, 2005. * * Copyright (C) 2005 Internet Archive. * * This file is part of wayback. * * wayback is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * wayback is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with wayback; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package org.archive.wayback.archivalurl; import java.text.ParseException; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.servlet.http.HttpServletRequest; import org.apache.commons.httpclient.URIException; import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.RequestFilter; import org.archive.wayback.core.Timestamp; import org.archive.wayback.core.WaybackRequest; /** * * * @author brad * @version $Date: 2005/11/16 03:11:30 $, $Revision: 1.1 $ */ public class ReplayFilter extends RequestFilter { private final Pattern WB_REQUEST_REGEX = Pattern .compile("^/(\\d{1,14})/(.*)$"); /** * Constructor */ public ReplayFilter() { super(); } public WaybackRequest parseRequest(HttpServletRequest httpRequest) { WaybackRequest wbRequest = null; Matcher matcher = null; String queryString = httpRequest.getQueryString(); String origRequestPath = httpRequest.getRequestURI(); if (queryString != null) { origRequestPath = httpRequest.getRequestURI() + "?" + queryString; } String contextPath = httpRequest.getContextPath(); if (!origRequestPath.startsWith(contextPath)) { return null; } String requestPath = origRequestPath.substring(contextPath.length()); matcher = WB_REQUEST_REGEX.matcher(requestPath); if (matcher != null && matcher.matches()) { wbRequest = new WaybackRequest(); String dateStr = matcher.group(1); String urlStr = matcher.group(2); if (!urlStr.startsWith("http://")) { urlStr = "http://" + urlStr; } wbRequest.put(WaybackConstants.REQUEST_EXACT_DATE,dateStr); try { String startDate = Timestamp.earliestTimestamp().getDateStr(); String endDate = Timestamp.currentTimestamp().getDateStr(); wbRequest.put(WaybackConstants.REQUEST_START_DATE,startDate); wbRequest.put(WaybackConstants.REQUEST_END_DATE,endDate); } catch (ParseException e1) { e1.printStackTrace(); return null; } wbRequest.put(WaybackConstants.REQUEST_TYPE, WaybackConstants.REQUEST_REPLAY_QUERY); String referer = httpRequest.getHeader("REFERER"); if (referer == null) { referer = null; } wbRequest.put(WaybackConstants.REQUEST_REFERER_URL,referer); try { UURI requestURI = UURIFactory.getInstance(urlStr); wbRequest.put(WaybackConstants.REQUEST_URL, requestURI.toString()); } catch (URIException e) { wbRequest = null; } } return wbRequest; } } |