From: <bra...@us...> - 2009-07-18 00:14:48
|
Revision: 2771 http://archive-access.svn.sourceforge.net/archive-access/?rev=2771&view=rev Author: bradtofel Date: 2009-07-18 00:14:42 +0000 (Sat, 18 Jul 2009) Log Message: ----------- FEATURE(ACC-32): AccessPoints now have an exactSchemeMatch property. If set to true, only documents with the same scheme as the request URL will be returned within this AccessPoint. Modified Paths: -------------- branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java Added Paths: ----------- branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SchemeMatchFilter.java Modified: branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java =================================================================== --- branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java 2009-07-18 00:05:07 UTC (rev 2770) +++ branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/core/WaybackRequest.java 2009-07-18 00:14:42 UTC (rev 2771) @@ -39,6 +39,7 @@ import org.archive.wayback.util.ObjectFilter; import org.archive.wayback.util.StringFormatter; import org.archive.wayback.util.Timestamp; +import org.archive.wayback.util.url.UrlOperations; import org.archive.wayback.webapp.AccessPoint; /** @@ -186,6 +187,12 @@ public static final String REQUEST_EXACT_HOST_ONLY = "requestexacthost"; /** + * Indicates user only wants results that were captured using the same + * scheme as that specified in REQUEST_URL. + */ + public static final String REQUEST_EXACT_SCHEME_ONLY = "requestexactscheme"; + + /** * indicates positive value for any request boolean flag. */ public static final String REQUEST_YES = "yes"; @@ -556,16 +563,27 @@ * @param urlStr Request URL. */ public void setRequestUrl(String urlStr) { - // TODO: fix this to use other schemes - if (!urlStr.startsWith("http://")) { + + // This looks a little confusing: We're trying to fixup an incoming + // request URL that starts with: + // "http:/www.archive.org" + // so it becomes: + // "http://www.archive.org" + // (note the missing second "/" in the first) + // + // if that is not the case, then see if the incoming scheme + // is known, adding an implied "http://" scheme if there doesn't appear + // to be a scheme.. + // TODO: make the default "http://" configurable. + if (!urlStr.startsWith(UrlOperations.HTTP_SCHEME)) { if(urlStr.startsWith("http:/")) { - urlStr = "http://" + urlStr.substring(6); + urlStr = UrlOperations.HTTP_SCHEME + urlStr.substring(6); } else { - urlStr = "http://" + urlStr; + if(UrlOperations.urlToScheme(urlStr) == null) { + urlStr = UrlOperations.HTTP_SCHEME + urlStr; + } } } -// UURI requestURI = UURIFactory.getInstance(urlStr); -// put(REQUEST_URL_CLEANED, requestURI.toString()); put(REQUEST_URL, urlStr); } @@ -614,6 +632,13 @@ public boolean isExactHost() { return getBoolean(REQUEST_EXACT_HOST_ONLY); } + + public void setExactScheme(boolean isExactScheme) { + setBoolean(REQUEST_EXACT_SCHEME_ONLY,isExactScheme); + } + public boolean isExactScheme() { + return getBoolean(REQUEST_EXACT_SCHEME_ONLY); + } public String getAnchorTimestamp() { return get(REQUEST_ANCHOR_DATE); Modified: branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2009-07-18 00:05:07 UTC (rev 2770) +++ branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2009-07-18 00:14:42 UTC (rev 2771) @@ -51,6 +51,7 @@ import org.archive.wayback.resourceindex.filters.EndDateFilter; import org.archive.wayback.resourceindex.filters.GuardRailFilter; import org.archive.wayback.resourceindex.filters.HostMatchFilter; +import org.archive.wayback.resourceindex.filters.SchemeMatchFilter; import org.archive.wayback.resourceindex.filters.SelfRedirectFilter; import org.archive.wayback.resourceindex.filters.UrlMatchFilter; import org.archive.wayback.resourceindex.filters.UrlPrefixMatchFilter; @@ -63,6 +64,7 @@ import org.archive.wayback.util.ObjectFilterIterator; import org.archive.wayback.util.Timestamp; import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.wayback.util.url.UrlOperations; /** * @@ -370,6 +372,7 @@ filter.addFilter(drFilter); } else if(type == TYPE_URL) { filter.addFilter(new UrlPrefixMatchFilter(keyUrl)); + filter.addFilter(drFilter); } else { throw new BadQueryException("Unknown type"); } @@ -378,6 +381,10 @@ filter.addFilter(exactHost); } + if(request.isExactScheme()) { + filter.addFilter(new SchemeMatchFilter( + UrlOperations.urlToScheme(request.getRequestUrl()))); + } // count how many results got to the ExclusionFilter: filter.addFilter(preExclusionCounter); Added: branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SchemeMatchFilter.java =================================================================== --- branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SchemeMatchFilter.java (rev 0) +++ branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SchemeMatchFilter.java 2009-07-18 00:14:42 UTC (rev 2771) @@ -0,0 +1,60 @@ +/* SchemeMatchFilter + * + * $Id$ + * + * Created on 6:40:02 PM Nov 6, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.filters; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.url.UrlOperations; + +/** + * ObjectFilter which omits CaptureSearchResult objects if their scheme does not + * match the specified scheme. + * + * @author brad + * @version $Date$, $Revision$ + */ + +public class SchemeMatchFilter implements ObjectFilter<CaptureSearchResult> { + + private String scheme = null; + + /** + * @param hostname String of original host to match + */ + public SchemeMatchFilter(final String scheme) { + this.scheme = scheme; + } + + /* (non-Javadoc) + * @see org.archive.wayback.util.ObjectFilter#filterObject(java.lang.Object) + */ + public int filterObject(CaptureSearchResult r) { + String captureScheme = UrlOperations.urlToScheme(r.getOriginalUrl()); + if(scheme == null) { + return captureScheme == null ? FILTER_INCLUDE : FILTER_EXCLUDE; + } + return scheme.equals(captureScheme) ? FILTER_INCLUDE : FILTER_EXCLUDE; + } +} Modified: branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java =================================================================== --- branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java 2009-07-18 00:05:07 UTC (rev 2770) +++ branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java 2009-07-18 00:14:42 UTC (rev 2771) @@ -206,23 +206,17 @@ return urlString; } String searchUrl = canonicalize(urlString); - - // TODO: force https into http for the moment... - if(searchUrl.startsWith("https://")) { - searchUrl = searchUrl.substring(8); + String scheme = UrlOperations.urlToScheme(searchUrl); + if(scheme != null) { + searchUrl = searchUrl.substring(scheme.length()); + } else { + scheme = UrlOperations.HTTP_SCHEME; } - - // TODO: this will only work with http:// scheme. should work with all? - // force add of scheme and possible add '/' with empty path: - if (searchUrl.startsWith("http://")) { - if (-1 == searchUrl.indexOf('/', 8)) { - searchUrl = searchUrl + "/"; - } + + if (-1 == searchUrl.indexOf("/")) { + searchUrl = scheme + searchUrl + "/"; } else { - if (-1 == searchUrl.indexOf("/")) { - searchUrl = searchUrl + "/"; - } - searchUrl = "http://" + searchUrl; + searchUrl = scheme + searchUrl; } // TODO: These next few lines look crazy -- need to be reworked.. This @@ -250,23 +244,18 @@ // if((newPath.length() > 1) && newPath.endsWith("/")) { // newPath = newPath.substring(0,newPath.length()-1); // } -// searchURI.setEscapedPath(newPath); -// searchURI.setRawPath(newPath.toCharArray()); -// String query = searchURI.getEscapedQuery(); - // TODO: handle non HTTP port stripping, too. -// String portStr = ""; -// if(searchURI.getPort() != 80 && searchURI.getPort() != -1) { -// portStr = ":" + searchURI.getPort(); -// } -// return searchURI.getHostBasename() + portStr + -// searchURI.getEscapedPathQuery(); - StringBuilder sb = new StringBuilder(searchUrl.length()); sb.append(searchURI.getHostBasename()); - if(searchURI.getPort() != 80 && searchURI.getPort() != -1) { + + // omit port if scheme default: + int defaultSchemePort = UrlOperations.schemeToDefaultPort(scheme); + if(searchURI.getPort() != defaultSchemePort + && searchURI.getPort() != -1) { + sb.append(":").append(searchURI.getPort()); } + sb.append(newPath); if(searchURI.getEscapedQuery() != null) { sb.append("?").append(searchURI.getEscapedQuery()); Modified: branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2009-07-18 00:05:07 UTC (rev 2770) +++ branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2009-07-18 00:14:42 UTC (rev 2771) @@ -81,15 +81,16 @@ * @return url resolved against baseUrl, unless it is absolute already */ public static String resolveUrl(String baseUrl, String url) { - // TODO: this only works for http:// - if(url.startsWith("http://")) { - try { - return UURIFactory.getInstance(url).getEscapedURI(); - } catch (URIException e) { - e.printStackTrace(); - // can't let a space exist... send back close to whatever came - // in... - return url.replace(" ", "%20"); + for(final String scheme : ALL_SCHEMES) { + if(url.startsWith(scheme)) { + try { + return UURIFactory.getInstance(url).getEscapedURI(); + } catch (URIException e) { + e.printStackTrace(); + // can't let a space exist... send back close to whatever came + // in... + return url.replace(" ", "%20"); + } } } UURI absBaseURI; @@ -99,11 +100,39 @@ resolvedURI = UURIFactory.getInstance(absBaseURI, url); } catch (URIException e) { e.printStackTrace(); - return url; + return url.replace(" ", "%20"); } return resolvedURI.getEscapedURI(); } + public static String urlToScheme(final String url) { + for(final String scheme : ALL_SCHEMES) { + if(url.startsWith(scheme)) { + return scheme; + } + } + return null; + } + + public static int schemeToDefaultPort(final String scheme) { + if(scheme.equals(HTTP_SCHEME)) { + return 80; + } + if(scheme.equals(HTTPS_SCHEME)) { + return 443; + } + if(scheme.equals(FTP_SCHEME)) { + return 21; + } + if(scheme.equals(RTSP_SCHEME)) { + return 554; + } + if(scheme.equals(MMS_SCHEME)) { + return 1755; + } + return -1; + } + public static String urlToHost(String url) { if(url.startsWith("dns:")) { return url.substring(4); Modified: branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java =================================================================== --- branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2009-07-18 00:05:07 UTC (rev 2770) +++ branches/wayback-1_4_2/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java 2009-07-18 00:14:42 UTC (rev 2771) @@ -80,6 +80,7 @@ private boolean useServerName = false; private boolean useAnchorWindow = false; + private boolean exactSchemeMatch = true; private int contextPort = 0; private String contextName = null; @@ -217,11 +218,6 @@ prefix.append(":").append(waybackPort); } String contextPath = getContextPath(httpRequest); -// if(contextPath.length() > 1) { -// prefix.append(contextPath); -// } else { -// prefix.append(contextPath); -// } prefix.append(contextPath); return prefix.toString(); } @@ -264,19 +260,6 @@ } catch(IOException e) { // TODO: figure out if we got IO because of a missing dispatcher } -// uiResults.storeInRequest(httpRequest,translated); -// RequestDispatcher dispatcher = null; -// // special case for the front '/' page: -// if(translated.length() == 0) { -// translated = "/"; -// } else { -// translated = "/" + translated; -// } -// dispatcher = httpRequest.getRequestDispatcher(translated); -// if(dispatcher != null) { -// dispatcher.forward(httpRequest, httpResponse); -// return true; -// } return false; } @@ -299,9 +282,13 @@ if(wbRequest != null) { handled = true; + + // TODO: refactor this code into RequestParser implementations wbRequest.setAccessPoint(this); wbRequest.setContextPrefix(getAbsoluteLocalPrefix(httpRequest)); wbRequest.fixup(httpRequest); + // end of refactor + if(authentication != null) { if(!authentication.isTrue(wbRequest)) { throw new AuthenticationControlException("Not authorized"); @@ -311,6 +298,12 @@ if(exclusionFactory != null) { wbRequest.setExclusionFilter(exclusionFactory.get()); } + // TODO: refactor this into RequestParser implementations, so a + // user could alter requests to change the behavior within a + // single AccessPoint. For now, this is a simple way to expose + // the feature to configuration. + wbRequest.setExactScheme(exactSchemeMatch); + if(wbRequest.isReplayRequest()) { handleReplay(wbRequest,httpRequest,httpResponse); @@ -488,7 +481,21 @@ public void setUseAnchorWindow(boolean useAnchorWindow) { this.useAnchorWindow = useAnchorWindow; } + + /** + * @return the exactSchemeMatch + */ + public boolean isExactSchemeMatch() { + return exactSchemeMatch; + } + /** + * @param exactSchemeMatch the exactSchemeMatch to set + */ + public void setExactSchemeMatch(boolean exactSchemeMatch) { + this.exactSchemeMatch = exactSchemeMatch; + } + public ExclusionFilterFactory getExclusionFactory() { return exclusionFactory; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |