archive-access-cvs Mailing List for Web Archive Access Utilities (Page 67)

Brought to you by: binzino, bradtofel, gojomo, ia_igor, and 5 others

archive-access-cvs — CVS commits

You can subscribe to this list here.

2005	Jan	Feb	Mar	Apr	May	Jun	Jul (1)	Aug (10)	Sep (36)	Oct (339)	Nov (103)	Dec (152)
2006	Jan (141)	Feb (102)	Mar (125)	Apr (203)	May (57)	Jun (30)	Jul (139)	Aug (46)	Sep (64)	Oct (105)	Nov (34)	Dec (162)
2007	Jan (81)	Feb (57)	Mar (141)	Apr (72)	May (9)	Jun (1)	Jul (144)	Aug (88)	Sep (40)	Oct (43)	Nov (34)	Dec (20)
2008	Jan (44)	Feb (45)	Mar (16)	Apr (36)	May (8)	Jun (77)	Jul (177)	Aug (66)	Sep (8)	Oct (33)	Nov (13)	Dec (37)
2009	Jan (2)	Feb (5)	Mar (8)	Apr	May (36)	Jun (19)	Jul (46)	Aug (8)	Sep (1)	Oct (66)	Nov (61)	Dec (10)
2010	Jan (13)	Feb (16)	Mar (38)	Apr (76)	May (47)	Jun (32)	Jul (35)	Aug (45)	Sep (20)	Oct (61)	Nov (24)	Dec (16)
2011	Jan (22)	Feb (34)	Mar (11)	Apr (8)	May (24)	Jun (23)	Jul (11)	Aug (42)	Sep (81)	Oct (48)	Nov (21)	Dec (20)
2012	Jan (30)	Feb (25)	Mar (4)	Apr (6)	May (1)	Jun (5)	Jul (5)	Aug (8)	Sep (6)	Oct (6)	Nov	Dec

Flat | Threaded

<< < 1 .. 65 66 67 68 69 .. 171 > >> (Page 67 of 171)

[Archive-access-cvs] SF.net SVN: archive-access: [2155] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/archivalurl/ ArchivalUrlReplayDispatcher.java

From: <bra...@us...> - 2008-01-30 03:01:23

Revision: 2155
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2155&view=rev
Author:   bradtofel
Date:     2008-01-29 19:01:18 -0800 (Tue, 29 Jan 2008)

Log Message:
-----------
FEATURE: added rewrite capability for text/css documents.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayDispatcher.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayDispatcher.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayDispatcher.java	2008-01-30 03:00:16 UTC (rev 2154)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayDispatcher.java	2008-01-30 03:01:18 UTC (rev 2155)
@@ -49,6 +49,7 @@
 	 */
 	private final static String TEXT_HTML_MIME = "text/html";
 	private final static String TEXT_XHTML_MIME = "application/xhtml";
+	private final static String TEXT_CSS_MIME = "text/css";
 
 	// TODO: make this configurable
 	private final static long MAX_HTML_MARKUP_LENGTH = 1024 * 1024 * 5;
@@ -59,6 +60,8 @@
 	private ReplayRenderer redirect = new DateRedirectReplayRenderer();
 	private ArchivalUrlReplayRenderer archivalHTML =
 		new ArchivalUrlReplayRenderer();
+	private ArchivalUrlCSSReplayRenderer archivalCSS =
+		new ArchivalUrlCSSReplayRenderer();
 
 	/* (non-Javadoc)
 	 * @see org.archive.wayback.replay.ReplayRendererDispatcher#getRenderer(org.archive.wayback.core.WaybackRequest, org.archive.wayback.core.SearchResult, org.archive.wayback.core.Resource)
@@ -76,11 +79,10 @@
 			return redirect;
 		}
 		
-		// TODO: handle .css docs -- embedded URLs there need to be fixed
-
-		// HTML and XHTML docs smaller than some size get marked up as HTML
+		// only bother attempting  markup on pages smaller than some size:
 		if (resource.getRecordLength() < MAX_HTML_MARKUP_LENGTH) {
 
+			// HTML and XHTML docs get marked up as HTML
 			if (-1 != result.get(WaybackConstants.RESULT_MIME_TYPE).indexOf(
 					TEXT_HTML_MIME)) {
 				return archivalHTML;
@@ -89,6 +91,11 @@
 					TEXT_XHTML_MIME)) {
 				return archivalHTML;
 			}
+			// CSS docs get marked up as CSS
+			if (-1 != result.get(WaybackConstants.RESULT_MIME_TYPE).indexOf(
+					TEXT_CSS_MIME)) {
+				return archivalCSS;
+			}
 		}
 		
 		// everything else goes transparently:
@@ -141,5 +148,6 @@
 	 */
 	public void setServerSideRendering(boolean isServerSideRendering) {
 		archivalHTML.setServerSideRendering(isServerSideRendering);
+		archivalCSS.setServerSideRendering(isServerSideRendering);
 	}
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2154] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/archivalurl/ ArchivalUrlCSSReplayRenderer.java

From: <bra...@us...> - 2008-01-30 03:00:15

Revision: 2154
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2154&view=rev
Author:   bradtofel
Date:     2008-01-29 19:00:16 -0800 (Tue, 29 Jan 2008)

Log Message:
-----------
INITIAL REV: Renderer implementation that is responsible for rewriting urls in text/css documents.

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlCSSReplayRenderer.java

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlCSSReplayRenderer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlCSSReplayRenderer.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlCSSReplayRenderer.java	2008-01-30 03:00:16 UTC (rev 2154)
@@ -0,0 +1,48 @@
+package org.archive.wayback.archivalurl;
+
+import java.io.IOException;
+import java.util.Map;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+import org.archive.wayback.ResultURIConverter;
+import org.archive.wayback.core.Resource;
+import org.archive.wayback.core.SearchResult;
+import org.archive.wayback.core.SearchResults;
+import org.archive.wayback.core.WaybackRequest;
+import org.archive.wayback.exception.BadContentException;
+import org.archive.wayback.replay.HTMLPage;
+import org.archive.wayback.replay.HttpHeaderOperation;
+
+public class ArchivalUrlCSSReplayRenderer extends ArchivalUrlReplayRenderer {
+	/* (non-Javadoc)
+	 * @see org.archive.wayback.ReplayRenderer#renderResource(javax.servlet.http.HttpServletRequest, javax.servlet.http.HttpServletResponse, org.archive.wayback.core.WaybackRequest, org.archive.wayback.core.SearchResult, org.archive.wayback.core.Resource, org.archive.wayback.ResultURIConverter, org.archive.wayback.core.SearchResults)
+	 */
+	public void renderResource(HttpServletRequest httpRequest,
+			HttpServletResponse httpResponse, WaybackRequest wbRequest,
+			SearchResult result, Resource resource,
+			ResultURIConverter uriConverter, SearchResults results)
+			throws ServletException, IOException, BadContentException {
+		HttpHeaderOperation.copyHTTPMessageHeader(resource, httpResponse);
+
+		Map<String,String> headers = HttpHeaderOperation.processHeaders(
+				resource, result, uriConverter, this);
+	
+		// Load content into an HTML page, and resolve @import URLs:
+		HTMLPage page = new HTMLPage(resource,result,uriConverter);
+		page.readFully();
+
+		page.resolveCSSUrls();
+
+		// set the corrected length:
+		int bytes = page.getBytes().length;
+		headers.put(HTTP_LENGTH_HEADER, String.valueOf(bytes));
+
+		// send back the headers:
+		HttpHeaderOperation.sendHeaders(headers, httpResponse);
+
+		page.writeToOutputStream(httpResponse.getOutputStream());
+	}
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2153] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java

From: <bra...@us...> - 2008-01-30 02:59:13

Revision: 2153
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2153&view=rev
Author:   bradtofel
Date:     2008-01-29 18:59:01 -0800 (Tue, 29 Jan 2008)

Log Message:
-----------
FEATURE: now uses TagMagix css related rewrite methods.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java	2008-01-30 02:58:00 UTC (rev 2152)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java	2008-01-30 02:59:01 UTC (rev 2153)
@@ -232,6 +232,8 @@
 			TagMagix.markupTagREURIC(sb, uriConverter, captureDate, pageUrl,
 					tagAttr[0], tagAttr[1]);
 		}
+		TagMagix.markupCSSImports(sb,uriConverter, captureDate, pageUrl);
+		TagMagix.markupStyleUrls(sb,uriConverter,captureDate,pageUrl);
 	}
 	
 	/**
@@ -271,7 +273,17 @@
 			TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl,
 					tagAttr[0], tagAttr[1]);
 		}
+		TagMagix.markupCSSImports(sb,uriConverter, captureDate, pageUrl);
+		TagMagix.markupStyleUrls(sb,uriConverter,captureDate,pageUrl);
 	}
+	
+	public void resolveCSSUrls() {
+		// TODO: get url from Resource instead of SearchResult?
+		String pageUrl = result.getAbsoluteUrl();
+		String captureDate = result.getCaptureDate();
+		TagMagix.markupCSSImports(sb,uriConverter, captureDate, pageUrl);
+	}
+
 	/**
 	 * @param charSet
 	 * @throws IOException 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2152] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java

From: <bra...@us...> - 2008-01-30 02:58:49

Revision: 2152
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2152&view=rev
Author:   bradtofel
Date:     2008-01-29 18:58:00 -0800 (Tue, 29 Jan 2008)

Log Message:
-----------
FERATURE: added functionality for rewriting CSS @import url() and style="... url();" parts of .css and .html pages.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java	2008-01-30 01:56:39 UTC (rev 2151)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java	2008-01-30 02:58:00 UTC (rev 2152)
@@ -59,10 +59,27 @@
 
 	private static String RAW_ATTR_VALUE = "(?:[^ \\t\\n\\x0B\\f\\r>\"']+)";
 
+	private static String ANY_TAGNAME = "[a-z]+";
+	
+	private static String STYLE_ATTR_NAME = "style";
+	
 	private static String ANY_ATTR_VALUE = QUOTED_ATTR_VALUE + "|"
 			+ APOSED_ATTR_VALUE + "|" + ESC_QUOTED_ATTR_VALUE + "|"
 			+ RAW_ATTR_VALUE;
+	
+//	private static String cssUrlPatString = 
+//		"url\\s*\\(\\s*(['\"]?.+?['\"]?)\\s*\\)";
+	private static String cssUrlPatString = 
+		"url\\s*\\(\\s*([\\\\\"']*.+?[\\\\\"']*)\\s*\\)";
+	
+	private static String cssImportPatString = 
+		"@import\\s+" + cssUrlPatString;
 
+	private static Pattern cssImportPattern = 
+		Pattern.compile(cssImportPatString);
+	
+	private static Pattern cssUrlPattern = Pattern.compile(cssUrlPatString);
+
 	/**
 	 * get (and cache) a regex Pattern for locating an HTML attribute value
 	 * within a particular tag. if found, the pattern will have the attribute
@@ -131,6 +148,65 @@
 		return pc;
 	}
 
+	public static void markupCSSImports(StringBuilder page,
+			ResultURIConverter uriConverter, String captureDate,
+			String baseUrl) {
+		markupTagREURIC(page,uriConverter,captureDate,baseUrl,cssImportPattern);
+	}
+	
+	public static void markupStyleUrls(StringBuilder page,
+			ResultURIConverter uriConverter, String captureDate,
+			String baseUrl) {
+		Pattern stylePattern = getPattern(ANY_TAGNAME, STYLE_ATTR_NAME);
+		Matcher matcher = stylePattern.matcher(page);
+
+		int idx = 0;
+		while (matcher.find(idx)) {
+			String attrValue = matcher.group(1);
+			int origAttrLength = attrValue.length();
+			int attrStart = matcher.start(1);
+			int attrEnd = matcher.end(1);
+			if (attrValue.charAt(0) == '"') {
+				attrValue = attrValue.substring(1, origAttrLength - 1);
+				attrStart += 1;
+			} else if (attrValue.charAt(0) == '\'') {
+				attrValue = attrValue.substring(1, origAttrLength - 1);
+				attrStart += 1;
+			} else if (attrValue.charAt(0) == '\\') {
+				attrValue = attrValue.substring(2, origAttrLength - 2);
+				attrStart += 2;
+			}
+			
+			idx = attrEnd;
+			Matcher urlMatcher = cssUrlPattern.matcher(attrValue);
+			int attrIdx = 0;
+			while(urlMatcher.find(attrIdx)) {
+				String url = urlMatcher.group(1);
+				int origUrlLength = url.length();
+				int urlStart = urlMatcher.start(1);
+				int urlEnd = urlMatcher.end(1);
+				attrIdx = urlEnd;
+				if (url.charAt(0) == '"') {
+					url = url.substring(1, origUrlLength - 1);
+					urlStart += 1;
+				} else if (url.charAt(0) == '\'') {
+					url = url.substring(1, origUrlLength - 1);
+					urlStart += 1;
+				} else if (url.charAt(0) == '\\') {
+					url = url.substring(2, origUrlLength - 2);
+					urlStart += 2;
+				}
+				int urlLength = url.length();
+				String finalUrl = UrlOperations.resolveUrl(baseUrl,url);
+				String replayUrl = uriConverter.makeReplayURI(captureDate, finalUrl);
+				int delta = replayUrl.length() - urlLength;
+				page.replace(attrStart + urlStart, attrStart + urlStart + urlLength , replayUrl);
+				idx += delta;
+				attrStart += delta;
+			}
+		}
+	}
+	
 	/**
 	 * Alter the HTML document in page, updating URLs in the attrName attributes
 	 * of all tagName tags such that:
@@ -152,8 +228,14 @@
 			String baseUrl, String tagName, String attrName) {
 
 		Pattern tagPat = getPattern(tagName, attrName);
-		Matcher matcher = tagPat.matcher(page);
+		markupTagREURIC(page,uriConverter,captureDate,baseUrl,tagPat);
+	}
 
+	public static void markupTagREURIC(StringBuilder page,
+			ResultURIConverter uriConverter, String captureDate,
+			String baseUrl, Pattern pattern) {
+		Matcher matcher = pattern.matcher(page);
+
 		int idx = 0;
 		while (matcher.find(idx)) {
 			String url = matcher.group(1);
@@ -163,13 +245,13 @@
 			String quote = "";
 			if (url.charAt(0) == '"') {
 				quote = "\"";
-				url = url.substring(1, url.length() - 1);
+				url = url.substring(1, origUrlLength - 1);
 			} else if (url.charAt(0) == '\'') {
 				quote = "'";
-				url = url.substring(1, url.length() - 1);
+				url = url.substring(1, origUrlLength - 1);
 			} else if (url.charAt(0) == '\\') {
 				quote = "\\\"";
-				url = url.substring(2, url.length() - 2);
+				url = url.substring(2, origUrlLength - 2);
 			}
 			String finalUrl = UrlOperations.resolveUrl(baseUrl,url);
 			String replayUrl = quote


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2151] trunk/archive-access/projects/wayback/ wayback-webapp/src/main/webapp/index.jsp

From: <bra...@us...> - 2008-01-30 01:56:36

Revision: 2151
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2151&view=rev
Author:   bradtofel
Date:     2008-01-29 17:56:39 -0800 (Tue, 29 Jan 2008)

Log Message:
-----------
FEATURE: added (quite ugly) message with links to configured AccessPoints when users visit the webapp context root, and have not specified an AccessPoint within that context.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/index.jsp

Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/index.jsp
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/index.jsp	2008-01-30 01:54:12 UTC (rev 2150)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/index.jsp	2008-01-30 01:56:39 UTC (rev 2151)
@@ -1,3 +1,4 @@
+<%@ page import="java.util.ArrayList" %>
 <%@ page import="org.archive.wayback.core.UIResults" %>
 <%@ page import="org.archive.wayback.util.StringFormatter" %>
 <jsp:include page="template/UI-header.jsp" flush="true" />
@@ -4,6 +5,22 @@
 <%
 UIResults results = UIResults.getFromRequest(request);
 StringFormatter fmt = results.getFormatter();
+Object names = request.getAttribute("AccessPointNames");
+if(names != null) {
+	if(names instanceof ArrayList) {
+		ArrayList<String> accessPoints = (ArrayList<String>) names;
+		if(accessPoints.size() > 0) {
+			%>
+			 You seems to be accessing this Wayback via an incorrect URL. Please try one of the following AccessPoints:<br></br>
+			<%
+		}
+		for(String accessPoint : accessPoints) {
+			%>
+			 <a href="<%= accessPoint %>/"><%= accessPoint %></a><br></br>
+			<%
+		}
+	}
+}
 %>
 <p>
 	<%= fmt.format("UIGlobal.indexPage") %>


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2150] trunk/archive-access/projects/wayback/ wayback-webapp/src/main/webapp/replay/disclaim-element.js

From: <bra...@us...> - 2008-01-30 01:54:17

Revision: 2150
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2150&view=rev
Author:   bradtofel
Date:     2008-01-29 17:54:12 -0800 (Tue, 29 Jan 2008)

Log Message:
-----------
INITIAL REV: currently unused, but this is the meat of the disclaimer javascript logic, which is generally usable by .jsps that insert page elements that need to appear at the top of HTML pages.

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/disclaim-element.js

Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/disclaim-element.js
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/disclaim-element.js	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/disclaim-element.js	2008-01-30 01:54:12 UTC (rev 2150)
@@ -0,0 +1,29 @@
+function getFrameArea(frame) {
+  if(frame.innerWidth) return frame.innerWidth * frame.innerHeight;
+  if(frame.document.documentElement && frame.document.documentElement.clientHeight) return frame.document.documentElement.clientWidth * frame.document.documentElement.clientHeight;
+  if(frame.document.body) return frame.document.body.clientWidth * frame.document.body.clientHeight;
+  return 0;
+}
+
+function disclaimElement(element) {
+  if(top!=self) {
+    if(top.document.body.tagName == "BODY") {
+      return;
+    }
+    largestArea = 0;
+    largestFrame = null;
+    for(i=0;i<top.frames.length;i++) {
+      frame = top.frames[i];
+      area = getFrameArea(frame);
+      if(area > largestArea) {
+        largestFrame = frame;
+        largestArea = area;
+      }
+    }
+    if(self!=largestFrame) {
+      return;
+    }
+  }
+  element.style.display="block";
+  document.body.insertBefore(element,document.body.firstChild);
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2149] trunk/archive-access/projects/wayback/ wayback-webapp/src/main/webapp/replay/Timeline.jsp

From: <bra...@us...> - 2008-01-30 01:52:30

Revision: 2149
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2149&view=rev
Author:   bradtofel
Date:     2008-01-29 17:52:36 -0800 (Tue, 29 Jan 2008)

Log Message:
-----------
FEATURE: added two-month partition interval, removed broken meta and resolution elements (till cookie resolution tracking is fixed). Adjusted formatting slightly to account for removed elements, but it still isn't pretty.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/Timeline.jsp

Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/Timeline.jsp
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/Timeline.jsp	2008-01-30 01:49:58 UTC (rev 2148)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/Timeline.jsp	2008-01-30 01:52:36 UTC (rev 2149)
@@ -81,6 +81,8 @@
 	optimal = fmt.format("TimelineView.timeRange.days");
 } else if(minResolution.equals(WaybackConstants.REQUEST_RESOLUTION_MONTHS)) {
 	optimal = fmt.format("TimelineView.timeRange.months");
+} else if(minResolution.equals(WaybackConstants.REQUEST_RESOLUTION_TWO_MONTHS)) {
+	  optimal = fmt.format("TimelineView.timeRange.twomonths");
 } else if(minResolution.equals(WaybackConstants.REQUEST_RESOLUTION_YEARS)) {
 	optimal = fmt.format("TimelineView.timeRange.years");
 } else {
@@ -101,6 +103,10 @@
 	monthsOptSelected = "selected";
 	partitions = ResultsTimelinePartitionsFactory.getMonth(results.getResults(),
 		wbRequest);
+} else if(resolution.equals(WaybackConstants.REQUEST_RESOLUTION_TWO_MONTHS)) {
+	  monthsOptSelected = "selected";
+	  partitions = ResultsTimelinePartitionsFactory.getTwoMonth(results.getResults(),
+	    wbRequest);
 } else if(resolution.equals(WaybackConstants.REQUEST_RESOLUTION_YEARS)) {
 	yearsOptSelected = "selected";
 	partitions = ResultsTimelinePartitionsFactory.getYear(results.getResults(),
@@ -149,15 +155,9 @@
 
 
 </script>
-<!-- 
-overflow:hidden; border-width:1; border-style:outset; width:100%; height:80px; right:0; top:0; background-color:#dddddd;
- -->
+
 <div id="wm-ipp" style="position:relative;z-index:99999;border:1px solid;color:black;background-color:lightYellow;font-size:10px;font-family:sans-serif;padding:5px" >
-<!-- 
-<div onclick="handleDragClick()" id="wm-dragger" style="height:25px; width:100%; border-width:1; border-style:outset; background-color:#cccccc; text-align:right;">
-&lt;
-</div>
- -->
+
 <table cellspacing="0" border="0" cellpadding="0"  width="100%">
 	<tr>
 		<td width="1" nowrap></td>
@@ -174,7 +174,7 @@
 				</tr>
 			</table>
 		</td>
-		<td width="400">
+		<td width="400" align="center">
 			<table>
 				<tr>
 					<td width="50%"></td>
@@ -281,8 +281,10 @@
 				</tr>
 			</table>
 		</td>
-		<td align="right">
+		<td align="right" width="400">
 			<!-- Resolution -->
+			<!--
+			 need to get cookie data passing set up before this can be re-enabled:
 			<form wmSpecial="1" name="timeline" method="GET" target="_top" action="<%= contextRoot + "/frameset" %>">
 				<input type="hidden" name="url" value="<%= searchUrl %>">
 				<input type="hidden" name="exactdate" value="<%= exactDateStr %>">
@@ -306,7 +308,9 @@
 					fmt.format("TimelineView.metaDataCheck") 
 				%><input type="checkbox" name="metamode" value="yes" <%=
 					metaChecked 
-				%> onClick="changeMeta()">&nbsp<a href="help.php" target="_top"><%=
+				%> onClick="changeMeta()">&nbsp
+				  -->
+				  <a wmSpecial="1" href="<%= contextRoot %>/help.jsp" target="_top"><%=
 					fmt.format("UIGlobal.helpLink")
 				%></a>
 			</form>


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2148] trunk/archive-access/projects/wayback/ wayback-webapp/src/main/webapp/replay/JSLessTimeline.jsp

From: <bra...@us...> - 2008-01-30 01:49:56

Revision: 2148
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2148&view=rev
Author:   bradtofel
Date:     2008-01-29 17:49:58 -0800 (Tue, 29 Jan 2008)

Log Message:
-----------
INITIAL REV: copy of Timeline.jsp which does not use javascript to move the timeline element to the top of the page. Assumed to be used with serverSideRendering=true.

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/JSLessTimeline.jsp

Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/JSLessTimeline.jsp
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/JSLessTimeline.jsp	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/JSLessTimeline.jsp	2008-01-30 01:49:58 UTC (rev 2148)
@@ -0,0 +1,306 @@
+<%@ page import="java.util.Iterator" %>
+<%@ page import="java.util.ArrayList" %>
+<%@ page import="java.util.Date" %>
+<%@ page import="java.text.ParseException" %>
+<%@ page import="org.archive.wayback.WaybackConstants" %>
+<%@ page import="org.archive.wayback.core.SearchResult" %>
+<%@ page import="org.archive.wayback.core.Timestamp" %>
+<%@ page import="org.archive.wayback.core.UIResults" %>
+<%@ page import="org.archive.wayback.core.WaybackRequest" %>
+<%@ page import="org.archive.wayback.query.UIQueryResults" %>
+<%@ page import="org.archive.wayback.query.resultspartitioner.ResultsTimelinePartitionsFactory" %>
+<%@ page import="org.archive.wayback.query.resultspartitioner.ResultsPartition" %>
+<%@ page import="org.archive.wayback.util.StringFormatter" %>
+<%
+
+String contextRoot = request.getScheme() + "://" + request.getServerName() + ":" 
+	+ request.getServerPort() + request.getContextPath();
+
+UIQueryResults results = (UIQueryResults) UIResults.getFromRequest(request);
+StringFormatter fmt = results.getFormatter();
+
+Timestamp searchStartTs = results.getStartTimestamp();
+Timestamp searchEndTs = results.getEndTimestamp();
+Timestamp exactTs = results.getExactRequestedTimestamp();
+String searchUrl = results.getSearchUrl();
+Date exactDate = exactTs.getDate();
+
+String exactDateStr = exactTs.getDateStr();
+WaybackRequest wbRequest = results.getWbRequest();
+String resolution = wbRequest.get(WaybackConstants.REQUEST_RESOLUTION);
+if(resolution == null) {
+	resolution = WaybackConstants.REQUEST_RESOLUTION_AUTO;
+}
+String metaMode = wbRequest.get(WaybackConstants.REQUEST_META_MODE);
+String metaChecked = "";
+if(metaMode != null && metaMode.equals("yes")) {
+	metaChecked = "checked";
+}
+
+String searchString = results.getSearchUrl();
+
+SearchResult first = null;
+SearchResult prev = null;
+SearchResult next = null;
+SearchResult last = null;
+
+int resultCount = results.getResultsReturned();
+int resultIndex = 1;
+Iterator<SearchResult> it = results.resultsIterator();
+while(it.hasNext()) {
+	SearchResult res = it.next();
+	String resDateStr = res.get(WaybackConstants.RESULT_CAPTURE_DATE);
+	int compared = resDateStr.compareTo(exactDateStr.substring(0,resDateStr.length()));
+	if(compared < 0) {
+		resultIndex++;
+		prev = res;
+		if(first == null) {
+			first = res;
+		}
+	} else if(compared > 0) {
+		last = res;
+		if(next == null) {
+			next = res;
+		}
+	}
+}
+// string to indicate which select option is currently active
+String yearsOptSelected = "";
+String monthsOptSelected = "";
+String daysOptSelected = "";
+String hoursOptSelected = "";
+String autoOptSelected = "";
+
+String minResolution = ResultsTimelinePartitionsFactory.getMinResolution(
+							results.getResults());
+
+String optimal = "";
+if(minResolution.equals(WaybackConstants.REQUEST_RESOLUTION_HOURS)) {
+	optimal = fmt.format("TimelineView.timeRange.hours");
+} else if(minResolution.equals(WaybackConstants.REQUEST_RESOLUTION_DAYS)) {
+	optimal = fmt.format("TimelineView.timeRange.days");
+} else if(minResolution.equals(WaybackConstants.REQUEST_RESOLUTION_MONTHS)) {
+	optimal = fmt.format("TimelineView.timeRange.months");
+} else if(minResolution.equals(WaybackConstants.REQUEST_RESOLUTION_TWO_MONTHS)) {
+	  optimal = fmt.format("TimelineView.timeRange.twomonths");
+} else if(minResolution.equals(WaybackConstants.REQUEST_RESOLUTION_YEARS)) {
+	optimal = fmt.format("TimelineView.timeRange.years");
+} else {
+	optimal = fmt.format("TimelineView.timeRange.unknown");
+}
+String autoOptString = fmt.format("TimelineView.timeRange.auto",optimal);
+
+ArrayList<ResultsPartition> partitions;
+if(resolution.equals(WaybackConstants.REQUEST_RESOLUTION_HOURS)) {
+	hoursOptSelected = "selected";
+	partitions = ResultsTimelinePartitionsFactory.getHour(results.getResults(),
+		wbRequest);
+} else if(resolution.equals(WaybackConstants.REQUEST_RESOLUTION_DAYS)) {
+	daysOptSelected = "selected";
+	partitions = ResultsTimelinePartitionsFactory.getDay(results.getResults(),
+		wbRequest);
+} else if(resolution.equals(WaybackConstants.REQUEST_RESOLUTION_MONTHS)) {
+	monthsOptSelected = "selected";
+	partitions = ResultsTimelinePartitionsFactory.getMonth(results.getResults(),
+		wbRequest);
+} else if(resolution.equals(WaybackConstants.REQUEST_RESOLUTION_TWO_MONTHS)) {
+	  monthsOptSelected = "selected";
+	  partitions = ResultsTimelinePartitionsFactory.getTwoMonth(results.getResults(),
+	    wbRequest);
+} else if(resolution.equals(WaybackConstants.REQUEST_RESOLUTION_YEARS)) {
+	yearsOptSelected = "selected";
+	partitions = ResultsTimelinePartitionsFactory.getYear(results.getResults(),
+		wbRequest);
+} else {
+	autoOptSelected = "selected";
+	partitions = ResultsTimelinePartitionsFactory.getAuto(results.getResults(),
+		wbRequest);
+}
+int numPartitions = partitions.size();
+ResultsPartition firstP = (ResultsPartition) partitions.get(0);
+ResultsPartition lastP = (ResultsPartition) partitions.get(numPartitions -1);
+
+String firstDate = firstP.getTitle();
+String lastDate = lastP.getTitle();
+String titleString = "";
+%>
+<!--
+  ======================================
+  BEGIN Wayback INSERTED TIMELINE BANNER
+
+  The following HTML has been inserted
+  by the Wayback application to enhance
+  the viewing experience, and was not
+  part of the original archived content.
+  ======================================
+-->
+<div id="wm-ipp" style="position:relative;z-index:99999;border:1px solid;color:black;background-color:lightYellow;font-size:10px;font-family:sans-serif;padding:5px" >
+
+<table cellspacing="0" border="0" cellpadding="0"  width="100%">
+	<tr>
+		<td width="1" nowrap></td>
+		<td>
+			<!-- Viewing -->
+			<table cellspacing="0" border="0" cellpadding="0" width="100%">
+				<tr>
+					<td>
+						<span><%= fmt.format("TimelineView.viewingVersion",resultIndex,resultCount) %>&nbsp;</span>
+					</td>
+				</tr>
+				<tr>
+					<td nowrap><span> <%= fmt.format("TimelineView.viewingVersionDate",exactDate) %> </span>&nbsp;&nbsp;</td>
+				</tr>
+			</table>
+		</td>
+		<td width="400" align="center">
+			<table>
+				<tr>
+					<td width="50%"></td>
+					<td>
+						<table cellspacing="0" border="0" cellpadding="0"  width="100%">
+							<tr>
+								<td width="48%" nowrap><span><%= firstDate %></span></td>
+								<td align="center" valign="bottom" nowrap><img wmSpecial="1" src="<%= contextRoot %>/images/mark.jpg"></td>
+								<td width="48%" nowrap align="right"><span><%= lastDate %></span></td>
+							</tr>
+						</table>
+					</td>
+					<td width="50%"></td>
+				</tr>
+				<tr>
+					<td nowrap align="right"><%
+						titleString = "";
+						if(first != null) {
+							titleString = "title=\"" + 
+								fmt.format("TimelineView.firstVersionTitle",
+									results.resultToDate(first)) + "\"";
+							%><a wmSpecial="1" href="<%= results.resultToReplayUrl(first) %>"><%
+						}
+						%><img <%= titleString %> wmSpecial="1" border=0 width=19 height=20 src="<%= contextRoot %>/images/first.jpg"><%
+						if(first != null) {
+							%></a><%
+						}
+						titleString = "";
+						if(prev != null) {
+							titleString = "title=\"" + 
+								fmt.format("TimelineView.prevVersionTitle",
+									results.resultToDate(prev)) + "\"";
+							%><a wmSpecial="1" href="<%= results.resultToReplayUrl(prev) %>"><%
+						}
+						%><img <%= titleString %> wmSpecial="1" border=0 width=13 height=20 src="<%= contextRoot %>/images/prev.jpg"><%
+						if(first != null) {
+							%></a><%
+						}
+					%></td>
+					<td nowrap><%
+			
+	for(int i = 0; i < numPartitions; i++) {
+		ResultsPartition partition = (ResultsPartition) partitions.get(i);
+		ArrayList partitionResults = partition.getMatches();
+		int numResults = partitionResults.size();
+		String imageUrl = contextRoot + "/images/line.jpg";
+		String replayUrl = null;
+		String prettyDateTime = null;
+		if(numResults == 1) {
+			imageUrl = contextRoot + "/images/mark_one.jpg";
+		  	SearchResult result = (SearchResult) partitionResults.get(0);
+			replayUrl = results.resultToReplayUrl(result);
+			prettyDateTime = fmt.format("TimelineView.markDateTitle",results.resultToDate(result));
+			
+		} else if (numResults > 1) {
+			imageUrl = contextRoot + "/images/mark_several.jpg";
+		  	SearchResult result = (SearchResult) partitionResults.get(numResults - 1);
+			replayUrl = results.resultToReplayUrl(result);
+			prettyDateTime = fmt.format("TimelineView.markDateTitle",results.resultToDate(result));
+
+		}
+		if((i > 0) && (i < numPartitions)) {
+
+%><img wmSpecial="1" border=0 width=1 height=16 src="<%= contextRoot %>/images/linemark.jpg"><%
+		
+		}
+
+		if(replayUrl == null) {
+
+%><img wmSpecial="1" border=0 width=7 height=16 src="<%= imageUrl %>"><%
+		
+		} else {
+
+%><a wmSpecial="1" href="<%= replayUrl %>"><img wmSpecial="1" border=0 width=7 height=16 title="<%= prettyDateTime %>" src="<%= imageUrl %>"></a><%
+
+		}
+	}
+
+%></td>
+					<td nowrap><%
+						titleString = "";
+						if(next != null) {
+							titleString = "title=\"" + 
+								fmt.format("TimelineView.nextVersionTitle",
+									results.resultToDate(next)) + "\"";
+							%><a wmSpecial="1" href="<%= results.resultToReplayUrl(next) %>"><%
+						}
+						%><img wmSpecial="1" <%= titleString %> border=0 width=13 height=20 src="<%= contextRoot %>/images/next.jpg"><%
+						if(first != null) {
+							%></a><%
+						}
+						titleString = "";
+						if(last != null) {
+							titleString = "title=\"" + 
+								fmt.format("TimelineView.lastVersionTitle",
+									results.resultToDate(last)) + "\"";
+							%><a wmSpecial="1" href="<%= results.resultToReplayUrl(last) %>"><%
+						}
+						%><img wmSpecial="1" <%= titleString %> border=0 width=19 height=20 src="<%= contextRoot %>/images/last.jpg"><%
+						if(first != null) {
+							%></a><%
+						}
+					%></td>
+				</tr>
+			</table>
+		</td>
+		<td align="right" width="400">
+			<!-- Resolution -->
+			<!--
+			 need to get cookie data passing set up before this can be re-enabled:
+			<form wmSpecial="1" name="timeline" method="GET" target="_top" action="<%= contextRoot + "/frameset" %>">
+				<input type="hidden" name="url" value="<%= searchUrl %>">
+				<input type="hidden" name="exactdate" value="<%= exactDateStr %>">
+				<input type="hidden" name="type" value="urlclosestquery">
+				<%= fmt.format("TimelineView.timeRange") %>
+				<select NAME="resolution" SIZE="1" onChange="changeResolution()">
+					<option <%= yearsOptSelected %> value="years">
+						<%= fmt.format("TimelineView.timeRange.years") %>
+					</option>
+					<option <%= monthsOptSelected %> value="months">
+						<%= fmt.format("TimelineView.timeRange.months") %>
+					</option>
+					<option <%= daysOptSelected %>  value="days">
+						<%= fmt.format("TimelineView.timeRange.days") %>
+					</option>
+					<option <%= hoursOptSelected %> value="hours">
+						<%= fmt.format("TimelineView.timeRange.hours") %>
+					</option>
+					<option <%= autoOptSelected %> value="auto"><%= autoOptString %></option>
+				</select>&nbsp;<%= 
+					fmt.format("TimelineView.metaDataCheck") 
+				%><input type="checkbox" name="metamode" value="yes" <%=
+					metaChecked 
+				%> onClick="changeMeta()">&nbsp
+				  -->
+				  <a wmSpecial="1" href="<%= contextRoot %>/help.jsp" target="_top"><%=
+					fmt.format("UIGlobal.helpLink")
+				%></a>
+			</form>
+		</td>
+		<td>
+			<img wmSpecial="1" alt='' height='1' src='<%= contextRoot %>/images/1px.gif' width='5'>
+		</td>
+	</tr>
+</table>
+</div>
+<!--
+  ======================================
+  END Wayback INSERTED TIMELINE BANNER
+  ======================================
+-->


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2147] trunk/archive-access/projects/wayback/ wayback-webapp/src/main/webapp/replay/Disclaimer.jsp

From: <bra...@us...> - 2008-01-30 01:48:23

Revision: 2147
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2147&view=rev
Author:   bradtofel
Date:     2008-01-29 17:48:22 -0800 (Tue, 29 Jan 2008)

Log Message:
-----------
INITIAL REV: extracted disclaimer functionality from ArchivalUrlReplayRenderer to here.

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/Disclaimer.jsp

Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/Disclaimer.jsp
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/Disclaimer.jsp	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/Disclaimer.jsp	2008-01-30 01:48:22 UTC (rev 2147)
@@ -0,0 +1,45 @@
+<%@ page import="java.util.Date" %>
+<%@ page import="org.archive.wayback.WaybackConstants" %>
+<%@ page import="org.archive.wayback.core.Timestamp" %>
+<%@ page import="org.archive.wayback.core.SearchResult" %>
+<%@ page import="org.archive.wayback.core.UIResults" %>
+<%@ page import="org.archive.wayback.core.WaybackRequest" %>
+<%@ page import="org.archive.wayback.query.UIQueryResults" %>
+<%@ page import="org.archive.wayback.util.StringFormatter" %>
+<%
+UIQueryResults results = (UIQueryResults) UIResults.getFromRequest(request);
+
+StringFormatter fmt = results.getFormatter();
+SearchResult result = results.getResult();
+String dupeMsg = "";
+if(result != null) {
+        String dupeType = result.get(WaybackConstants.RESULT_DUPLICATE_ANNOTATION);
+        if(dupeType != null) {
+                String dupeDate = result.get(WaybackConstants.RESULT_DUPLICATE_STORED_DATE);
+                String prettyDate = "";
+                if(dupeDate != null) {
+                	  Timestamp dupeTS = Timestamp.parseBefore(dupeDate);
+                    prettyDate = "(" + 
+                    		fmt.format("MetaReplay.captureDateDisplay",
+                    				dupeTS.getDate()) + ")";
+                }
+                dupeMsg = " Note that this document was downloaded, and not saved because it was a duplicate of a previously captured version " + 
+                          prettyDate + ". HTTP headers presented here are from the original capture.";
+        }
+}
+
+Date requestDate = results.getExactRequestedTimestamp().getDate();
+String requestUrl = results.getSearchUrl();
+
+String wmNotice = fmt.format("ReplayView.banner", requestUrl, requestDate);
+String wmHideNotice = fmt.format("ReplayView.bannerHideLink");
+
+String contextRoot = request.getScheme() + "://" + request.getServerName() + ":"
++ request.getServerPort() + request.getContextPath();
+String jsUrl = contextRoot + "/replay/disclaim.js";
+%>
+<script type="text/javascript">
+  var wmNotice = "<%= wmNotice %><%= dupeMsg %>";
+  var wmHideNotice = "<%= wmHideNotice %>";
+</script>
+<script type="text/javascript" src="<%= jsUrl %>"></script>


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2146] trunk/archive-access/projects/wayback/ wayback-webapp/src/main/webapp/WEB-INF/web.xml

From: <bra...@us...> - 2008-01-30 01:44:21

Revision: 2146
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2146&view=rev
Author:   bradtofel
Date:     2008-01-29 17:44:21 -0800 (Tue, 29 Jan 2008)

Log Message:
-----------
TWEAK: changed declared encoding to UTF-8, changes webapp spec.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/web.xml

Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/web.xml
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/web.xml	2008-01-30 01:42:34 UTC (rev 2145)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/web.xml	2008-01-30 01:44:21 UTC (rev 2146)
@@ -1,8 +1,11 @@
-<?xml version="1.0"?>
-<!DOCTYPE web-app PUBLIC "-//Sun Microsystems, Inc.//DTD Web Application 2.3//EN"
-	"http://java.sun.com/dtd/web-app_2_3.dtd">
-<web-app>
+<?xml version="1.0" encoding="UTF-8"?>
 
+<web-app xmlns="http://java.sun.com/xml/ns/j2ee"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+        xsi:schemaLocation="http://java.sun.com/xml/ns/j2ee
+            http://java.sun.com/xml/ns/j2ee/web-app_2_4.xsd"
+        version="2.4">
+
 	<security-constraint>
 		<web-resource-collection>
 			<web-resource-name>Secured-Wayback</web-resource-name>


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2145] trunk/archive-access/projects/wayback/ wayback-webapp/src/main/webapp/WEB-INF/classes

From: <bra...@us...> - 2008-01-30 01:42:33

Revision: 2145
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2145&view=rev
Author:   bradtofel
Date:     2008-01-29 17:42:34 -0800 (Tue, 29 Jan 2008)

Log Message:
-----------
CONFIG: added two month partition text.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI.properties
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI_fr_CA.properties

Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI.properties
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI.properties	2008-01-30 00:58:08 UTC (rev 2144)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI.properties	2008-01-30 01:42:34 UTC (rev 2145)
@@ -43,6 +43,7 @@
 TimelineView.viewingVersionDate={0,date,H:mm:ss MMM d, yyyy}
 TimelineView.timeRange=Time Range
 TimelineView.timeRange.years=Years
+TimelineView.timeRange.twomonths=Months
 TimelineView.timeRange.months=Months
 TimelineView.timeRange.days=Days
 TimelineView.timeRange.hours=Hours

Modified: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI_fr_CA.properties
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI_fr_CA.properties	2008-01-30 00:58:08 UTC (rev 2144)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/WEB-INF/classes/WaybackUI_fr_CA.properties	2008-01-30 01:42:34 UTC (rev 2145)
@@ -39,6 +39,7 @@
 TimelineView.viewingVersionDate={0,date,[H:mm:ss MMM d, yyyy]}
 TimelineView.timeRange=[Time Range]
 TimelineView.timeRange.years=[Years]
+TimelineView.timeRange.twomonths=[Months]
 TimelineView.timeRange.months=[Months]
 TimelineView.timeRange.days=[Days]
 TimelineView.timeRange.hours=[Hours]


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2144] trunk/archive-access/projects/wayback/ wayback-webapp/src/main/webapp/replay/ClientSideJSInsert.jsp

From: <bra...@us...> - 2008-01-30 00:58:06

Revision: 2144
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2144&view=rev
Author:   bradtofel
Date:     2008-01-29 16:58:08 -0800 (Tue, 29 Jan 2008)

Log Message:
-----------
INITIAL REV: split out functionality that had been in ArchivalUrlReplayRenderer into this .jsp.

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/ClientSideJSInsert.jsp

Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/ClientSideJSInsert.jsp
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/ClientSideJSInsert.jsp	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/ClientSideJSInsert.jsp	2008-01-30 00:58:08 UTC (rev 2144)
@@ -0,0 +1,21 @@
+<%@ page import="java.util.Date" %>
+<%@ page import="org.archive.wayback.ResultURIConverter" %>
+<%@ page import="org.archive.wayback.core.Timestamp" %>
+<%@ page import="org.archive.wayback.core.UIResults" %>
+<%@ page import="org.archive.wayback.core.WaybackRequest" %>
+<%@ page import="org.archive.wayback.query.UIQueryResults" %>
+<%@ page import="org.archive.wayback.util.StringFormatter" %>
+<%
+UIQueryResults results = (UIQueryResults) UIResults.getFromRequest(request);
+ResultURIConverter uriConverter = results.getURIConverter();
+String requestDate = results.getExactRequestedTimestamp().getDateStr();
+String contextPath = uriConverter.makeReplayURI(requestDate, "");
+String contextRoot = request.getScheme() + "://" + request.getServerName() + ":" 
+  + request.getServerPort() + request.getContextPath();
+
+String jsUrl = contextRoot + "/replay/client-rewrite.js";
+%>
+<script type="text/javascript">
+  var sWayBackCGI = "<%= contextPath %>";
+</script>
+<script type="text/javascript" src="<%= jsUrl %>" ></script>


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2143] trunk/archive-access/projects/wayback/ wayback-webapp/src/main/webapp/replay

From: <bra...@us...> - 2008-01-30 00:53:45

Revision: 2143
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2143&view=rev
Author:   bradtofel
Date:     2008-01-29 16:53:32 -0800 (Tue, 29 Jan 2008)

Log Message:
-----------
RENAME: wm.js to client-rewrite.js, which is now all this code does.

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/client-rewrite.js

Removed Paths:
-------------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/wm.js

Copied: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/client-rewrite.js (from rev 2141, trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/wm.js)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/client-rewrite.js	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/client-rewrite.js	2008-01-30 00:53:32 UTC (rev 2143)
@@ -0,0 +1,62 @@
+
+function xResolveUrl(url) {
+   var image = new Image();
+   image.src = url;
+   return image.src;
+}
+function xLateUrl(aCollection, sProp) {
+   var i = 0;
+   for(i = 0; i < aCollection.length; i++) {
+      if(aCollection[i].getAttribute(sProp) &&
+         (aCollection[i].getAttribute(sProp).length > 0) &&
+         (typeof(aCollection[i][sProp]) == "string")) {
+
+         if(aCollection[i][sProp].indexOf("mailto:") == -1 &&
+            aCollection[i][sProp].indexOf("javascript:") == -1) {
+
+            var wmSpecial = aCollection[i].getAttribute("wmSpecial");
+            if(wmSpecial && wmSpecial.length > 0) {
+            } else {
+                if(aCollection[i][sProp].indexOf("http") == 0) {
+                    aCollection[i][sProp] = sWayBackCGI + aCollection[i][sProp];
+                } else {
+                    aCollection[i][sProp] = sWayBackCGI + xResolveUrl(aCollection[i][sProp]);
+                }
+            }
+         }
+      }
+   }
+}
+
+xLateUrl(document.getElementsByTagName("IMG"),"src");
+xLateUrl(document.getElementsByTagName("A"),"href");
+xLateUrl(document.getElementsByTagName("AREA"),"href");
+xLateUrl(document.getElementsByTagName("OBJECT"),"codebase");
+xLateUrl(document.getElementsByTagName("OBJECT"),"data");
+xLateUrl(document.getElementsByTagName("APPLET"),"codebase");
+xLateUrl(document.getElementsByTagName("APPLET"),"archive");
+xLateUrl(document.getElementsByTagName("EMBED"),"src");
+xLateUrl(document.getElementsByTagName("IFRAME"),"src");
+xLateUrl(document.getElementsByTagName("BODY"),"background");
+var forms = document.getElementsByTagName("FORM");
+if (forms) {
+		var j = 0;
+		for (j = 0; j < forms.length; j++) {
+			f = forms[j];
+			if (typeof(f.action)  == "string") {
+				if(typeof(f.method)  == "string") {
+					if(typeof(f.method) != "post") {
+						var resolved = "";
+						var orig = f.action;
+						if(f.action.indexOf("http") == 0) {
+							resolved = f.action;
+						} else {
+							resolved = xResolveUrl(f.action);
+						}
+						// this does not work on firefox...
+				    	f.action = sWayBackCGI + resolved;
+				    }
+				}
+			}
+		}
+}

Deleted: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/wm.js
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/wm.js	2008-01-30 00:50:40 UTC (rev 2142)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/wm.js	2008-01-30 00:53:32 UTC (rev 2143)
@@ -1,62 +0,0 @@
-
-function xResolveUrl(url) {
-   var image = new Image();
-   image.src = url;
-   return image.src;
-}
-function xLateUrl(aCollection, sProp) {
-   var i = 0;
-   for(i = 0; i < aCollection.length; i++) {
-      if(aCollection[i].getAttribute(sProp) &&
-         (aCollection[i].getAttribute(sProp).length > 0) &&
-         (typeof(aCollection[i][sProp]) == "string")) {
-
-         if(aCollection[i][sProp].indexOf("mailto:") == -1 &&
-            aCollection[i][sProp].indexOf("javascript:") == -1) {
-
-            var wmSpecial = aCollection[i].getAttribute("wmSpecial");
-            if(wmSpecial && wmSpecial.length > 0) {
-            } else {
-                if(aCollection[i][sProp].indexOf("http") == 0) {
-                    aCollection[i][sProp] = sWayBackCGI + aCollection[i][sProp];
-                } else {
-                    aCollection[i][sProp] = sWayBackCGI + xResolveUrl(aCollection[i][sProp]);
-                }
-            }
-         }
-      }
-   }
-}
-
-xLateUrl(document.getElementsByTagName("IMG"),"src");
-xLateUrl(document.getElementsByTagName("A"),"href");
-xLateUrl(document.getElementsByTagName("AREA"),"href");
-xLateUrl(document.getElementsByTagName("OBJECT"),"codebase");
-xLateUrl(document.getElementsByTagName("OBJECT"),"data");
-xLateUrl(document.getElementsByTagName("APPLET"),"codebase");
-xLateUrl(document.getElementsByTagName("APPLET"),"archive");
-xLateUrl(document.getElementsByTagName("EMBED"),"src");
-xLateUrl(document.getElementsByTagName("IFRAME"),"src");
-xLateUrl(document.getElementsByTagName("BODY"),"background");
-var forms = document.getElementsByTagName("FORM");
-if (forms) {
-		var j = 0;
-		for (j = 0; j < forms.length; j++) {
-			f = forms[j];
-			if (typeof(f.action)  == "string") {
-				if(typeof(f.method)  == "string") {
-					if(typeof(f.method) != "post") {
-						var resolved = "";
-						var orig = f.action;
-						if(f.action.indexOf("http") == 0) {
-							resolved = f.action;
-						} else {
-							resolved = xResolveUrl(f.action);
-						}
-						// this does not work on firefox...
-				    	f.action = sWayBackCGI + resolved;
-				    }
-				}
-			}
-		}
-}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2142] trunk/archive-access/projects/wayback/ wayback-webapp/src/main/webapp/replay/ArchiveComment.jsp

From: <bra...@us...> - 2008-01-30 00:50:35

Revision: 2142
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2142&view=rev
Author:   bradtofel
Date:     2008-01-29 16:50:40 -0800 (Tue, 29 Jan 2008)

Log Message:
-----------
INITIAL REV: split out archival comment functionality into this .jsp include.

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/ArchiveComment.jsp

Added: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/ArchiveComment.jsp
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/ArchiveComment.jsp	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/ArchiveComment.jsp	2008-01-30 00:50:40 UTC (rev 2142)
@@ -0,0 +1,22 @@
+<%@ page import="java.util.Date" %>
+<%@ page import="org.archive.wayback.core.Timestamp" %>
+<%@ page import="org.archive.wayback.core.UIResults" %>
+<%@ page import="org.archive.wayback.query.UIQueryResults" %>
+<%@ page import="org.archive.wayback.util.StringFormatter" %>
+<%
+UIQueryResults results = (UIQueryResults) UIResults.getFromRequest(request);
+StringFormatter fmt = results.getFormatter();
+Date exactDate = results.getExactRequestedTimestamp().getDate();
+Date now = new Date();
+String prettyDateFormat = "{0,date,H:mm:ss MMM d, yyyy}";
+String prettyArchiveString = fmt.format(prettyDateFormat,exactDate);
+String prettyRequestString = fmt.format(prettyDateFormat,now);
+%>
+<!--
+     FILE ARCHIVED ON <%= prettyArchiveString %> AND RETRIEVED FROM THE
+     INTERNET ARCHIVE ON <%= prettyRequestString %>.
+     JAVASCRIPT APPENDED BY WAYBACK MACHINE, COPYRIGHT INTERNET ARCHIVE.
+
+     ALL OTHER CONTENT MAY ALSO BE PROTECTED BY COPYRIGHT (17 U.S.C.
+     SECTION 108(a)(3)).
+-->


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2141] trunk/archive-access/projects/wayback/ wayback-webapp/src/main/webapp

From: <bra...@us...> - 2008-01-30 00:49:32

Revision: 2141
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2141&view=rev
Author:   bradtofel
Date:     2008-01-29 16:49:30 -0800 (Tue, 29 Jan 2008)

Log Message:
-----------
MOVED: wm.js into replay/ directory

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/wm.js

Removed Paths:
-------------
    trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/wm.js

Copied: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/wm.js (from rev 2140, trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/wm.js)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/wm.js	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/replay/wm.js	2008-01-30 00:49:30 UTC (rev 2141)
@@ -0,0 +1,62 @@
+
+function xResolveUrl(url) {
+   var image = new Image();
+   image.src = url;
+   return image.src;
+}
+function xLateUrl(aCollection, sProp) {
+   var i = 0;
+   for(i = 0; i < aCollection.length; i++) {
+      if(aCollection[i].getAttribute(sProp) &&
+         (aCollection[i].getAttribute(sProp).length > 0) &&
+         (typeof(aCollection[i][sProp]) == "string")) {
+
+         if(aCollection[i][sProp].indexOf("mailto:") == -1 &&
+            aCollection[i][sProp].indexOf("javascript:") == -1) {
+
+            var wmSpecial = aCollection[i].getAttribute("wmSpecial");
+            if(wmSpecial && wmSpecial.length > 0) {
+            } else {
+                if(aCollection[i][sProp].indexOf("http") == 0) {
+                    aCollection[i][sProp] = sWayBackCGI + aCollection[i][sProp];
+                } else {
+                    aCollection[i][sProp] = sWayBackCGI + xResolveUrl(aCollection[i][sProp]);
+                }
+            }
+         }
+      }
+   }
+}
+
+xLateUrl(document.getElementsByTagName("IMG"),"src");
+xLateUrl(document.getElementsByTagName("A"),"href");
+xLateUrl(document.getElementsByTagName("AREA"),"href");
+xLateUrl(document.getElementsByTagName("OBJECT"),"codebase");
+xLateUrl(document.getElementsByTagName("OBJECT"),"data");
+xLateUrl(document.getElementsByTagName("APPLET"),"codebase");
+xLateUrl(document.getElementsByTagName("APPLET"),"archive");
+xLateUrl(document.getElementsByTagName("EMBED"),"src");
+xLateUrl(document.getElementsByTagName("IFRAME"),"src");
+xLateUrl(document.getElementsByTagName("BODY"),"background");
+var forms = document.getElementsByTagName("FORM");
+if (forms) {
+		var j = 0;
+		for (j = 0; j < forms.length; j++) {
+			f = forms[j];
+			if (typeof(f.action)  == "string") {
+				if(typeof(f.method)  == "string") {
+					if(typeof(f.method) != "post") {
+						var resolved = "";
+						var orig = f.action;
+						if(f.action.indexOf("http") == 0) {
+							resolved = f.action;
+						} else {
+							resolved = xResolveUrl(f.action);
+						}
+						// this does not work on firefox...
+				    	f.action = sWayBackCGI + resolved;
+				    }
+				}
+			}
+		}
+}

Deleted: trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/wm.js
===================================================================
--- trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/wm.js	2008-01-15 23:21:02 UTC (rev 2140)
+++ trunk/archive-access/projects/wayback/wayback-webapp/src/main/webapp/wm.js	2008-01-30 00:49:30 UTC (rev 2141)
@@ -1,62 +0,0 @@
-
-function xResolveUrl(url) {
-   var image = new Image();
-   image.src = url;
-   return image.src;
-}
-function xLateUrl(aCollection, sProp) {
-   var i = 0;
-   for(i = 0; i < aCollection.length; i++) {
-      if(aCollection[i].getAttribute(sProp) &&
-         (aCollection[i].getAttribute(sProp).length > 0) &&
-         (typeof(aCollection[i][sProp]) == "string")) {
-
-         if(aCollection[i][sProp].indexOf("mailto:") == -1 &&
-            aCollection[i][sProp].indexOf("javascript:") == -1) {
-
-            var wmSpecial = aCollection[i].getAttribute("wmSpecial");
-            if(wmSpecial && wmSpecial.length > 0) {
-            } else {
-                if(aCollection[i][sProp].indexOf("http") == 0) {
-                    aCollection[i][sProp] = sWayBackCGI + aCollection[i][sProp];
-                } else {
-                    aCollection[i][sProp] = sWayBackCGI + xResolveUrl(aCollection[i][sProp]);
-                }
-            }
-         }
-      }
-   }
-}
-
-xLateUrl(document.getElementsByTagName("IMG"),"src");
-xLateUrl(document.getElementsByTagName("A"),"href");
-xLateUrl(document.getElementsByTagName("AREA"),"href");
-xLateUrl(document.getElementsByTagName("OBJECT"),"codebase");
-xLateUrl(document.getElementsByTagName("OBJECT"),"data");
-xLateUrl(document.getElementsByTagName("APPLET"),"codebase");
-xLateUrl(document.getElementsByTagName("APPLET"),"archive");
-xLateUrl(document.getElementsByTagName("EMBED"),"src");
-xLateUrl(document.getElementsByTagName("IFRAME"),"src");
-xLateUrl(document.getElementsByTagName("BODY"),"background");
-var forms = document.getElementsByTagName("FORM");
-if (forms) {
-		var j = 0;
-		for (j = 0; j < forms.length; j++) {
-			f = forms[j];
-			if (typeof(f.action)  == "string") {
-				if(typeof(f.method)  == "string") {
-					if(typeof(f.method) != "post") {
-						var resolved = "";
-						var orig = f.action;
-						if(f.action.indexOf("http") == 0) {
-							resolved = f.action;
-						} else {
-							resolved = xResolveUrl(f.action);
-						}
-						// this does not work on firefox...
-				    	f.action = sWayBackCGI + resolved;
-				    }
-				}
-			}
-		}
-}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2140] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java

From: <bra...@us...> - 2008-01-15 23:21:01

Revision: 2140
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2140&view=rev
Author:   bradtofel
Date:     2008-01-15 15:21:02 -0800 (Tue, 15 Jan 2008)

Log Message:
-----------
FEATURE: reimplemented lost NotInArchive logging.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java	2008-01-15 03:06:37 UTC (rev 2139)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/webapp/AccessPoint.java	2008-01-15 23:21:02 UTC (rev 2140)
@@ -26,6 +26,7 @@
 
 import java.io.IOException;
 import java.util.Properties;
+import java.util.logging.Logger;
 
 import javax.servlet.RequestDispatcher;
 import javax.servlet.ServletException;
@@ -47,6 +48,7 @@
 import org.archive.wayback.exception.AuthenticationControlException;
 import org.archive.wayback.exception.BadQueryException;
 import org.archive.wayback.exception.ResourceNotAvailableException;
+import org.archive.wayback.exception.ResourceNotInArchiveException;
 import org.archive.wayback.exception.WaybackException;
 import org.archive.wayback.util.operator.BooleanOperator;
 import org.springframework.beans.factory.BeanNameAware;
@@ -66,6 +68,9 @@
  */
 public class AccessPoint implements RequestContext, BeanNameAware {
 
+	private static final Logger LOGGER = Logger.getLogger(
+			AccessPoint.class.getName());
+	
 	private boolean useServerName = false;
 	private int contextPort = 0;
 	private String contextName = null;
@@ -309,6 +314,7 @@
 			replay.renderResource(httpRequest, httpResponse, wbRequest,
 					closest, resource, uriConverter, captureResults);
 		} catch(WaybackException e) {
+			logNotInArchive(e,wbRequest);
 			replay.renderException(httpRequest, httpResponse, wbRequest, e);
 		} finally {
 			if(resource != null) {
@@ -337,9 +343,23 @@
 						results,uriConverter);
 			}
 		} catch(WaybackException e) {
+			logNotInArchive(e,wbRequest);
 			query.renderException(httpRequest, httpResponse, wbRequest, e);
 		}
 	}
+	
+	private void logNotInArchive(WaybackException e, WaybackRequest r) {
+		if(e instanceof ResourceNotInArchiveException) {
+			String url = r.get(WaybackConstants.REQUEST_URL);
+			StringBuilder sb = new StringBuilder(100);
+			sb.append("NotInArchive\t");
+			sb.append(contextName).append("\t");
+			sb.append(contextPort).append("\t");
+			sb.append(url);
+			
+			LOGGER.info(sb.toString());
+		}
+	}
 
 	/**
 	 * @param contextPort the contextPort to set


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2139] trunk/archive-access/projects/wayback/ dist/src/scripts/url-client

From: <bra...@us...> - 2008-01-15 03:06:31

Revision: 2139
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2139&view=rev
Author:   bradtofel
Date:     2008-01-14 19:06:37 -0800 (Mon, 14 Jan 2008)

Log Message:
-----------
REFACTOR: UrlCanonicalizer has changed names and packages.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/dist/src/scripts/url-client

Modified: trunk/archive-access/projects/wayback/dist/src/scripts/url-client
===================================================================
--- trunk/archive-access/projects/wayback/dist/src/scripts/url-client	2008-01-15 03:03:21 UTC (rev 2138)
+++ trunk/archive-access/projects/wayback/dist/src/scripts/url-client	2008-01-15 03:06:37 UTC (rev 2139)
@@ -75,7 +75,7 @@
 # Main ArcIndexer class.
 if [ -z "$CLASS_MAIN" ]
 then
-  CLASS_MAIN='org.archive.wayback.util.UrlCanonicalizer'
+  CLASS_MAIN='org.archive.wayback.util.url.AggressiveUrlCanonicalizer'
 fi
 
 CLASSPATH=${CP} $JAVACMD ${JAVA_OPTS} $CLASS_MAIN $@


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2138] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback

From: <bra...@us...> - 2008-01-15 03:03:16

Revision: 2138
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2138&view=rev
Author:   bradtofel
Date:     2008-01-14 19:03:21 -0800 (Mon, 14 Jan 2008)

Log Message:
-----------
FEATURE: UrlCanonicalizer customization capabilities. Previous hard-coded UrlCanonicalizer is now AggressiveUrlCanonicalizer, which is the default, but now it can be overridden with another UrlCanonicalizer implementation. main() of WarcIndexer and ArcIndexer now include parsing of "-identity" option, which causes the IdentityUrlCanonicalizer to be used -- passing through URLs to the CDX as they appear in the ARC file.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SelfRedirectFilter.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java	2008-01-15 03:00:16 UTC (rev 2137)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java	2008-01-15 03:03:21 UTC (rev 2138)
@@ -32,6 +32,7 @@
 import org.apache.commons.httpclient.URIException;
 import org.archive.io.arc.ARCLocation;
 import org.archive.io.arc.ARCRecord;
+import org.archive.wayback.UrlCanonicalizer;
 import org.archive.wayback.WaybackConstants;
 import org.archive.wayback.core.CaptureSearchResults;
 import org.archive.wayback.core.Resource;
@@ -44,8 +45,7 @@
 import org.archive.wayback.exception.WaybackException;
 import org.archive.wayback.resourcestore.ARCRecordToSearchResultAdapter;
 import org.archive.wayback.resourcestore.ArcResource;
-import org.archive.wayback.util.Adapter;
-import org.archive.wayback.util.UrlCanonicalizer;
+import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
 
 /**
  *
@@ -61,10 +61,15 @@
 	private ARCCacheDirectory arcCacheDir = null;
 	private URLCacher cacher = null;
 	private LiveWebLocalResourceIndex index = null;
-	static UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
-	private static Adapter<ARCRecord,SearchResult> adapter = 
-		new ARCRecordToSearchResultAdapter();
+	private UrlCanonicalizer canonicalizer = null;
+	private ARCRecordToSearchResultAdapter adapter = null;
 	
+	public LiveWebCache() {
+		canonicalizer = new AggressiveUrlCanonicalizer();
+		adapter = new ARCRecordToSearchResultAdapter();
+		adapter.setCanonicalizer(canonicalizer);
+	}
+	
 	/**
 	 * closes all resources
 	 */
@@ -330,4 +335,13 @@
 	public void setIndex(LiveWebLocalResourceIndex index) {
 		this.index = index;
 	}
+
+	public UrlCanonicalizer getCanonicalizer() {
+		return canonicalizer;
+	}
+
+	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
+		this.canonicalizer = canonicalizer;
+		adapter.setCanonicalizer(canonicalizer);
+	}
 }

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java	2008-01-15 03:00:16 UTC (rev 2137)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java	2008-01-15 03:03:21 UTC (rev 2138)
@@ -25,11 +25,13 @@
 package org.archive.wayback.resourceindex;
 
 import java.io.IOException;
+import java.util.Iterator;
 
 import org.apache.commons.httpclient.URIException;
 import org.archive.net.UURI;
 import org.archive.net.UURIFactory;
 import org.archive.wayback.ResourceIndex;
+import org.archive.wayback.UrlCanonicalizer;
 import org.archive.wayback.WaybackConstants;
 import org.archive.wayback.resourceindex.filters.CaptureToUrlResultFilter;
 import org.archive.wayback.resourceindex.filters.CounterFilter;
@@ -39,7 +41,6 @@
 import org.archive.wayback.resourceindex.filters.GuardRailFilter;
 import org.archive.wayback.resourceindex.filters.HostMatchFilter;
 import org.archive.wayback.resourceindex.filters.SelfRedirectFilter;
-import org.archive.wayback.resourceindex.filters.StartDateFilter;
 import org.archive.wayback.resourceindex.filters.UrlMatchFilter;
 import org.archive.wayback.resourceindex.filters.UrlPrefixMatchFilter;
 import org.archive.wayback.resourceindex.filters.WindowEndFilter;
@@ -58,7 +59,7 @@
 import org.archive.wayback.util.CloseableIterator;
 import org.archive.wayback.util.ObjectFilter;
 import org.archive.wayback.util.ObjectFilterChain;
-import org.archive.wayback.util.UrlCanonicalizer;
+import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
 
 /**
  * 
@@ -77,11 +78,15 @@
 
 	protected SearchResultSource source;
 	
-	private UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
+	private UrlCanonicalizer canonicalizer = null;
 	
 	private boolean dedupeRecords = false;
 
-	private void filterRecords(CloseableIterator<SearchResult> itr,
+	public LocalResourceIndex() {
+		canonicalizer = new AggressiveUrlCanonicalizer();
+	}
+	
+	private void filterRecords(Iterator<SearchResult> itr,
 			ObjectFilter<SearchResult> filter, SearchResults results,
 			boolean forwards) throws IOException {
 
@@ -98,7 +103,11 @@
 				results.addSearchResult(result, forwards);
 			}
 		}
-		source.cleanup(itr);
+		if(itr instanceof CloseableIterator) {
+			CloseableIterator<SearchResult> citr =
+				(CloseableIterator<SearchResult>) itr;
+			source.cleanup(citr);
+		}
 	}
 
 	private String getRequired(WaybackRequest wbRequest, String field,
@@ -216,27 +225,27 @@
 		if (searchType.equals(WaybackConstants.REQUEST_REPLAY_QUERY)
 				|| searchType.equals(WaybackConstants.REQUEST_CLOSEST_QUERY)) {
 
-			results = new CaptureSearchResults(); 
+			results = new CaptureSearchResults();
+
 			ObjectFilterChain<SearchResult> forwardFilters = 
 				new ObjectFilterChain<SearchResult>();
-			ObjectFilterChain<SearchResult> reverseFilters = 
-				new ObjectFilterChain<SearchResult>();
 
+//			ObjectFilterChain<SearchResult> reverseFilters = 
+//				new ObjectFilterChain<SearchResult>();
+
 			// use the same guardrail for both:
 			forwardFilters.addFilter(guardrail);
-			reverseFilters.addFilter(guardrail);
+//			reverseFilters.addFilter(guardrail);
 			
-			// BUGBUG: won't work when closest is a dupe!
 			forwardFilters.addFilter(new DuplicateRecordFilter());
-			reverseFilters.addFilter(new DuplicateRecordFilter());
 			
 			// match URL key:
 			forwardFilters.addFilter(new UrlMatchFilter(keyUrl));
-			reverseFilters.addFilter(new UrlMatchFilter(keyUrl));
+//			reverseFilters.addFilter(new UrlMatchFilter(keyUrl));
 
 			if(hostMatchFilter != null) {
 				forwardFilters.addFilter(hostMatchFilter);
-				reverseFilters.addFilter(hostMatchFilter);
+//				reverseFilters.addFilter(hostMatchFilter);
 			}
 			
 			// be sure to only include records within the date range we want:
@@ -246,11 +255,11 @@
 			// requested range.
 			DateRangeFilter drFilter = new DateRangeFilter(startDate,endDate);
 			forwardFilters.addFilter(drFilter);
-			reverseFilters.addFilter(drFilter);
+//			reverseFilters.addFilter(drFilter);
 			
 			// abort processing if we hit a date outside the search range:
 			forwardFilters.addFilter(new EndDateFilter(endDate));
-			reverseFilters.addFilter(new StartDateFilter(startDate));
+//			reverseFilters.addFilter(new StartDateFilter(startDate));
 
 			// for replay, do not include records that redirect to
 			// themselves.. We'll leave this for both closest and replays,
@@ -258,39 +267,54 @@
 			// timeline in which case, we don't want to show captures that
 			// redirect to themselves in the timeline if they are not viewable.
 			SelfRedirectFilter selfRedirectFilter = new SelfRedirectFilter();
+			selfRedirectFilter.setCanonicalizer(canonicalizer);
 			forwardFilters.addFilter(selfRedirectFilter);
-			reverseFilters.addFilter(selfRedirectFilter);
+//			reverseFilters.addFilter(selfRedirectFilter);
 			
 			// possibly filter via exclusions:
 			if(exclusion != null) {
 				forwardFilters.addFilter(preExCounter);
 				forwardFilters.addFilter(exclusion);
 
-				reverseFilters.addFilter(preExCounter);
-				reverseFilters.addFilter(exclusion);
+//				reverseFilters.addFilter(preExCounter);
+//				reverseFilters.addFilter(exclusion);
 			}
 			forwardFilters.addFilter(finalCounter);
-			reverseFilters.addFilter(finalCounter);
+//			reverseFilters.addFilter(finalCounter);
 
-			int resultsPerDirection = (int) Math.floor(resultsPerPage / 2);
-			if (resultsPerDirection * 2 == resultsPerPage) {
-				forwardFilters.addFilter(new WindowEndFilter(
-						resultsPerDirection));
-			} else {
-				forwardFilters.addFilter(new WindowEndFilter(
-						resultsPerDirection + 1));
-			}
-			reverseFilters.addFilter(new WindowEndFilter(resultsPerDirection));
+			forwardFilters.addFilter(new WindowEndFilter(resultsPerPage));
+//			int resultsPerDirection = (int) Math.floor(resultsPerPage / 2);
+//			reverseFilters.addFilter(new WindowEndFilter(resultsPerDirection));
 
-			startKey = keyUrl + " " + exactDate;
+			startKey = keyUrl;
 
-			// first the reverse search:
 			try {
-				filterRecords(source.getPrefixIterator(startKey), reverseFilters,
-						results, true);
-				// then the forwards:
-				filterRecords(source.getPrefixReverseIterator(startKey),
-						forwardFilters, results, false);
+//				CloseableIterator<SearchResult> reverse =
+//					new AdaptedObjectFilterIterator<SearchResult>(
+//					source.getPrefixReverseIterator(startKey),
+//					reverseFilters);
+
+//				// reverse the reverseResults:
+//				ArrayList<SearchResult> reverseResults = 
+//					new ArrayList<SearchResult>();
+//				while(reverse.hasNext()) {
+//					reverseResults.add(0, reverse.next());
+//				}
+				
+				// now make a composite of the reverse and forwards:
+				
+				CloseableIterator<SearchResult> forward =
+					source.getPrefixIterator(startKey);
+//				
+//				CompositeIterator<SearchResult> resultsItr =
+//					new CompositeIterator<SearchResult>();
+//				resultsItr.addComponent(reverseResults.iterator());
+//				resultsItr.addComponent(forward);
+				
+				// and filter:
+//				filterRecords(resultsItr, forwardFilters, results, true);
+				filterRecords(forward, forwardFilters, results, true);
+
 			} catch (IOException e) {
 				throw new ResourceIndexNotAvailableException(
 						e.getLocalizedMessage());
@@ -345,13 +369,11 @@
 			}
 			filters.addFilter(new DateRangeFilter(startDate, endDate));
 			// possibly filter via exclusions:
-			if (exclusion == null) {
-				filters.addFilter(new CaptureToUrlResultFilter());
-			} else {
+			if (exclusion != null) {
 				filters.addFilter(preExCounter);
 				filters.addFilter(exclusion);
-				filters.addFilter(new CaptureToUrlResultFilter());
 			}
+			filters.addFilter(new CaptureToUrlResultFilter());
 			filters.addFilter(finalCounter);
 			startKey = keyUrl;
 
@@ -430,4 +452,12 @@
 	public void setDedupeRecords(boolean dedupeRecords) {
 		this.dedupeRecords = dedupeRecords;
 	}
+
+	public UrlCanonicalizer getCanonicalizer() {
+		return canonicalizer;
+	}
+
+	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
+		this.canonicalizer = canonicalizer;
+	}
 }

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java	2008-01-15 03:00:16 UTC (rev 2137)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java	2008-01-15 03:03:21 UTC (rev 2138)
@@ -35,6 +35,7 @@
 
 import org.apache.commons.httpclient.URIException;
 import org.archive.wayback.ResourceIndex;
+import org.archive.wayback.UrlCanonicalizer;
 import org.archive.wayback.WaybackConstants;
 import org.archive.wayback.core.SearchResults;
 import org.archive.wayback.core.WaybackRequest;
@@ -42,8 +43,8 @@
 import org.archive.wayback.exception.BadQueryException;
 import org.archive.wayback.exception.ResourceIndexNotAvailableException;
 import org.archive.wayback.exception.ResourceNotInArchiveException;
-import org.archive.wayback.util.UrlCanonicalizer;
 import org.archive.wayback.util.flatfile.FlatFile;
+import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
 
 /**
  *
@@ -75,8 +76,12 @@
 	private String mapPath;
 	private static Comparator<RangeGroup> comparator = 
 		RangeGroup.getComparator();
-	private UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
+	private UrlCanonicalizer canonicalizer = null;
 
+	public AlphaPartitionedIndex() {
+		canonicalizer = new AggressiveUrlCanonicalizer();
+	}
+	
 	@SuppressWarnings("unchecked")
 	private void reloadMapFile() throws IOException {
 		FlatFile ff = new FlatFile(mapPath);
@@ -235,4 +240,12 @@
 	public void setMapPath(String mapPath) {
 		this.mapPath = mapPath;
 	}
+
+	public UrlCanonicalizer getCanonicalizer() {
+		return canonicalizer;
+	}
+
+	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
+		this.canonicalizer = canonicalizer;
+	}
 }

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SelfRedirectFilter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SelfRedirectFilter.java	2008-01-15 03:00:16 UTC (rev 2137)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SelfRedirectFilter.java	2008-01-15 03:03:21 UTC (rev 2138)
@@ -25,10 +25,11 @@
 package org.archive.wayback.resourceindex.filters;
 
 import org.apache.commons.httpclient.URIException;
+import org.archive.wayback.UrlCanonicalizer;
 import org.archive.wayback.WaybackConstants;
 import org.archive.wayback.core.SearchResult;
 import org.archive.wayback.util.ObjectFilter;
-import org.archive.wayback.util.UrlCanonicalizer;
+import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
 
 /**
  * SearchResultFilter which INCLUDEs all records, unless they redirect to 
@@ -39,7 +40,10 @@
  */
 public class SelfRedirectFilter implements ObjectFilter<SearchResult> {
 
-	private UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
+	private UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer();
+	public SelfRedirectFilter() {
+		canonicalizer = new AggressiveUrlCanonicalizer();
+	}
 	/* (non-Javadoc)
 	 * @see org.archive.wayback.resourceindex.SearchResultFilter#filterSearchResult(org.archive.wayback.core.SearchResult)
 	 */
@@ -63,4 +67,10 @@
 		}
 		return FILTER_INCLUDE;
 	}
+	public UrlCanonicalizer getCanonicalizer() {
+		return canonicalizer;
+	}
+	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
+		this.canonicalizer = canonicalizer;
+	}
 }

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java	2008-01-15 03:00:16 UTC (rev 2137)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java	2008-01-15 03:03:21 UTC (rev 2138)
@@ -34,10 +34,11 @@
 import org.archive.io.arc.ARCRecordMetaData;
 import org.archive.net.UURI;
 import org.archive.net.UURIFactory;
+import org.archive.wayback.UrlCanonicalizer;
 import org.archive.wayback.WaybackConstants;
 import org.archive.wayback.core.SearchResult;
 import org.archive.wayback.util.Adapter;
-import org.archive.wayback.util.UrlCanonicalizer;
+import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
 
 /**
  *
@@ -51,9 +52,11 @@
 	private static final Logger LOGGER = Logger.getLogger(
 			ARCRecordToSearchResultAdapter.class.getName());
 
-	// TODO: make this configurable based on the ResourceIndex
-	private static UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
-
+	private UrlCanonicalizer canonicalizer = null;
+	
+	public ARCRecordToSearchResultAdapter() {
+		canonicalizer = new AggressiveUrlCanonicalizer();
+	}
 //	public static SearchResult arcRecordToSearchResult(final ARCRecord rec)
 //	throws IOException, ParseException {
 	/* (non-Javadoc)
@@ -161,4 +164,10 @@
 		}
 		return result;
 	}
+	public UrlCanonicalizer getCanonicalizer() {
+		return canonicalizer;
+	}
+	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
+		this.canonicalizer = canonicalizer;
+	}
 }

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java	2008-01-15 03:00:16 UTC (rev 2137)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java	2008-01-15 03:03:21 UTC (rev 2138)
@@ -33,11 +33,14 @@
 import org.archive.io.arc.ARCReader;
 import org.archive.io.arc.ARCReaderFactory;
 import org.archive.io.arc.ARCRecord;
+import org.archive.wayback.UrlCanonicalizer;
 import org.archive.wayback.core.SearchResult;
 import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter;
 import org.archive.wayback.util.AdaptedIterator;
 import org.archive.wayback.util.Adapter;
 import org.archive.wayback.util.CloseableIterator;
+import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
+import org.archive.wayback.util.url.IdentityUrlCanonicalizer;
 
 /**
  * Transforms an ARC file into Iterator<SearchResult>.
@@ -51,7 +54,12 @@
 	 * CDX Header line for these fields. not very configurable..
 	 */
 	public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g";
-
+	private UrlCanonicalizer canonicalizer = null;
+	
+	public ArcIndexer() {
+		canonicalizer = new AggressiveUrlCanonicalizer();
+	}
+	
 	/**
 	 * @param arc
 	 * @return Iterator of SearchResults for input arc File
@@ -65,8 +73,9 @@
 		Adapter<ArchiveRecord,ARCRecord> adapter1 =
 			new ArchiveRecordToARCRecordAdapter();
 
-		Adapter<ARCRecord,SearchResult> adapter2 =
+		ARCRecordToSearchResultAdapter adapter2 =
 			new ARCRecordToSearchResultAdapter();
+		adapter2.setCanonicalizer(canonicalizer);
 		
 		Iterator<ArchiveRecord> itr1 = arcReader.iterator();
 
@@ -76,29 +85,22 @@
 		return new AdaptedIterator<ARCRecord,SearchResult>(itr2,adapter2);
 	}
 	
-	
-	private class ArchiveRecordToARCRecordAdapter 
-	implements Adapter<ArchiveRecord,ARCRecord> {
+	public UrlCanonicalizer getCanonicalizer() {
+		return canonicalizer;
+	}
 
-		/* (non-Javadoc)
-		 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
-		 */
-		public ARCRecord adapt(ArchiveRecord o) {
-			ARCRecord rec = null;
-			if(o instanceof ARCRecord) {
-				rec = (ARCRecord) o;
-			}
-			return rec;
-		}
+	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
+		this.canonicalizer = canonicalizer;
 	}
 
 	private static void USAGE() {
 		System.err.println("USAGE:");
 		System.err.println("");
-		System.err.println("arc-indexer ARCFILE");
-		System.err.println("arc-indexer ARCFILE CDXFILE");
+		System.err.println("arc-indexer [-identity] ARCFILE");
+		System.err.println("arc-indexer [-identity] ARCFILE CDXFILE");
 		System.err.println("");
-		System.err.println("Create a CDX format index at CDXFILE or to STDOUT");
+		System.err.println("Create a CDX format index at CDXFILE or to STDOUT.");
+		System.err.println("With -identity, perform no url canonicalization.");
 		System.exit(1);
 	}
 	
@@ -107,14 +109,20 @@
 	 */
 	public static void main(String[] args) {
 		ArcIndexer indexer = new ArcIndexer();
-		File arc = new File(args[0]);
+		int idx = 0;
+		if(args[0] != null && args[0].equals("-identity")) {
+			indexer.setCanonicalizer(new IdentityUrlCanonicalizer());
+			idx++;
+		}
+		File arc = new File(args[idx]);
+		idx++;
 		PrintWriter pw = null;
 		try {
-			if(args.length == 1) {
+			if(args.length == idx) {
 				// dump to STDOUT:
 				pw = new PrintWriter(System.out);
-			} else if(args.length == 2) {
-				pw = new PrintWriter(args[1]);
+			} else if(args.length == (idx + 1)) {
+				pw = new PrintWriter(args[idx]);
 			} else {
 				USAGE();
 			}
@@ -126,6 +134,22 @@
 			pw.close();
 		} catch (Exception e) {
 			e.printStackTrace();
+			System.exit(1);
 		}
 	}
+	
+	private class ArchiveRecordToARCRecordAdapter 
+	implements Adapter<ArchiveRecord,ARCRecord> {
+
+		/* (non-Javadoc)
+		 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
+		 */
+		public ARCRecord adapt(ArchiveRecord o) {
+			ARCRecord rec = null;
+			if(o instanceof ARCRecord) {
+				rec = (ARCRecord) o;
+			}
+			return rec;
+		}
+	}
 }

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java	2008-01-15 03:00:16 UTC (rev 2137)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java	2008-01-15 03:03:21 UTC (rev 2138)
@@ -16,10 +16,11 @@
 import org.archive.io.warc.WARCRecord;
 import org.archive.net.UURI;
 import org.archive.net.UURIFactory;
+import org.archive.wayback.UrlCanonicalizer;
 import org.archive.wayback.WaybackConstants;
 import org.archive.wayback.core.SearchResult;
 import org.archive.wayback.util.Adapter;
-import org.archive.wayback.util.UrlCanonicalizer;
+import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
 
 /**
  * Adapts certain WARCRecords into SearchResults. DNS and response records are
@@ -52,9 +53,12 @@
 	private static final Logger LOGGER = Logger.getLogger(
 			WARCRecordToSearchResultAdapter.class.getName());
 
-	// TODO: make this configurable based on the ResourceIndex
-	private static UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
+	private UrlCanonicalizer canonicalizer = null;
 
+	public WARCRecordToSearchResultAdapter() {
+		canonicalizer = new AggressiveUrlCanonicalizer();
+	}
+
 	/* (non-Javadoc)
 	 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
 	 */
@@ -303,4 +307,12 @@
 
 		return result;
 	}
+
+	public UrlCanonicalizer getCanonicalizer() {
+		return canonicalizer;
+	}
+
+	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
+		this.canonicalizer = canonicalizer;
+	}
 }

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java	2008-01-15 03:00:16 UTC (rev 2137)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java	2008-01-15 03:03:21 UTC (rev 2138)
@@ -9,11 +9,14 @@
 import org.archive.io.warc.WARCReader;
 import org.archive.io.warc.WARCReaderFactory;
 import org.archive.io.warc.WARCRecord;
+import org.archive.wayback.UrlCanonicalizer;
 import org.archive.wayback.core.SearchResult;
 import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter;
 import org.archive.wayback.util.AdaptedIterator;
 import org.archive.wayback.util.Adapter;
 import org.archive.wayback.util.CloseableIterator;
+import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
+import org.archive.wayback.util.url.IdentityUrlCanonicalizer;
 
 public class WarcIndexer {
 
@@ -22,6 +25,11 @@
 	 */
 	public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g";
 
+	private UrlCanonicalizer canonicalizer = null;
+	public WarcIndexer() {
+		canonicalizer = new AggressiveUrlCanonicalizer();
+	}
+	
 	/**
 	 * @param arc
 	 * @return Iterator of SearchResults for input arc File
@@ -32,7 +40,10 @@
 
 		Adapter<ArchiveRecord, WARCRecord> adapter1 = new ArchiveRecordToWARCRecordAdapter();
 
-		Adapter<WARCRecord, SearchResult> adapter2 = new WARCRecordToSearchResultAdapter();
+		WARCRecordToSearchResultAdapter adapter2 = 
+			new WARCRecordToSearchResultAdapter();
+		adapter2.setCanonicalizer(canonicalizer);
+		
 		WARCReader reader = WARCReaderFactory.get(warc);
 		
 		Iterator<ArchiveRecord> itr1 = reader.iterator();
@@ -43,28 +54,22 @@
 		return new AdaptedIterator<WARCRecord, SearchResult>(itr2, adapter2);
 	}
 
-	private class ArchiveRecordToWARCRecordAdapter implements
-			Adapter<ArchiveRecord, WARCRecord> {
+	public UrlCanonicalizer getCanonicalizer() {
+		return canonicalizer;
+	}
 
-		/* (non-Javadoc)
-		 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
-		 */
-		public WARCRecord adapt(ArchiveRecord o) {
-			WARCRecord rec = null;
-			if (o instanceof WARCRecord) {
-				rec = (WARCRecord) o;
-			}
-			return rec;
-		}
+	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
+		this.canonicalizer = canonicalizer;
 	}
-
+	
 	private static void USAGE() {
 		System.err.println("USAGE:");
 		System.err.println("");
-		System.err.println("warc-indexer WARCFILE");
-		System.err.println("warc-indexer WARCFILE CDXFILE");
+		System.err.println("warc-indexer [-identity] WARCFILE");
+		System.err.println("warc-indexer [-identity] WARCFILE CDXFILE");
 		System.err.println("");
 		System.err.println("Create a CDX format index at CDXFILE or to STDOUT");
+		System.err.println("With -identity, perform no url canonicalization.");
 		System.exit(1);
 	}
 
@@ -73,13 +78,19 @@
 	 */
 	public static void main(String[] args) {
 		WarcIndexer indexer = new WarcIndexer();
-		File arc = new File(args[0]);
+		int idx = 0;
+		if(args[0] != null && args[0].equals("-identity")) {
+			indexer.setCanonicalizer(new IdentityUrlCanonicalizer());
+			idx++;
+		}
+		File arc = new File(args[idx]);
+		idx++;
 		PrintWriter pw = null;
 		try {
-			if (args.length == 1) {
+			if (args.length == idx) {
 				// dump to STDOUT:
 				pw = new PrintWriter(System.out);
-			} else if (args.length == 2) {
+			} else if (args.length == (idx+1)) {
 				pw = new PrintWriter(args[1]);
 			} else {
 				USAGE();
@@ -94,4 +105,19 @@
 			e.printStackTrace();
 		}
 	}
+
+	private class ArchiveRecordToWARCRecordAdapter implements
+			Adapter<ArchiveRecord, WARCRecord> {
+
+		/* (non-Javadoc)
+		 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
+		 */
+		public WARCRecord adapt(ArchiveRecord o) {
+			WARCRecord rec = null;
+			if (o instanceof WARCRecord) {
+				rec = (WARCRecord) o;
+			}
+			return rec;
+		}
+	}
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2137] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/resourceindex/ DeduplicationSearchResultAnnotationAdapter.java

From: <bra...@us...> - 2008-01-15 03:00:17

Revision: 2137
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2137&view=rev
Author:   bradtofel
Date:     2008-01-14 19:00:16 -0800 (Mon, 14 Jan 2008)

Log Message:
-----------
FEATURE: now remembers the actual capture date for each stored capture, and subsequent deduped records are annotated with this date.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java	2008-01-15 02:58:30 UTC (rev 2136)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java	2008-01-15 03:00:16 UTC (rev 2137)
@@ -34,6 +34,7 @@
 		WaybackConstants.RESULT_REDIRECT_URL
 	};
 	private String lastDigest = null;
+	private String lastTimeStamp = null;
 	private HashMap<String,String> lastValues = new HashMap<String,String>();
 	private SearchResult annotate(SearchResult o) {
 		String thisDigest = o.get(WaybackConstants.RESULT_MD5_DIGEST);
@@ -45,10 +46,12 @@
 		}
 		o.put(WaybackConstants.RESULT_DUPLICATE_ANNOTATION, 
 				WaybackConstants.RESULT_DUPLICATE_DIGEST);
+		o.put(WaybackConstants.RESULT_DUPLICATE_STORED_DATE, lastTimeStamp);
 		return o;
 	}
 	private SearchResult remember(SearchResult o) {
 		lastDigest = o.get(WaybackConstants.RESULT_MD5_DIGEST);
+		lastTimeStamp = o.get(WaybackConstants.RESULT_CAPTURE_DATE);
 		for(String field : FIELDS) {
 			lastValues.put(field, o.get(field));
 		}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2136] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback

From: <bra...@us...> - 2008-01-15 02:58:25

Revision: 2136
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2136&view=rev
Author:   bradtofel
Date:     2008-01-14 18:58:30 -0800 (Mon, 14 Jan 2008)

Log Message:
-----------
REFACTOR: moved resolveUrl() from UrlCanonicalizer to UrlOperations

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayRenderer.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlTransparentReplayRenderer.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixReplayRenderer.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayRenderer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayRenderer.java	2008-01-15 02:57:30 UTC (rev 2135)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlReplayRenderer.java	2008-01-15 02:58:30 UTC (rev 2136)
@@ -43,7 +43,7 @@
 import org.archive.wayback.replay.HTMLPage;
 import org.archive.wayback.replay.HttpHeaderProcessor;
 import org.archive.wayback.replay.HttpHeaderOperation;
-import org.archive.wayback.util.UrlCanonicalizer;
+import org.archive.wayback.util.url.UrlOperations;
 
 /**
  * ReplayRenderer responsible for marking up HTML pages so they replay in
@@ -139,7 +139,7 @@
 			String baseUrl = result.getAbsoluteUrl();
 			String cd = result.getCaptureDate();
 			// by the spec, these should be absolute already, but just in case:
-			String u = UrlCanonicalizer.resolveUrl(baseUrl, value);
+			String u = UrlOperations.resolveUrl(baseUrl, value);
 
 			output.put(key, uriConverter.makeReplayURI(cd,u));
 

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlTransparentReplayRenderer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlTransparentReplayRenderer.java	2008-01-15 02:57:30 UTC (rev 2135)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/ArchivalUrlTransparentReplayRenderer.java	2008-01-15 02:58:30 UTC (rev 2136)
@@ -5,7 +5,7 @@
 import org.archive.wayback.ResultURIConverter;
 import org.archive.wayback.core.SearchResult;
 import org.archive.wayback.replay.TransparentReplayRenderer;
-import org.archive.wayback.util.UrlCanonicalizer;
+import org.archive.wayback.util.url.UrlOperations;
 
 /**
  * Slight extension to TransparentReplayRenderer, which rewrites Location and
@@ -32,7 +32,7 @@
 			String baseUrl = result.getAbsoluteUrl();
 			String cd = result.getCaptureDate();
 			// by the spec, these should be absolute already, but just in case:
-			String u = UrlCanonicalizer.resolveUrl(baseUrl, value);
+			String u = UrlOperations.resolveUrl(baseUrl, value);
 
 			output.put(key, uriConverter.makeReplayURI(cd,u));
 

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixReplayRenderer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixReplayRenderer.java	2008-01-15 02:57:30 UTC (rev 2135)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/domainprefix/DomainPrefixReplayRenderer.java	2008-01-15 02:58:30 UTC (rev 2136)
@@ -25,8 +25,6 @@
 package org.archive.wayback.domainprefix;
 
 import java.io.IOException;
-//import java.util.Date;
-//import java.util.Iterator;
 import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -37,7 +35,6 @@
 
 import org.archive.wayback.ReplayRenderer;
 import org.archive.wayback.ResultURIConverter;
-//import org.archive.wayback.WaybackConstants;
 import org.archive.wayback.core.Resource;
 import org.archive.wayback.core.SearchResult;
 import org.archive.wayback.core.SearchResults;
@@ -47,8 +44,7 @@
 import org.archive.wayback.replay.HTMLPage;
 import org.archive.wayback.replay.HttpHeaderProcessor;
 import org.archive.wayback.replay.HttpHeaderOperation;
-//import org.archive.wayback.util.StringFormatter;
-import org.archive.wayback.util.UrlCanonicalizer;
+import org.archive.wayback.util.url.UrlOperations;
 
 /**
  *
@@ -126,7 +122,7 @@
 			String baseUrl = result.getAbsoluteUrl();
 			String cd = result.getCaptureDate();
 			// by the spec, these should be absolute already, but just in case:
-			String u = UrlCanonicalizer.resolveUrl(baseUrl, value);
+			String u = UrlOperations.resolveUrl(baseUrl, value);
 
 			output.put(key, uriConverter.makeReplayURI(cd,u));
 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2135] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java

From: <bra...@us...> - 2008-01-15 02:57:24

Revision: 2135
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2135&view=rev
Author:   bradtofel
Date:     2008-01-14 18:57:30 -0800 (Mon, 14 Jan 2008)

Log Message:
-----------
REFACTOR: moved resolveUrl() from UrlCanonicalizer to UrlOperations

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java	2008-01-15 02:30:01 UTC (rev 2134)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TagMagix.java	2008-01-15 02:57:30 UTC (rev 2135)
@@ -29,7 +29,7 @@
 import java.util.regex.Pattern;
 
 import org.archive.wayback.ResultURIConverter;
-import org.archive.wayback.util.UrlCanonicalizer;
+import org.archive.wayback.util.url.UrlOperations;
 
 /**
  * Library for updating arbitrary attributes in arbitrary tags to rewrite HTML
@@ -171,7 +171,7 @@
 				quote = "\\\"";
 				url = url.substring(2, url.length() - 2);
 			}
-			String finalUrl = UrlCanonicalizer.resolveUrl(baseUrl,url);
+			String finalUrl = UrlOperations.resolveUrl(baseUrl,url);
 			String replayUrl = quote
 					+ uriConverter.makeReplayURI(captureDate, finalUrl) + quote;
 


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2134] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/util/url/ AggressiveUrlCanonicalizer.java

From: <bra...@us...> - 2008-01-15 02:30:01

Revision: 2134
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2134&view=rev
Author:   bradtofel
Date:     2008-01-14 18:30:01 -0800 (Mon, 14 Jan 2008)

Log Message:
-----------
INTERFACE: added UrlCanonicalizer interface to AggressiveUrlCanonicalizer

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java	2008-01-15 02:26:31 UTC (rev 2133)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java	2008-01-15 02:30:01 UTC (rev 2134)
@@ -34,6 +34,7 @@
 import org.apache.commons.httpclient.URIException;
 import org.archive.net.UURI;
 import org.archive.net.UURIFactory;
+import org.archive.wayback.UrlCanonicalizer;
 
 /**
  * Class that performs the standard Heritrix URL canonicalization. Eventually,
@@ -43,8 +44,7 @@
  * @author brad
  * @version $Date$, $Revision$
  */
-public class AggressiveUrlCanonicalizer {
-
+public class AggressiveUrlCanonicalizer implements UrlCanonicalizer {
 	
 	private static final String CDX_PREFIX = " CDX ";
     /**


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2133] trunk/archive-access/projects/wayback/ wayback-core/src/test/java/org/archive/wayback/util/url

From: <bra...@us...> - 2008-01-15 02:26:34

Revision: 2133
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2133&view=rev
Author:   bradtofel
Date:     2008-01-14 18:26:31 -0800 (Mon, 14 Jan 2008)

Log Message:
-----------
REFACTOR: renamed UrlCanonicalizer => AggressiveUrlCanonicalizer

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java

Removed Paths:
-------------
    trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlCanonicalizerTest.java

Copied: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java (from rev 2131, trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlCanonicalizerTest.java)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java	2008-01-15 02:26:31 UTC (rev 2133)
@@ -0,0 +1,203 @@
+/* UrlCanonicalizerTest
+ *
+ * $Id$
+ *
+ * Created on 2:13:36 PM Oct 11, 2006.
+ *
+ * Copyright (C) 2006 Internet Archive.
+ *
+ * This file is part of Wayback.
+ *
+ * Wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * Wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+package org.archive.wayback.util.url;
+
+import org.apache.commons.httpclient.URIException;
+import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
+
+import junit.framework.TestCase;
+
+/**
+ *
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class AggressiveUrlCanonicalizerTest extends TestCase {
+	private AggressiveUrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer();
+	/**
+	 * Test method for 'org.archive.wayback.cdx.CDXRecord.urlStringToKey(String)'
+	 */
+	public void testUrlStringToKey() {
+
+		// simple strip of http://
+		checkCanonicalization("http://foo.com/","foo.com/");
+
+// would be nice to handle other protocols...
+//		// simple strip of https://
+//		checkCanonicalization("https://foo.com/","foo.com/");
+//
+//		// simple strip of ftp://
+//		checkCanonicalization("ftp://foo.com/","foo.com/");
+//
+//		// simple strip of rtsp://
+//		checkCanonicalization("rtsp://foo.com/","foo.com/");
+
+		// strip leading 'www.'
+		checkCanonicalization("http://www.foo.com/","foo.com/");
+		
+		// add trailing '/' with empty path
+		checkCanonicalization("http://www.foo.com","foo.com/");
+		
+		// strip leading 'www##.'
+		checkCanonicalization("http://www12.foo.com/","foo.com/");
+		
+		// strip leading 'www##.' with no protocol
+		checkCanonicalization("www12.foo.com/","foo.com/");
+		
+		
+		// leave alone an url with no protocol but non-empty path
+		checkCanonicalization("foo.com/","foo.com/");
+		
+		// add trailing '/' with empty path and without protocol
+		checkCanonicalization("foo.com","foo.com/");
+
+		// add trailing '/' to with empty path and no protocol, plus massage
+		checkCanonicalization("www12.foo.com","foo.com/");
+
+		// do not add trailing '/' non-empty path and without protocol
+		checkCanonicalization("foo.com/boo","foo.com/boo");
+
+		// TEST
+		// replace escaped ' ' with '+' in path plus keep trailing slash and query
+		checkCanonicalization("foo.com/pa%20th?a=b","foo.com/pa+th?a=b");
+		
+		
+		// replace escaped ' ' with '+' in path
+		checkCanonicalization("foo.com/pa%20th","foo.com/pa+th");
+		
+		// replace escaped ' ' with '+' in path plus leave trailing slash
+		checkCanonicalization("foo.com/pa%20th/","foo.com/pa+th/");
+
+		// replace multiple consecutive /'s in path
+		checkCanonicalization("foo.com//goo","foo.com/goo");
+
+		// replace multiple consecutive /'s in path
+		checkCanonicalization("foo.com///goo","foo.com/goo");
+
+		// leave alone consecutive /'s after ?
+		checkCanonicalization("foo.com/b?jar=//goo","foo.com/b?jar=//goo");
+
+		// replace multiple consecutive /'s in path, plus leave trailing /
+		checkCanonicalization("foo.com///goo/","foo.com/goo/");
+
+		// replace escaped ' ' with '+' in path plus keep trailing slash and query
+		checkCanonicalization("foo.com/pa%20th/?a=b","foo.com/pa+th/?a=b");
+		
+		
+		// replace escaped ' ' with '+' in path but not in query key
+		checkCanonicalization("foo.com/pa%20th?a%20a=b","foo.com/pa+th?a%20a=b");
+
+		// replace escaped ' ' with '+' in path but not in query value
+		checkCanonicalization("foo.com/pa%20th?a=b%20b","foo.com/pa+th?a=b%20b");
+
+		
+		// no change in '!' escaping
+		checkCanonicalization("foo.com/pa!th","foo.com/pa!th");
+
+		// no change in '+' escaping
+		checkCanonicalization("foo.com/pa+th","foo.com/pa+th");
+
+		// unescape legal escaped '!' (%21)
+		checkCanonicalization("foo.com/pa%21th","foo.com/pa!th");
+
+		// leave '%' (%25)
+		checkCanonicalization("foo.com/pa%th","foo.com/pa%th");
+
+		// unescape '%' (%25)
+		checkCanonicalization("foo.com/pa%25th","foo.com/pa%th");
+		
+		
+		// replace escaped ' ' with '+' in path, unescape legal '!' in path
+		// no change in query escaping
+		checkCanonicalization("foo.com/pa%20t%21h?a%20a=b","foo.com/pa+t!h?a%20a=b");
+		
+		// replace escaped ' ' with '+' in path, leave illegal '%02' in path
+		// no change in query escaping
+		checkCanonicalization("foo.com/pa%20t%02h?a%20a=b","foo.com/pa+t%02h?a%20a=b");
+
+		// strip jsessionid
+		String sid1 = "jsessionid=0123456789abcdefghijklemopqrstuv";
+		String sid2 = "PHPSESSID=9682993c8daa2c5497996114facdc805";
+		String sid3 = "sid=9682993c8daa2c5497996114facdc805";
+		String sid4 = "ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM";
+		String sid5 = "CFID=12412453&CFTOKEN=15501799";
+		//String sid6 = "CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A";
+
+		String fore = "http://foo.com/bar?bo=lo&";
+		String aft = "&gum=yum";
+		String want = "foo.com/bar?bo=lo&gum=yum";
+//		String fore = "http://www.archive.org/index.html?";
+//		String aft = "";
+//		String want = "archive.org/index.html";
+		
+		checkCanonicalization(fore + sid1 + aft,want);
+		checkCanonicalization(fore + sid2 + aft,want);
+		checkCanonicalization(fore + sid3 + aft,want);
+		checkCanonicalization(fore + sid4 + aft,want);
+		checkCanonicalization(fore + sid5 + aft,want);
+		//checkCanonicalization(fore + sid6 + aft,want);
+
+		// Check ASP_SESSIONID2:
+		checkCanonicalization(
+				"http://legislature.mi.gov/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx",
+				"legislature.mi.gov/mileg.aspx");
+
+		// Check ASP_SESSIONID2 (again):
+		checkCanonicalization(
+				"http://legislature.mi.gov/(4hqa0555fwsecu455xqckv45)/mileg.aspx",
+				"legislature.mi.gov/mileg.aspx");
+
+		// Check ASP_SESSIONID3:
+		checkCanonicalization(
+				"http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules",
+				"legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules");
+		
+		// strip port 80
+		checkCanonicalization("http://www.chub.org:80/foo","chub.org/foo");
+
+		// but not other ports...
+		checkCanonicalization("http://www.chub.org:8080/foo","chub.org:8080/foo");
+
+	}
+	
+	private void checkCanonicalization(String orig, String want) {
+		String got;
+		try {
+			got = canonicalizer.urlStringToKey(orig);
+			assertEquals("Failed canonicalization (" + orig + ") => (" + got + 
+					") and not (" + want + ") as expected",want,got);
+			
+			String got2 = canonicalizer.urlStringToKey(got);
+			assertEquals("Failed 2nd canonicalization (" + got + ") => (" + 
+					got2 + ") and not (" + want + ") as expected",want,got2);
+			
+			
+		} catch (URIException e) {
+			e.printStackTrace();
+			assertTrue("Exception converting(" + orig + ")",false);
+		}
+	}
+}

Deleted: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlCanonicalizerTest.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlCanonicalizerTest.java	2008-01-15 02:26:01 UTC (rev 2132)
+++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlCanonicalizerTest.java	2008-01-15 02:26:31 UTC (rev 2133)
@@ -1,203 +0,0 @@
-/* UrlCanonicalizerTest
- *
- * $Id$
- *
- * Created on 2:13:36 PM Oct 11, 2006.
- *
- * Copyright (C) 2006 Internet Archive.
- *
- * This file is part of Wayback.
- *
- * Wayback is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * any later version.
- *
- * Wayback is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser Public License for more details.
- *
- * You should have received a copy of the GNU Lesser Public License
- * along with Wayback; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-package org.archive.wayback.util.url;
-
-import org.apache.commons.httpclient.URIException;
-import org.archive.wayback.util.url.UrlCanonicalizer;
-
-import junit.framework.TestCase;
-
-/**
- *
- *
- * @author brad
- * @version $Date$, $Revision$
- */
-public class UrlCanonicalizerTest extends TestCase {
-	private UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
-	/**
-	 * Test method for 'org.archive.wayback.cdx.CDXRecord.urlStringToKey(String)'
-	 */
-	public void testUrlStringToKey() {
-
-		// simple strip of http://
-		checkCanonicalization("http://foo.com/","foo.com/");
-
-// would be nice to handle other protocols...
-//		// simple strip of https://
-//		checkCanonicalization("https://foo.com/","foo.com/");
-//
-//		// simple strip of ftp://
-//		checkCanonicalization("ftp://foo.com/","foo.com/");
-//
-//		// simple strip of rtsp://
-//		checkCanonicalization("rtsp://foo.com/","foo.com/");
-
-		// strip leading 'www.'
-		checkCanonicalization("http://www.foo.com/","foo.com/");
-		
-		// add trailing '/' with empty path
-		checkCanonicalization("http://www.foo.com","foo.com/");
-		
-		// strip leading 'www##.'
-		checkCanonicalization("http://www12.foo.com/","foo.com/");
-		
-		// strip leading 'www##.' with no protocol
-		checkCanonicalization("www12.foo.com/","foo.com/");
-		
-		
-		// leave alone an url with no protocol but non-empty path
-		checkCanonicalization("foo.com/","foo.com/");
-		
-		// add trailing '/' with empty path and without protocol
-		checkCanonicalization("foo.com","foo.com/");
-
-		// add trailing '/' to with empty path and no protocol, plus massage
-		checkCanonicalization("www12.foo.com","foo.com/");
-
-		// do not add trailing '/' non-empty path and without protocol
-		checkCanonicalization("foo.com/boo","foo.com/boo");
-
-		// TEST
-		// replace escaped ' ' with '+' in path plus keep trailing slash and query
-		checkCanonicalization("foo.com/pa%20th?a=b","foo.com/pa+th?a=b");
-		
-		
-		// replace escaped ' ' with '+' in path
-		checkCanonicalization("foo.com/pa%20th","foo.com/pa+th");
-		
-		// replace escaped ' ' with '+' in path plus leave trailing slash
-		checkCanonicalization("foo.com/pa%20th/","foo.com/pa+th/");
-
-		// replace multiple consecutive /'s in path
-		checkCanonicalization("foo.com//goo","foo.com/goo");
-
-		// replace multiple consecutive /'s in path
-		checkCanonicalization("foo.com///goo","foo.com/goo");
-
-		// leave alone consecutive /'s after ?
-		checkCanonicalization("foo.com/b?jar=//goo","foo.com/b?jar=//goo");
-
-		// replace multiple consecutive /'s in path, plus leave trailing /
-		checkCanonicalization("foo.com///goo/","foo.com/goo/");
-
-		// replace escaped ' ' with '+' in path plus keep trailing slash and query
-		checkCanonicalization("foo.com/pa%20th/?a=b","foo.com/pa+th/?a=b");
-		
-		
-		// replace escaped ' ' with '+' in path but not in query key
-		checkCanonicalization("foo.com/pa%20th?a%20a=b","foo.com/pa+th?a%20a=b");
-
-		// replace escaped ' ' with '+' in path but not in query value
-		checkCanonicalization("foo.com/pa%20th?a=b%20b","foo.com/pa+th?a=b%20b");
-
-		
-		// no change in '!' escaping
-		checkCanonicalization("foo.com/pa!th","foo.com/pa!th");
-
-		// no change in '+' escaping
-		checkCanonicalization("foo.com/pa+th","foo.com/pa+th");
-
-		// unescape legal escaped '!' (%21)
-		checkCanonicalization("foo.com/pa%21th","foo.com/pa!th");
-
-		// leave '%' (%25)
-		checkCanonicalization("foo.com/pa%th","foo.com/pa%th");
-
-		// unescape '%' (%25)
-		checkCanonicalization("foo.com/pa%25th","foo.com/pa%th");
-		
-		
-		// replace escaped ' ' with '+' in path, unescape legal '!' in path
-		// no change in query escaping
-		checkCanonicalization("foo.com/pa%20t%21h?a%20a=b","foo.com/pa+t!h?a%20a=b");
-		
-		// replace escaped ' ' with '+' in path, leave illegal '%02' in path
-		// no change in query escaping
-		checkCanonicalization("foo.com/pa%20t%02h?a%20a=b","foo.com/pa+t%02h?a%20a=b");
-
-		// strip jsessionid
-		String sid1 = "jsessionid=0123456789abcdefghijklemopqrstuv";
-		String sid2 = "PHPSESSID=9682993c8daa2c5497996114facdc805";
-		String sid3 = "sid=9682993c8daa2c5497996114facdc805";
-		String sid4 = "ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM";
-		String sid5 = "CFID=12412453&CFTOKEN=15501799";
-		//String sid6 = "CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A";
-
-		String fore = "http://foo.com/bar?bo=lo&";
-		String aft = "&gum=yum";
-		String want = "foo.com/bar?bo=lo&gum=yum";
-//		String fore = "http://www.archive.org/index.html?";
-//		String aft = "";
-//		String want = "archive.org/index.html";
-		
-		checkCanonicalization(fore + sid1 + aft,want);
-		checkCanonicalization(fore + sid2 + aft,want);
-		checkCanonicalization(fore + sid3 + aft,want);
-		checkCanonicalization(fore + sid4 + aft,want);
-		checkCanonicalization(fore + sid5 + aft,want);
-		//checkCanonicalization(fore + sid6 + aft,want);
-
-		// Check ASP_SESSIONID2:
-		checkCanonicalization(
-				"http://legislature.mi.gov/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx",
-				"legislature.mi.gov/mileg.aspx");
-
-		// Check ASP_SESSIONID2 (again):
-		checkCanonicalization(
-				"http://legislature.mi.gov/(4hqa0555fwsecu455xqckv45)/mileg.aspx",
-				"legislature.mi.gov/mileg.aspx");
-
-		// Check ASP_SESSIONID3:
-		checkCanonicalization(
-				"http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules",
-				"legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules");
-		
-		// strip port 80
-		checkCanonicalization("http://www.chub.org:80/foo","chub.org/foo");
-
-		// but not other ports...
-		checkCanonicalization("http://www.chub.org:8080/foo","chub.org:8080/foo");
-
-	}
-	
-	private void checkCanonicalization(String orig, String want) {
-		String got;
-		try {
-			got = canonicalizer.urlStringToKey(orig);
-			assertEquals("Failed canonicalization (" + orig + ") => (" + got + 
-					") and not (" + want + ") as expected",want,got);
-			
-			String got2 = canonicalizer.urlStringToKey(got);
-			assertEquals("Failed 2nd canonicalization (" + got + ") => (" + 
-					got2 + ") and not (" + want + ") as expected",want,got2);
-			
-			
-		} catch (URIException e) {
-			e.printStackTrace();
-			assertTrue("Exception converting(" + orig + ")",false);
-		}
-	}
-}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2132] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/util/url

From: <bra...@us...> - 2008-01-15 02:25:59

Revision: 2132
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2132&view=rev
Author:   bradtofel
Date:     2008-01-14 18:26:01 -0800 (Mon, 14 Jan 2008)

Log Message:
-----------
REFACTOR: renamed UrlCanonicalizer => AggressiveUrlCanonicalizer

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java

Removed Paths:
-------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java

Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java (from rev 2130, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java	2008-01-15 02:26:01 UTC (rev 2132)
@@ -0,0 +1,391 @@
+/* UrlCanonicalizer
+ *
+ * $Id$
+ *
+ * Created on 2:08:07 PM Oct 11, 2006.
+ *
+ * Copyright (C) 2006 Internet Archive.
+ *
+ * This file is part of Wayback.
+ *
+ * Wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * Wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+package org.archive.wayback.util.url;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.httpclient.URIException;
+import org.archive.net.UURI;
+import org.archive.net.UURIFactory;
+
+/**
+ * Class that performs the standard Heritrix URL canonicalization. Eventually,
+ * this should all be configurable, or perhaps be able to read the settings
+ * used within a Heritrix crawler... or even multiple crawlers... this is hard.
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class AggressiveUrlCanonicalizer {
+
+	
+	private static final String CDX_PREFIX = " CDX ";
+    /**
+     * Strip leading 'www.'
+     */
+    private static final Pattern STRIP_WWW_REGEX =
+        Pattern.compile("(?i)^(https?://)(?:www\\.)([^/]*/.+)$");
+    /**
+     * Strip leading 'www44.', 'www3.', etc.
+     */
+    private static final Pattern STRIP_WWWN_REGEX =
+        Pattern.compile("(?i)^(https?://)(?:www[0-9]+\\.)([^/]*/.+)$");
+    /**
+     * Strip userinfo.
+     */
+    private static final Pattern STRIP_USERINFO_REGEX =
+        Pattern.compile("^((?:(?:https?)|(?:ftps?))://)(?:[^/]+@)(.*)$",
+            Pattern.CASE_INSENSITIVE);
+
+    /**
+     * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A.
+     * Example: PHPSESSID=9682993c8daa2c5497996114facdc805.
+     */
+    private static final Pattern STRIP_SESSION_ID_REGEX =
+    	 Pattern.compile("^(.+)(?:(?:(?:jsessionid)|(?:phpsessid))=" +
+    	                 "[0-9a-zA-Z]{32})(?:&(.*))?$",  
+    	                 Pattern.CASE_INSENSITIVE);
+
+    /**
+     * Example: sid=9682993c8daa2c5497996114facdc805. 
+     * 'sid=' can be tricky but all sid= followed by 32 byte string
+     * so far seen have been session ids.  Sid is a 32 byte string
+     * like the BASE_PATTERN only 'sid' is the tail of 'phpsessid'
+     * so have to have it run after the phpsessid elimination.
+     */
+    private static final Pattern STRIP_SID_REGEX =
+        Pattern.compile("^(.+)" +
+                "(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE);
+    
+    /**
+     * Example:ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM.
+     */
+    private static final Pattern STRIP_ASPSESSION_REGEX =
+        Pattern.compile("^(.+)" +
+                "(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$",
+                    Pattern.CASE_INSENSITIVE);
+
+    /**
+     * Examples:
+     *
+     *        (.NET 2.0)
+     *        http://legislature.mi.gov/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx
+     *     => http://legislature.mi.gov/mileg.aspx
+     *
+     *		  (.NET 1.0/1.1)
+     *        http://legislature.mi.gov/(4hqa0555fwsecu455xqckv45)/mileg.aspx
+     *     => http://legislature.mi.gov/mileg.aspx
+     *     
+     *     For more info, see: 
+     *     	  http://msdn2.microsoft.com/en-us/library/aa479315.aspx
+     *     
+     */
+    private static final Pattern STRIP_ASPSESSION2_REGEX =
+    	Pattern.compile("^([^\\?]+/)" +
+    			"(?:\\((?:S\\(|)[0-9a-z]{24}\\)(?:\\)|)/)([^\\?]+\\.aspx.*)$",
+    			Pattern.CASE_INSENSITIVE);
+    
+    /**
+     * Examples:
+     *
+     *        (.NET 2.0)
+     *        http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules
+     *     => http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules
+     *
+     *     For more info, see: 
+     *     	  http://msdn2.microsoft.com/en-us/library/aa479315.aspx
+     *     
+     */   
+
+    private static final Pattern STRIP_ASPSESSION3_REGEX =
+    	Pattern.compile("^([^\\?]+/" +
+    			"\\((?:a\\([0-9a-z]{24}\\)))(?:S\\([0-9a-z]{24}\\))" +
+    			"((?:f\\([0-9a-z]{24}\\))\\)/[^\\?]+\\.aspx.*)$",
+    			Pattern.CASE_INSENSITIVE);
+    
+    /**
+     * Strip ColdFusion session IDs. Remove sessionids that look like the 
+     * following:
+     * CFID=12412453&CFTOKEN=15501799
+     * CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A
+     */
+    private static final Pattern STRIP_CFSESSION_REGEX = 
+    	Pattern.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+(?:jsession=[^&]+)?)" +
+    			"(?:&(.*))?$",Pattern.CASE_INSENSITIVE);
+        
+    /**
+     * Run a regex that strips elements of a string.
+     * 
+     * Assumes the regex has a form that wants to strip elements of the passed
+     * string.  Assumes that if a match, appending group 1
+     * and group 2 yields desired result.
+     * @param url Url to search in.
+     * @param matcher Matcher whose form yields a group 1 and group 2 if a
+     * match (non-null.
+     * @return Original <code>url</code> else concatenization of group 1
+     * and group 2.
+     */
+    protected String doStripRegexMatch(String url, Matcher matcher) {
+        return (matcher != null && matcher.matches())?
+            checkForNull(matcher.group(1)) + checkForNull(matcher.group(2)):
+            url;
+    }
+
+    /**
+     * @param string String to check.
+     * @return <code>string</code> if non-null, else empty string ("").
+     */
+    private String checkForNull(String string) {
+        return (string != null)? string: "";
+    }
+    
+	/**
+	 * return the canonical string key for the URL argument.
+	 * 
+	 * @param urlString
+	 * @return String lookup key for URL argument.
+	 * @throws URIException 
+	 */
+	public String urlStringToKey(final String urlString) throws URIException {
+
+		String searchUrl = canonicalize(urlString);
+
+		// TODO: force https into http for the moment...
+		if(searchUrl.startsWith("https://")) {
+			searchUrl = searchUrl.substring(8);
+		}
+		
+		// TODO: this will only work with http:// scheme. should work with all?
+		// force add of scheme and possible add '/' with empty path:
+		if (searchUrl.startsWith("http://")) {
+			if (-1 == searchUrl.indexOf('/', 8)) {
+				searchUrl = searchUrl + "/";
+			}
+		} else {
+			if (-1 == searchUrl.indexOf("/")) {
+				searchUrl = searchUrl + "/";
+			}
+			searchUrl = "http://" + searchUrl;
+		}
+
+		// unescape anythying that can be:
+		UURI tmpURI = UURIFactory.getInstance(searchUrl);
+		tmpURI.setPath(tmpURI.getPath());
+		
+		
+		// convert to UURI to perform require URI fixup:
+		UURI searchURI = UURIFactory.getInstance(tmpURI.getURI());
+
+
+		
+		
+		// replace ' ' with '+' (this is only to match Alexa's canonicalization)
+		String newPath = searchURI.getEscapedPath().replace("%20","+");
+//		String newPath = searchURI.getPath().replace(' ','+');
+		
+		// replace multiple consecutive '/'s in the path.
+		while(newPath.contains("//")) {
+			newPath = newPath.replace("//","/");
+		}
+		
+		// this would remove trailing a '/' character, unless the path is empty
+		// but we're not going to do this just yet..
+//		if((newPath.length() > 1) && newPath.endsWith("/")) {
+//			newPath = newPath.substring(0,newPath.length()-1);
+//		}
+//		searchURI.setEscapedPath(newPath);
+//		searchURI.setRawPath(newPath.toCharArray());
+//		String query = searchURI.getEscapedQuery();
+		
+		// TODO: handle non HTTP port stripping, too.
+//		String portStr = "";
+//		if(searchURI.getPort() != 80 && searchURI.getPort() != -1) {
+//			portStr = ":" + searchURI.getPort();
+//		}
+//		return searchURI.getHostBasename() + portStr + 
+//		searchURI.getEscapedPathQuery();
+		
+		StringBuilder sb = new StringBuilder(searchUrl.length());
+		sb.append(searchURI.getHostBasename());
+		if(searchURI.getPort() != 80 && searchURI.getPort() != -1) {
+			sb.append(":").append(searchURI.getPort());
+		}
+		sb.append(newPath);
+		if(searchURI.getEscapedQuery() != null) {
+			sb.append("?").append(searchURI.getEscapedQuery());
+		}
+		
+
+		return sb.toString();
+	}
+
+	
+	/**
+	 * Idempotent operation that will determine the 'fuzziest'
+	 * form of the url argument. This operation is done prior to adding records
+	 * to the ResourceIndex, and prior to lookup. Current version is exactly
+	 * the default found in Heritrix. When the configuration system for
+	 * Heritrix stabilizes, hopefully this can use the system directly within
+	 * Heritrix.
+	 * 
+	 * @param url to be canonicalized.
+	 * @return canonicalized version of url argument.
+	 */
+	public String canonicalize(String url) {
+        url = doStripRegexMatch(url, STRIP_USERINFO_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_WWW_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_WWWN_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_SESSION_ID_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_ASPSESSION_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_ASPSESSION2_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_ASPSESSION3_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_SID_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_CFSESSION_REGEX.matcher(url));
+        url = url.toLowerCase();
+        if (url == null || url.length() <= 0) {
+            return url;
+        }
+        
+        int index = url.lastIndexOf('?');
+        if (index > 0) {
+            if (index == (url.length() - 1)) {
+                // '?' is last char in url.  Strip it.
+                url = url.substring(0, url.length() - 1);
+            } else if (url.charAt(index + 1) == '&') {
+                // Next char is '&'. Strip it.
+                if (url.length() == (index + 2)) {
+                    // Then url ends with '?&'.  Strip them.
+                    url = url.substring(0, url.length() - 2);
+                } else {
+                    // The '&' is redundant.  Strip it.
+                    url = url.substring(0, index + 1) +
+                    url.substring(index + 2);
+                }
+            } else if (url.charAt(url.length() - 1) == '&') {
+                // If we have a lone '&' on end of query str,
+                // strip it.
+                url = url.substring(0, url.length() - 1);
+            }
+        }
+        return url;
+	}
+	
+	private static void USAGE() {
+		System.err.println("Usage: [-f FIELD] [-d DELIM]");
+		System.exit(3);
+	}
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		AggressiveUrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer();
+		int n = 0;
+		int i = 0;
+		ArrayList<Integer> columns = new ArrayList<Integer>();
+		
+		long lineNumber = 0;
+		boolean cdxPassThru = false;
+		String delimiter = " ";
+		while(n < args.length) {
+			String arg = args[n];
+			if(arg.compareTo("-cdx") == 0) {
+				cdxPassThru = true;
+				n++;
+				continue;
+			}
+			if(n == (args.length -1)) {
+				USAGE();
+			}
+			String val = args[n+1];
+			if(arg.compareTo("-f") == 0) {
+				columns.add(new Integer(val));
+			} else if(arg.compareTo("-d") == 0) {
+				delimiter = val;
+			} else {
+				USAGE();
+			}
+			n += 2;
+		}
+		// place default '0' in case none specified:
+		if(columns.size() == 0) {
+			columns.add(new Integer(1));
+		}
+		
+		// convert to int[]:
+		int[] cols = new int[columns.size()];
+		for(int idx = 0; idx < columns.size(); idx++) {
+			cols[idx] = columns.get(idx).intValue() - 1;
+		}
+		BufferedReader r = new BufferedReader(new InputStreamReader(System.in));
+		StringBuilder sb = new StringBuilder();
+		String line = null;
+		
+		while(true) {
+			try {
+				line = r.readLine();
+			} catch (IOException e) {
+				e.printStackTrace();
+				System.exit(1);
+			}
+			if(line == null) {
+				break;
+			}
+			lineNumber++;
+			if(cdxPassThru && line.startsWith(CDX_PREFIX)) {
+				System.out.println(line);
+				continue;
+			}
+			String parts[] = line.split(delimiter);
+			for(int column : cols) {
+				if(column >= parts.length) {
+					System.err.println("Invalid line " + lineNumber + " (" +
+							line + ") skipped");
+				} else {
+					try {
+						parts[column] = canonicalizer.urlStringToKey(parts[column]);
+					} catch (URIException e) {
+						System.err.println("Invalid URL in line " + lineNumber + " (" +
+								line + ") skipped (" + parts[column] + ")");
+						e.printStackTrace();
+						continue;
+					}
+				}
+			}
+			sb.setLength(0);
+			for(i = 0; i < parts.length; i++) {
+				sb.append(parts[i]);
+				if(i < (parts.length-1)) {
+					sb.append(delimiter);
+				}
+			}
+			System.out.println(sb.toString());
+		}
+	}
+}
\ No newline at end of file

Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java	2008-01-15 02:22:35 UTC (rev 2131)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java	2008-01-15 02:26:01 UTC (rev 2132)
@@ -1,391 +0,0 @@
-/* UrlCanonicalizer
- *
- * $Id$
- *
- * Created on 2:08:07 PM Oct 11, 2006.
- *
- * Copyright (C) 2006 Internet Archive.
- *
- * This file is part of Wayback.
- *
- * Wayback is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * any later version.
- *
- * Wayback is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser Public License for more details.
- *
- * You should have received a copy of the GNU Lesser Public License
- * along with Wayback; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-package org.archive.wayback.util.url;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.httpclient.URIException;
-import org.archive.net.UURI;
-import org.archive.net.UURIFactory;
-
-/**
- * Class that performs the standard Heritrix URL canonicalization. Eventually,
- * this should all be configurable, or perhaps be able to read the settings
- * used within a Heritrix crawler... or even multiple crawlers... this is hard.
- *
- * @author brad
- * @version $Date$, $Revision$
- */
-public class UrlCanonicalizer {
-
-	
-	private static final String CDX_PREFIX = " CDX ";
-    /**
-     * Strip leading 'www.'
-     */
-    private static final Pattern STRIP_WWW_REGEX =
-        Pattern.compile("(?i)^(https?://)(?:www\\.)([^/]*/.+)$");
-    /**
-     * Strip leading 'www44.', 'www3.', etc.
-     */
-    private static final Pattern STRIP_WWWN_REGEX =
-        Pattern.compile("(?i)^(https?://)(?:www[0-9]+\\.)([^/]*/.+)$");
-    /**
-     * Strip userinfo.
-     */
-    private static final Pattern STRIP_USERINFO_REGEX =
-        Pattern.compile("^((?:(?:https?)|(?:ftps?))://)(?:[^/]+@)(.*)$",
-            Pattern.CASE_INSENSITIVE);
-
-    /**
-     * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A.
-     * Example: PHPSESSID=9682993c8daa2c5497996114facdc805.
-     */
-    private static final Pattern STRIP_SESSION_ID_REGEX =
-    	 Pattern.compile("^(.+)(?:(?:(?:jsessionid)|(?:phpsessid))=" +
-    	                 "[0-9a-zA-Z]{32})(?:&(.*))?$",  
-    	                 Pattern.CASE_INSENSITIVE);
-
-    /**
-     * Example: sid=9682993c8daa2c5497996114facdc805. 
-     * 'sid=' can be tricky but all sid= followed by 32 byte string
-     * so far seen have been session ids.  Sid is a 32 byte string
-     * like the BASE_PATTERN only 'sid' is the tail of 'phpsessid'
-     * so have to have it run after the phpsessid elimination.
-     */
-    private static final Pattern STRIP_SID_REGEX =
-        Pattern.compile("^(.+)" +
-                "(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE);
-    
-    /**
-     * Example:ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM.
-     */
-    private static final Pattern STRIP_ASPSESSION_REGEX =
-        Pattern.compile("^(.+)" +
-                "(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$",
-                    Pattern.CASE_INSENSITIVE);
-
-    /**
-     * Examples:
-     *
-     *        (.NET 2.0)
-     *        http://legislature.mi.gov/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx
-     *     => http://legislature.mi.gov/mileg.aspx
-     *
-     *		  (.NET 1.0/1.1)
-     *        http://legislature.mi.gov/(4hqa0555fwsecu455xqckv45)/mileg.aspx
-     *     => http://legislature.mi.gov/mileg.aspx
-     *     
-     *     For more info, see: 
-     *     	  http://msdn2.microsoft.com/en-us/library/aa479315.aspx
-     *     
-     */
-    private static final Pattern STRIP_ASPSESSION2_REGEX =
-    	Pattern.compile("^([^\\?]+/)" +
-    			"(?:\\((?:S\\(|)[0-9a-z]{24}\\)(?:\\)|)/)([^\\?]+\\.aspx.*)$",
-    			Pattern.CASE_INSENSITIVE);
-    
-    /**
-     * Examples:
-     *
-     *        (.NET 2.0)
-     *        http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules
-     *     => http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules
-     *
-     *     For more info, see: 
-     *     	  http://msdn2.microsoft.com/en-us/library/aa479315.aspx
-     *     
-     */   
-
-    private static final Pattern STRIP_ASPSESSION3_REGEX =
-    	Pattern.compile("^([^\\?]+/" +
-    			"\\((?:a\\([0-9a-z]{24}\\)))(?:S\\([0-9a-z]{24}\\))" +
-    			"((?:f\\([0-9a-z]{24}\\))\\)/[^\\?]+\\.aspx.*)$",
-    			Pattern.CASE_INSENSITIVE);
-    
-    /**
-     * Strip ColdFusion session IDs. Remove sessionids that look like the 
-     * following:
-     * CFID=12412453&CFTOKEN=15501799
-     * CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A
-     */
-    private static final Pattern STRIP_CFSESSION_REGEX = 
-    	Pattern.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+(?:jsession=[^&]+)?)" +
-    			"(?:&(.*))?$",Pattern.CASE_INSENSITIVE);
-        
-    /**
-     * Run a regex that strips elements of a string.
-     * 
-     * Assumes the regex has a form that wants to strip elements of the passed
-     * string.  Assumes that if a match, appending group 1
-     * and group 2 yields desired result.
-     * @param url Url to search in.
-     * @param matcher Matcher whose form yields a group 1 and group 2 if a
-     * match (non-null.
-     * @return Original <code>url</code> else concatenization of group 1
-     * and group 2.
-     */
-    protected String doStripRegexMatch(String url, Matcher matcher) {
-        return (matcher != null && matcher.matches())?
-            checkForNull(matcher.group(1)) + checkForNull(matcher.group(2)):
-            url;
-    }
-
-    /**
-     * @param string String to check.
-     * @return <code>string</code> if non-null, else empty string ("").
-     */
-    private String checkForNull(String string) {
-        return (string != null)? string: "";
-    }
-    
-	/**
-	 * return the canonical string key for the URL argument.
-	 * 
-	 * @param urlString
-	 * @return String lookup key for URL argument.
-	 * @throws URIException 
-	 */
-	public String urlStringToKey(final String urlString) throws URIException {
-
-		String searchUrl = canonicalize(urlString);
-
-		// TODO: force https into http for the moment...
-		if(searchUrl.startsWith("https://")) {
-			searchUrl = searchUrl.substring(8);
-		}
-		
-		// TODO: this will only work with http:// scheme. should work with all?
-		// force add of scheme and possible add '/' with empty path:
-		if (searchUrl.startsWith("http://")) {
-			if (-1 == searchUrl.indexOf('/', 8)) {
-				searchUrl = searchUrl + "/";
-			}
-		} else {
-			if (-1 == searchUrl.indexOf("/")) {
-				searchUrl = searchUrl + "/";
-			}
-			searchUrl = "http://" + searchUrl;
-		}
-
-		// unescape anythying that can be:
-		UURI tmpURI = UURIFactory.getInstance(searchUrl);
-		tmpURI.setPath(tmpURI.getPath());
-		
-		
-		// convert to UURI to perform require URI fixup:
-		UURI searchURI = UURIFactory.getInstance(tmpURI.getURI());
-
-
-		
-		
-		// replace ' ' with '+' (this is only to match Alexa's canonicalization)
-		String newPath = searchURI.getEscapedPath().replace("%20","+");
-//		String newPath = searchURI.getPath().replace(' ','+');
-		
-		// replace multiple consecutive '/'s in the path.
-		while(newPath.contains("//")) {
-			newPath = newPath.replace("//","/");
-		}
-		
-		// this would remove trailing a '/' character, unless the path is empty
-		// but we're not going to do this just yet..
-//		if((newPath.length() > 1) && newPath.endsWith("/")) {
-//			newPath = newPath.substring(0,newPath.length()-1);
-//		}
-//		searchURI.setEscapedPath(newPath);
-//		searchURI.setRawPath(newPath.toCharArray());
-//		String query = searchURI.getEscapedQuery();
-		
-		// TODO: handle non HTTP port stripping, too.
-//		String portStr = "";
-//		if(searchURI.getPort() != 80 && searchURI.getPort() != -1) {
-//			portStr = ":" + searchURI.getPort();
-//		}
-//		return searchURI.getHostBasename() + portStr + 
-//		searchURI.getEscapedPathQuery();
-		
-		StringBuilder sb = new StringBuilder(searchUrl.length());
-		sb.append(searchURI.getHostBasename());
-		if(searchURI.getPort() != 80 && searchURI.getPort() != -1) {
-			sb.append(":").append(searchURI.getPort());
-		}
-		sb.append(newPath);
-		if(searchURI.getEscapedQuery() != null) {
-			sb.append("?").append(searchURI.getEscapedQuery());
-		}
-		
-
-		return sb.toString();
-	}
-
-	
-	/**
-	 * Idempotent operation that will determine the 'fuzziest'
-	 * form of the url argument. This operation is done prior to adding records
-	 * to the ResourceIndex, and prior to lookup. Current version is exactly
-	 * the default found in Heritrix. When the configuration system for
-	 * Heritrix stabilizes, hopefully this can use the system directly within
-	 * Heritrix.
-	 * 
-	 * @param url to be canonicalized.
-	 * @return canonicalized version of url argument.
-	 */
-	public String canonicalize(String url) {
-        url = doStripRegexMatch(url, STRIP_USERINFO_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_WWW_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_WWWN_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_SESSION_ID_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_ASPSESSION_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_ASPSESSION2_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_ASPSESSION3_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_SID_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_CFSESSION_REGEX.matcher(url));
-        url = url.toLowerCase();
-        if (url == null || url.length() <= 0) {
-            return url;
-        }
-        
-        int index = url.lastIndexOf('?');
-        if (index > 0) {
-            if (index == (url.length() - 1)) {
-                // '?' is last char in url.  Strip it.
-                url = url.substring(0, url.length() - 1);
-            } else if (url.charAt(index + 1) == '&') {
-                // Next char is '&'. Strip it.
-                if (url.length() == (index + 2)) {
-                    // Then url ends with '?&'.  Strip them.
-                    url = url.substring(0, url.length() - 2);
-                } else {
-                    // The '&' is redundant.  Strip it.
-                    url = url.substring(0, index + 1) +
-                    url.substring(index + 2);
-                }
-            } else if (url.charAt(url.length() - 1) == '&') {
-                // If we have a lone '&' on end of query str,
-                // strip it.
-                url = url.substring(0, url.length() - 1);
-            }
-        }
-        return url;
-	}
-	
-	private static void USAGE() {
-		System.err.println("Usage: [-f FIELD] [-d DELIM]");
-		System.exit(3);
-	}
-	/**
-	 * @param args
-	 */
-	public static void main(String[] args) {
-		UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
-		int n = 0;
-		int i = 0;
-		ArrayList<Integer> columns = new ArrayList<Integer>();
-		
-		long lineNumber = 0;
-		boolean cdxPassThru = false;
-		String delimiter = " ";
-		while(n < args.length) {
-			String arg = args[n];
-			if(arg.compareTo("-cdx") == 0) {
-				cdxPassThru = true;
-				n++;
-				continue;
-			}
-			if(n == (args.length -1)) {
-				USAGE();
-			}
-			String val = args[n+1];
-			if(arg.compareTo("-f") == 0) {
-				columns.add(new Integer(val));
-			} else if(arg.compareTo("-d") == 0) {
-				delimiter = val;
-			} else {
-				USAGE();
-			}
-			n += 2;
-		}
-		// place default '0' in case none specified:
-		if(columns.size() == 0) {
-			columns.add(new Integer(1));
-		}
-		
-		// convert to int[]:
-		int[] cols = new int[columns.size()];
-		for(int idx = 0; idx < columns.size(); idx++) {
-			cols[idx] = columns.get(idx).intValue() - 1;
-		}
-		BufferedReader r = new BufferedReader(new InputStreamReader(System.in));
-		StringBuilder sb = new StringBuilder();
-		String line = null;
-		
-		while(true) {
-			try {
-				line = r.readLine();
-			} catch (IOException e) {
-				e.printStackTrace();
-				System.exit(1);
-			}
-			if(line == null) {
-				break;
-			}
-			lineNumber++;
-			if(cdxPassThru && line.startsWith(CDX_PREFIX)) {
-				System.out.println(line);
-				continue;
-			}
-			String parts[] = line.split(delimiter);
-			for(int column : cols) {
-				if(column >= parts.length) {
-					System.err.println("Invalid line " + lineNumber + " (" +
-							line + ") skipped");
-				} else {
-					try {
-						parts[column] = canonicalizer.urlStringToKey(parts[column]);
-					} catch (URIException e) {
-						System.err.println("Invalid URL in line " + lineNumber + " (" +
-								line + ") skipped (" + parts[column] + ")");
-						e.printStackTrace();
-						continue;
-					}
-				}
-			}
-			sb.setLength(0);
-			for(i = 0; i < parts.length; i++) {
-				sb.append(parts[i]);
-				if(i < (parts.length-1)) {
-					sb.append(delimiter);
-				}
-			}
-			System.out.println(sb.toString());
-		}
-	}
-}
\ No newline at end of file


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

[Archive-access-cvs] SF.net SVN: archive-access: [2131] trunk/archive-access/projects/wayback/ wayback-core/src/test/java/org/archive/wayback/util

From: <bra...@us...> - 2008-01-15 02:22:31

Revision: 2131
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2131&view=rev
Author:   bradtofel
Date:     2008-01-14 18:22:35 -0800 (Mon, 14 Jan 2008)

Log Message:
-----------
REFACTOR: moved isAuthority() tests from UrlCanonicalizerTest to UrlOperationsTest. Moved UrlCanonicalizerTest to package org.archive.wayback.util.url

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/
    trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlCanonicalizerTest.java
    trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java

Removed Paths:
-------------
    trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/UrlCanonicalizerTest.java

Deleted: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/UrlCanonicalizerTest.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/UrlCanonicalizerTest.java	2008-01-15 02:17:09 UTC (rev 2130)
+++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/UrlCanonicalizerTest.java	2008-01-15 02:22:35 UTC (rev 2131)
@@ -1,221 +0,0 @@
-/* UrlCanonicalizerTest
- *
- * $Id$
- *
- * Created on 2:13:36 PM Oct 11, 2006.
- *
- * Copyright (C) 2006 Internet Archive.
- *
- * This file is part of Wayback.
- *
- * Wayback is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * any later version.
- *
- * Wayback is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser Public License for more details.
- *
- * You should have received a copy of the GNU Lesser Public License
- * along with Wayback; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-package org.archive.wayback.util;
-
-import org.apache.commons.httpclient.URIException;
-
-import junit.framework.TestCase;
-
-/**
- *
- *
- * @author brad
- * @version $Date$, $Revision$
- */
-public class UrlCanonicalizerTest extends TestCase {
-	private UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
-	/**
-	 * Test method for 'org.archive.wayback.cdx.CDXRecord.urlStringToKey(String)'
-	 */
-	public void testUrlStringToKey() {
-
-		checkAuthority("foo.com",true);
-		checkAuthority("foo.con",false);
-		checkAuthority("foo.de",true);
-		checkAuthority("foo.denny",false);
-		checkAuthority("1.1.1.1",true);
-		checkAuthority("23.4.4.foo",false);
-		checkAuthority("23.4.4.com",true);
-		checkAuthority("com.23.4.4.134",false);
-		
-		
-		// simple strip of http://
-		checkCanonicalization("http://foo.com/","foo.com/");
-
-// would be nice to handle other protocols...
-//		// simple strip of https://
-//		checkCanonicalization("https://foo.com/","foo.com/");
-//
-//		// simple strip of ftp://
-//		checkCanonicalization("ftp://foo.com/","foo.com/");
-//
-//		// simple strip of rtsp://
-//		checkCanonicalization("rtsp://foo.com/","foo.com/");
-
-		// strip leading 'www.'
-		checkCanonicalization("http://www.foo.com/","foo.com/");
-		
-		// add trailing '/' with empty path
-		checkCanonicalization("http://www.foo.com","foo.com/");
-		
-		// strip leading 'www##.'
-		checkCanonicalization("http://www12.foo.com/","foo.com/");
-		
-		// strip leading 'www##.' with no protocol
-		checkCanonicalization("www12.foo.com/","foo.com/");
-		
-		
-		// leave alone an url with no protocol but non-empty path
-		checkCanonicalization("foo.com/","foo.com/");
-		
-		// add trailing '/' with empty path and without protocol
-		checkCanonicalization("foo.com","foo.com/");
-
-		// add trailing '/' to with empty path and no protocol, plus massage
-		checkCanonicalization("www12.foo.com","foo.com/");
-
-		// do not add trailing '/' non-empty path and without protocol
-		checkCanonicalization("foo.com/boo","foo.com/boo");
-
-		// TEST
-		// replace escaped ' ' with '+' in path plus keep trailing slash and query
-		checkCanonicalization("foo.com/pa%20th?a=b","foo.com/pa+th?a=b");
-		
-		
-		// replace escaped ' ' with '+' in path
-		checkCanonicalization("foo.com/pa%20th","foo.com/pa+th");
-		
-		// replace escaped ' ' with '+' in path plus leave trailing slash
-		checkCanonicalization("foo.com/pa%20th/","foo.com/pa+th/");
-
-		// replace multiple consecutive /'s in path
-		checkCanonicalization("foo.com//goo","foo.com/goo");
-
-		// replace multiple consecutive /'s in path
-		checkCanonicalization("foo.com///goo","foo.com/goo");
-
-		// leave alone consecutive /'s after ?
-		checkCanonicalization("foo.com/b?jar=//goo","foo.com/b?jar=//goo");
-
-		// replace multiple consecutive /'s in path, plus leave trailing /
-		checkCanonicalization("foo.com///goo/","foo.com/goo/");
-
-		// replace escaped ' ' with '+' in path plus keep trailing slash and query
-		checkCanonicalization("foo.com/pa%20th/?a=b","foo.com/pa+th/?a=b");
-		
-		
-		// replace escaped ' ' with '+' in path but not in query key
-		checkCanonicalization("foo.com/pa%20th?a%20a=b","foo.com/pa+th?a%20a=b");
-
-		// replace escaped ' ' with '+' in path but not in query value
-		checkCanonicalization("foo.com/pa%20th?a=b%20b","foo.com/pa+th?a=b%20b");
-
-		
-		// no change in '!' escaping
-		checkCanonicalization("foo.com/pa!th","foo.com/pa!th");
-
-		// no change in '+' escaping
-		checkCanonicalization("foo.com/pa+th","foo.com/pa+th");
-
-		// unescape legal escaped '!' (%21)
-		checkCanonicalization("foo.com/pa%21th","foo.com/pa!th");
-
-		// leave '%' (%25)
-		checkCanonicalization("foo.com/pa%th","foo.com/pa%th");
-
-		// unescape '%' (%25)
-		checkCanonicalization("foo.com/pa%25th","foo.com/pa%th");
-		
-		
-		// replace escaped ' ' with '+' in path, unescape legal '!' in path
-		// no change in query escaping
-		checkCanonicalization("foo.com/pa%20t%21h?a%20a=b","foo.com/pa+t!h?a%20a=b");
-		
-		// replace escaped ' ' with '+' in path, leave illegal '%02' in path
-		// no change in query escaping
-		checkCanonicalization("foo.com/pa%20t%02h?a%20a=b","foo.com/pa+t%02h?a%20a=b");
-
-		// strip jsessionid
-		String sid1 = "jsessionid=0123456789abcdefghijklemopqrstuv";
-		String sid2 = "PHPSESSID=9682993c8daa2c5497996114facdc805";
-		String sid3 = "sid=9682993c8daa2c5497996114facdc805";
-		String sid4 = "ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM";
-		String sid5 = "CFID=12412453&CFTOKEN=15501799";
-		//String sid6 = "CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A";
-
-		String fore = "http://foo.com/bar?bo=lo&";
-		String aft = "&gum=yum";
-		String want = "foo.com/bar?bo=lo&gum=yum";
-//		String fore = "http://www.archive.org/index.html?";
-//		String aft = "";
-//		String want = "archive.org/index.html";
-		
-		checkCanonicalization(fore + sid1 + aft,want);
-		checkCanonicalization(fore + sid2 + aft,want);
-		checkCanonicalization(fore + sid3 + aft,want);
-		checkCanonicalization(fore + sid4 + aft,want);
-		checkCanonicalization(fore + sid5 + aft,want);
-		//checkCanonicalization(fore + sid6 + aft,want);
-
-		// Check ASP_SESSIONID2:
-		checkCanonicalization(
-				"http://legislature.mi.gov/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx",
-				"legislature.mi.gov/mileg.aspx");
-
-		// Check ASP_SESSIONID2 (again):
-		checkCanonicalization(
-				"http://legislature.mi.gov/(4hqa0555fwsecu455xqckv45)/mileg.aspx",
-				"legislature.mi.gov/mileg.aspx");
-
-		// Check ASP_SESSIONID3:
-		checkCanonicalization(
-				"http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules",
-				"legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules");
-		
-		// strip port 80
-		checkCanonicalization("http://www.chub.org:80/foo","chub.org/foo");
-
-		// but not other ports...
-		checkCanonicalization("http://www.chub.org:8080/foo","chub.org:8080/foo");
-
-	}
-	
-	private void checkAuthority(String s, boolean want) {
-		boolean got = canonicalizer.isAuthority(s);
-		if(want) {
-			assertTrue("String("+s+") could be an Authority",want == got);
-		} else {
-			assertTrue("String("+s+") is not an Authority",want == got);	
-		}
-	}
-	
-	private void checkCanonicalization(String orig, String want) {
-		String got;
-		try {
-			got = canonicalizer.urlStringToKey(orig);
-			assertEquals("Failed canonicalization (" + orig + ") => (" + got + 
-					") and not (" + want + ") as expected",want,got);
-			
-			String got2 = canonicalizer.urlStringToKey(got);
-			assertEquals("Failed 2nd canonicalization (" + got + ") => (" + 
-					got2 + ") and not (" + want + ") as expected",want,got2);
-			
-			
-		} catch (URIException e) {
-			e.printStackTrace();
-			assertTrue("Exception converting(" + orig + ")",false);
-		}
-	}
-}

Copied: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlCanonicalizerTest.java (from rev 2118, trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/UrlCanonicalizerTest.java)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlCanonicalizerTest.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlCanonicalizerTest.java	2008-01-15 02:22:35 UTC (rev 2131)
@@ -0,0 +1,203 @@
+/* UrlCanonicalizerTest
+ *
+ * $Id$
+ *
+ * Created on 2:13:36 PM Oct 11, 2006.
+ *
+ * Copyright (C) 2006 Internet Archive.
+ *
+ * This file is part of Wayback.
+ *
+ * Wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * Wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+package org.archive.wayback.util.url;
+
+import org.apache.commons.httpclient.URIException;
+import org.archive.wayback.util.url.UrlCanonicalizer;
+
+import junit.framework.TestCase;
+
+/**
+ *
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class UrlCanonicalizerTest extends TestCase {
+	private UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
+	/**
+	 * Test method for 'org.archive.wayback.cdx.CDXRecord.urlStringToKey(String)'
+	 */
+	public void testUrlStringToKey() {
+
+		// simple strip of http://
+		checkCanonicalization("http://foo.com/","foo.com/");
+
+// would be nice to handle other protocols...
+//		// simple strip of https://
+//		checkCanonicalization("https://foo.com/","foo.com/");
+//
+//		// simple strip of ftp://
+//		checkCanonicalization("ftp://foo.com/","foo.com/");
+//
+//		// simple strip of rtsp://
+//		checkCanonicalization("rtsp://foo.com/","foo.com/");
+
+		// strip leading 'www.'
+		checkCanonicalization("http://www.foo.com/","foo.com/");
+		
+		// add trailing '/' with empty path
+		checkCanonicalization("http://www.foo.com","foo.com/");
+		
+		// strip leading 'www##.'
+		checkCanonicalization("http://www12.foo.com/","foo.com/");
+		
+		// strip leading 'www##.' with no protocol
+		checkCanonicalization("www12.foo.com/","foo.com/");
+		
+		
+		// leave alone an url with no protocol but non-empty path
+		checkCanonicalization("foo.com/","foo.com/");
+		
+		// add trailing '/' with empty path and without protocol
+		checkCanonicalization("foo.com","foo.com/");
+
+		// add trailing '/' to with empty path and no protocol, plus massage
+		checkCanonicalization("www12.foo.com","foo.com/");
+
+		// do not add trailing '/' non-empty path and without protocol
+		checkCanonicalization("foo.com/boo","foo.com/boo");
+
+		// TEST
+		// replace escaped ' ' with '+' in path plus keep trailing slash and query
+		checkCanonicalization("foo.com/pa%20th?a=b","foo.com/pa+th?a=b");
+		
+		
+		// replace escaped ' ' with '+' in path
+		checkCanonicalization("foo.com/pa%20th","foo.com/pa+th");
+		
+		// replace escaped ' ' with '+' in path plus leave trailing slash
+		checkCanonicalization("foo.com/pa%20th/","foo.com/pa+th/");
+
+		// replace multiple consecutive /'s in path
+		checkCanonicalization("foo.com//goo","foo.com/goo");
+
+		// replace multiple consecutive /'s in path
+		checkCanonicalization("foo.com///goo","foo.com/goo");
+
+		// leave alone consecutive /'s after ?
+		checkCanonicalization("foo.com/b?jar=//goo","foo.com/b?jar=//goo");
+
+		// replace multiple consecutive /'s in path, plus leave trailing /
+		checkCanonicalization("foo.com///goo/","foo.com/goo/");
+
+		// replace escaped ' ' with '+' in path plus keep trailing slash and query
+		checkCanonicalization("foo.com/pa%20th/?a=b","foo.com/pa+th/?a=b");
+		
+		
+		// replace escaped ' ' with '+' in path but not in query key
+		checkCanonicalization("foo.com/pa%20th?a%20a=b","foo.com/pa+th?a%20a=b");
+
+		// replace escaped ' ' with '+' in path but not in query value
+		checkCanonicalization("foo.com/pa%20th?a=b%20b","foo.com/pa+th?a=b%20b");
+
+		
+		// no change in '!' escaping
+		checkCanonicalization("foo.com/pa!th","foo.com/pa!th");
+
+		// no change in '+' escaping
+		checkCanonicalization("foo.com/pa+th","foo.com/pa+th");
+
+		// unescape legal escaped '!' (%21)
+		checkCanonicalization("foo.com/pa%21th","foo.com/pa!th");
+
+		// leave '%' (%25)
+		checkCanonicalization("foo.com/pa%th","foo.com/pa%th");
+
+		// unescape '%' (%25)
+		checkCanonicalization("foo.com/pa%25th","foo.com/pa%th");
+		
+		
+		// replace escaped ' ' with '+' in path, unescape legal '!' in path
+		// no change in query escaping
+		checkCanonicalization("foo.com/pa%20t%21h?a%20a=b","foo.com/pa+t!h?a%20a=b");
+		
+		// replace escaped ' ' with '+' in path, leave illegal '%02' in path
+		// no change in query escaping
+		checkCanonicalization("foo.com/pa%20t%02h?a%20a=b","foo.com/pa+t%02h?a%20a=b");
+
+		// strip jsessionid
+		String sid1 = "jsessionid=0123456789abcdefghijklemopqrstuv";
+		String sid2 = "PHPSESSID=9682993c8daa2c5497996114facdc805";
+		String sid3 = "sid=9682993c8daa2c5497996114facdc805";
+		String sid4 = "ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM";
+		String sid5 = "CFID=12412453&CFTOKEN=15501799";
+		//String sid6 = "CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A";
+
+		String fore = "http://foo.com/bar?bo=lo&";
+		String aft = "&gum=yum";
+		String want = "foo.com/bar?bo=lo&gum=yum";
+//		String fore = "http://www.archive.org/index.html?";
+//		String aft = "";
+//		String want = "archive.org/index.html";
+		
+		checkCanonicalization(fore + sid1 + aft,want);
+		checkCanonicalization(fore + sid2 + aft,want);
+		checkCanonicalization(fore + sid3 + aft,want);
+		checkCanonicalization(fore + sid4 + aft,want);
+		checkCanonicalization(fore + sid5 + aft,want);
+		//checkCanonicalization(fore + sid6 + aft,want);
+
+		// Check ASP_SESSIONID2:
+		checkCanonicalization(
+				"http://legislature.mi.gov/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx",
+				"legislature.mi.gov/mileg.aspx");
+
+		// Check ASP_SESSIONID2 (again):
+		checkCanonicalization(
+				"http://legislature.mi.gov/(4hqa0555fwsecu455xqckv45)/mileg.aspx",
+				"legislature.mi.gov/mileg.aspx");
+
+		// Check ASP_SESSIONID3:
+		checkCanonicalization(
+				"http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules",
+				"legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules");
+		
+		// strip port 80
+		checkCanonicalization("http://www.chub.org:80/foo","chub.org/foo");
+
+		// but not other ports...
+		checkCanonicalization("http://www.chub.org:8080/foo","chub.org:8080/foo");
+
+	}
+	
+	private void checkCanonicalization(String orig, String want) {
+		String got;
+		try {
+			got = canonicalizer.urlStringToKey(orig);
+			assertEquals("Failed canonicalization (" + orig + ") => (" + got + 
+					") and not (" + want + ") as expected",want,got);
+			
+			String got2 = canonicalizer.urlStringToKey(got);
+			assertEquals("Failed 2nd canonicalization (" + got + ") => (" + 
+					got2 + ") and not (" + want + ") as expected",want,got2);
+			
+			
+		} catch (URIException e) {
+			e.printStackTrace();
+			assertTrue("Exception converting(" + orig + ")",false);
+		}
+	}
+}

Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java	2008-01-15 02:22:35 UTC (rev 2131)
@@ -0,0 +1,31 @@
+package org.archive.wayback.util.url;
+
+import junit.framework.TestCase;
+
+/**
+ * Stub for testing UrlOperations static methods
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class UrlOperationsTest extends TestCase {
+	public void testIsAuthority() {
+		checkAuthority("foo.com",true);
+		checkAuthority("foo.con",false);
+		checkAuthority("foo.de",true);
+		checkAuthority("foo.denny",false);
+		checkAuthority("1.1.1.1",true);
+		checkAuthority("23.4.4.foo",false);
+		checkAuthority("23.4.4.com",true);
+		checkAuthority("com.23.4.4.134",false);
+	}
+	
+	private void checkAuthority(String s, boolean want) {
+		boolean got = UrlOperations.isAuthority(s);
+		if(want) {
+			assertTrue("String("+s+") could be an Authority",want == got);
+		} else {
+			assertTrue("String("+s+") is not an Authority",want == got);	
+		}
+	}	
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.

Flat | Threaded

<< < 1 .. 65 66 67 68 69 .. 171 > >> (Page 67 of 171)