[Archive-access-cvs] SF.net SVN: archive-access: [2130] trunk/archive-access/projects/wayback/ way

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2130
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2130&view=rev
Author:   bradtofel
Date:     2008-01-14 18:17:09 -0800 (Mon, 14 Jan 2008)

Log Message:
-----------
REFACTOR: moved UrlCanonicalizer to org.archive.wayback.util.url

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java

Removed Paths:
-------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java

Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java
===================================================================

--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java	2008-01-15 01:46:54 UTC (rev 2129)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java	2008-01-15 02:17:09 UTC (rev 2130)
@@ -1,391 +0,0 @@
-/* UrlCanonicalizer
- *
- * $Id$
- *
- * Created on 2:08:07 PM Oct 11, 2006.
- *
- * Copyright (C) 2006 Internet Archive.
- *
- * This file is part of Wayback.
- *
- * Wayback is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * any later version.
- *
- * Wayback is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser Public License for more details.
- *
- * You should have received a copy of the GNU Lesser Public License
- * along with Wayback; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-package org.archive.wayback.util;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.httpclient.URIException;
-import org.archive.net.UURI;
-import org.archive.net.UURIFactory;
-
-/**
- * Class that performs the standard Heritrix URL canonicalization. Eventually,
- * this should all be configurable, or perhaps be able to read the settings
- * used within a Heritrix crawler... or even multiple crawlers... this is hard.
- *
- * @author brad
- * @version $Date$, $Revision$
- */
-public class UrlCanonicalizer {
-
-	
-	private static final String CDX_PREFIX = " CDX ";
-    /**
-     * Strip leading 'www.'
-     */
-    private static final Pattern STRIP_WWW_REGEX =
-        Pattern.compile("(?i)^(https?://)(?:www\\.)([^/]*/.+)$");
-    /**
-     * Strip leading 'www44.', 'www3.', etc.
-     */
-    private static final Pattern STRIP_WWWN_REGEX =
-        Pattern.compile("(?i)^(https?://)(?:www[0-9]+\\.)([^/]*/.+)$");
-    /**
-     * Strip userinfo.
-     */
-    private static final Pattern STRIP_USERINFO_REGEX =
-        Pattern.compile("^((?:(?:https?)|(?:ftps?))://)(?:[^/]+@)(.*)$",
-            Pattern.CASE_INSENSITIVE);
-
-    /**
-     * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A.
-     * Example: PHPSESSID=9682993c8daa2c5497996114facdc805.
-     */
-    private static final Pattern STRIP_SESSION_ID_REGEX =
-    	 Pattern.compile("^(.+)(?:(?:(?:jsessionid)|(?:phpsessid))=" +
-    	                 "[0-9a-zA-Z]{32})(?:&(.*))?$",  
-    	                 Pattern.CASE_INSENSITIVE);
-
-    /**
-     * Example: sid=9682993c8daa2c5497996114facdc805. 
-     * 'sid=' can be tricky but all sid= followed by 32 byte string
-     * so far seen have been session ids.  Sid is a 32 byte string
-     * like the BASE_PATTERN only 'sid' is the tail of 'phpsessid'
-     * so have to have it run after the phpsessid elimination.
-     */
-    private static final Pattern STRIP_SID_REGEX =
-        Pattern.compile("^(.+)" +
-                "(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE);
-    
-    /**
-     * Example:ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM.
-     */
-    private static final Pattern STRIP_ASPSESSION_REGEX =
-        Pattern.compile("^(.+)" +
-                "(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$",
-                    Pattern.CASE_INSENSITIVE);
-
-    /**
-     * Examples:
-     *
-     *        (.NET 2.0)
-     *        http://legislature.mi.gov/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx
-     *     => http://legislature.mi.gov/mileg.aspx
-     *
-     *		  (.NET 1.0/1.1)
-     *        http://legislature.mi.gov/(4hqa0555fwsecu455xqckv45)/mileg.aspx
-     *     => http://legislature.mi.gov/mileg.aspx
-     *     
-     *     For more info, see: 
-     *     	  http://msdn2.microsoft.com/en-us/library/aa479315.aspx
-     *     
-     */
-    private static final Pattern STRIP_ASPSESSION2_REGEX =
-    	Pattern.compile("^([^\\?]+/)" +
-    			"(?:\\((?:S\\(|)[0-9a-z]{24}\\)(?:\\)|)/)([^\\?]+\\.aspx.*)$",
-    			Pattern.CASE_INSENSITIVE);
-    
-    /**
-     * Examples:
-     *
-     *        (.NET 2.0)
-     *        http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules
-     *     => http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules
-     *
-     *     For more info, see: 
-     *     	  http://msdn2.microsoft.com/en-us/library/aa479315.aspx
-     *     
-     */   
-
-    private static final Pattern STRIP_ASPSESSION3_REGEX =
-    	Pattern.compile("^([^\\?]+/" +
-    			"\\((?:a\\([0-9a-z]{24}\\)))(?:S\\([0-9a-z]{24}\\))" +
-    			"((?:f\\([0-9a-z]{24}\\))\\)/[^\\?]+\\.aspx.*)$",
-    			Pattern.CASE_INSENSITIVE);
-    
-    /**
-     * Strip ColdFusion session IDs. Remove sessionids that look like the 
-     * following:
-     * CFID=12412453&CFTOKEN=15501799
-     * CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A
-     */
-    private static final Pattern STRIP_CFSESSION_REGEX = 
-    	Pattern.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+(?:jsession=[^&]+)?)" +
-    			"(?:&(.*))?$",Pattern.CASE_INSENSITIVE);
-        
-    /**
-     * Run a regex that strips elements of a string.
-     * 
-     * Assumes the regex has a form that wants to strip elements of the passed
-     * string.  Assumes that if a match, appending group 1
-     * and group 2 yields desired result.
-     * @param url Url to search in.
-     * @param matcher Matcher whose form yields a group 1 and group 2 if a
-     * match (non-null.
-     * @return Original <code>url</code> else concatenization of group 1
-     * and group 2.
-     */
-    protected String doStripRegexMatch(String url, Matcher matcher) {
-        return (matcher != null && matcher.matches())?
-            checkForNull(matcher.group(1)) + checkForNull(matcher.group(2)):
-            url;
-    }
-
-    /**
-     * @param string String to check.
-     * @return <code>string</code> if non-null, else empty string ("").
-     */
-    private String checkForNull(String string) {
-        return (string != null)? string: "";
-    }
-    
-	/**
-	 * return the canonical string key for the URL argument.
-	 * 
-	 * @param urlString
-	 * @return String lookup key for URL argument.
-	 * @throws URIException 
-	 */
-	public String urlStringToKey(final String urlString) throws URIException {
-
-		String searchUrl = canonicalize(urlString);
-
-		// TODO: force https into http for the moment...
-		if(searchUrl.startsWith("https://")) {
-			searchUrl = searchUrl.substring(8);
-		}
-		
-		// TODO: this will only work with http:// scheme. should work with all?
-		// force add of scheme and possible add '/' with empty path:
-		if (searchUrl.startsWith("http://")) {
-			if (-1 == searchUrl.indexOf('/', 8)) {
-				searchUrl = searchUrl + "/";
-			}
-		} else {
-			if (-1 == searchUrl.indexOf("/")) {
-				searchUrl = searchUrl + "/";
-			}
-			searchUrl = "http://" + searchUrl;
-		}
-
-		// unescape anythying that can be:
-		UURI tmpURI = UURIFactory.getInstance(searchUrl);
-		tmpURI.setPath(tmpURI.getPath());
-		
-		
-		// convert to UURI to perform require URI fixup:
-		UURI searchURI = UURIFactory.getInstance(tmpURI.getURI());
-
-
-		
-		
-		// replace ' ' with '+' (this is only to match Alexa's canonicalization)
-		String newPath = searchURI.getEscapedPath().replace("%20","+");
-//		String newPath = searchURI.getPath().replace(' ','+');
-		
-		// replace multiple consecutive '/'s in the path.
-		while(newPath.contains("//")) {
-			newPath = newPath.replace("//","/");
-		}
-		
-		// this would remove trailing a '/' character, unless the path is empty
-		// but we're not going to do this just yet..
-//		if((newPath.length() > 1) && newPath.endsWith("/")) {
-//			newPath = newPath.substring(0,newPath.length()-1);
-//		}
-//		searchURI.setEscapedPath(newPath);
-//		searchURI.setRawPath(newPath.toCharArray());
-//		String query = searchURI.getEscapedQuery();
-		
-		// TODO: handle non HTTP port stripping, too.
-//		String portStr = "";
-//		if(searchURI.getPort() != 80 && searchURI.getPort() != -1) {
-//			portStr = ":" + searchURI.getPort();
-//		}
-//		return searchURI.getHostBasename() + portStr + 
-//		searchURI.getEscapedPathQuery();
-		
-		StringBuilder sb = new StringBuilder(searchUrl.length());
-		sb.append(searchURI.getHostBasename());
-		if(searchURI.getPort() != 80 && searchURI.getPort() != -1) {
-			sb.append(":").append(searchURI.getPort());
-		}
-		sb.append(newPath);
-		if(searchURI.getEscapedQuery() != null) {
-			sb.append("?").append(searchURI.getEscapedQuery());
-		}
-		
-
-		return sb.toString();
-	}
-
-	
-	/**
-	 * Idempotent operation that will determine the 'fuzziest'
-	 * form of the url argument. This operation is done prior to adding records
-	 * to the ResourceIndex, and prior to lookup. Current version is exactly
-	 * the default found in Heritrix. When the configuration system for
-	 * Heritrix stabilizes, hopefully this can use the system directly within
-	 * Heritrix.
-	 * 
-	 * @param url to be canonicalized.
-	 * @return canonicalized version of url argument.
-	 */
-	public String canonicalize(String url) {
-        url = doStripRegexMatch(url, STRIP_USERINFO_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_WWW_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_WWWN_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_SESSION_ID_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_ASPSESSION_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_ASPSESSION2_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_ASPSESSION3_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_SID_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_CFSESSION_REGEX.matcher(url));
-        url = url.toLowerCase();
-        if (url == null || url.length() <= 0) {
-            return url;
-        }
-        
-        int index = url.lastIndexOf('?');
-        if (index > 0) {
-            if (index == (url.length() - 1)) {
-                // '?' is last char in url.  Strip it.
-                url = url.substring(0, url.length() - 1);
-            } else if (url.charAt(index + 1) == '&') {
-                // Next char is '&'. Strip it.
-                if (url.length() == (index + 2)) {
-                    // Then url ends with '?&'.  Strip them.
-                    url = url.substring(0, url.length() - 2);
-                } else {
-                    // The '&' is redundant.  Strip it.
-                    url = url.substring(0, index + 1) +
-                    url.substring(index + 2);
-                }
-            } else if (url.charAt(url.length() - 1) == '&') {
-                // If we have a lone '&' on end of query str,
-                // strip it.
-                url = url.substring(0, url.length() - 1);
-            }
-        }
-        return url;
-	}
-	
-	private static void USAGE() {
-		System.err.println("Usage: [-f FIELD] [-d DELIM]");
-		System.exit(3);
-	}
-	/**
-	 * @param args
-	 */
-	public static void main(String[] args) {
-		UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
-		int n = 0;
-		int i = 0;
-		ArrayList<Integer> columns = new ArrayList<Integer>();
-		
-		long lineNumber = 0;
-		boolean cdxPassThru = false;
-		String delimiter = " ";
-		while(n < args.length) {
-			String arg = args[n];
-			if(arg.compareTo("-cdx") == 0) {
-				cdxPassThru = true;
-				n++;
-				continue;
-			}
-			if(n == (args.length -1)) {
-				USAGE();
-			}
-			String val = args[n+1];
-			if(arg.compareTo("-f") == 0) {
-				columns.add(new Integer(val));
-			} else if(arg.compareTo("-d") == 0) {
-				delimiter = val;
-			} else {
-				USAGE();
-			}
-			n += 2;
-		}
-		// place default '0' in case none specified:
-		if(columns.size() == 0) {
-			columns.add(new Integer(1));
-		}
-		
-		// convert to int[]:
-		int[] cols = new int[columns.size()];
-		for(int idx = 0; idx < columns.size(); idx++) {
-			cols[idx] = columns.get(idx).intValue() - 1;
-		}
-		BufferedReader r = new BufferedReader(new InputStreamReader(System.in));
-		StringBuilder sb = new StringBuilder();
-		String line = null;
-		
-		while(true) {
-			try {
-				line = r.readLine();
-			} catch (IOException e) {
-				e.printStackTrace();
-				System.exit(1);
-			}
-			if(line == null) {
-				break;
-			}
-			lineNumber++;
-			if(cdxPassThru && line.startsWith(CDX_PREFIX)) {
-				System.out.println(line);
-				continue;
-			}
-			String parts[] = line.split(delimiter);
-			for(int column : cols) {
-				if(column >= parts.length) {
-					System.err.println("Invalid line " + lineNumber + " (" +
-							line + ") skipped");
-				} else {
-					try {
-						parts[column] = canonicalizer.urlStringToKey(parts[column]);
-					} catch (URIException e) {
-						System.err.println("Invalid URL in line " + lineNumber + " (" +
-								line + ") skipped (" + parts[column] + ")");
-						e.printStackTrace();
-						continue;
-					}
-				}
-			}
-			sb.setLength(0);
-			for(i = 0; i < parts.length; i++) {
-				sb.append(parts[i]);
-				if(i < (parts.length-1)) {
-					sb.append(delimiter);
-				}
-			}
-			System.out.println(sb.toString());
-		}
-	}
-}
\ No newline at end of file

Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java (from rev 2128, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java	2008-01-15 02:17:09 UTC (rev 2130)
@@ -0,0 +1,391 @@
+/* UrlCanonicalizer
+ *
+ * $Id$
+ *
+ * Created on 2:08:07 PM Oct 11, 2006.
+ *
+ * Copyright (C) 2006 Internet Archive.
+ *
+ * This file is part of Wayback.
+ *
+ * Wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * Wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+package org.archive.wayback.util.url;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.httpclient.URIException;
+import org.archive.net.UURI;
+import org.archive.net.UURIFactory;
+
+/**
+ * Class that performs the standard Heritrix URL canonicalization. Eventually,
+ * this should all be configurable, or perhaps be able to read the settings
+ * used within a Heritrix crawler... or even multiple crawlers... this is hard.
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class UrlCanonicalizer {
+
+	
+	private static final String CDX_PREFIX = " CDX ";
+    /**
+     * Strip leading 'www.'
+     */
+    private static final Pattern STRIP_WWW_REGEX =
+        Pattern.compile("(?i)^(https?://)(?:www\\.)([^/]*/.+)$");
+    /**
+     * Strip leading 'www44.', 'www3.', etc.
+     */
+    private static final Pattern STRIP_WWWN_REGEX =
+        Pattern.compile("(?i)^(https?://)(?:www[0-9]+\\.)([^/]*/.+)$");
+    /**
+     * Strip userinfo.
+     */
+    private static final Pattern STRIP_USERINFO_REGEX =
+        Pattern.compile("^((?:(?:https?)|(?:ftps?))://)(?:[^/]+@)(.*)$",
+            Pattern.CASE_INSENSITIVE);
+
+    /**
+     * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A.
+     * Example: PHPSESSID=9682993c8daa2c5497996114facdc805.
+     */
+    private static final Pattern STRIP_SESSION_ID_REGEX =
+    	 Pattern.compile("^(.+)(?:(?:(?:jsessionid)|(?:phpsessid))=" +
+    	                 "[0-9a-zA-Z]{32})(?:&(.*))?$",  
+    	                 Pattern.CASE_INSENSITIVE);
+
+    /**
+     * Example: sid=9682993c8daa2c5497996114facdc805. 
+     * 'sid=' can be tricky but all sid= followed by 32 byte string
+     * so far seen have been session ids.  Sid is a 32 byte string
+     * like the BASE_PATTERN only 'sid' is the tail of 'phpsessid'
+     * so have to have it run after the phpsessid elimination.
+     */
+    private static final Pattern STRIP_SID_REGEX =
+        Pattern.compile("^(.+)" +
+                "(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE);
+    
+    /**
+     * Example:ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM.
+     */
+    private static final Pattern STRIP_ASPSESSION_REGEX =
+        Pattern.compile("^(.+)" +
+                "(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$",
+                    Pattern.CASE_INSENSITIVE);
+
+    /**
+     * Examples:
+     *
+     *        (.NET 2.0)
+     *        http://legislature.mi.gov/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx
+     *     => http://legislature.mi.gov/mileg.aspx
+     *
+     *		  (.NET 1.0/1.1)
+     *        http://legislature.mi.gov/(4hqa0555fwsecu455xqckv45)/mileg.aspx
+     *     => http://legislature.mi.gov/mileg.aspx
+     *     
+     *     For more info, see: 
+     *     	  http://msdn2.microsoft.com/en-us/library/aa479315.aspx
+     *     
+     */
+    private static final Pattern STRIP_ASPSESSION2_REGEX =
+    	Pattern.compile("^([^\\?]+/)" +
+    			"(?:\\((?:S\\(|)[0-9a-z]{24}\\)(?:\\)|)/)([^\\?]+\\.aspx.*)$",
+    			Pattern.CASE_INSENSITIVE);
+    
+    /**
+     * Examples:
+     *
+     *        (.NET 2.0)
+     *        http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules
+     *     => http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules
+     *
+     *     For more info, see: 
+     *     	  http://msdn2.microsoft.com/en-us/library/aa479315.aspx
+     *     
+     */   
+
+    private static final Pattern STRIP_ASPSESSION3_REGEX =
+    	Pattern.compile("^([^\\?]+/" +
+    			"\\((?:a\\([0-9a-z]{24}\\)))(?:S\\([0-9a-z]{24}\\))" +
+    			"((?:f\\([0-9a-z]{24}\\))\\)/[^\\?]+\\.aspx.*)$",
+    			Pattern.CASE_INSENSITIVE);
+    
+    /**
+     * Strip ColdFusion session IDs. Remove sessionids that look like the 
+     * following:
+     * CFID=12412453&CFTOKEN=15501799
+     * CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A
+     */
+    private static final Pattern STRIP_CFSESSION_REGEX = 
+    	Pattern.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+(?:jsession=[^&]+)?)" +
+    			"(?:&(.*))?$",Pattern.CASE_INSENSITIVE);
+        
+    /**
+     * Run a regex that strips elements of a string.
+     * 
+     * Assumes the regex has a form that wants to strip elements of the passed
+     * string.  Assumes that if a match, appending group 1
+     * and group 2 yields desired result.
+     * @param url Url to search in.
+     * @param matcher Matcher whose form yields a group 1 and group 2 if a
+     * match (non-null.
+     * @return Original <code>url</code> else concatenization of group 1
+     * and group 2.
+     */
+    protected String doStripRegexMatch(String url, Matcher matcher) {
+        return (matcher != null && matcher.matches())?
+            checkForNull(matcher.group(1)) + checkForNull(matcher.group(2)):
+            url;
+    }
+
+    /**
+     * @param string String to check.
+     * @return <code>string</code> if non-null, else empty string ("").
+     */
+    private String checkForNull(String string) {
+        return (string != null)? string: "";
+    }
+    
+	/**
+	 * return the canonical string key for the URL argument.
+	 * 
+	 * @param urlString
+	 * @return String lookup key for URL argument.
+	 * @throws URIException 
+	 */
+	public String urlStringToKey(final String urlString) throws URIException {
+
+		String searchUrl = canonicalize(urlString);
+
+		// TODO: force https into http for the moment...
+		if(searchUrl.startsWith("https://")) {
+			searchUrl = searchUrl.substring(8);
+		}
+		
+		// TODO: this will only work with http:// scheme. should work with all?
+		// force add of scheme and possible add '/' with empty path:
+		if (searchUrl.startsWith("http://")) {
+			if (-1 == searchUrl.indexOf('/', 8)) {
+				searchUrl = searchUrl + "/";
+			}
+		} else {
+			if (-1 == searchUrl.indexOf("/")) {
+				searchUrl = searchUrl + "/";
+			}
+			searchUrl = "http://" + searchUrl;
+		}
+
+		// unescape anythying that can be:
+		UURI tmpURI = UURIFactory.getInstance(searchUrl);
+		tmpURI.setPath(tmpURI.getPath());
+		
+		
+		// convert to UURI to perform require URI fixup:
+		UURI searchURI = UURIFactory.getInstance(tmpURI.getURI());
+
+
+		
+		
+		// replace ' ' with '+' (this is only to match Alexa's canonicalization)
+		String newPath = searchURI.getEscapedPath().replace("%20","+");
+//		String newPath = searchURI.getPath().replace(' ','+');
+		
+		// replace multiple consecutive '/'s in the path.
+		while(newPath.contains("//")) {
+			newPath = newPath.replace("//","/");
+		}
+		
+		// this would remove trailing a '/' character, unless the path is empty
+		// but we're not going to do this just yet..
+//		if((newPath.length() > 1) && newPath.endsWith("/")) {
+//			newPath = newPath.substring(0,newPath.length()-1);
+//		}
+//		searchURI.setEscapedPath(newPath);
+//		searchURI.setRawPath(newPath.toCharArray());
+//		String query = searchURI.getEscapedQuery();
+		
+		// TODO: handle non HTTP port stripping, too.
+//		String portStr = "";
+//		if(searchURI.getPort() != 80 && searchURI.getPort() != -1) {
+//			portStr = ":" + searchURI.getPort();
+//		}
+//		return searchURI.getHostBasename() + portStr + 
+//		searchURI.getEscapedPathQuery();
+		
+		StringBuilder sb = new StringBuilder(searchUrl.length());
+		sb.append(searchURI.getHostBasename());
+		if(searchURI.getPort() != 80 && searchURI.getPort() != -1) {
+			sb.append(":").append(searchURI.getPort());
+		}
+		sb.append(newPath);
+		if(searchURI.getEscapedQuery() != null) {
+			sb.append("?").append(searchURI.getEscapedQuery());
+		}
+		
+
+		return sb.toString();
+	}
+
+	
+	/**
+	 * Idempotent operation that will determine the 'fuzziest'
+	 * form of the url argument. This operation is done prior to adding records
+	 * to the ResourceIndex, and prior to lookup. Current version is exactly
+	 * the default found in Heritrix. When the configuration system for
+	 * Heritrix stabilizes, hopefully this can use the system directly within
+	 * Heritrix.
+	 * 
+	 * @param url to be canonicalized.
+	 * @return canonicalized version of url argument.
+	 */
+	public String canonicalize(String url) {
+        url = doStripRegexMatch(url, STRIP_USERINFO_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_WWW_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_WWWN_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_SESSION_ID_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_ASPSESSION_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_ASPSESSION2_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_ASPSESSION3_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_SID_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_CFSESSION_REGEX.matcher(url));
+        url = url.toLowerCase();
+        if (url == null || url.length() <= 0) {
+            return url;
+        }
+        
+        int index = url.lastIndexOf('?');
+        if (index > 0) {
+            if (index == (url.length() - 1)) {
+                // '?' is last char in url.  Strip it.
+                url = url.substring(0, url.length() - 1);
+            } else if (url.charAt(index + 1) == '&') {
+                // Next char is '&'. Strip it.
+                if (url.length() == (index + 2)) {
+                    // Then url ends with '?&'.  Strip them.
+                    url = url.substring(0, url.length() - 2);
+                } else {
+                    // The '&' is redundant.  Strip it.
+                    url = url.substring(0, index + 1) +
+                    url.substring(index + 2);
+                }
+            } else if (url.charAt(url.length() - 1) == '&') {
+                // If we have a lone '&' on end of query str,
+                // strip it.
+                url = url.substring(0, url.length() - 1);
+            }
+        }
+        return url;
+	}
+	
+	private static void USAGE() {
+		System.err.println("Usage: [-f FIELD] [-d DELIM]");
+		System.exit(3);
+	}
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
+		int n = 0;
+		int i = 0;
+		ArrayList<Integer> columns = new ArrayList<Integer>();
+		
+		long lineNumber = 0;
+		boolean cdxPassThru = false;
+		String delimiter = " ";
+		while(n < args.length) {
+			String arg = args[n];
+			if(arg.compareTo("-cdx") == 0) {
+				cdxPassThru = true;
+				n++;
+				continue;
+			}
+			if(n == (args.length -1)) {
+				USAGE();
+			}
+			String val = args[n+1];
+			if(arg.compareTo("-f") == 0) {
+				columns.add(new Integer(val));
+			} else if(arg.compareTo("-d") == 0) {
+				delimiter = val;
+			} else {
+				USAGE();
+			}
+			n += 2;
+		}
+		// place default '0' in case none specified:
+		if(columns.size() == 0) {
+			columns.add(new Integer(1));
+		}
+		
+		// convert to int[]:
+		int[] cols = new int[columns.size()];
+		for(int idx = 0; idx < columns.size(); idx++) {
+			cols[idx] = columns.get(idx).intValue() - 1;
+		}
+		BufferedReader r = new BufferedReader(new InputStreamReader(System.in));
+		StringBuilder sb = new StringBuilder();
+		String line = null;
+		
+		while(true) {
+			try {
+				line = r.readLine();
+			} catch (IOException e) {
+				e.printStackTrace();
+				System.exit(1);
+			}
+			if(line == null) {
+				break;
+			}
+			lineNumber++;
+			if(cdxPassThru && line.startsWith(CDX_PREFIX)) {
+				System.out.println(line);
+				continue;
+			}
+			String parts[] = line.split(delimiter);
+			for(int column : cols) {
+				if(column >= parts.length) {
+					System.err.println("Invalid line " + lineNumber + " (" +
+							line + ") skipped");
+				} else {
+					try {
+						parts[column] = canonicalizer.urlStringToKey(parts[column]);
+					} catch (URIException e) {
+						System.err.println("Invalid URL in line " + lineNumber + " (" +
+								line + ") skipped (" + parts[column] + ")");
+						e.printStackTrace();
+						continue;
+					}
+				}
+			}
+			sb.setLength(0);
+			for(i = 0; i < parts.length; i++) {
+				sb.append(parts[i]);
+				if(i < (parts.length-1)) {
+					sb.append(delimiter);
+				}
+			}
+			System.out.println(sb.toString());
+		}
+	}
+}
\ No newline at end of file


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access: [2130] trunk/archive-access/projects/wayback/ way

[Archive-access-cvs] SF.net SVN: archive-access: [2130] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/util