archive-access-cvs Mailing List for Web Archive Access Utilities (Page 68)

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2130
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2130&view=rev
Author:   bradtofel
Date:     2008-01-14 18:17:09 -0800 (Mon, 14 Jan 2008)

Log Message:
-----------
REFACTOR: moved UrlCanonicalizer to org.archive.wayback.util.url

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java

Removed Paths:
-------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java

Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java
===================================================================

--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java	2008-01-15 01:46:54 UTC (rev 2129)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java	2008-01-15 02:17:09 UTC (rev 2130)
@@ -1,391 +0,0 @@
-/* UrlCanonicalizer
- *
- * $Id$
- *
- * Created on 2:08:07 PM Oct 11, 2006.
- *
- * Copyright (C) 2006 Internet Archive.
- *
- * This file is part of Wayback.
- *
- * Wayback is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * any later version.
- *
- * Wayback is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser Public License for more details.
- *
- * You should have received a copy of the GNU Lesser Public License
- * along with Wayback; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-package org.archive.wayback.util;
-
-import java.io.BufferedReader;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.util.ArrayList;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.httpclient.URIException;
-import org.archive.net.UURI;
-import org.archive.net.UURIFactory;
-
-/**
- * Class that performs the standard Heritrix URL canonicalization. Eventually,
- * this should all be configurable, or perhaps be able to read the settings
- * used within a Heritrix crawler... or even multiple crawlers... this is hard.
- *
- * @author brad
- * @version $Date$, $Revision$
- */
-public class UrlCanonicalizer {
-
-	
-	private static final String CDX_PREFIX = " CDX ";
-    /**
-     * Strip leading 'www.'
-     */
-    private static final Pattern STRIP_WWW_REGEX =
-        Pattern.compile("(?i)^(https?://)(?:www\\.)([^/]*/.+)$");
-    /**
-     * Strip leading 'www44.', 'www3.', etc.
-     */
-    private static final Pattern STRIP_WWWN_REGEX =
-        Pattern.compile("(?i)^(https?://)(?:www[0-9]+\\.)([^/]*/.+)$");
-    /**
-     * Strip userinfo.
-     */
-    private static final Pattern STRIP_USERINFO_REGEX =
-        Pattern.compile("^((?:(?:https?)|(?:ftps?))://)(?:[^/]+@)(.*)$",
-            Pattern.CASE_INSENSITIVE);
-
-    /**
-     * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A.
-     * Example: PHPSESSID=9682993c8daa2c5497996114facdc805.
-     */
-    private static final Pattern STRIP_SESSION_ID_REGEX =
-    	 Pattern.compile("^(.+)(?:(?:(?:jsessionid)|(?:phpsessid))=" +
-    	                 "[0-9a-zA-Z]{32})(?:&(.*))?$",  
-    	                 Pattern.CASE_INSENSITIVE);
-
-    /**
-     * Example: sid=9682993c8daa2c5497996114facdc805. 
-     * 'sid=' can be tricky but all sid= followed by 32 byte string
-     * so far seen have been session ids.  Sid is a 32 byte string
-     * like the BASE_PATTERN only 'sid' is the tail of 'phpsessid'
-     * so have to have it run after the phpsessid elimination.
-     */
-    private static final Pattern STRIP_SID_REGEX =
-        Pattern.compile("^(.+)" +
-                "(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE);
-    
-    /**
-     * Example:ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM.
-     */
-    private static final Pattern STRIP_ASPSESSION_REGEX =
-        Pattern.compile("^(.+)" +
-                "(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$",
-                    Pattern.CASE_INSENSITIVE);
-
-    /**
-     * Examples:
-     *
-     *        (.NET 2.0)
-     *        http://legislature.mi.gov/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx
-     *     => http://legislature.mi.gov/mileg.aspx
-     *
-     *		  (.NET 1.0/1.1)
-     *        http://legislature.mi.gov/(4hqa0555fwsecu455xqckv45)/mileg.aspx
-     *     => http://legislature.mi.gov/mileg.aspx
-     *     
-     *     For more info, see: 
-     *     	  http://msdn2.microsoft.com/en-us/library/aa479315.aspx
-     *     
-     */
-    private static final Pattern STRIP_ASPSESSION2_REGEX =
-    	Pattern.compile("^([^\\?]+/)" +
-    			"(?:\\((?:S\\(|)[0-9a-z]{24}\\)(?:\\)|)/)([^\\?]+\\.aspx.*)$",
-    			Pattern.CASE_INSENSITIVE);
-    
-    /**
-     * Examples:
-     *
-     *        (.NET 2.0)
-     *        http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules
-     *     => http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules
-     *
-     *     For more info, see: 
-     *     	  http://msdn2.microsoft.com/en-us/library/aa479315.aspx
-     *     
-     */   
-
-    private static final Pattern STRIP_ASPSESSION3_REGEX =
-    	Pattern.compile("^([^\\?]+/" +
-    			"\\((?:a\\([0-9a-z]{24}\\)))(?:S\\([0-9a-z]{24}\\))" +
-    			"((?:f\\([0-9a-z]{24}\\))\\)/[^\\?]+\\.aspx.*)$",
-    			Pattern.CASE_INSENSITIVE);
-    
-    /**
-     * Strip ColdFusion session IDs. Remove sessionids that look like the 
-     * following:
-     * CFID=12412453&CFTOKEN=15501799
-     * CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A
-     */
-    private static final Pattern STRIP_CFSESSION_REGEX = 
-    	Pattern.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+(?:jsession=[^&]+)?)" +
-    			"(?:&(.*))?$",Pattern.CASE_INSENSITIVE);
-        
-    /**
-     * Run a regex that strips elements of a string.
-     * 
-     * Assumes the regex has a form that wants to strip elements of the passed
-     * string.  Assumes that if a match, appending group 1
-     * and group 2 yields desired result.
-     * @param url Url to search in.
-     * @param matcher Matcher whose form yields a group 1 and group 2 if a
-     * match (non-null.
-     * @return Original <code>url</code> else concatenization of group 1
-     * and group 2.
-     */
-    protected String doStripRegexMatch(String url, Matcher matcher) {
-        return (matcher != null && matcher.matches())?
-            checkForNull(matcher.group(1)) + checkForNull(matcher.group(2)):
-            url;
-    }
-
-    /**
-     * @param string String to check.
-     * @return <code>string</code> if non-null, else empty string ("").
-     */
-    private String checkForNull(String string) {
-        return (string != null)? string: "";
-    }
-    
-	/**
-	 * return the canonical string key for the URL argument.
-	 * 
-	 * @param urlString
-	 * @return String lookup key for URL argument.
-	 * @throws URIException 
-	 */
-	public String urlStringToKey(final String urlString) throws URIException {
-
-		String searchUrl = canonicalize(urlString);
-
-		// TODO: force https into http for the moment...
-		if(searchUrl.startsWith("https://")) {
-			searchUrl = searchUrl.substring(8);
-		}
-		
-		// TODO: this will only work with http:// scheme. should work with all?
-		// force add of scheme and possible add '/' with empty path:
-		if (searchUrl.startsWith("http://")) {
-			if (-1 == searchUrl.indexOf('/', 8)) {
-				searchUrl = searchUrl + "/";
-			}
-		} else {
-			if (-1 == searchUrl.indexOf("/")) {
-				searchUrl = searchUrl + "/";
-			}
-			searchUrl = "http://" + searchUrl;
-		}
-
-		// unescape anythying that can be:
-		UURI tmpURI = UURIFactory.getInstance(searchUrl);
-		tmpURI.setPath(tmpURI.getPath());
-		
-		
-		// convert to UURI to perform require URI fixup:
-		UURI searchURI = UURIFactory.getInstance(tmpURI.getURI());
-
-
-		
-		
-		// replace ' ' with '+' (this is only to match Alexa's canonicalization)
-		String newPath = searchURI.getEscapedPath().replace("%20","+");
-//		String newPath = searchURI.getPath().replace(' ','+');
-		
-		// replace multiple consecutive '/'s in the path.
-		while(newPath.contains("//")) {
-			newPath = newPath.replace("//","/");
-		}
-		
-		// this would remove trailing a '/' character, unless the path is empty
-		// but we're not going to do this just yet..
-//		if((newPath.length() > 1) && newPath.endsWith("/")) {
-//			newPath = newPath.substring(0,newPath.length()-1);
-//		}
-//		searchURI.setEscapedPath(newPath);
-//		searchURI.setRawPath(newPath.toCharArray());
-//		String query = searchURI.getEscapedQuery();
-		
-		// TODO: handle non HTTP port stripping, too.
-//		String portStr = "";
-//		if(searchURI.getPort() != 80 && searchURI.getPort() != -1) {
-//			portStr = ":" + searchURI.getPort();
-//		}
-//		return searchURI.getHostBasename() + portStr + 
-//		searchURI.getEscapedPathQuery();
-		
-		StringBuilder sb = new StringBuilder(searchUrl.length());
-		sb.append(searchURI.getHostBasename());
-		if(searchURI.getPort() != 80 && searchURI.getPort() != -1) {
-			sb.append(":").append(searchURI.getPort());
-		}
-		sb.append(newPath);
-		if(searchURI.getEscapedQuery() != null) {
-			sb.append("?").append(searchURI.getEscapedQuery());
-		}
-		
-
-		return sb.toString();
-	}
-
-	
-	/**
-	 * Idempotent operation that will determine the 'fuzziest'
-	 * form of the url argument. This operation is done prior to adding records
-	 * to the ResourceIndex, and prior to lookup. Current version is exactly
-	 * the default found in Heritrix. When the configuration system for
-	 * Heritrix stabilizes, hopefully this can use the system directly within
-	 * Heritrix.
-	 * 
-	 * @param url to be canonicalized.
-	 * @return canonicalized version of url argument.
-	 */
-	public String canonicalize(String url) {
-        url = doStripRegexMatch(url, STRIP_USERINFO_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_WWW_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_WWWN_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_SESSION_ID_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_ASPSESSION_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_ASPSESSION2_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_ASPSESSION3_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_SID_REGEX.matcher(url));
-        url = doStripRegexMatch(url, STRIP_CFSESSION_REGEX.matcher(url));
-        url = url.toLowerCase();
-        if (url == null || url.length() <= 0) {
-            return url;
-        }
-        
-        int index = url.lastIndexOf('?');
-        if (index > 0) {
-            if (index == (url.length() - 1)) {
-                // '?' is last char in url.  Strip it.
-                url = url.substring(0, url.length() - 1);
-            } else if (url.charAt(index + 1) == '&') {
-                // Next char is '&'. Strip it.
-                if (url.length() == (index + 2)) {
-                    // Then url ends with '?&'.  Strip them.
-                    url = url.substring(0, url.length() - 2);
-                } else {
-                    // The '&' is redundant.  Strip it.
-                    url = url.substring(0, index + 1) +
-                    url.substring(index + 2);
-                }
-            } else if (url.charAt(url.length() - 1) == '&') {
-                // If we have a lone '&' on end of query str,
-                // strip it.
-                url = url.substring(0, url.length() - 1);
-            }
-        }
-        return url;
-	}
-	
-	private static void USAGE() {
-		System.err.println("Usage: [-f FIELD] [-d DELIM]");
-		System.exit(3);
-	}
-	/**
-	 * @param args
-	 */
-	public static void main(String[] args) {
-		UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
-		int n = 0;
-		int i = 0;
-		ArrayList<Integer> columns = new ArrayList<Integer>();
-		
-		long lineNumber = 0;
-		boolean cdxPassThru = false;
-		String delimiter = " ";
-		while(n < args.length) {
-			String arg = args[n];
-			if(arg.compareTo("-cdx") == 0) {
-				cdxPassThru = true;
-				n++;
-				continue;
-			}
-			if(n == (args.length -1)) {
-				USAGE();
-			}
-			String val = args[n+1];
-			if(arg.compareTo("-f") == 0) {
-				columns.add(new Integer(val));
-			} else if(arg.compareTo("-d") == 0) {
-				delimiter = val;
-			} else {
-				USAGE();
-			}
-			n += 2;
-		}
-		// place default '0' in case none specified:
-		if(columns.size() == 0) {
-			columns.add(new Integer(1));
-		}
-		
-		// convert to int[]:
-		int[] cols = new int[columns.size()];
-		for(int idx = 0; idx < columns.size(); idx++) {
-			cols[idx] = columns.get(idx).intValue() - 1;
-		}
-		BufferedReader r = new BufferedReader(new InputStreamReader(System.in));
-		StringBuilder sb = new StringBuilder();
-		String line = null;
-		
-		while(true) {
-			try {
-				line = r.readLine();
-			} catch (IOException e) {
-				e.printStackTrace();
-				System.exit(1);
-			}
-			if(line == null) {
-				break;
-			}
-			lineNumber++;
-			if(cdxPassThru && line.startsWith(CDX_PREFIX)) {
-				System.out.println(line);
-				continue;
-			}
-			String parts[] = line.split(delimiter);
-			for(int column : cols) {
-				if(column >= parts.length) {
-					System.err.println("Invalid line " + lineNumber + " (" +
-							line + ") skipped");
-				} else {
-					try {
-						parts[column] = canonicalizer.urlStringToKey(parts[column]);
-					} catch (URIException e) {
-						System.err.println("Invalid URL in line " + lineNumber + " (" +
-								line + ") skipped (" + parts[column] + ")");
-						e.printStackTrace();
-						continue;
-					}
-				}
-			}
-			sb.setLength(0);
-			for(i = 0; i < parts.length; i++) {
-				sb.append(parts[i]);
-				if(i < (parts.length-1)) {
-					sb.append(delimiter);
-				}
-			}
-			System.out.println(sb.toString());
-		}
-	}
-}
\ No newline at end of file

Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java (from rev 2128, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlCanonicalizer.java	2008-01-15 02:17:09 UTC (rev 2130)
@@ -0,0 +1,391 @@
+/* UrlCanonicalizer
+ *
+ * $Id$
+ *
+ * Created on 2:08:07 PM Oct 11, 2006.
+ *
+ * Copyright (C) 2006 Internet Archive.
+ *
+ * This file is part of Wayback.
+ *
+ * Wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * Wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+package org.archive.wayback.util.url;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.httpclient.URIException;
+import org.archive.net.UURI;
+import org.archive.net.UURIFactory;
+
+/**
+ * Class that performs the standard Heritrix URL canonicalization. Eventually,
+ * this should all be configurable, or perhaps be able to read the settings
+ * used within a Heritrix crawler... or even multiple crawlers... this is hard.
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class UrlCanonicalizer {
+
+	
+	private static final String CDX_PREFIX = " CDX ";
+    /**
+     * Strip leading 'www.'
+     */
+    private static final Pattern STRIP_WWW_REGEX =
+        Pattern.compile("(?i)^(https?://)(?:www\\.)([^/]*/.+)$");
+    /**
+     * Strip leading 'www44.', 'www3.', etc.
+     */
+    private static final Pattern STRIP_WWWN_REGEX =
+        Pattern.compile("(?i)^(https?://)(?:www[0-9]+\\.)([^/]*/.+)$");
+    /**
+     * Strip userinfo.
+     */
+    private static final Pattern STRIP_USERINFO_REGEX =
+        Pattern.compile("^((?:(?:https?)|(?:ftps?))://)(?:[^/]+@)(.*)$",
+            Pattern.CASE_INSENSITIVE);
+
+    /**
+     * Example: jsessionid=999A9EF028317A82AC83F0FDFE59385A.
+     * Example: PHPSESSID=9682993c8daa2c5497996114facdc805.
+     */
+    private static final Pattern STRIP_SESSION_ID_REGEX =
+    	 Pattern.compile("^(.+)(?:(?:(?:jsessionid)|(?:phpsessid))=" +
+    	                 "[0-9a-zA-Z]{32})(?:&(.*))?$",  
+    	                 Pattern.CASE_INSENSITIVE);
+
+    /**
+     * Example: sid=9682993c8daa2c5497996114facdc805. 
+     * 'sid=' can be tricky but all sid= followed by 32 byte string
+     * so far seen have been session ids.  Sid is a 32 byte string
+     * like the BASE_PATTERN only 'sid' is the tail of 'phpsessid'
+     * so have to have it run after the phpsessid elimination.
+     */
+    private static final Pattern STRIP_SID_REGEX =
+        Pattern.compile("^(.+)" +
+                "(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", Pattern.CASE_INSENSITIVE);
+    
+    /**
+     * Example:ASPSESSIONIDAQBSDSRT=EOHBLBDDPFCLHKPGGKLILNAM.
+     */
+    private static final Pattern STRIP_ASPSESSION_REGEX =
+        Pattern.compile("^(.+)" +
+                "(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$",
+                    Pattern.CASE_INSENSITIVE);
+
+    /**
+     * Examples:
+     *
+     *        (.NET 2.0)
+     *        http://legislature.mi.gov/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx
+     *     => http://legislature.mi.gov/mileg.aspx
+     *
+     *		  (.NET 1.0/1.1)
+     *        http://legislature.mi.gov/(4hqa0555fwsecu455xqckv45)/mileg.aspx
+     *     => http://legislature.mi.gov/mileg.aspx
+     *     
+     *     For more info, see: 
+     *     	  http://msdn2.microsoft.com/en-us/library/aa479315.aspx
+     *     
+     */
+    private static final Pattern STRIP_ASPSESSION2_REGEX =
+    	Pattern.compile("^([^\\?]+/)" +
+    			"(?:\\((?:S\\(|)[0-9a-z]{24}\\)(?:\\)|)/)([^\\?]+\\.aspx.*)$",
+    			Pattern.CASE_INSENSITIVE);
+    
+    /**
+     * Examples:
+     *
+     *        (.NET 2.0)
+     *        http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules
+     *     => http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=SessionSchedules
+     *
+     *     For more info, see: 
+     *     	  http://msdn2.microsoft.com/en-us/library/aa479315.aspx
+     *     
+     */   
+
+    private static final Pattern STRIP_ASPSESSION3_REGEX =
+    	Pattern.compile("^([^\\?]+/" +
+    			"\\((?:a\\([0-9a-z]{24}\\)))(?:S\\([0-9a-z]{24}\\))" +
+    			"((?:f\\([0-9a-z]{24}\\))\\)/[^\\?]+\\.aspx.*)$",
+    			Pattern.CASE_INSENSITIVE);
+    
+    /**
+     * Strip ColdFusion session IDs. Remove sessionids that look like the 
+     * following:
+     * CFID=12412453&CFTOKEN=15501799
+     * CFID=3304324&CFTOKEN=57491900&jsessionid=a63098d96360$B0$D9$A
+     */
+    private static final Pattern STRIP_CFSESSION_REGEX = 
+    	Pattern.compile("^(.+)(?:cfid=[^&]+&cftoken=[^&]+(?:jsession=[^&]+)?)" +
+    			"(?:&(.*))?$",Pattern.CASE_INSENSITIVE);
+        
+    /**
+     * Run a regex that strips elements of a string.
+     * 
+     * Assumes the regex has a form that wants to strip elements of the passed
+     * string.  Assumes that if a match, appending group 1
+     * and group 2 yields desired result.
+     * @param url Url to search in.
+     * @param matcher Matcher whose form yields a group 1 and group 2 if a
+     * match (non-null.
+     * @return Original <code>url</code> else concatenization of group 1
+     * and group 2.
+     */
+    protected String doStripRegexMatch(String url, Matcher matcher) {
+        return (matcher != null && matcher.matches())?
+            checkForNull(matcher.group(1)) + checkForNull(matcher.group(2)):
+            url;
+    }
+
+    /**
+     * @param string String to check.
+     * @return <code>string</code> if non-null, else empty string ("").
+     */
+    private String checkForNull(String string) {
+        return (string != null)? string: "";
+    }
+    
+	/**
+	 * return the canonical string key for the URL argument.
+	 * 
+	 * @param urlString
+	 * @return String lookup key for URL argument.
+	 * @throws URIException 
+	 */
+	public String urlStringToKey(final String urlString) throws URIException {
+
+		String searchUrl = canonicalize(urlString);
+
+		// TODO: force https into http for the moment...
+		if(searchUrl.startsWith("https://")) {
+			searchUrl = searchUrl.substring(8);
+		}
+		
+		// TODO: this will only work with http:// scheme. should work with all?
+		// force add of scheme and possible add '/' with empty path:
+		if (searchUrl.startsWith("http://")) {
+			if (-1 == searchUrl.indexOf('/', 8)) {
+				searchUrl = searchUrl + "/";
+			}
+		} else {
+			if (-1 == searchUrl.indexOf("/")) {
+				searchUrl = searchUrl + "/";
+			}
+			searchUrl = "http://" + searchUrl;
+		}
+
+		// unescape anythying that can be:
+		UURI tmpURI = UURIFactory.getInstance(searchUrl);
+		tmpURI.setPath(tmpURI.getPath());
+		
+		
+		// convert to UURI to perform require URI fixup:
+		UURI searchURI = UURIFactory.getInstance(tmpURI.getURI());
+
+
+		
+		
+		// replace ' ' with '+' (this is only to match Alexa's canonicalization)
+		String newPath = searchURI.getEscapedPath().replace("%20","+");
+//		String newPath = searchURI.getPath().replace(' ','+');
+		
+		// replace multiple consecutive '/'s in the path.
+		while(newPath.contains("//")) {
+			newPath = newPath.replace("//","/");
+		}
+		
+		// this would remove trailing a '/' character, unless the path is empty
+		// but we're not going to do this just yet..
+//		if((newPath.length() > 1) && newPath.endsWith("/")) {
+//			newPath = newPath.substring(0,newPath.length()-1);
+//		}
+//		searchURI.setEscapedPath(newPath);
+//		searchURI.setRawPath(newPath.toCharArray());
+//		String query = searchURI.getEscapedQuery();
+		
+		// TODO: handle non HTTP port stripping, too.
+//		String portStr = "";
+//		if(searchURI.getPort() != 80 && searchURI.getPort() != -1) {
+//			portStr = ":" + searchURI.getPort();
+//		}
+//		return searchURI.getHostBasename() + portStr + 
+//		searchURI.getEscapedPathQuery();
+		
+		StringBuilder sb = new StringBuilder(searchUrl.length());
+		sb.append(searchURI.getHostBasename());
+		if(searchURI.getPort() != 80 && searchURI.getPort() != -1) {
+			sb.append(":").append(searchURI.getPort());
+		}
+		sb.append(newPath);
+		if(searchURI.getEscapedQuery() != null) {
+			sb.append("?").append(searchURI.getEscapedQuery());
+		}
+		
+
+		return sb.toString();
+	}
+
+	
+	/**
+	 * Idempotent operation that will determine the 'fuzziest'
+	 * form of the url argument. This operation is done prior to adding records
+	 * to the ResourceIndex, and prior to lookup. Current version is exactly
+	 * the default found in Heritrix. When the configuration system for
+	 * Heritrix stabilizes, hopefully this can use the system directly within
+	 * Heritrix.
+	 * 
+	 * @param url to be canonicalized.
+	 * @return canonicalized version of url argument.
+	 */
+	public String canonicalize(String url) {
+        url = doStripRegexMatch(url, STRIP_USERINFO_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_WWW_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_WWWN_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_SESSION_ID_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_ASPSESSION_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_ASPSESSION2_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_ASPSESSION3_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_SID_REGEX.matcher(url));
+        url = doStripRegexMatch(url, STRIP_CFSESSION_REGEX.matcher(url));
+        url = url.toLowerCase();
+        if (url == null || url.length() <= 0) {
+            return url;
+        }
+        
+        int index = url.lastIndexOf('?');
+        if (index > 0) {
+            if (index == (url.length() - 1)) {
+                // '?' is last char in url.  Strip it.
+                url = url.substring(0, url.length() - 1);
+            } else if (url.charAt(index + 1) == '&') {
+                // Next char is '&'. Strip it.
+                if (url.length() == (index + 2)) {
+                    // Then url ends with '?&'.  Strip them.
+                    url = url.substring(0, url.length() - 2);
+                } else {
+                    // The '&' is redundant.  Strip it.
+                    url = url.substring(0, index + 1) +
+                    url.substring(index + 2);
+                }
+            } else if (url.charAt(url.length() - 1) == '&') {
+                // If we have a lone '&' on end of query str,
+                // strip it.
+                url = url.substring(0, url.length() - 1);
+            }
+        }
+        return url;
+	}
+	
+	private static void USAGE() {
+		System.err.println("Usage: [-f FIELD] [-d DELIM]");
+		System.exit(3);
+	}
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
+		int n = 0;
+		int i = 0;
+		ArrayList<Integer> columns = new ArrayList<Integer>();
+		
+		long lineNumber = 0;
+		boolean cdxPassThru = false;
+		String delimiter = " ";
+		while(n < args.length) {
+			String arg = args[n];
+			if(arg.compareTo("-cdx") == 0) {
+				cdxPassThru = true;
+				n++;
+				continue;
+			}
+			if(n == (args.length -1)) {
+				USAGE();
+			}
+			String val = args[n+1];
+			if(arg.compareTo("-f") == 0) {
+				columns.add(new Integer(val));
+			} else if(arg.compareTo("-d") == 0) {
+				delimiter = val;
+			} else {
+				USAGE();
+			}
+			n += 2;
+		}
+		// place default '0' in case none specified:
+		if(columns.size() == 0) {
+			columns.add(new Integer(1));
+		}
+		
+		// convert to int[]:
+		int[] cols = new int[columns.size()];
+		for(int idx = 0; idx < columns.size(); idx++) {
+			cols[idx] = columns.get(idx).intValue() - 1;
+		}
+		BufferedReader r = new BufferedReader(new InputStreamReader(System.in));
+		StringBuilder sb = new StringBuilder();
+		String line = null;
+		
+		while(true) {
+			try {
+				line = r.readLine();
+			} catch (IOException e) {
+				e.printStackTrace();
+				System.exit(1);
+			}
+			if(line == null) {
+				break;
+			}
+			lineNumber++;
+			if(cdxPassThru && line.startsWith(CDX_PREFIX)) {
+				System.out.println(line);
+				continue;
+			}
+			String parts[] = line.split(delimiter);
+			for(int column : cols) {
+				if(column >= parts.length) {
+					System.err.println("Invalid line " + lineNumber + " (" +
+							line + ") skipped");
+				} else {
+					try {
+						parts[column] = canonicalizer.urlStringToKey(parts[column]);
+					} catch (URIException e) {
+						System.err.println("Invalid URL in line " + lineNumber + " (" +
+								line + ") skipped (" + parts[column] + ")");
+						e.printStackTrace();
+						continue;
+					}
+				}
+			}
+			sb.setLength(0);
+			for(i = 0; i < parts.length; i++) {
+				sb.append(parts[i]);
+				if(i < (parts.length-1)) {
+					sb.append(delimiter);
+				}
+			}
+			System.out.println(sb.toString());
+		}
+	}
+}
\ No newline at end of file


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




2005	Jan	Feb	Mar	Apr	May	Jun	Jul (1)	Aug (10)	Sep (36)	Oct (339)	Nov (103)	Dec (152)
2006	Jan (141)	Feb (102)	Mar (125)	Apr (203)	May (57)	Jun (30)	Jul (139)	Aug (46)	Sep (64)	Oct (105)	Nov (34)	Dec (162)
2007	Jan (81)	Feb (57)	Mar (141)	Apr (72)	May (9)	Jun (1)	Jul (144)	Aug (88)	Sep (40)	Oct (43)	Nov (34)	Dec (20)
2008	Jan (44)	Feb (45)	Mar (16)	Apr (36)	May (8)	Jun (77)	Jul (177)	Aug (66)	Sep (8)	Oct (33)	Nov (13)	Dec (37)
2009	Jan (2)	Feb (5)	Mar (8)	Apr	May (36)	Jun (19)	Jul (46)	Aug (8)	Sep (1)	Oct (66)	Nov (61)	Dec (10)
2010	Jan (13)	Feb (16)	Mar (38)	Apr (76)	May (47)	Jun (32)	Jul (35)	Aug (45)	Sep (20)	Oct (61)	Nov (24)	Dec (16)
2011	Jan (22)	Feb (34)	Mar (11)	Apr (8)	May (24)	Jun (23)	Jul (11)	Aug (42)	Sep (81)	Oct (48)	Nov (21)	Dec (20)
2012	Jan (30)	Feb (25)	Mar (4)	Apr (6)	May (1)	Jun (5)	Jul (5)	Aug (8)	Sep (6)	Oct (6)	Nov	Dec

archive-access-cvs Mailing List for Web Archive Access Utilities (Page 68)

archive-access-cvs — CVS commits