From: <bra...@us...> - 2007-07-16 22:37:30
|
Revision: 1775 http://archive-access.svn.sourceforge.net/archive-access/?rev=1775&view=rev Author: bradtofel Date: 2007-07-16 15:37:32 -0700 (Mon, 16 Jul 2007) Log Message: ----------- REFACTOR: moved url resolving code here from various other occurences Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2007-07-16 22:36:05 UTC (rev 1774) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2007-07-16 22:37:32 UTC (rev 1775) @@ -303,6 +303,28 @@ return (m != null) && m.matches(); } + + /** + * @param baseUrl + * @param url + * @return url resolved against baseUrl, unless it is absolute already + */ + public static String resolveUrl(String baseUrl, String url) { + // TODO: this only works for http:// + if(url.startsWith("http://")) { + return url; + } + UURI absBaseURI; + UURI resolvedURI = null; + try { + absBaseURI = UURIFactory.getInstance(baseUrl); + resolvedURI = UURIFactory.getInstance(absBaseURI, url); + } catch (URIException e) { + e.printStackTrace(); + return url; + } + return resolvedURI.getEscapedURI(); + } private static void USAGE() { System.err.println("Usage: [-f FIELD] [-d DELIM]"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-08-23 21:29:23
|
Revision: 1924 http://archive-access.svn.sourceforge.net/archive-access/?rev=1924&view=rev Author: bradtofel Date: 2007-08-23 14:29:23 -0700 (Thu, 23 Aug 2007) Log Message: ----------- BUGFIX: (unreported) makes https urls work as though they were http. still not the right long-term solution.. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2007-08-23 21:27:07 UTC (rev 1923) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2007-08-23 21:29:23 UTC (rev 1924) @@ -198,11 +198,15 @@ * @return String lookup key for URL argument. * @throws URIException */ - public String urlStringToKey(final String urlString) - throws URIException { + public String urlStringToKey(final String urlString) throws URIException { String searchUrl = canonicalize(urlString); + // TODO: force https into http for the moment... + if(searchUrl.startsWith("https://")) { + searchUrl = searchUrl.substring(8); + } + // TODO: this will only work with http:// scheme. should work with all? // force add of scheme and possible add '/' with empty path: if (searchUrl.startsWith("http://")) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2007-12-11 02:26:13
|
Revision: 2102 http://archive-access.svn.sourceforge.net/archive-access/?rev=2102&view=rev Author: bradtofel Date: 2007-12-10 18:26:18 -0800 (Mon, 10 Dec 2007) Log Message: ----------- FEATURE: Command line main() now accepts multiple fields to canonicalize in a single pass. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2007-12-11 02:25:10 UTC (rev 2101) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2007-12-11 02:26:18 UTC (rev 2102) @@ -27,6 +27,7 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; +import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -341,7 +342,8 @@ UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); int n = 0; int i = 0; - int column = 0; + ArrayList<Integer> columns = new ArrayList<Integer>(); + long lineNumber = 0; boolean cdxPassThru = false; String delimiter = " "; @@ -357,7 +359,7 @@ } String val = args[n+1]; if(arg.compareTo("-f") == 0) { - column = Integer.parseInt(val) - 1; + columns.add(new Integer(val)); } else if(arg.compareTo("-d") == 0) { delimiter = val; } else { @@ -365,9 +367,20 @@ } n += 2; } + // place default '0' in case none specified: + if(columns.size() == 0) { + columns.add(new Integer(1)); + } + + // convert to int[]: + int[] cols = new int[columns.size()]; + for(int idx = 0; idx < columns.size(); idx++) { + cols[idx] = columns.get(idx).intValue() - 1; + } BufferedReader r = new BufferedReader(new InputStreamReader(System.in)); StringBuilder sb = new StringBuilder(); String line = null; + while(true) { try { line = r.readLine(); @@ -384,27 +397,29 @@ continue; } String parts[] = line.split(delimiter); - if(column >= parts.length) { - System.err.println("Invalid line " + lineNumber + " (" + - line + ") skipped"); - } else { - try { - parts[column] = canonicalizer.urlStringToKey(parts[column]); - } catch (URIException e) { - System.err.println("Invalid URL in line " + lineNumber + " (" + - line + ") skipped"); - e.printStackTrace(); - continue; - } - sb.setLength(0); - for(i = 0; i < parts.length; i++) { - sb.append(parts[i]); - if(i < (parts.length-1)) { - sb.append(delimiter); + for(int column : cols) { + if(column >= parts.length) { + System.err.println("Invalid line " + lineNumber + " (" + + line + ") skipped"); + } else { + try { + parts[column] = canonicalizer.urlStringToKey(parts[column]); + } catch (URIException e) { + System.err.println("Invalid URL in line " + lineNumber + " (" + + line + ") skipped (" + parts[column] + ")"); + e.printStackTrace(); + continue; } } - System.out.println(sb.toString()); } + sb.setLength(0); + for(i = 0; i < parts.length; i++) { + sb.append(parts[i]); + if(i < (parts.length-1)) { + sb.append(delimiter); + } + } + System.out.println(sb.toString()); } } } \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-01-15 01:43:31
|
Revision: 2128 http://archive-access.svn.sourceforge.net/archive-access/?rev=2128&view=rev Author: bradtofel Date: 2008-01-14 17:43:29 -0800 (Mon, 14 Jan 2008) Log Message: ----------- REFACTOR: removed isAuthority() and resolveUrl() Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2008-01-15 01:41:32 UTC (rev 2127) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/UrlCanonicalizer.java 2008-01-15 01:43:29 UTC (rev 2128) @@ -44,34 +44,8 @@ * @version $Date$, $Revision$ */ public class UrlCanonicalizer { - - private static final String CC_TLDS = "ac|ad|ae|af|ag|ai|al|am|an|ao|aq" + - "|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs" + - "|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cx" + - "|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo" + - "|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk" + - "|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg" + - "|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma" + - "|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz" + - "|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm" + - "|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj" + - "|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn" + - "|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu" + - "|wf|ws|ye|yt|yu|za|zm|zw"; - - private static final String GEN_TLDS = "aero|biz|cat|com|coop|edu|gov" + - "|info|int|jobs|mil|mobi|museum|name|net|org|pro|travel"; - - - private static final String ALL_TLD_PATTERN = CC_TLDS + "|" + GEN_TLDS; - private static final String IP_PATTERN = "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+"; - private static final Pattern AUTHORITY_REGEX = - Pattern.compile("(([0-9a-z_.-]+)\\.(" + ALL_TLD_PATTERN + "))|" + - "(" + IP_PATTERN + ")"); - - private static final String CDX_PREFIX = " CDX "; /** * Strip leading 'www.' @@ -323,38 +297,6 @@ return url; } - /** - * @param urlPart - * @return boolean indicating whether urlPart might be an Authority. - */ - public boolean isAuthority(String urlPart) { - Matcher m = AUTHORITY_REGEX.matcher(urlPart); - - return (m != null) && m.matches(); - } - - /** - * @param baseUrl - * @param url - * @return url resolved against baseUrl, unless it is absolute already - */ - public static String resolveUrl(String baseUrl, String url) { - // TODO: this only works for http:// - if(url.startsWith("http://")) { - return url; - } - UURI absBaseURI; - UURI resolvedURI = null; - try { - absBaseURI = UURIFactory.getInstance(baseUrl); - resolvedURI = UURIFactory.getInstance(absBaseURI, url); - } catch (URIException e) { - e.printStackTrace(); - return url; - } - return resolvedURI.getEscapedURI(); - } - private static void USAGE() { System.err.println("Usage: [-f FIELD] [-d DELIM]"); System.exit(3); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |