From: <bra...@us...> - 2008-11-07 22:35:28
|
Revision: 2638 http://archive-access.svn.sourceforge.net/archive-access/?rev=2638&view=rev Author: bradtofel Date: 2008-11-07 22:35:24 +0000 (Fri, 07 Nov 2008) Log Message: ----------- FEATURE: Now supports canonicalization of some non-http:// schemes. TWEAK: removed unused commented out code Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java 2008-11-07 22:34:00 UTC (rev 2637) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java 2008-11-07 22:35:24 UTC (rev 2638) @@ -206,25 +206,32 @@ return urlString; } String searchUrl = canonicalize(urlString); - - // TODO: force https into http for the moment... - if(searchUrl.startsWith("https://")) { - searchUrl = searchUrl.substring(8); + String scheme = UrlOperations.urlToScheme(searchUrl); + if(scheme != null) { + searchUrl = searchUrl.substring(scheme.length()); + } else { + scheme = UrlOperations.HTTP_SCHEME; } - - // TODO: this will only work with http:// scheme. should work with all? - // force add of scheme and possible add '/' with empty path: - if (searchUrl.startsWith("http://")) { - if (-1 == searchUrl.indexOf('/', 8)) { - searchUrl = searchUrl + "/"; - } + + if (-1 == searchUrl.indexOf("/")) { + searchUrl = scheme + searchUrl + "/"; } else { - if (-1 == searchUrl.indexOf("/")) { - searchUrl = searchUrl + "/"; - } - searchUrl = "http://" + searchUrl; + searchUrl = scheme + searchUrl; } + // TODO: this will only work with http:// scheme. should work with all? + // force add of scheme and possible add '/' with empty path: +// if (searchUrl.startsWith("http://")) { +// if (-1 == searchUrl.indexOf('/', 8)) { +// searchUrl = searchUrl + "/"; +// } +// } else { +// if (-1 == searchUrl.indexOf("/")) { +// searchUrl = searchUrl + "/"; +// } +// searchUrl = "http://" + searchUrl; +// } + // TODO: These next few lines look crazy -- need to be reworked.. This // was the only easy way I could find to get the correct unescaping // out of UURIs, possible a bug. Definitely needs some TLC in any case, @@ -250,23 +257,18 @@ // if((newPath.length() > 1) && newPath.endsWith("/")) { // newPath = newPath.substring(0,newPath.length()-1); // } -// searchURI.setEscapedPath(newPath); -// searchURI.setRawPath(newPath.toCharArray()); -// String query = searchURI.getEscapedQuery(); - // TODO: handle non HTTP port stripping, too. -// String portStr = ""; -// if(searchURI.getPort() != 80 && searchURI.getPort() != -1) { -// portStr = ":" + searchURI.getPort(); -// } -// return searchURI.getHostBasename() + portStr + -// searchURI.getEscapedPathQuery(); - StringBuilder sb = new StringBuilder(searchUrl.length()); sb.append(searchURI.getHostBasename()); - if(searchURI.getPort() != 80 && searchURI.getPort() != -1) { + + // omit port if scheme default: + int defaultSchemePort = UrlOperations.schemeToDefaultPort(scheme); + if(searchURI.getPort() != defaultSchemePort + && searchURI.getPort() != -1) { + sb.append(":").append(searchURI.getPort()); } + sb.append(newPath); if(searchURI.getEscapedQuery() != null) { sb.append("?").append(searchURI.getEscapedQuery()); Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java 2008-11-07 22:34:00 UTC (rev 2637) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java 2008-11-07 22:35:24 UTC (rev 2638) @@ -45,16 +45,15 @@ // simple strip of http:// checkCanonicalization("http://foo.com/","foo.com/"); -// would be nice to handle other protocols... -// // simple strip of https:// -// checkCanonicalization("https://foo.com/","foo.com/"); -// -// // simple strip of ftp:// -// checkCanonicalization("ftp://foo.com/","foo.com/"); -// -// // simple strip of rtsp:// -// checkCanonicalization("rtsp://foo.com/","foo.com/"); + // simple strip of https:// + checkCanonicalization("https://foo.com/","foo.com/"); + // simple strip of ftp:// + checkCanonicalization("ftp://foo.com/","foo.com/"); + + // simple strip of rtsp:// + checkCanonicalization("rtsp://foo.com/","foo.com/"); + // strip leading 'www.' checkCanonicalization("http://www.foo.com/","foo.com/"); @@ -63,6 +62,9 @@ // strip leading 'www##.' checkCanonicalization("http://www12.foo.com/","foo.com/"); + + // strip leading 'www##.' with https + checkCanonicalization("https://www12.foo.com/","foo.com/"); // strip leading 'www##.' with no protocol checkCanonicalization("www12.foo.com/","foo.com/"); @@ -174,13 +176,53 @@ checkCanonicalization( "http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules", "legislature.mi.gov/mileg.aspx?page=sessionschedules"); + + + + + // default port stripping: + // FIRST the easy-on-the-eyes + // strip port 80 checkCanonicalization("http://www.chub.org:80/foo","chub.org/foo"); // but not other ports... checkCanonicalization("http://www.chub.org:8080/foo","chub.org:8080/foo"); + + // but not other ports... with "www#." massage + checkCanonicalization("http://www232.chub.org:8080/foo","chub.org:8080/foo"); + // default HTTP (:80) stripping without a scheme: + checkCanonicalization("www.chub.org:80/foo","chub.org/foo"); + + // no strip https port (443) without scheme: + checkCanonicalization("www.chub.org:443/foo","chub.org:443/foo"); + + // yes strip https port (443) with scheme: + checkCanonicalization("https://www.chub.org:443/foo","chub.org/foo"); + + // NEXT the exhaustive: + String origHost = "www.chub.org"; + String massagedHost = "chub.org"; + String path = "/foo"; + for(String scheme : UrlOperations.ALL_SCHEMES) { + + int defaultPort = UrlOperations.schemeToDefaultPort(scheme); + int nonDefaultPort = 19991; + + String origDefault = scheme + origHost + ":" + defaultPort + path; + String canonDefault = massagedHost + path; + + String origNonDefault = + scheme + origHost + ":" + nonDefaultPort + path; + String canonNonDefault = + massagedHost + ":" + nonDefaultPort + path; + + checkCanonicalization(origDefault,canonDefault); + checkCanonicalization(origNonDefault,canonNonDefault); + } + } private void checkCanonicalization(String orig, String want) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |