From: <bra...@us...> - 2010-04-23 23:35:19
|
Revision: 3051 http://archive-access.svn.sourceforge.net/archive-access/?rev=3051&view=rev Author: bradtofel Date: 2010-04-23 23:35:12 +0000 (Fri, 23 Apr 2010) Log Message: ----------- BUGFIX: fixed(hopefully) problem when extracting hostname from URLs containing userinfo FEATURE: added urlToUserInfo() Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-04-15 00:23:54 UTC (rev 3050) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-04-23 23:35:12 UTC (rev 3051) @@ -24,6 +24,8 @@ */ package org.archive.wayback.util.url; +import java.net.MalformedURLException; +import java.net.URL; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -97,6 +99,10 @@ private static final Pattern AUTHORITY_REGEX_SIMPLE = Pattern.compile("([0-9a-z_.-]++)"); + private static final Pattern HOST_REGEX_SIMPLE = + Pattern.compile("(?:[0-9a-z_.:-]+@)?([0-9a-z_.-]++)"); + private static final Pattern USERINFO_REGEX_SIMPLE = + Pattern.compile("([0-9a-z_.:-]+)(?:@[0-9a-z_.-]++)"); /** * @param urlPart @@ -184,23 +190,44 @@ return url.substring(pathIdx); } } - + public static String urlToHost(String url) { - if(url.startsWith("dns:")) { - return url.substring(4); + String lcUrl = url.toLowerCase(); + if(lcUrl.startsWith("dns:")) { + return lcUrl.substring(4); } for(String scheme : ALL_SCHEMES) { - if(url.startsWith(scheme)) { - int hostIdx = scheme.length(); + if(lcUrl.startsWith(scheme)) { + int authorityIdx = scheme.length(); - Matcher m = AUTHORITY_REGEX_SIMPLE.matcher(url.substring(hostIdx)); + Matcher m = + HOST_REGEX_SIMPLE.matcher(lcUrl.substring(authorityIdx)); if(m.find()) { - return m.group(0); + return m.group(1); } } } return url; } + + public static String urlToUserInfo(String url) { + String lcUrl = url.toLowerCase(); + if(lcUrl.startsWith("dns:")) { + return null; + } + for(String scheme : ALL_SCHEMES) { + if(lcUrl.startsWith(scheme)) { + int authorityIdx = scheme.length(); + + Matcher m = + USERINFO_REGEX_SIMPLE.matcher(lcUrl.substring(authorityIdx)); + if(m.find()) { + return m.group(1); + } + } + } + return null; + } /** * Find and return the parent directory of the URL argument Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2010-04-15 00:23:54 UTC (rev 3050) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2010-04-23 23:35:12 UTC (rev 3051) @@ -35,7 +35,14 @@ assertEquals("foo.com",UrlOperations.urlToHost("http://foo.com")); assertEquals("foo.com",UrlOperations.urlToHost("https://foo.com")); assertEquals("foo.com",UrlOperations.urlToHost("ftp://foo.com")); - + + assertEquals("www.google.com",UrlOperations.urlToHost("http://www.GOOGLE.COM")); + assertEquals("google.com",UrlOperations.urlToHost("http://GOOGLE.COM/")); + assertEquals("google.com",UrlOperations.urlToHost("http://GOOGLE.COM")); + assertEquals("google.com",UrlOperations.urlToHost("http://GOOGLE.COM:80")); + assertEquals("google.com",UrlOperations.urlToHost("http://GOOGLE.COM:80/")); + assertEquals("google.com",UrlOperations.urlToHost("http://GOOGLE.COM:80/foo")); + assertEquals("foo.com",UrlOperations.urlToHost("http://foo.com/")); assertEquals("foo.com",UrlOperations.urlToHost("https://foo.com/")); assertEquals("foo.com",UrlOperations.urlToHost("ftp://foo.com/")); @@ -66,8 +73,34 @@ assertEquals("foo.com",UrlOperations.urlToHost("ftp://foo.com\\")); assertEquals("www.foo.com",UrlOperations.urlToHost("http://www.foo.com\\")); assertEquals("www.foo.com",UrlOperations.urlToHost("http://www.foo.com:80\\")); + + + assertEquals("foo.com",UrlOperations.urlToHost("http://us...@fo...")); + assertEquals("www.foo.com",UrlOperations.urlToHost("http://us...@ww...")); + assertEquals("www.foo.com",UrlOperations.urlToHost("http://user:pa...@ww...")); + + assertEquals("www.foo.com",UrlOperations.urlToHost("http://user:pa...@ww.../")); + assertEquals("www.foo.com",UrlOperations.urlToHost("http://user:pa...@ww.../boo@foo")); } + public void testUrlToUserInfo() { + assertEquals(null,UrlOperations.urlToUserInfo("dns:foo.com")); + assertEquals(null,UrlOperations.urlToUserInfo("http://foo.com")); + assertEquals(null,UrlOperations.urlToUserInfo("https://foo.com")); + assertEquals(null,UrlOperations.urlToUserInfo("ftp://foo.com")); + assertEquals(null,UrlOperations.urlToUserInfo("ftp://foo.com/")); + assertEquals(null,UrlOperations.urlToUserInfo("http://foo.com:80/")); + assertEquals(null,UrlOperations.urlToUserInfo("http://foo.com:80")); + assertEquals(null,UrlOperations.urlToUserInfo("http://www.foo.com:80\\")); + + assertEquals("user",UrlOperations.urlToUserInfo("http://us...@fo...")); + assertEquals("user",UrlOperations.urlToUserInfo("http://us...@ww...")); + assertEquals("user:pass",UrlOperations.urlToUserInfo("http://user:pa...@ww...")); + assertEquals("user:pass",UrlOperations.urlToUserInfo("http://user:pa...@ww...:8080")); + assertEquals("user:pass",UrlOperations.urlToUserInfo("http://user:pa...@ww...:8080/boo@arb")); + assertEquals("www.foo.com",UrlOperations.urlToHost("http://user:pa...@ww.../")); + assertEquals("www.foo.com",UrlOperations.urlToHost("http://user:pa...@ww.../boo@foo")); + } public void testResolveUrl() { for(String scheme : UrlOperations.ALL_SCHEMES) { @@ -83,9 +116,7 @@ assertEquals(scheme + "a.org/1/2", UrlOperations.resolveUrl(scheme + "a.org/3","1/2")); - } - } public void testUrlToScheme() { assertEquals("http://",UrlOperations.urlToScheme("http://a.com/")); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |