|
From: <bra...@us...> - 2010-05-13 18:34:44
|
Revision: 3090
http://archive-access.svn.sourceforge.net/archive-access/?rev=3090&view=rev
Author: bradtofel
Date: 2010-05-13 18:34:37 +0000 (Thu, 13 May 2010)
Log Message:
-----------
FEATURE: added new method stripDefaultPort() and tests for that
TWEAK: Updated TLD list
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java
trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-05-07 23:11:24 UTC (rev 3089)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-05-13 18:34:37 UTC (rev 3090)
@@ -101,31 +101,29 @@
*/
public final static char PATH_START = '/';
-
- private static final String CC_TLDS = "ac|ad|ae|af|ag|ai|al|am|an|ao|aq" +
- "|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs" +
- "|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cx" +
- "|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo" +
- "|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk" +
- "|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg" +
- "|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma" +
- "|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz" +
- "|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm" +
- "|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj" +
- "|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn" +
- "|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu" +
- "|wf|ws|ye|yt|yu|za|zm|zw";
-
- private static final String GEN_TLDS = "aero|biz|cat|com|coop|edu|gov" +
- "|info|int|jobs|mil|mobi|museum|name|net|org|pro|travel";
-
-
- private static final String ALL_TLD_PATTERN = CC_TLDS + "|" + GEN_TLDS;
+ private static final String ALL_TLDS = "ac|ad|ae|aero|af|ag|ai|al|am|an" +
+ "|ao|aq|ar|arpa|as|asia|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi" +
+ "|biz|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cat|cc|cd|cf|cg|ch|ci" +
+ "|ck|cl|cm|cn|co|com|coop|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec" +
+ "|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh" +
+ "|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id" +
+ "|ie|il|im|in|info|int|io|iq|ir|is|it|je|jm|jo|jobs|jp|ke|kg|kh" +
+ "|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc" +
+ "|md|me|mg|mh|mil|mk|ml|mm|mn|mo|mobi|mp|mq|mr|ms|mt|mu|museum" +
+ "|mv|mw|mx|my|mz|na|name|nc|ne|net|nf|ng|ni|nl|no|np|nr|nu|nz" +
+ "|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pro|ps|pt|pw|py|qa|re|ro" +
+ "|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv" +
+ "|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|travel|tt|tv" +
+ "|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|xn--0zwm56d" +
+ "|xn--11b5bs3a9aj6g|xn--80akhbyknj4f|xn--9t4b11yi5a|xn--deba0ad" +
+ "|xn--g6w251d|xn--hgbk6aj7f53bba|xn--hlcj6aya9esc7a|xn--jxalpdlp" +
+ "|xn--kgbechtv|xn--mgbaam7a8h|xn--mgberp4a5d4ar|xn--p1ai" +
+ "|xn--wgbh1c|xn--zckzah|ye|yt|za|zm|zw";
private static final String IP_PATTERN = "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+";
private static final Pattern AUTHORITY_REGEX =
- Pattern.compile("(([0-9a-z_.-]+)\\.(" + ALL_TLD_PATTERN + "))|" +
+ Pattern.compile("(([0-9a-z_.-]+)\\.(" + ALL_TLDS + "))|" +
"(" + IP_PATTERN + ")");
// private static final Pattern AUTHORITY_REGEX_SIMPLE =
@@ -244,7 +242,48 @@
return url.substring(pathIdx);
}
}
+
+ /**
+ * Attempt to strip default ports out of URL strings.
+ * @param url the original URL possibly including a port
+ * @return the URL sans port, if the scheme was recognized and the default
+ * port was supplied, otherwise, the original URL.
+ */
+ public static String stripDefaultPortFromUrl(String url) {
+ String scheme = urlToScheme(url);
+ if(scheme == null) {
+ return url;
+ }
+ int defaultPort = schemeToDefaultPort(scheme);
+ if(defaultPort == -1) {
+ return url;
+ }
+ String portStr = null;
+ // is there a slash after the scheme?
+ int slashIdx = url.indexOf('/', scheme.length());
+ if(slashIdx == -1) {
+ portStr = String.format(":%d", defaultPort);
+ if(url.endsWith(portStr)) {
+ return url.substring(0,url.length() - portStr.length());
+ }
+ }
+ portStr = String.format(":%d/", defaultPort);
+ int idx = url.indexOf(portStr);
+ if(idx == -1) {
+ return url;
+ }
+ // if that occurred before the first / (after the scheme) then strip it:
+ if(slashIdx < idx) {
+ return url;
+ }
+ // we want to strip out the portStr:
+ StringBuilder sb = new StringBuilder(url.length());
+ sb.append(url.substring(0,idx));
+ sb.append(url.substring(idx + (portStr.length()-1)));
+ return sb.toString();
+ }
+
/**
* Attempt to extract the hostname component of an absolute URL argument.
* @param url the url String from which to extract the hostname
Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2010-05-07 23:11:24 UTC (rev 3089)
+++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2010-05-13 18:34:37 UTC (rev 3090)
@@ -161,4 +161,35 @@
}
+
+ public void testStripDefaultPort() {
+ assertSDP("http://foo.com/","http://foo.com/");
+ assertSDP("http://foo.com","http://foo.com");
+ assertSDP("http://foo.com","http://foo.com:80");
+ assertSDP("foo.com:80/","foo.com:80/");
+ assertSDP("http://foo.com:8080/","http://foo.com:8080/");
+ assertSDP("http://foo.com:8081/","http://foo.com:8081/");
+ assertSDP("https://foo.com:8081/","https://foo.com:8081/");
+ assertSDP("https://foo.com/","https://foo.com:443/");
+ assertSDP("https://foo.com","https://foo.com:443");
+ assertSDP("ftp://foo.com/","ftp://foo.com/");
+ assertSDP("ftp://foo.com","ftp://foo.com");
+ assertSDP("ftp://foo.com:1234","ftp://foo.com:1234");
+ assertSDP("ftp://foo.com","ftp://foo.com:21");
+ assertSDP("ftp://foo.com/","ftp://foo.com:21/");
+ assertSDP("ftp://foo.com/bla","ftp://foo.com:21/bla");
+ assertSDP("s3://foo.com/","s3://foo.com/");
+ assertSDP("s3://foo.com/bar","s3://foo.com/bar");
+ assertSDP("s3://foo.com:80/bar","s3://foo.com:80/bar");
+ assertSDP("http://b@foo.com/bar","http://b@foo.com:80/bar");
+ assertSDP("http://b@foo.com/bar","http://b@foo.com/bar");
+ assertSDP("http://b:80...@fo.../bar","http://b:80...@fo.../bar");
+ assertSDP("http://b:80...@fo.../bar","http://b:80...@fo...:80/bar");
+ assertSDP("http://b:80...@fo...:8080/ba","http://b:80...@fo...:8080/ba");
+ }
+ private void assertSDP(String want, String orig) {
+ String got = UrlOperations.stripDefaultPortFromUrl(orig);
+ assertEquals(want,got);
+ }
+
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|