From: <bra...@us...> - 2008-01-15 01:39:59
|
Revision: 2126 http://archive-access.svn.sourceforge.net/archive-access/?rev=2126&view=rev Author: bradtofel Date: 2008-01-14 17:39:58 -0800 (Mon, 14 Jan 2008) Log Message: ----------- REFACTOR: moved isAuthority() and resolveUrl() from the generic UrlCanonicalizer class, in preparation for making UrlCanonicalizers configurable. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2008-01-15 01:39:58 UTC (rev 2126) @@ -0,0 +1,76 @@ +package org.archive.wayback.util.url; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.httpclient.URIException; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; + +/** + * Class containing common static URL methods. Primarily resolveUrl() and + * the (currently) unused isAuthority(). + * + * @author brad + * @version $Date$, $Revision$ + */ +public class UrlOperations { + + private static final String CC_TLDS = "ac|ad|ae|af|ag|ai|al|am|an|ao|aq" + + "|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs" + + "|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cx" + + "|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo" + + "|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk" + + "|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg" + + "|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma" + + "|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz" + + "|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm" + + "|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj" + + "|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn" + + "|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu" + + "|wf|ws|ye|yt|yu|za|zm|zw"; + + private static final String GEN_TLDS = "aero|biz|cat|com|coop|edu|gov" + + "|info|int|jobs|mil|mobi|museum|name|net|org|pro|travel"; + + + private static final String ALL_TLD_PATTERN = CC_TLDS + "|" + GEN_TLDS; + + private static final String IP_PATTERN = "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+"; + + private static final Pattern AUTHORITY_REGEX = + Pattern.compile("(([0-9a-z_.-]+)\\.(" + ALL_TLD_PATTERN + "))|" + + "(" + IP_PATTERN + ")"); + + /** + * @param urlPart + * @return boolean indicating whether urlPart might be an Authority. + */ + public static boolean isAuthority(String urlPart) { + Matcher m = AUTHORITY_REGEX.matcher(urlPart); + + return (m != null) && m.matches(); + } + + /** + * @param baseUrl + * @param url + * @return url resolved against baseUrl, unless it is absolute already + */ + public static String resolveUrl(String baseUrl, String url) { + // TODO: this only works for http:// + if(url.startsWith("http://")) { + return url; + } + UURI absBaseURI; + UURI resolvedURI = null; + try { + absBaseURI = UURIFactory.getInstance(baseUrl); + resolvedURI = UURIFactory.getInstance(absBaseURI, url); + } catch (URIException e) { + e.printStackTrace(); + return url; + } + return resolvedURI.getEscapedURI(); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |