From: <bra...@us...> - 2008-09-04 21:35:59
|
Revision: 2590 http://archive-access.svn.sourceforge.net/archive-access/?rev=2590&view=rev Author: bradtofel Date: 2008-09-04 21:36:09 +0000 (Thu, 04 Sep 2008) Log Message: ----------- BUGFIX(ACC-31): now escapes URLs as they are resolved in UrlOperations. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2008-09-02 23:26:08 UTC (rev 2589) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2008-09-04 21:36:09 UTC (rev 2590) @@ -83,7 +83,14 @@ public static String resolveUrl(String baseUrl, String url) { // TODO: this only works for http:// if(url.startsWith("http://")) { - return url; + try { + return UURIFactory.getInstance(url).getEscapedURI(); + } catch (URIException e) { + e.printStackTrace(); + // can't let a space exist... send back close to whatever came + // in... + return url.replace(" ", "%20"); + } } UURI absBaseURI; UURI resolvedURI = null; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-15 22:51:36
|
Revision: 2809 http://archive-access.svn.sourceforge.net/archive-access/?rev=2809&view=rev Author: bradtofel Date: 2009-10-15 22:51:23 +0000 (Thu, 15 Oct 2009) Log Message: ----------- REFACTOR: moved parsing of path from a URL String here... it should be further refactored into URL, or UURI... Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2009-10-15 22:44:10 UTC (rev 2808) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2009-10-15 22:51:23 UTC (rev 2809) @@ -22,6 +22,9 @@ public final static String FTP_SCHEME = "ftp://"; public final static String MMS_SCHEME = "mms://"; public final static String RTSP_SCHEME = "rtsp://"; + + public final static String DEFAULT_SCHEME = HTTP_SCHEME; + // go brewster public final static String WAIS_SCHEME = "wais://"; @@ -132,6 +135,25 @@ } return -1; } + + public static String getURLPath(String url) { + int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR); + int pathIdx = url.indexOf(UrlOperations.PATH_START); + if(portIdx == -1 && pathIdx == -1) { + return ""; + } + if(portIdx == -1) { + return url.substring(pathIdx); + } + if(pathIdx == -1) { + return url.substring(portIdx); + } + if(pathIdx > portIdx) { + return url.substring(portIdx); + } else { + return url.substring(pathIdx); + } + } public static String urlToHost(String url) { if(url.startsWith("dns:")) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-03-20 01:05:45
|
Revision: 2988 http://archive-access.svn.sourceforge.net/archive-access/?rev=2988&view=rev Author: bradtofel Date: 2010-03-20 01:05:39 +0000 (Sat, 20 Mar 2010) Log Message: ----------- BUGFIX(unreported): replaced URL to host processing with REGEX, to better handle URLs with freakish illegal characters before the port/path start. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-03-20 01:02:49 UTC (rev 2987) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-03-20 01:05:39 UTC (rev 2988) @@ -92,6 +92,9 @@ Pattern.compile("(([0-9a-z_.-]+)\\.(" + ALL_TLD_PATTERN + "))|" + "(" + IP_PATTERN + ")"); + private static final Pattern AUTHORITY_REGEX_SIMPLE = + Pattern.compile("([0-9a-z_.-]++)"); + /** * @param urlPart * @return boolean indicating whether urlPart might be an Authority. @@ -186,22 +189,11 @@ for(String scheme : ALL_SCHEMES) { if(url.startsWith(scheme)) { int hostIdx = scheme.length(); - int portIdx = url.indexOf(PORT_SEPARATOR, hostIdx + 1); - int pathIdx = url.indexOf(PATH_START, hostIdx + 1); - if(portIdx == -1 && pathIdx == -1) { - return url.substring(hostIdx); + + Matcher m = AUTHORITY_REGEX_SIMPLE.matcher(url.substring(hostIdx)); + if(m.find()) { + return m.group(0); } - if(portIdx == -1) { - return url.substring(hostIdx,pathIdx); - } - if(pathIdx == -1) { - return url.substring(hostIdx,portIdx); - } - if(pathIdx > portIdx) { - return url.substring(hostIdx,portIdx); - } else { - return url.substring(hostIdx,pathIdx); - } } } return url; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-04-27 20:52:26
|
Revision: 3068 http://archive-access.svn.sourceforge.net/archive-access/?rev=3068&view=rev Author: bradtofel Date: 2010-04-27 20:52:20 +0000 (Tue, 27 Apr 2010) Log Message: ----------- JAVADOC Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-04-27 20:51:27 UTC (rev 3067) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-04-27 20:52:20 UTC (rev 3068) @@ -24,8 +24,6 @@ */ package org.archive.wayback.util.url; -import java.net.MalformedURLException; -import java.net.URL; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -45,18 +43,44 @@ private static final Logger LOGGER = Logger.getLogger( UrlOperations.class.getName()); + /** + * ARC/WARC specific DNS resolution record. + */ public final static String DNS_SCHEME = "dns:"; + /** + * HTTP + */ public final static String HTTP_SCHEME = "http://"; + /** + * HTTPS + */ public final static String HTTPS_SCHEME = "https://"; + /** + * FTP + */ public final static String FTP_SCHEME = "ftp://"; + /** + * MMS + */ public final static String MMS_SCHEME = "mms://"; + /** + * RTSP + */ public final static String RTSP_SCHEME = "rtsp://"; + /** + * Default scheme to assume if unspecified. No context implied... + */ public final static String DEFAULT_SCHEME = HTTP_SCHEME; - // go brewster + /** + * go brewster + */ public final static String WAIS_SCHEME = "wais://"; + /** + * array of static Strings for all "known" schemes + */ public final static String ALL_SCHEMES[] = { HTTP_SCHEME, HTTPS_SCHEME, @@ -67,7 +91,14 @@ }; + /** + * character separating host from port within a URL authority + */ public final static char PORT_SEPARATOR = ':'; + /** + * character which delimits the path from the authority in a... in some + * URLs. + */ public final static char PATH_START = '/'; @@ -97,27 +128,35 @@ Pattern.compile("(([0-9a-z_.-]+)\\.(" + ALL_TLD_PATTERN + "))|" + "(" + IP_PATTERN + ")"); - private static final Pattern AUTHORITY_REGEX_SIMPLE = - Pattern.compile("([0-9a-z_.-]++)"); +// private static final Pattern AUTHORITY_REGEX_SIMPLE = +// Pattern.compile("([0-9a-z_.-]++)"); private static final Pattern HOST_REGEX_SIMPLE = Pattern.compile("(?:[0-9a-z_.:-]+@)?([0-9a-z_.-]++)"); private static final Pattern USERINFO_REGEX_SIMPLE = Pattern.compile("([0-9a-z_.:-]+)(?:@[0-9a-z_.-]++)"); /** - * @param urlPart + * Tests if the String argument looks like it could be a legitimate + * authority fragment of a URL, that is, is it an IP address, or, are the + * characters legal in an authority, and does the string end with a legal + * TLD. + * + * @param authString String representation of a fragment of a URL * @return boolean indicating whether urlPart might be an Authority. */ - public static boolean isAuthority(String urlPart) { - Matcher m = AUTHORITY_REGEX.matcher(urlPart); + public static boolean isAuthority(String authString) { + Matcher m = AUTHORITY_REGEX.matcher(authString); return (m != null) && m.matches(); } /** - * @param baseUrl - * @param url - * @return url resolved against baseUrl, unless it is absolute already + * Resolve a possibly relative url argument against a base URL. + * @param baseUrl the base URL against which the url should be resolved + * @param url the URL, possibly relative, to make absolute. + * @return url resolved against baseUrl, unless it is absolute already, and + * further transformed by whatever escaping normally takes place with a + * UURI. */ public static String resolveUrl(String baseUrl, String url) { for(final String scheme : ALL_SCHEMES) { @@ -144,6 +183,11 @@ return resolvedURI.getEscapedURI(); } + /** + * Attempt to find the scheme (http://, https://, etc) from a given URL. + * @param url URL String to parse for a scheme. + * @return the scheme, including trailing "://" if known, null otherwise. + */ public static String urlToScheme(final String url) { for(final String scheme : ALL_SCHEMES) { if(url.startsWith(scheme)) { @@ -153,6 +197,11 @@ return null; } + /** + * Return the default port for the scheme String argument, if known. + * @param scheme String scheme, including '://', as in, "http://", "ftp://" + * @return the default port for the scheme, or -1 if the scheme isn't known. + */ public static int schemeToDefaultPort(final String scheme) { if(scheme.equals(HTTP_SCHEME)) { return 80; @@ -172,6 +221,11 @@ return -1; } + /** + * Attempt to extract the path component of a url String argument. + * @param url the URL which may contain a path, sans scheme. + * @return the path component of the URL, or "" if it contains no path. + */ public static String getURLPath(String url) { int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR); int pathIdx = url.indexOf(UrlOperations.PATH_START); @@ -191,6 +245,12 @@ } } + /** + * Attempt to extract the hostname component of an absolute URL argument. + * @param url the url String from which to extract the hostname + * @return the hostname within the URL, or the url argument if the host + * cannot be found. + */ public static String urlToHost(String url) { String lcUrl = url.toLowerCase(); if(lcUrl.startsWith("dns:")) { @@ -210,6 +270,13 @@ return url; } + /** + * Extract userinfo from the absolute URL argument, that is, "username@", or + * "username:password@" if present. + * @param url the URL from which to extract the userinfo + * @return the userinfo found, not including the "@", or null if no userinfo + * is found + */ public static String urlToUserInfo(String url) { String lcUrl = url.toLowerCase(); if(lcUrl.startsWith("dns:")) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |