From: <bra...@us...> - 2008-11-07 22:34:04
|
Revision: 2637 http://archive-access.svn.sourceforge.net/archive-access/?rev=2637&view=rev Author: bradtofel Date: 2008-11-07 22:34:00 +0000 (Fri, 07 Nov 2008) Log Message: ----------- FEATURE: added static methods urlToScheme() and getSchemeDefaultPort() Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2008-11-07 22:31:42 UTC (rev 2636) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2008-11-07 22:34:00 UTC (rev 2637) @@ -81,15 +81,16 @@ * @return url resolved against baseUrl, unless it is absolute already */ public static String resolveUrl(String baseUrl, String url) { - // TODO: this only works for http:// - if(url.startsWith("http://")) { - try { - return UURIFactory.getInstance(url).getEscapedURI(); - } catch (URIException e) { - e.printStackTrace(); - // can't let a space exist... send back close to whatever came - // in... - return url.replace(" ", "%20"); + for(final String scheme : ALL_SCHEMES) { + if(url.startsWith(scheme)) { + try { + return UURIFactory.getInstance(url).getEscapedURI(); + } catch (URIException e) { + e.printStackTrace(); + // can't let a space exist... send back close to whatever came + // in... + return url.replace(" ", "%20"); + } } } UURI absBaseURI; @@ -99,11 +100,39 @@ resolvedURI = UURIFactory.getInstance(absBaseURI, url); } catch (URIException e) { e.printStackTrace(); - return url; + return url.replace(" ", "%20"); } return resolvedURI.getEscapedURI(); } + public static String urlToScheme(final String url) { + for(final String scheme : ALL_SCHEMES) { + if(url.startsWith(scheme)) { + return scheme; + } + } + return null; + } + + public static int schemeToDefaultPort(final String scheme) { + if(scheme.equals(HTTP_SCHEME)) { + return 80; + } + if(scheme.equals(HTTPS_SCHEME)) { + return 443; + } + if(scheme.equals(FTP_SCHEME)) { + return 21; + } + if(scheme.equals(RTSP_SCHEME)) { + return 554; + } + if(scheme.equals(MMS_SCHEME)) { + return 1755; + } + return -1; + } + public static String urlToHost(String url) { if(url.startsWith("dns:")) { return url.substring(4); Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2008-11-07 22:31:42 UTC (rev 2636) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2008-11-07 22:34:00 UTC (rev 2637) @@ -62,7 +62,33 @@ assertEquals("foo.com",UrlOperations.urlToHost("http://foo.com/path:/")); assertEquals("foo.com",UrlOperations.urlToHost("https://foo.com/path:/")); assertEquals("foo.com",UrlOperations.urlToHost("ftp://foo.com/path:/")); + } + + public void testResolveUrl() { + for(String scheme : UrlOperations.ALL_SCHEMES) { + + assertEquals(scheme + "a.org/1/2", + UrlOperations.resolveUrl(scheme + "a.org/3/","/1/2")); + + assertEquals(scheme + "b.org/1/2", + UrlOperations.resolveUrl(scheme + "a.org/3/", + scheme + "b.org/1/2")); + + assertEquals(scheme + "a.org/3/1/2", + UrlOperations.resolveUrl(scheme + "a.org/3/","1/2")); + + assertEquals(scheme + "a.org/1/2", + UrlOperations.resolveUrl(scheme + "a.org/3","1/2")); + } } + public void testUrlToScheme() { + assertEquals("http://",UrlOperations.urlToScheme("http://a.com/")); + assertEquals("https://",UrlOperations.urlToScheme("https://a.com/")); + assertEquals("ftp://",UrlOperations.urlToScheme("ftp://a.com/")); + assertEquals("rtsp://",UrlOperations.urlToScheme("rtsp://a.com/")); + assertEquals("mms://",UrlOperations.urlToScheme("mms://a.com/")); + assertNull(UrlOperations.urlToScheme("blah://a.com/")); + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-11-07 22:35:28
|
Revision: 2638 http://archive-access.svn.sourceforge.net/archive-access/?rev=2638&view=rev Author: bradtofel Date: 2008-11-07 22:35:24 +0000 (Fri, 07 Nov 2008) Log Message: ----------- FEATURE: Now supports canonicalization of some non-http:// schemes. TWEAK: removed unused commented out code Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java 2008-11-07 22:34:00 UTC (rev 2637) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizer.java 2008-11-07 22:35:24 UTC (rev 2638) @@ -206,25 +206,32 @@ return urlString; } String searchUrl = canonicalize(urlString); - - // TODO: force https into http for the moment... - if(searchUrl.startsWith("https://")) { - searchUrl = searchUrl.substring(8); + String scheme = UrlOperations.urlToScheme(searchUrl); + if(scheme != null) { + searchUrl = searchUrl.substring(scheme.length()); + } else { + scheme = UrlOperations.HTTP_SCHEME; } - - // TODO: this will only work with http:// scheme. should work with all? - // force add of scheme and possible add '/' with empty path: - if (searchUrl.startsWith("http://")) { - if (-1 == searchUrl.indexOf('/', 8)) { - searchUrl = searchUrl + "/"; - } + + if (-1 == searchUrl.indexOf("/")) { + searchUrl = scheme + searchUrl + "/"; } else { - if (-1 == searchUrl.indexOf("/")) { - searchUrl = searchUrl + "/"; - } - searchUrl = "http://" + searchUrl; + searchUrl = scheme + searchUrl; } + // TODO: this will only work with http:// scheme. should work with all? + // force add of scheme and possible add '/' with empty path: +// if (searchUrl.startsWith("http://")) { +// if (-1 == searchUrl.indexOf('/', 8)) { +// searchUrl = searchUrl + "/"; +// } +// } else { +// if (-1 == searchUrl.indexOf("/")) { +// searchUrl = searchUrl + "/"; +// } +// searchUrl = "http://" + searchUrl; +// } + // TODO: These next few lines look crazy -- need to be reworked.. This // was the only easy way I could find to get the correct unescaping // out of UURIs, possible a bug. Definitely needs some TLC in any case, @@ -250,23 +257,18 @@ // if((newPath.length() > 1) && newPath.endsWith("/")) { // newPath = newPath.substring(0,newPath.length()-1); // } -// searchURI.setEscapedPath(newPath); -// searchURI.setRawPath(newPath.toCharArray()); -// String query = searchURI.getEscapedQuery(); - // TODO: handle non HTTP port stripping, too. -// String portStr = ""; -// if(searchURI.getPort() != 80 && searchURI.getPort() != -1) { -// portStr = ":" + searchURI.getPort(); -// } -// return searchURI.getHostBasename() + portStr + -// searchURI.getEscapedPathQuery(); - StringBuilder sb = new StringBuilder(searchUrl.length()); sb.append(searchURI.getHostBasename()); - if(searchURI.getPort() != 80 && searchURI.getPort() != -1) { + + // omit port if scheme default: + int defaultSchemePort = UrlOperations.schemeToDefaultPort(scheme); + if(searchURI.getPort() != defaultSchemePort + && searchURI.getPort() != -1) { + sb.append(":").append(searchURI.getPort()); } + sb.append(newPath); if(searchURI.getEscapedQuery() != null) { sb.append("?").append(searchURI.getEscapedQuery()); Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java 2008-11-07 22:34:00 UTC (rev 2637) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java 2008-11-07 22:35:24 UTC (rev 2638) @@ -45,16 +45,15 @@ // simple strip of http:// checkCanonicalization("http://foo.com/","foo.com/"); -// would be nice to handle other protocols... -// // simple strip of https:// -// checkCanonicalization("https://foo.com/","foo.com/"); -// -// // simple strip of ftp:// -// checkCanonicalization("ftp://foo.com/","foo.com/"); -// -// // simple strip of rtsp:// -// checkCanonicalization("rtsp://foo.com/","foo.com/"); + // simple strip of https:// + checkCanonicalization("https://foo.com/","foo.com/"); + // simple strip of ftp:// + checkCanonicalization("ftp://foo.com/","foo.com/"); + + // simple strip of rtsp:// + checkCanonicalization("rtsp://foo.com/","foo.com/"); + // strip leading 'www.' checkCanonicalization("http://www.foo.com/","foo.com/"); @@ -63,6 +62,9 @@ // strip leading 'www##.' checkCanonicalization("http://www12.foo.com/","foo.com/"); + + // strip leading 'www##.' with https + checkCanonicalization("https://www12.foo.com/","foo.com/"); // strip leading 'www##.' with no protocol checkCanonicalization("www12.foo.com/","foo.com/"); @@ -174,13 +176,53 @@ checkCanonicalization( "http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules", "legislature.mi.gov/mileg.aspx?page=sessionschedules"); + + + + + // default port stripping: + // FIRST the easy-on-the-eyes + // strip port 80 checkCanonicalization("http://www.chub.org:80/foo","chub.org/foo"); // but not other ports... checkCanonicalization("http://www.chub.org:8080/foo","chub.org:8080/foo"); + + // but not other ports... with "www#." massage + checkCanonicalization("http://www232.chub.org:8080/foo","chub.org:8080/foo"); + // default HTTP (:80) stripping without a scheme: + checkCanonicalization("www.chub.org:80/foo","chub.org/foo"); + + // no strip https port (443) without scheme: + checkCanonicalization("www.chub.org:443/foo","chub.org:443/foo"); + + // yes strip https port (443) with scheme: + checkCanonicalization("https://www.chub.org:443/foo","chub.org/foo"); + + // NEXT the exhaustive: + String origHost = "www.chub.org"; + String massagedHost = "chub.org"; + String path = "/foo"; + for(String scheme : UrlOperations.ALL_SCHEMES) { + + int defaultPort = UrlOperations.schemeToDefaultPort(scheme); + int nonDefaultPort = 19991; + + String origDefault = scheme + origHost + ":" + defaultPort + path; + String canonDefault = massagedHost + path; + + String origNonDefault = + scheme + origHost + ":" + nonDefaultPort + path; + String canonNonDefault = + massagedHost + ":" + nonDefaultPort + path; + + checkCanonicalization(origDefault,canonDefault); + checkCanonicalization(origNonDefault,canonNonDefault); + } + } private void checkCanonicalization(String orig, String want) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-11 00:33:39
|
Revision: 2926 http://archive-access.svn.sourceforge.net/archive-access/?rev=2926&view=rev Author: bradtofel Date: 2009-11-11 00:18:08 +0000 (Wed, 11 Nov 2009) Log Message: ----------- BUGFIX(unreported) URI actually does not do proper path canonicalization - resolving "../../foo.gif" against "http://base.com/" results in "http://base.com/../../foo.gif" not "http://base.com/foo.gif" as it should. UURI does the right thing, so now this uses UURI to perform the resolving, and now we have a trivial test case that demonstrates this. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseContext.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ParseContextTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseContext.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseContext.java 2009-11-11 00:14:29 UTC (rev 2925) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/ReplayParseContext.java 2009-11-11 00:18:08 UTC (rev 2926) @@ -46,7 +46,7 @@ URL baseUrl, String datespec) { this.uriConverterFactory = uriConverterFactory; - this.baseUrl = baseUrl; + setBaseUrl(baseUrl); this.datespec = datespec; converters = new HashMap<String,ResultURIConverter>(); } @@ -143,4 +143,18 @@ public void setJspExec(JSPExecutor jspExec) { this.jspExec = jspExec; } + + /** + * @return the datespec + */ + public String getDatespec() { + return datespec; + } + + /** + * @param datespec the datespec to set + */ + public void setDatespec(String datespec) { + this.datespec = datespec; + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2009-11-11 00:14:29 UTC (rev 2925) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2009-11-11 00:18:08 UTC (rev 2926) @@ -28,6 +28,10 @@ import java.net.URL; import java.util.HashMap; +import org.apache.commons.httpclient.URIException; +import org.archive.net.UURI; +import org.archive.net.UURIFactory; + /** * Class which tracks the context and state involved with parsing an HTML * document via SAX events. @@ -43,7 +47,7 @@ */ public class ParseContext { - protected URL baseUrl = null; + protected UURI baseUrl = null; private boolean inCSS = false; private boolean inJS = false; @@ -60,11 +64,21 @@ return data.get(key); } public void setBaseUrl(URL url) { - baseUrl = url; + try { + baseUrl = UURIFactory.getInstance(url.toExternalForm()); + } catch (URIException e) { + e.printStackTrace(); + } } public String resolve(String url) throws MalformedURLException { - URL tmp = new URL(baseUrl,url); - return tmp.toString(); + try { + return baseUrl.resolve(url).toString(); + } catch (URIException e) { + e.printStackTrace(); + } + return url; +// URL tmp = new URL(baseUrl,url); +// return tmp.toString(); } public String contextualizeUrl(String url) { if(url.startsWith("javascript:")) { Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ParseContextTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ParseContextTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ParseContextTest.java 2009-11-11 00:18:08 UTC (rev 2926) @@ -0,0 +1,68 @@ +/* ParseContextTest + * + * $Id$: + * + * Created on Nov 10, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.util.htmllex; + +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URL; + +import org.archive.net.UURI; +import org.archive.net.UURIFactory; + +import junit.framework.TestCase; + +/** + * @author brad + * + */ +public class ParseContextTest extends TestCase { + + /** + * Test method for {@link org.archive.wayback.util.htmllex.ParseContext#contextualizeUrl(java.lang.String)}. + */ + public void testContextualizeUrl() { + ParseContext pc = new ParseContext(); + try { + pc.setBaseUrl(new URL("http://base.com/")); + assertEquals("http://base.com/images.gif", + pc.contextualizeUrl("/images.gif")); + assertEquals("http://base.com/images.gif", + pc.contextualizeUrl("../images.gif")); + assertEquals("http://base.com/images.gif", + pc.contextualizeUrl("../../images.gif")); + assertEquals("http://base.com/image/1s.gif", + pc.contextualizeUrl("/image/1s.gif")); + assertEquals("http://base.com/image/1s.gif", + pc.contextualizeUrl("../../image/1s.gif")); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + fail(e.getLocalizedMessage()); + } + + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ParseContextTest.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-11-12 22:22:41
|
Revision: 2930 http://archive-access.svn.sourceforge.net/archive-access/?rev=2930&view=rev Author: bradtofel Date: 2009-11-12 22:22:32 +0000 (Thu, 12 Nov 2009) Log Message: ----------- BUGFIX: now includes original named link (#name fragment at end of input URL) in resolved url Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ParseContextTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2009-11-11 00:25:13 UTC (rev 2929) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/htmllex/ParseContext.java 2009-11-12 22:22:32 UTC (rev 2930) @@ -24,7 +24,7 @@ */ package org.archive.wayback.util.htmllex; -import java.net.MalformedURLException; +import java.net.URISyntaxException; import java.net.URL; import java.util.HashMap; @@ -54,15 +54,31 @@ private boolean inScriptText = false; private HashMap<String,String> data = null; + /** + * constructor + */ public ParseContext() { data = new HashMap<String, String>(); } + /** + * Stores arbitrary key value pairs in this ParseContext + * @param key for storage + * @param value for storage + */ public void putData(String key, String value) { data.put(key, value); } + /** + * Retrieves previously stored data for key key from this ParseContext + * @param key under which value was stored + * @return previously stored value for key or null, if nothing was stored + */ public String getData(String key) { return data.get(key); } + /** + * @param url against which relative URLs should be resolved for this parse + */ public void setBaseUrl(URL url) { try { baseUrl = UURIFactory.getInstance(url.toExternalForm()); @@ -70,23 +86,37 @@ e.printStackTrace(); } } - public String resolve(String url) throws MalformedURLException { + /** + * @param url which should be resolved against the baseUrl for this + * ParseContext. + * @return absolute form of url, resolved against baseUrl if relative. + * @throws URISyntaxException if the input URL is malformed + */ + public String resolve(String url) throws URISyntaxException { + int hashIdx = url.indexOf('#'); + String frag = ""; + if(hashIdx != -1) { + frag = url.substring(hashIdx); + url = url.substring(0,hashIdx); + } try { - return baseUrl.resolve(url).toString(); + return baseUrl.resolve(url).toString() + frag; } catch (URIException e) { e.printStackTrace(); } return url; -// URL tmp = new URL(baseUrl,url); -// return tmp.toString(); } + /** + * @param url which should be resolved. + * @return absolute form of input url, or url itself if javascript: + */ public String contextualizeUrl(String url) { if(url.startsWith("javascript:")) { return url; } try { return resolve(url); - } catch (MalformedURLException e) { + } catch (URISyntaxException e) { e.printStackTrace(); return url; } Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ParseContextTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ParseContextTest.java 2009-11-11 00:25:13 UTC (rev 2929) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/htmllex/ParseContextTest.java 2009-11-12 22:22:32 UTC (rev 2930) @@ -25,13 +25,9 @@ package org.archive.wayback.util.htmllex; -import java.net.MalformedURLException; import java.net.URI; import java.net.URL; -import org.archive.net.UURI; -import org.archive.net.UURIFactory; - import junit.framework.TestCase; /** @@ -46,6 +42,14 @@ public void testContextualizeUrl() { ParseContext pc = new ParseContext(); try { + + URI tmp = new URI("http://base.com/foo.html#REF"); + String ref = tmp.getFragment(); + assertEquals("REF",ref); + tmp = new URI("http://base.com/foo.html"); + assertNull(tmp.getFragment()); + + pc.setBaseUrl(new URL("http://base.com/")); assertEquals("http://base.com/images.gif", pc.contextualizeUrl("/images.gif")); @@ -57,6 +61,12 @@ pc.contextualizeUrl("/image/1s.gif")); assertEquals("http://base.com/image/1s.gif", pc.contextualizeUrl("../../image/1s.gif")); + assertEquals("http://base.com/image/1s.gif", + pc.contextualizeUrl("/../../image/1s.gif")); + assertEquals("http://base.com/image/1.html#REF", + pc.contextualizeUrl("/../../image/1.html#REF")); + assertEquals("http://base.com/image/1.html#REF FOO", + pc.contextualizeUrl("/../../image/1.html#REF FOO")); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-03-23 23:45:15
|
Revision: 3000 http://archive-access.svn.sourceforge.net/archive-access/?rev=3000&view=rev Author: bradtofel Date: 2010-03-23 23:45:07 +0000 (Tue, 23 Mar 2010) Log Message: ----------- INTERFACE: now passing AccessPoint reference into PathRequestParsers Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDatePrefixQueryRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDateRangeQueryRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDatePrefixQueryRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDateRangeQueryRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParserTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDatePrefixQueryRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDatePrefixQueryRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDatePrefixQueryRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000) @@ -31,6 +31,7 @@ import org.archive.wayback.requestparser.BaseRequestParser; import org.archive.wayback.requestparser.PathRequestParser; import org.archive.wayback.util.Timestamp; +import org.archive.wayback.webapp.AccessPoint; /** * RequestParser implementation that extracts request info from an Archival Url @@ -54,7 +55,7 @@ private final static Pattern WB_QUERY_REGEX = Pattern .compile("^(\\d{0,13})\\*/(.*[^*])$"); - public WaybackRequest parse(String requestPath) { + public WaybackRequest parse(String requestPath, AccessPoint ap) { WaybackRequest wbRequest = null; Matcher matcher = WB_QUERY_REGEX.matcher(requestPath); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDateRangeQueryRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDateRangeQueryRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathDateRangeQueryRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000) @@ -31,6 +31,7 @@ import org.archive.wayback.requestparser.BaseRequestParser; import org.archive.wayback.requestparser.PathRequestParser; import org.archive.wayback.util.Timestamp; +import org.archive.wayback.webapp.AccessPoint; /** * RequestParser implementation that extracts request info from an Archival Url @@ -56,7 +57,7 @@ .compile("^(\\d{1,14})-(\\d{1,14})\\*/(.*[^*])$"); - public WaybackRequest parse(String requestPath) { + public WaybackRequest parse(String requestPath, AccessPoint ap) { WaybackRequest wbRequest = null; Matcher matcher = WB_QUERY2_REGEX.matcher(requestPath); if (matcher != null && matcher.matches()) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDatePrefixQueryRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDatePrefixQueryRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDatePrefixQueryRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000) @@ -31,6 +31,7 @@ import org.archive.wayback.requestparser.BaseRequestParser; import org.archive.wayback.requestparser.PathRequestParser; import org.archive.wayback.util.Timestamp; +import org.archive.wayback.webapp.AccessPoint; /** * RequestParser implementation that extracts request info from an Archival Url @@ -54,7 +55,7 @@ private final static Pattern WB_PATH_QUERY_REGEX = Pattern .compile("^(\\d{0,13})\\*/(.*)\\*$"); - public WaybackRequest parse(String requestPath) { + public WaybackRequest parse(String requestPath, AccessPoint ap) { WaybackRequest wbRequest = null; Matcher matcher = WB_PATH_QUERY_REGEX.matcher(requestPath); if (matcher != null && matcher.matches()) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDateRangeQueryRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDateRangeQueryRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/PathPrefixDateRangeQueryRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000) @@ -31,6 +31,7 @@ import org.archive.wayback.requestparser.BaseRequestParser; import org.archive.wayback.requestparser.PathRequestParser; import org.archive.wayback.util.Timestamp; +import org.archive.wayback.webapp.AccessPoint; /** * RequestParser implementation that extracts request info from an Archival Url @@ -54,7 +55,7 @@ private final static Pattern WB_PATH_QUERY2_REGEX = Pattern .compile("^(\\d{1,14})-(\\d{1,14})\\*/(.*)\\*$"); - public WaybackRequest parse(String requestPath) { + public WaybackRequest parse(String requestPath, AccessPoint ap) { WaybackRequest wbRequest = null; Matcher matcher = WB_PATH_QUERY2_REGEX.matcher(requestPath); if (matcher != null && matcher.matches()) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000) @@ -27,11 +27,17 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.commons.httpclient.URIException; +import org.archive.net.UURIFactory; +import org.archive.wayback.ResultURIConverter; import org.archive.wayback.archivalurl.ArchivalUrlRequestParser; import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BetterRequestException; import org.archive.wayback.requestparser.BaseRequestParser; import org.archive.wayback.requestparser.PathRequestParser; import org.archive.wayback.util.Timestamp; +import org.archive.wayback.util.url.UrlOperations; +import org.archive.wayback.webapp.AccessPoint; /** * RequestParser implementation that extracts request info from a Replay @@ -55,7 +61,8 @@ super(wrapped); } - public WaybackRequest parse(String requestPath) { + public WaybackRequest parse(String requestPath, AccessPoint ap) + throws BetterRequestException { WaybackRequest wbRequest = null; Matcher matcher = WB_REQUEST_REGEX.matcher(requestPath); String urlStr = null; @@ -105,6 +112,33 @@ wbRequest.setReplayRequest(); wbRequest.setRequestUrl(urlStr); + } else { + // see if the remainder looks like an URL: +// String scheme = UrlOperations.urlToScheme(requestPath); +// if(scheme != null) { +// // lets interpret this as a replay request missing the +// // timestamp: use "NOW" +// String nowTS = Timestamp.currentTimestamp().getDateStr(); +// ResultURIConverter conv = ap.getUriConverter(); +// +// String betterURI = conv.makeReplayURI(nowTS, requestPath); +// throw new BetterRequestException(betterURI); +// } else { +// // not obviously an URL... see if UURI can handle it: +// String httpUrl = UrlOperations.HTTP_SCHEME + requestPath; +// try { +// UURIFactory.getInstance(httpUrl); +// // that worked. use httpUrl: +// String nowTS = Timestamp.currentTimestamp().getDateStr(); +// ResultURIConverter conv = ap.getUriConverter(); +// +// String betterURI = conv.makeReplayURI(nowTS, requestPath); +// throw new BetterRequestException(betterURI); +// } catch (URIException e) { +// // oh well. lets just fail: +// } +// } + } return wbRequest; } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java 2010-03-23 23:40:36 UTC (rev 2999) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/requestparser/PathRequestParser.java 2010-03-23 23:45:07 UTC (rev 3000) @@ -28,6 +28,7 @@ import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.exception.BetterRequestException; import org.archive.wayback.webapp.AccessPoint; /** @@ -48,17 +49,21 @@ /** * @param requestPath + * @param acessPoint * @return WaybackRequest with information parsed from the requestPath, or * null if information could not be extracted. + * @throws BetterRequestException */ - public abstract WaybackRequest parse(String requestPath); + public abstract WaybackRequest parse(String requestPath, + AccessPoint acessPoint) throws BetterRequestException; /* (non-Javadoc) * @see org.archive.wayback.requestparser.BaseRequestParser#parse(javax.servlet.http.HttpServletRequest, org.archive.wayback.webapp.WaybackContext) */ @Override public WaybackRequest parse(HttpServletRequest httpRequest, - AccessPoint wbContext) throws BadQueryException { + AccessPoint acessPoint) + throws BadQueryException, BetterRequestException { String queryString = httpRequest.getQueryString(); String origRequestPath = httpRequest.getRequestURI(); @@ -66,13 +71,13 @@ if (queryString != null) { origRequestPath += "?" + queryString; } - String contextPath = wbContext.getContextPath(httpRequest); + String contextPath = acessPoint.getContextPath(httpRequest); if (!origRequestPath.startsWith(contextPath)) { return null; } String requestPath = origRequestPath.substring(contextPath.length()); - WaybackRequest wbRequest = parse(requestPath); + WaybackRequest wbRequest = parse(requestPath, acessPoint); if(wbRequest != null) { wbRequest.setResultsPerPage(getMaxRecords()); } Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParserTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParserTest.java 2010-03-23 23:40:36 UTC (rev 2999) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/archivalurl/requestparser/ReplayRequestParserTest.java 2010-03-23 23:45:07 UTC (rev 3000) @@ -26,7 +26,9 @@ import org.archive.wayback.archivalurl.ArchivalUrlRequestParser; import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BetterRequestException; import org.archive.wayback.requestparser.BaseRequestParser; +import org.archive.wayback.webapp.AccessPoint; import junit.framework.TestCase; @@ -41,87 +43,89 @@ /** * Test method for {@link org.archive.wayback.archivalurl.requestparser.ReplayRequestParser#parse(java.lang.String)}. + * @throws BetterRequestException */ - public void testParseString() { + public void testParseString() throws BetterRequestException { BaseRequestParser wrapped = new ArchivalUrlRequestParser(); ReplayRequestParser p = new ReplayRequestParser(wrapped); WaybackRequest r; - r = p.parse(""); + AccessPoint ap = null; + r = p.parse("",ap); assertNull("Should not parse empty string", r); - r = p.parse("20070101000000/foo.com"); + r = p.parse("20070101000000/foo.com",ap); assertNotNull("Should parse legit request sans scheme", r); assertEquals("parsed request Url",r.getRequestUrl(),"http://foo.com"); assertEquals("Parsed timestamp","20070101000000",r.getReplayTimestamp()); - r = p.parse("20070101000000/foo.com/"); + r = p.parse("20070101000000/foo.com/",ap); assertEquals("parsed request Url, maintaining trailing slash", "http://foo.com/",r.getRequestUrl()); - r = p.parse("200701010000/foo.com"); + r = p.parse("200701010000/foo.com",ap); assertEquals("parsed partial date", "http://foo.com",r.getRequestUrl()); assertEquals("Parsed partial timestamp to earliest", "20070101000000",r.getReplayTimestamp()); - r = p.parse("20070101000000/http://foo.com"); + r = p.parse("20070101000000/http://foo.com",ap); assertEquals("parsed request Url with scheme", "http://foo.com",r.getRequestUrl()); - r = p.parse("20070101000000/http://foo.com/"); + r = p.parse("20070101000000/http://foo.com/",ap); assertEquals("parsed request Url with scheme and trailing slash", "http://foo.com/",r.getRequestUrl()); - r = p.parse("20070101000000/ftp://foo.com/"); + r = p.parse("20070101000000/ftp://foo.com/",ap); assertEquals("parsed request Url with ftp scheme", "ftp://foo.com/",r.getRequestUrl()); - r = p.parse("20070101000000/https://foo.com/"); + r = p.parse("20070101000000/https://foo.com/",ap); assertEquals("parsed request Url with https scheme", "https://foo.com/",r.getRequestUrl()); - r = p.parse("20070101000000js_/http://foo.com/"); + r = p.parse("20070101000000js_/http://foo.com/",ap); assertEquals("parsed request Url with js_ flag", "http://foo.com/",r.getRequestUrl()); assertTrue("parsed js_ flag",r.isJSContext()); assertFalse("css not set",r.isCSSContext()); - r = p.parse("20070101000000cs_/http://foo.com/"); + r = p.parse("20070101000000cs_/http://foo.com/",ap); assertEquals("parsed request Url with cs_ flag", "http://foo.com/",r.getRequestUrl()); assertTrue("parsed cs_ flag",r.isCSSContext()); assertFalse("js not set",r.isJSContext()); - r = p.parse("20070101000000cs_js_/http://foo.com/"); + r = p.parse("20070101000000cs_js_/http://foo.com/",ap); assertEquals("parsed request Url with cs_ and js_ flags", "http://foo.com/",r.getRequestUrl()); assertTrue("parsed cs_ flag",r.isCSSContext()); assertTrue("parsed js_ flag",r.isJSContext()); - r = p.parse("20070101000000js_cs_/http://foo.com/"); + r = p.parse("20070101000000js_cs_/http://foo.com/",ap); assertEquals("parsed request Url with cs_ and js_ flags, backvards", "http://foo.com/",r.getRequestUrl()); assertTrue("parsed cs_ flag",r.isCSSContext()); assertTrue("parsed js_ flag",r.isJSContext()); - r = p.parse("20070101000000un_/http://foo.com/"); + r = p.parse("20070101000000un_/http://foo.com/",ap); assertEquals("parsed request Url with unknown flag", "http://foo.com/",r.getRequestUrl()); assertFalse("no cs_ flag",r.isCSSContext()); assertFalse("no js_ flag",r.isJSContext()); - r = p.parse("20070101000000un_js_cs_/http://foo.com/"); + r = p.parse("20070101000000un_js_cs_/http://foo.com/",ap); assertEquals("parsed request Url with falgs and unknown flag", "http://foo.com/",r.getRequestUrl()); assertTrue("parsed cs_ flag",r.isCSSContext()); assertTrue("parsed js_ flag",r.isJSContext()); - r = p.parse("20070101000000js_cs_un_/http://foo.com/"); + r = p.parse("20070101000000js_cs_un_/http://foo.com/",ap); assertEquals("parsed request Url with falgs and unknown flag at end", "http://foo.com/",r.getRequestUrl()); assertTrue("parsed cs_ flag",r.isCSSContext()); assertTrue("parsed js_ flag",r.isJSContext()); - r = p.parse("20070101000000un_js_cs_un_/http://foo.com/"); + r = p.parse("20070101000000un_js_cs_un_/http://foo.com/",ap); assertEquals("parsed request Url with falgs and unknown flags", "http://foo.com/",r.getRequestUrl()); assertTrue("parsed cs_ flag",r.isCSSContext()); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-04-14 21:33:25
|
Revision: 3042 http://archive-access.svn.sourceforge.net/archive-access/?rev=3042&view=rev Author: bradtofel Date: 2010-04-14 21:33:02 +0000 (Wed, 14 Apr 2010) Log Message: ----------- FEATURE: added getUrlParentDir() Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-04-14 21:28:35 UTC (rev 3041) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-04-14 21:33:02 UTC (rev 3042) @@ -28,6 +28,7 @@ import java.util.regex.Pattern; import org.apache.commons.httpclient.URIException; +import org.apache.log4j.Logger; import org.archive.net.UURI; import org.archive.net.UURIFactory; @@ -39,6 +40,8 @@ * @version $Date$, $Revision$ */ public class UrlOperations { + private static final Logger LOGGER = Logger.getLogger( + UrlOperations.class.getName()); public final static String DNS_SCHEME = "dns:"; public final static String HTTP_SCHEME = "http://"; @@ -116,7 +119,7 @@ try { return UURIFactory.getInstance(url).getEscapedURI(); } catch (URIException e) { - e.printStackTrace(); + LOGGER.warn(e.getLocalizedMessage() + ": " + url); // can't let a space exist... send back close to whatever came // in... return url.replace(" ", "%20"); @@ -129,7 +132,7 @@ absBaseURI = UURIFactory.getInstance(baseUrl); resolvedURI = UURIFactory.getInstance(absBaseURI, url); } catch (URIException e) { - e.printStackTrace(); + LOGGER.warn(e.getLocalizedMessage() + ": " + url); return url.replace(" ", "%20"); } return resolvedURI.getEscapedURI(); @@ -198,4 +201,33 @@ } return url; } + + /** + * Find and return the parent directory of the URL argument + * @param url to find the parent directory of + * @return parent directory of URL, or null, if either the url argument is + * invalid, or if the url is the root of the authority. + */ + public static String getUrlParentDir(String url) { + + try { + UURI uri = UURIFactory.getInstance(url); + String path = uri.getPath(); + if(path.length() > 1) { + int startIdx = path.length()-1; + if(path.charAt(path.length()-1) == '/') { + startIdx--; + } + int idx = path.lastIndexOf('/',startIdx); + if(idx >= 0) { + uri.setPath(path.substring(0,idx+1)); + uri.setQuery(null); + return uri.toString(); + } + } + } catch (URIException e) { + LOGGER.warn(e.getLocalizedMessage() + ": " + url); + } + return null; + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2010-04-14 21:28:35 UTC (rev 3041) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2010-04-14 21:33:02 UTC (rev 3042) @@ -9,6 +9,7 @@ * @version $Date$, $Revision$ */ public class UrlOperationsTest extends TestCase { + public void testIsAuthority() { checkAuthority("foo.com",true); checkAuthority("foo.con",false); @@ -93,5 +94,40 @@ assertEquals("rtsp://",UrlOperations.urlToScheme("rtsp://a.com/")); assertEquals("mms://",UrlOperations.urlToScheme("mms://a.com/")); assertNull(UrlOperations.urlToScheme("blah://a.com/")); - } + } + + public void testGetUrlParentDir() { + + assertEquals( "http://a.b/c/", + UrlOperations.getUrlParentDir("http://a.b/c/d")); + + assertEquals( "http://a.b/", + UrlOperations.getUrlParentDir("http://a.b/c/")); + + assertEquals( "http://a.b/", + UrlOperations.getUrlParentDir("http://a.b/c")); + + assertEquals( "http://a.b/c/d/e/", + UrlOperations.getUrlParentDir("http://a.b/c/d/e/f")); + + assertEquals( "http://a.b/", + UrlOperations.getUrlParentDir("http://a.b/c?d=e")); + + assertEquals( null, + UrlOperations.getUrlParentDir("http://a.b/")); + + assertEquals( null, + UrlOperations.getUrlParentDir("http//a.b/")); + + assertEquals( null, + UrlOperations.getUrlParentDir("http://")); + + assertEquals( null, + UrlOperations.getUrlParentDir("http://#4.8gifdijdf")); + + assertEquals( null, + UrlOperations.getUrlParentDir("http://#4.8gifdijdf/a/b")); + + + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-04-23 23:35:19
|
Revision: 3051 http://archive-access.svn.sourceforge.net/archive-access/?rev=3051&view=rev Author: bradtofel Date: 2010-04-23 23:35:12 +0000 (Fri, 23 Apr 2010) Log Message: ----------- BUGFIX: fixed(hopefully) problem when extracting hostname from URLs containing userinfo FEATURE: added urlToUserInfo() Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-04-15 00:23:54 UTC (rev 3050) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-04-23 23:35:12 UTC (rev 3051) @@ -24,6 +24,8 @@ */ package org.archive.wayback.util.url; +import java.net.MalformedURLException; +import java.net.URL; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -97,6 +99,10 @@ private static final Pattern AUTHORITY_REGEX_SIMPLE = Pattern.compile("([0-9a-z_.-]++)"); + private static final Pattern HOST_REGEX_SIMPLE = + Pattern.compile("(?:[0-9a-z_.:-]+@)?([0-9a-z_.-]++)"); + private static final Pattern USERINFO_REGEX_SIMPLE = + Pattern.compile("([0-9a-z_.:-]+)(?:@[0-9a-z_.-]++)"); /** * @param urlPart @@ -184,23 +190,44 @@ return url.substring(pathIdx); } } - + public static String urlToHost(String url) { - if(url.startsWith("dns:")) { - return url.substring(4); + String lcUrl = url.toLowerCase(); + if(lcUrl.startsWith("dns:")) { + return lcUrl.substring(4); } for(String scheme : ALL_SCHEMES) { - if(url.startsWith(scheme)) { - int hostIdx = scheme.length(); + if(lcUrl.startsWith(scheme)) { + int authorityIdx = scheme.length(); - Matcher m = AUTHORITY_REGEX_SIMPLE.matcher(url.substring(hostIdx)); + Matcher m = + HOST_REGEX_SIMPLE.matcher(lcUrl.substring(authorityIdx)); if(m.find()) { - return m.group(0); + return m.group(1); } } } return url; } + + public static String urlToUserInfo(String url) { + String lcUrl = url.toLowerCase(); + if(lcUrl.startsWith("dns:")) { + return null; + } + for(String scheme : ALL_SCHEMES) { + if(lcUrl.startsWith(scheme)) { + int authorityIdx = scheme.length(); + + Matcher m = + USERINFO_REGEX_SIMPLE.matcher(lcUrl.substring(authorityIdx)); + if(m.find()) { + return m.group(1); + } + } + } + return null; + } /** * Find and return the parent directory of the URL argument Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2010-04-15 00:23:54 UTC (rev 3050) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2010-04-23 23:35:12 UTC (rev 3051) @@ -35,7 +35,14 @@ assertEquals("foo.com",UrlOperations.urlToHost("http://foo.com")); assertEquals("foo.com",UrlOperations.urlToHost("https://foo.com")); assertEquals("foo.com",UrlOperations.urlToHost("ftp://foo.com")); - + + assertEquals("www.google.com",UrlOperations.urlToHost("http://www.GOOGLE.COM")); + assertEquals("google.com",UrlOperations.urlToHost("http://GOOGLE.COM/")); + assertEquals("google.com",UrlOperations.urlToHost("http://GOOGLE.COM")); + assertEquals("google.com",UrlOperations.urlToHost("http://GOOGLE.COM:80")); + assertEquals("google.com",UrlOperations.urlToHost("http://GOOGLE.COM:80/")); + assertEquals("google.com",UrlOperations.urlToHost("http://GOOGLE.COM:80/foo")); + assertEquals("foo.com",UrlOperations.urlToHost("http://foo.com/")); assertEquals("foo.com",UrlOperations.urlToHost("https://foo.com/")); assertEquals("foo.com",UrlOperations.urlToHost("ftp://foo.com/")); @@ -66,8 +73,34 @@ assertEquals("foo.com",UrlOperations.urlToHost("ftp://foo.com\\")); assertEquals("www.foo.com",UrlOperations.urlToHost("http://www.foo.com\\")); assertEquals("www.foo.com",UrlOperations.urlToHost("http://www.foo.com:80\\")); + + + assertEquals("foo.com",UrlOperations.urlToHost("http://us...@fo...")); + assertEquals("www.foo.com",UrlOperations.urlToHost("http://us...@ww...")); + assertEquals("www.foo.com",UrlOperations.urlToHost("http://user:pa...@ww...")); + + assertEquals("www.foo.com",UrlOperations.urlToHost("http://user:pa...@ww.../")); + assertEquals("www.foo.com",UrlOperations.urlToHost("http://user:pa...@ww.../boo@foo")); } + public void testUrlToUserInfo() { + assertEquals(null,UrlOperations.urlToUserInfo("dns:foo.com")); + assertEquals(null,UrlOperations.urlToUserInfo("http://foo.com")); + assertEquals(null,UrlOperations.urlToUserInfo("https://foo.com")); + assertEquals(null,UrlOperations.urlToUserInfo("ftp://foo.com")); + assertEquals(null,UrlOperations.urlToUserInfo("ftp://foo.com/")); + assertEquals(null,UrlOperations.urlToUserInfo("http://foo.com:80/")); + assertEquals(null,UrlOperations.urlToUserInfo("http://foo.com:80")); + assertEquals(null,UrlOperations.urlToUserInfo("http://www.foo.com:80\\")); + + assertEquals("user",UrlOperations.urlToUserInfo("http://us...@fo...")); + assertEquals("user",UrlOperations.urlToUserInfo("http://us...@ww...")); + assertEquals("user:pass",UrlOperations.urlToUserInfo("http://user:pa...@ww...")); + assertEquals("user:pass",UrlOperations.urlToUserInfo("http://user:pa...@ww...:8080")); + assertEquals("user:pass",UrlOperations.urlToUserInfo("http://user:pa...@ww...:8080/boo@arb")); + assertEquals("www.foo.com",UrlOperations.urlToHost("http://user:pa...@ww.../")); + assertEquals("www.foo.com",UrlOperations.urlToHost("http://user:pa...@ww.../boo@foo")); + } public void testResolveUrl() { for(String scheme : UrlOperations.ALL_SCHEMES) { @@ -83,9 +116,7 @@ assertEquals(scheme + "a.org/1/2", UrlOperations.resolveUrl(scheme + "a.org/3","1/2")); - } - } public void testUrlToScheme() { assertEquals("http://",UrlOperations.urlToScheme("http://a.com/")); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-05-13 18:34:44
|
Revision: 3090 http://archive-access.svn.sourceforge.net/archive-access/?rev=3090&view=rev Author: bradtofel Date: 2010-05-13 18:34:37 +0000 (Thu, 13 May 2010) Log Message: ----------- FEATURE: added new method stripDefaultPort() and tests for that TWEAK: Updated TLD list Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-05-07 23:11:24 UTC (rev 3089) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-05-13 18:34:37 UTC (rev 3090) @@ -101,31 +101,29 @@ */ public final static char PATH_START = '/'; - - private static final String CC_TLDS = "ac|ad|ae|af|ag|ai|al|am|an|ao|aq" + - "|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs" + - "|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cu|cv|cx" + - "|cy|cz|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo" + - "|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk" + - "|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg" + - "|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma" + - "|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz" + - "|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm" + - "|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj" + - "|sk|sl|sm|sn|so|sr|st|su|sv|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn" + - "|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|um|us|uy|uz|va|vc|ve|vg|vi|vn|vu" + - "|wf|ws|ye|yt|yu|za|zm|zw"; - - private static final String GEN_TLDS = "aero|biz|cat|com|coop|edu|gov" + - "|info|int|jobs|mil|mobi|museum|name|net|org|pro|travel"; - - - private static final String ALL_TLD_PATTERN = CC_TLDS + "|" + GEN_TLDS; + private static final String ALL_TLDS = "ac|ad|ae|aero|af|ag|ai|al|am|an" + + "|ao|aq|ar|arpa|as|asia|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi" + + "|biz|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cat|cc|cd|cf|cg|ch|ci" + + "|ck|cl|cm|cn|co|com|coop|cr|cu|cv|cx|cy|cz|de|dj|dk|dm|do|dz|ec" + + "|edu|ee|eg|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh" + + "|gi|gl|gm|gn|gov|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id" + + "|ie|il|im|in|info|int|io|iq|ir|is|it|je|jm|jo|jobs|jp|ke|kg|kh" + + "|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc" + + "|md|me|mg|mh|mil|mk|ml|mm|mn|mo|mobi|mp|mq|mr|ms|mt|mu|museum" + + "|mv|mw|mx|my|mz|na|name|nc|ne|net|nf|ng|ni|nl|no|np|nr|nu|nz" + + "|om|org|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|pro|ps|pt|pw|py|qa|re|ro" + + "|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|sk|sl|sm|sn|so|sr|st|su|sv" + + "|sy|sz|tc|td|tel|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|travel|tt|tv" + + "|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|xn--0zwm56d" + + "|xn--11b5bs3a9aj6g|xn--80akhbyknj4f|xn--9t4b11yi5a|xn--deba0ad" + + "|xn--g6w251d|xn--hgbk6aj7f53bba|xn--hlcj6aya9esc7a|xn--jxalpdlp" + + "|xn--kgbechtv|xn--mgbaam7a8h|xn--mgberp4a5d4ar|xn--p1ai" + + "|xn--wgbh1c|xn--zckzah|ye|yt|za|zm|zw"; private static final String IP_PATTERN = "[0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+"; private static final Pattern AUTHORITY_REGEX = - Pattern.compile("(([0-9a-z_.-]+)\\.(" + ALL_TLD_PATTERN + "))|" + + Pattern.compile("(([0-9a-z_.-]+)\\.(" + ALL_TLDS + "))|" + "(" + IP_PATTERN + ")"); // private static final Pattern AUTHORITY_REGEX_SIMPLE = @@ -244,7 +242,48 @@ return url.substring(pathIdx); } } + + /** + * Attempt to strip default ports out of URL strings. + * @param url the original URL possibly including a port + * @return the URL sans port, if the scheme was recognized and the default + * port was supplied, otherwise, the original URL. + */ + public static String stripDefaultPortFromUrl(String url) { + String scheme = urlToScheme(url); + if(scheme == null) { + return url; + } + int defaultPort = schemeToDefaultPort(scheme); + if(defaultPort == -1) { + return url; + } + String portStr = null; + // is there a slash after the scheme? + int slashIdx = url.indexOf('/', scheme.length()); + if(slashIdx == -1) { + portStr = String.format(":%d", defaultPort); + if(url.endsWith(portStr)) { + return url.substring(0,url.length() - portStr.length()); + } + } + portStr = String.format(":%d/", defaultPort); + int idx = url.indexOf(portStr); + if(idx == -1) { + return url; + } + // if that occurred before the first / (after the scheme) then strip it: + if(slashIdx < idx) { + return url; + } + // we want to strip out the portStr: + StringBuilder sb = new StringBuilder(url.length()); + sb.append(url.substring(0,idx)); + sb.append(url.substring(idx + (portStr.length()-1))); + return sb.toString(); + } + /** * Attempt to extract the hostname component of an absolute URL argument. * @param url the url String from which to extract the hostname Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2010-05-07 23:11:24 UTC (rev 3089) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2010-05-13 18:34:37 UTC (rev 3090) @@ -161,4 +161,35 @@ } + + public void testStripDefaultPort() { + assertSDP("http://foo.com/","http://foo.com/"); + assertSDP("http://foo.com","http://foo.com"); + assertSDP("http://foo.com","http://foo.com:80"); + assertSDP("foo.com:80/","foo.com:80/"); + assertSDP("http://foo.com:8080/","http://foo.com:8080/"); + assertSDP("http://foo.com:8081/","http://foo.com:8081/"); + assertSDP("https://foo.com:8081/","https://foo.com:8081/"); + assertSDP("https://foo.com/","https://foo.com:443/"); + assertSDP("https://foo.com","https://foo.com:443"); + assertSDP("ftp://foo.com/","ftp://foo.com/"); + assertSDP("ftp://foo.com","ftp://foo.com"); + assertSDP("ftp://foo.com:1234","ftp://foo.com:1234"); + assertSDP("ftp://foo.com","ftp://foo.com:21"); + assertSDP("ftp://foo.com/","ftp://foo.com:21/"); + assertSDP("ftp://foo.com/bla","ftp://foo.com:21/bla"); + assertSDP("s3://foo.com/","s3://foo.com/"); + assertSDP("s3://foo.com/bar","s3://foo.com/bar"); + assertSDP("s3://foo.com:80/bar","s3://foo.com:80/bar"); + assertSDP("http://b...@fo.../bar","http://b...@fo...:80/bar"); + assertSDP("http://b...@fo.../bar","http://b...@fo.../bar"); + assertSDP("http://b:80...@fo.../bar","http://b:80...@fo.../bar"); + assertSDP("http://b:80...@fo.../bar","http://b:80...@fo...:80/bar"); + assertSDP("http://b:80...@fo...:8080/ba","http://b:80...@fo...:8080/ba"); + } + private void assertSDP(String want, String orig) { + String got = UrlOperations.stripDefaultPortFromUrl(orig); + assertEquals(want,got); + } + } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-05-18 22:56:29
|
Revision: 3106 http://archive-access.svn.sourceforge.net/archive-access/?rev=3106&view=rev Author: bradtofel Date: 2010-05-18 22:56:23 +0000 (Tue, 18 May 2010) Log Message: ----------- INITIAL REV: simple class for manipulating bits within a byte array. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/BitArray.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/BitArrayTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/BitArray.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/BitArray.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/BitArray.java 2010-05-18 22:56:23 UTC (rev 3106) @@ -0,0 +1,109 @@ +/* BitArray + * + * $Id$: + * + * Created on Apr 27, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.util; + +/** + * @author brad + * + */ +public class BitArray { + private static final int MASKS[] = { + 0x01, + 0x02, + 0x04, + 0x08, + 0x10, + 0x20, + 0x40, + 0x80 + }; + private static final int MASKSR[] = { + ~0x01, + ~0x02, + ~0x04, + ~0x08, + ~0x10, + ~0x20, + ~0x40, + ~0x80 + }; + private byte array[] = null; + + /** + * Construct a new BitArray holding at least n bits + * @param n number of bits to hold + */ + public BitArray(int n) { + int bytes = n / 8; + int bits = n % 8; + if(bits > 0) { + bytes++; + } + this.array = new byte[bytes]; + } + /** + * Construct a new BitArray using argument as initial values. + * @param array byte array of initial values + */ + public BitArray(byte array[]) { + this.array = array; + } + /** + * @return the byte array backing this bit array. + */ + public byte[] getBytes() { + return array; + } + /** + * @param i index of bit to test + * @return true if the i'th bit is set, false otherwise + */ + public boolean get(int i) { + int idx = i / 8; + if(idx >= array.length) { + throw new IndexOutOfBoundsException(); + } + int bit = 7 - (i % 8); + return ((array[idx] & MASKS[bit]) == MASKS[bit]); + } + /** + * set the i'th bit to 1 or 0 + * @param i bit number to set + * @param value if true, the bit is set to 1, otherwise it is set to 0 + */ + public void set(int i, boolean value) { + int idx = i / 8; + if(idx >= array.length) { + throw new IndexOutOfBoundsException(); + } + int bit = 7 - (i % 8); + if(value) { + array[idx] = (byte) (array[idx] | MASKS[bit]); + } else { + array[idx] = (byte) (array[idx] & MASKSR[bit]); + } + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/BitArray.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/BitArrayTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/BitArrayTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/BitArrayTest.java 2010-05-18 22:56:23 UTC (rev 3106) @@ -0,0 +1,84 @@ +/* BitArrayTest + * + * $Id$: + * + * Created on May 14, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.util; + +import junit.framework.TestCase; + +/** + * @author brad + * + */ +public class BitArrayTest extends TestCase { + + /** + * Test method for {@link org.archive.wayback.util.BitArray#get(int)}. + */ + public void testGet() { + byte bytes[] = "Here is some data!".getBytes(); + byte bytes2[] = "Here is some data!".getBytes(); + int bits = bytes.length * 8; + sun.security.util.BitArray sba = + new sun.security.util.BitArray(bits, bytes); + org.archive.wayback.util.BitArray wba = + new org.archive.wayback.util.BitArray(bytes2); + for(int i = 0; i < bits; i++) { + boolean want = sba.get(i); + boolean got = wba.get(i); + if(want != got) { + got = wba.get(i); + } + assertEquals(want,got); + } + } + + /** + * Test method for {@link org.archive.wayback.util.BitArray#set(int, boolean)}. + */ + public void testSet() { + byte bytes[] = "Here is some data!".getBytes(); + byte bytes2[] = "Here is some data!".getBytes(); + int bits = bytes.length * 8; + sun.security.util.BitArray sba = + new sun.security.util.BitArray(bits, bytes); + org.archive.wayback.util.BitArray wba = + new org.archive.wayback.util.BitArray(bytes2); + for(int i = 0; i < bits; i++) { + boolean want = sba.get(i); + boolean got = wba.get(i); + boolean not = !want; + assertTrue(ByteOp.cmp(sba.toByteArray(), wba.getBytes())); + sba.set(i,not); + wba.set(i,not); + assertTrue(ByteOp.cmp(sba.toByteArray(), wba.getBytes())); + assertEquals(not,wba.get(i)); + sba.set(i,got); + wba.set(i,got); + assertEquals(sba.get(i),wba.get(i)); + assertTrue(ByteOp.cmp(sba.toByteArray(), wba.getBytes())); + } + + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/BitArrayTest.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-08-10 21:24:53
|
Revision: 3224 http://archive-access.svn.sourceforge.net/archive-access/?rev=3224&view=rev Author: bradtofel Date: 2010-08-10 21:24:46 +0000 (Tue, 10 Aug 2010) Log Message: ----------- BUGFIX(ARI-2473): fixed parse bug extracting userinfo from an url. TWEAK: replaced "dns:" with reference to existing static variable Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-08-10 20:41:33 UTC (rev 3223) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2010-08-10 21:24:46 UTC (rev 3224) @@ -131,7 +131,7 @@ private static final Pattern HOST_REGEX_SIMPLE = Pattern.compile("(?:[0-9a-z_.:-]+@)?([0-9a-z_.-]++)"); private static final Pattern USERINFO_REGEX_SIMPLE = - Pattern.compile("([0-9a-z_.:-]+)(?:@[0-9a-z_.-]++)"); + Pattern.compile("^([0-9a-z_.:-]+)(?:@[0-9a-z_.-]++)"); /** * Tests if the String argument looks like it could be a legitimate @@ -292,8 +292,8 @@ */ public static String urlToHost(String url) { String lcUrl = url.toLowerCase(); - if(lcUrl.startsWith("dns:")) { - return lcUrl.substring(4); + if(lcUrl.startsWith(DNS_SCHEME)) { + return lcUrl.substring(DNS_SCHEME.length()); } for(String scheme : ALL_SCHEMES) { if(lcUrl.startsWith(scheme)) { @@ -318,7 +318,7 @@ */ public static String urlToUserInfo(String url) { String lcUrl = url.toLowerCase(); - if(lcUrl.startsWith("dns:")) { + if(lcUrl.startsWith(DNS_SCHEME)) { return null; } for(String scheme : ALL_SCHEMES) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java 2010-08-10 20:41:33 UTC (rev 3223) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/AggressiveUrlCanonicalizerTest.java 2010-08-10 21:24:46 UTC (rev 3224) @@ -181,6 +181,10 @@ "http://legislature.mi.gov/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules", "legislature.mi.gov/mileg.aspx?page=sessionschedules"); + // '@' in path: + checkCanonicalization( + "http://www.flickr.com/photos/36050182@N05/", + "flickr.com/photos/36050182@n05/"); Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2010-08-10 20:41:33 UTC (rev 3223) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2010-08-10 21:24:46 UTC (rev 3224) @@ -91,7 +91,9 @@ assertEquals(null,UrlOperations.urlToUserInfo("http://foo.com:80/")); assertEquals(null,UrlOperations.urlToUserInfo("http://foo.com:80")); assertEquals(null,UrlOperations.urlToUserInfo("http://www.foo.com:80\\")); + assertEquals(null,UrlOperations.urlToUserInfo("http://www.flickr.com/photos/36050182@N05/")); + assertEquals("user",UrlOperations.urlToUserInfo("http://us...@fo...")); assertEquals("user",UrlOperations.urlToUserInfo("http://us...@ww...")); assertEquals("user:pass",UrlOperations.urlToUserInfo("http://user:pa...@ww...")); @@ -190,6 +192,8 @@ assertSDP("http://b:80...@fo.../bar","http://b:80...@fo.../bar"); assertSDP("http://b:80...@fo.../bar","http://b:80...@fo...:80/bar"); assertSDP("http://b:80...@fo...:8080/ba","http://b:80...@fo...:8080/ba"); + assertSDP("http://www.flickr.com/photos/36050182@N05/","http://www.flickr.com/photos/36050182@N05/"); + } private void assertSDP(String want, String orig) { String got = UrlOperations.stripDefaultPortFromUrl(orig); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-08-17 00:06:09
|
Revision: 3231 http://archive-access.svn.sourceforge.net/archive-access/?rev=3231&view=rev Author: bradtofel Date: 2010-08-17 00:06:02 +0000 (Tue, 17 Aug 2010) Log Message: ----------- BUGFIX: (ARI-2509) - now rewriting absolute URLs in javascritp strings, with escaped '/'s. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/JSStringTransformer.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/JSStringTransformer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/JSStringTransformer.java 2010-08-16 23:00:36 UTC (rev 3230) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/html/transformer/JSStringTransformer.java 2010-08-17 00:06:02 UTC (rev 3231) @@ -38,7 +38,7 @@ */ public class JSStringTransformer implements StringTransformer { private final static Pattern httpPattern = Pattern - .compile("(http://[A-Za-z0-9:_@.-]+)"); + .compile("(http:\\\\?/\\\\?/[A-Za-z0-9:_@.-]+)"); public String transform(ReplayParseContext context, String input) { Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java 2010-08-16 23:00:36 UTC (rev 3230) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/replay/html/transformer/JSStringTransformerTest.java 2010-08-17 00:06:02 UTC (rev 3231) @@ -57,8 +57,15 @@ jst.transform(rc, input); assertEquals(1,rc.got.size()); assertEquals("http://www.gavelgrab.org",rc.got.get(0)); + + input = "onloadRegister(function (){window.location.href=\"http:\\/\\/www.facebook.com\\/barrettforwisconsin?v=info\";});"; + rc = new RecordingReplayParseContext(null, new URL("http://foo.com/"), null); + jst.transform(rc, input); + assertEquals(1,rc.got.size()); + assertEquals("http:\\/\\/www.facebook.com",rc.got.get(0)); } + public class RecordingReplayParseContext extends ReplayParseContext { ArrayList<String> got = null; /** This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-12-31 00:19:07
|
Revision: 3356 http://archive-access.svn.sourceforge.net/archive-access/?rev=3356&view=rev Author: bradtofel Date: 2010-12-31 00:19:01 +0000 (Fri, 31 Dec 2010) Log Message: ----------- MOVED tests (that don't work now..) to /src/main/test/ Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/ziplines/ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java 2010-12-30 19:54:31 UTC (rev 3355) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java 2010-12-31 00:19:01 UTC (rev 3356) @@ -1,79 +0,0 @@ -/* - * This file is part of the Wayback archival access software - * (http://archive-access.sourceforge.net/projects/wayback/). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.archive.wayback.resourceindex.ziplines; - -import java.io.IOException; -import java.util.HashMap; - -import org.archive.wayback.exception.ResourceIndexNotAvailableException; -import org.archive.wayback.util.CloseableIterator; -import org.archive.wayback.util.flatfile.FlatFile; - -import junit.framework.TestCase; - -/** - * @author brad - * - */ -public class ZiplinedBlockStringSequenceTest extends TestCase { - private String indexPath = "/home/brad/os-cdx/CDX-201002-clean/ALL.count.summary"; - private String mapPath = "/home/brad/os-cdx/CDX-201002-clean/ALL.loc-workstation"; - - private ZiplinedBlockStringSequence getSequence() throws IOException { - HashMap<String, String> chunkMap = new HashMap<String, String>(); - FlatFile ff = new FlatFile(mapPath); - CloseableIterator<String> lines = ff.getSequentialIterator(); - while(lines.hasNext()) { - String line = lines.next(); - String[] parts = line.split("\\s"); - if(parts.length != 2) { - throw new IOException("Bad line(" + line +") in (" + - mapPath + ")"); - } - chunkMap.put(parts[0],parts[1]); - } - lines.close(); - FlatFile chunkIndex = new FlatFile(indexPath); - return new ZiplinedBlockStringSequence(chunkIndex, chunkMap); - } - /** - * Test method for {@link org.archive.wayback.resourceindex.ziplines.ZiplinedBlockStringSequence#getIterator(java.lang.String, long)}. - * @throws IOException - * @throws ResourceIndexNotAvailableException - */ - public void testGetIteratorStringLong() throws IOException, ResourceIndexNotAvailableException { - ZiplinedBlockStringSequence seq = getSequence(); - StringPrefixIterator itr = seq.getIterator("yahoo.com/", 1000000); - System.out.format("Total Matches %d\n",itr.getTotalMatches()); - for(int i = 0; i < 10; i++) { - if(itr.hasNext()) { - System.out.format("Line(%d): %s\n",i,itr.next()); - } - } - } - - /** - * Test method for {@link org.archive.wayback.resourceindex.ziplines.ZiplinedBlockStringSequence#getIterator(java.lang.String)}. - */ - public void testGetIteratorString() { -// fail("Not yet implemented"); - } - -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java 2010-12-30 19:54:31 UTC (rev 3355) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java 2010-12-31 00:19:01 UTC (rev 3356) @@ -1,58 +0,0 @@ -/* - * This file is part of the Wayback archival access software - * (http://archive-access.sourceforge.net/projects/wayback/). - * - * Licensed to the Internet Archive (IA) by one or more individual - * contributors. - * - * The IA licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.archive.wayback.resourceindex.ziplines; - -import java.util.Iterator; - -import org.archive.wayback.resourceindex.cdx.format.CDXFormat; -import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; - -import junit.framework.TestCase; - -/** - * @author brad - * - */ -public class ZiplinesSearchResultSourceTest extends TestCase { - - /** - * Test method for {@link org.archive.wayback.resourceindex.ziplines.ZiplinesSearchResultSource#getPrefixIterator(java.lang.String)}. - * @throws CDXFormatException - */ - public void testGetPrefixIterator() throws Exception { - CDXFormat format = new CDXFormat(" CDX N b a m s k r M V g"); - ZiplinesSearchResultSource zsrs = new ZiplinesSearchResultSource(format); -// zsrs.setChunkIndexPath("/home/brad/zipline-test/part-00005-frag.cdx.zlm"); -// zsrs.setChunkMapPath("/home/brad/zipline-test/manifest.txt"); - zsrs.setChunkIndexPath("/home/brad/ALL.summary"); - zsrs.setChunkMapPath("/home/brad/ALL.loc"); - zsrs.init(); - Iterator<String> i = zsrs.getStringPrefixIterator("krunch.com/ "); - int max = 100; - int done = 0; - while(i.hasNext()) { - System.out.println(i.next()); - if(done++ > max) { - break; - } - } - } - -} Copied: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java (from rev 3324, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java 2010-12-31 00:19:01 UTC (rev 3356) @@ -0,0 +1,79 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.resourceindex.ziplines; + +import java.io.IOException; +import java.util.HashMap; + +import org.archive.wayback.exception.ResourceIndexNotAvailableException; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.flatfile.FlatFile; + +import junit.framework.TestCase; + +/** + * @author brad + * + */ +public class ZiplinedBlockStringSequenceTest extends TestCase { + private String indexPath = "/home/brad/os-cdx/CDX-201002-clean/ALL.count.summary"; + private String mapPath = "/home/brad/os-cdx/CDX-201002-clean/ALL.loc-workstation"; + +// private ZiplinedBlockStringSequence getSequence() throws IOException { +// HashMap<String, String> chunkMap = new HashMap<String, String>(); +// FlatFile ff = new FlatFile(mapPath); +// CloseableIterator<String> lines = ff.getSequentialIterator(); +// while(lines.hasNext()) { +// String line = lines.next(); +// String[] parts = line.split("\\s"); +// if(parts.length != 2) { +// throw new IOException("Bad line(" + line +") in (" + +// mapPath + ")"); +// } +// chunkMap.put(parts[0],parts[1]); +// } +// lines.close(); +// FlatFile chunkIndex = new FlatFile(indexPath); +// return new ZiplinedBlockStringSequence(chunkIndex, chunkMap); +// } + /** + * Test method for {@link org.archive.wayback.resourceindex.ziplines.ZiplinedBlockStringSequence#getIterator(java.lang.String, long)}. + * @throws IOException + * @throws ResourceIndexNotAvailableException + */ + public void testGetIteratorStringLong() throws IOException, ResourceIndexNotAvailableException { +// ZiplinedBlockStringSequence seq = getSequence(); +// StringPrefixIterator itr = seq.getIterator("yahoo.com/", 1000000); +// System.out.format("Total Matches %d\n",itr.getTotalMatches()); +// for(int i = 0; i < 10; i++) { +// if(itr.hasNext()) { +// System.out.format("Line(%d): %s\n",i,itr.next()); +// } +// } + } + + /** + * Test method for {@link org.archive.wayback.resourceindex.ziplines.ZiplinedBlockStringSequence#getIterator(java.lang.String)}. + */ + public void testGetIteratorString() { +// fail("Not yet implemented"); + } + +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java (from rev 3324, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java 2010-12-31 00:19:01 UTC (rev 3356) @@ -0,0 +1,62 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.resourceindex.ziplines; + +import java.util.Iterator; + + +import org.archive.wayback.resourceindex.cdx.format.CDXFormat; +import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; + +import junit.framework.TestCase; + +/** + * @author brad + * + */ +public class ZiplinesSearchResultSourceTest extends TestCase { + +// /** +// * Test method for {@link org.archive.wayback.resourceindex.ziplines.ZiplinesSearchResultSource#getPrefixIterator(java.lang.String)}. +// * @throws CDXFormatException +// */ +// public void testGetPrefixIterator() throws Exception { +// CDXFormat format = new CDXFormat(" CDX N b a m s k r M V g"); +// ZiplinesSearchResultSource zsrs = new ZiplinesSearchResultSource(format); +//// zsrs.setChunkIndexPath("/home/brad/zipline-test/part-00005-frag.cdx.zlm"); +//// zsrs.setChunkMapPath("/home/brad/zipline-test/manifest.txt"); +// zsrs.setChunkIndexPath("/home/brad/ALL.summary"); +// zsrs.setChunkMapPath("/home/brad/ALL.loc"); +// zsrs.init(); +// Iterator<String> i = zsrs.getStringPrefixIterator("krunch.com/ "); +// int max = 100; +// int done = 0; +// while(i.hasNext()) { +// System.out.println(i.next()); +// if(done++ > max) { +// break; +// } +// } +// } + + public void testEmpty() throws Exception { + + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-02-06 14:34:49
|
Revision: 3391 http://archive-access.svn.sourceforge.net/archive-access/?rev=3391&view=rev Author: bradtofel Date: 2011-02-06 14:34:43 +0000 (Sun, 06 Feb 2011) Log Message: ----------- Added fixupHTTPUrlWithOneSlash() Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2011-02-06 14:33:44 UTC (rev 3390) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2011-02-06 14:34:43 UTC (rev 3391) @@ -279,6 +279,16 @@ return sb.toString(); } + public static String fixupHTTPUrlWithOneSlash(String orig) { + if(orig.startsWith("http:/") && ! orig.startsWith(HTTP_SCHEME)) { + // very likely the IE "you must have meant 1 slash, not 2 bug: + StringBuilder sb = new StringBuilder(orig.length()+1); + sb.append(HTTP_SCHEME); + return sb.append(orig.substring(6)).toString(); + } + return orig; + } + /** * Attempt to extract the hostname component of an absolute URL argument. * @param url the url String from which to extract the hostname Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2011-02-06 14:33:44 UTC (rev 3390) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2011-02-06 14:34:43 UTC (rev 3391) @@ -19,6 +19,9 @@ */ package org.archive.wayback.util.url; +import java.net.MalformedURLException; +import java.net.URL; + import junit.framework.TestCase; /** @@ -29,6 +32,22 @@ */ public class UrlOperationsTest extends TestCase { + public void testOneSlashUrl() throws MalformedURLException { + assertEquals("http://one.com/", + UrlOperations.fixupHTTPUrlWithOneSlash("http://one.com/")); + assertEquals("http://one.com", + UrlOperations.fixupHTTPUrlWithOneSlash("http://one.com")); + assertEquals("http://http://one.com", + UrlOperations.fixupHTTPUrlWithOneSlash("http://http://one.com")); + assertEquals("http://one.com", + UrlOperations.fixupHTTPUrlWithOneSlash("http:/one.com")); + assertEquals("http://one.com/", + UrlOperations.fixupHTTPUrlWithOneSlash("http:/one.com/")); + assertEquals("http://one.com/foo.html", + UrlOperations.fixupHTTPUrlWithOneSlash("http:/one.com/foo.html")); + + } + public void testIsAuthority() { checkAuthority("foo.com",true); checkAuthority("foo.con",false); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-05-25 01:08:44
|
Revision: 3442 http://archive-access.svn.sourceforge.net/archive-access/?rev=3442&view=rev Author: bradtofel Date: 2011-05-25 01:08:38 +0000 (Wed, 25 May 2011) Log Message: ----------- BUGFIX: urlToPath was not handling ports correctly Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2011-05-25 01:04:11 UTC (rev 3441) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/url/UrlOperations.java 2011-05-25 01:08:38 UTC (rev 3442) @@ -220,25 +220,28 @@ * @return the path component of the URL, or "" if it contains no path. */ public static String getURLPath(String url) { - int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR); + url = stripURLScheme(url); int pathIdx = url.indexOf(UrlOperations.PATH_START); - if(portIdx == -1 && pathIdx == -1) { - return ""; - } - if(portIdx == -1) { - return url.substring(pathIdx); - } if(pathIdx == -1) { - return url.substring(portIdx); + return "/"; } - if(pathIdx > portIdx) { - return url.substring(portIdx); - } else { - return url.substring(pathIdx); - } + return url.substring(pathIdx); } - /** + * Attempt to extract the path component of a url String argument. + * @param url the URL which may contain a path, sans scheme. + * @return the path component of the URL, or "" if it contains no path. + */ + public static String stripURLScheme(String url) { + String lcUrl = url.toLowerCase(); + for(String scheme : ALL_SCHEMES) { + if(lcUrl.startsWith(scheme)) { + return url.substring(scheme.length()); + } + } + return url; + } + /** * Attempt to strip default ports out of URL strings. * @param url the original URL possibly including a port * @return the URL sans port, if the scheme was recognized and the default @@ -279,6 +282,11 @@ return sb.toString(); } + /** + * @param orig String containing a URL, possibly beginning with "http:/". + * @return original string if orig begins with "http://", or a new String + * with the extra slash, if orig only had one slash. + */ public static String fixupHTTPUrlWithOneSlash(String orig) { if(orig.startsWith("http:/") && ! orig.startsWith(HTTP_SCHEME)) { // very likely the IE "you must have meant 1 slash, not 2 bug: Modified: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2011-05-25 01:04:11 UTC (rev 3441) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/url/UrlOperationsTest.java 2011-05-25 01:08:38 UTC (rev 3442) @@ -205,7 +205,29 @@ } + public void testUrlPath() { + assertEquals("/",UrlOperations.getURLPath("http://foo.com")); + assertEquals("/",UrlOperations.getURLPath("http://foo.com/")); + assertEquals("/",UrlOperations.getURLPath("http://foo.com:80/")); + assertEquals("/blue",UrlOperations.getURLPath("http://foo.com:80/blue")); + assertEquals("/blue/red",UrlOperations.getURLPath("http://foo.com:80/blue/red")); + assertEquals("/blue/red:colon",UrlOperations.getURLPath("http://foo.com:80/blue/red:colon")); + assertEquals("/",UrlOperations.getURLPath("foo.com")); + assertEquals("/",UrlOperations.getURLPath("foo.com:80")); + assertEquals("/",UrlOperations.getURLPath("foo.com:8080")); + assertEquals("/",UrlOperations.getURLPath("foo.com/")); + assertEquals("/",UrlOperations.getURLPath("foo.com:80/")); + assertEquals("/",UrlOperations.getURLPath("foo.com:8080/")); + assertEquals("/bar",UrlOperations.getURLPath("foo.com/bar")); + assertEquals("/bar",UrlOperations.getURLPath("foo.com:80/bar")); + assertEquals("/bar",UrlOperations.getURLPath("foo.com:8080/bar")); + + assertEquals("/bar/baz",UrlOperations.getURLPath("foo.com/bar/baz")); + assertEquals("/bar/baz",UrlOperations.getURLPath("foo.com:80/bar/baz")); + assertEquals("/bar/baz",UrlOperations.getURLPath("foo.com:8080/bar/baz")); + + } public void testStripDefaultPort() { assertSDP("http://foo.com/","http://foo.com/"); assertSDP("http://foo.com","http://foo.com"); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-11-16 22:09:12
|
Revision: 3557 http://archive-access.svn.sourceforge.net/archive-access/?rev=3557&view=rev Author: bradtofel Date: 2011-11-16 22:09:06 +0000 (Wed, 16 Nov 2011) Log Message: ----------- INITIAL REV: class which monitors a set of files, and indicates when a file in the group has changed Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/MonitoredFileSet.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/MonitoredFileSetTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/MonitoredFileSet.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/MonitoredFileSet.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/util/MonitoredFileSet.java 2011-11-16 22:09:06 UTC (rev 3557) @@ -0,0 +1,50 @@ +package org.archive.wayback.util; + +import java.io.File; +import java.util.Date; +import java.util.HashMap; +import java.util.List; + +public class MonitoredFileSet { + List<String> files; + + public MonitoredFileSet(List<String> files) { + this.files = files; + } + public boolean isChanged(FileState fileState) { + FileState currentFileState = getFileState(); + return currentFileState.isChanged(fileState); + } + public FileState getFileState() { + FileState fileState = new FileState(); + + for(String path : files) { + File file = new File(path); + if(file.isFile()) { + fileState.put(path, new Date(file.lastModified())); + } else { + fileState.put(path, null); + } + } + return fileState; + } + + public class FileState extends HashMap<String,Date> { + public boolean isChanged(FileState other) { + for(String path : keySet()) { + if(other.containsKey(path)) { + Date otherDate = other.get(path); + Date thisDate = get(path); + if((otherDate == null) && (thisDate == null)) { + // treat both missing as the same.. + continue; + } + if(!otherDate.equals(thisDate)) { + return true; + } + } + } + return false; + } + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/MonitoredFileSetTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/MonitoredFileSetTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/util/MonitoredFileSetTest.java 2011-11-16 22:09:06 UTC (rev 3557) @@ -0,0 +1,43 @@ +package org.archive.wayback.util; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.ArrayList; + +import junit.framework.TestCase; + +public class MonitoredFileSetTest extends TestCase { + + public void testIsChanged() throws IOException, InterruptedException { + File f1 = new File("/tmp/file-set-1.tmp"); + File f2 = new File("/tmp/file-set-2.tmp"); + writeFile(f1,"one"); + writeFile(f2,"two"); + ArrayList<String> l = new ArrayList<String>(); + l.add(f1.getAbsolutePath()); + l.add(f2.getAbsolutePath()); + + MonitoredFileSet fs = new MonitoredFileSet(l); + MonitoredFileSet.FileState s1 = fs.getFileState(); + MonitoredFileSet.FileState s2 = fs.getFileState(); + assertFalse(fs.isChanged(s1)); + assertFalse(fs.isChanged(s2)); + Thread.sleep(1001); + writeFile(f2,"two2"); + MonitoredFileSet.FileState s3 = fs.getFileState(); + assertTrue(fs.isChanged(s2)); + assertTrue(s3.isChanged(s2)); + Thread.sleep(1001); + assertTrue(fs.isChanged(s2)); + assertFalse(fs.isChanged(s3)); + } + private void writeFile(File f, String stuff) throws IOException { + if(f.exists()) { + f.delete(); + } + FileOutputStream fos = new FileOutputStream(f,false); + fos.write(stuff.getBytes()); + fos.close(); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-11-16 22:18:05
|
Revision: 3559 http://archive-access.svn.sourceforge.net/archive-access/?rev=3559&view=rev Author: bradtofel Date: 2011-11-16 22:17:57 +0000 (Wed, 16 Nov 2011) Log Message: ----------- INITIAL REV: not fully tested but much improved robots.txt handling. Uses copy of current H3 robots handling - allows + disallow, more robust parsing, cleaner separation of responsibility to clean up the code Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/FixedRobotsDirectives.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/HRobotExclusionFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregation.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectives.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/Robotstxt.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregationTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/FixedRobotsDirectives.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/FixedRobotsDirectives.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/FixedRobotsDirectives.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,11 @@ +package org.archive.wayback.accesscontrol.robotstxt; + +public class FixedRobotsDirectives extends RobotsDirectives { + private boolean result; + public FixedRobotsDirectives(boolean result) { + this.result = result; + } + public boolean allows(String path) { + return result; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/HRobotExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/HRobotExclusionFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/HRobotExclusionFilter.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,164 @@ +package org.archive.wayback.accesscontrol.robotstxt; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.charset.Charset; +import java.util.List; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.Resource; +import org.archive.wayback.exception.LiveDocumentNotAvailableException; +import org.archive.wayback.exception.LiveWebCacheUnavailableException; +import org.archive.wayback.exception.LiveWebTimeoutException; +import org.archive.wayback.liveweb.LiveWebCache; +import org.archive.wayback.resourceindex.filters.ExclusionFilter; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.url.UrlOperations; +import org.archive.wayback.webapp.PerformanceLogger; + +public class HRobotExclusionFilter extends ExclusionFilter { + + private final static String ROBOT_SUFFIX = "/robots.txt"; + private final static Logger LOGGER = + Logger.getLogger(HRobotExclusionFilter.class.getName()); + + // TODO: this is not the right thing! + private Charset cs = Charset.forName("UTF-8"); + + private RobotsDirectiveAggregation aggregation = null; + private LiveWebCache webCache = null; + + private String userAgent = null; + private boolean notifiedSeen = false; + private boolean notifiedPassed = false; + private static final FixedRobotsDirectives ALLOW_ROBOT_DIRECTIVE = + new FixedRobotsDirectives(true); + + /** + * Construct a new HRobotExclusionFilter that uses webCache to pull + * robots.txt documents. filtering is based on userAgent, and cached + * documents newer than maxCacheMS in the webCache are considered valid. + * + * @param webCache LiveWebCache from which documents can be retrieved + * @param userAgent String user agent to use for requests to the live web. + * @param maxCacheMS long number of milliseconds to cache documents in the + * LiveWebCache + */ + public HRobotExclusionFilter(LiveWebCache webCache, String userAgent, + long maxCacheMS) { + aggregation = new RobotsDirectiveAggregation(); + this.webCache = webCache; + this.userAgent = userAgent; + } + + private void updateAggregation(String host) + throws LiveWebCacheUnavailableException, + LiveWebTimeoutException, MalformedURLException, IOException { + + List<String> missing = aggregation.getMissingRobotUrls(host); + for(String robotUrl : missing) { + long start = System.currentTimeMillis(); + Resource resource; + try { + resource = webCache.getCachedResource(new URL(robotUrl), + 0,true); + if(resource.getStatusCode() != 200) { + LOGGER.info("ROBOT: Non200("+robotUrl+")"); + // consider it an allow: + aggregation.addDirectives(robotUrl, ALLOW_ROBOT_DIRECTIVE); + } else { + InputStreamReader isr = new InputStreamReader(resource, cs); + BufferedReader br = new BufferedReader(isr); + Robotstxt robotsTxt = new Robotstxt(br); + RobotsDirectives directives = robotsTxt.getDirectivesFor(userAgent); + aggregation.addDirectives(robotUrl, directives); + } + } catch (LiveDocumentNotAvailableException e) { + if(LOGGER.isLoggable(Level.INFO)) { + LOGGER.info("ROBOT: LiveDocumentNotAvailableException(" + + robotUrl + ")"); + } + // consider it an allow: + aggregation.addDirectives(robotUrl, ALLOW_ROBOT_DIRECTIVE); + } + long elapsed = System.currentTimeMillis() - start; + PerformanceLogger.noteElapsed("RobotRequest", elapsed, robotUrl); + } + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourceindex.SearchResultFilter#filterSearchResult(org.archive.wayback.core.SearchResult) + */ + public int filterObject(CaptureSearchResult r) { + if(!notifiedSeen) { + if(filterGroup != null) { + filterGroup.setSawRobots(); + } + notifiedSeen = true; + } + String originalURL = r.getOriginalUrl(); + String path = UrlOperations.getURLPath(originalURL); + if(path.equals(ROBOT_SUFFIX)) { + if(!notifiedPassed) { + if(filterGroup != null) { + filterGroup.setPassedRobots(); + } + notifiedPassed = true; + } + return ObjectFilter.FILTER_INCLUDE; + } + String host = UrlOperations.urlToHost(originalURL); + boolean updated = false; + try { + updateAggregation(host); + if(!aggregation.isBlocked(path)) { + if(LOGGER.isLoggable(Level.INFO)) { + LOGGER.fine("ROBOT: BLOCKED(" + originalURL + ")"); + } + if(LOGGER.isLoggable(Level.FINE)) { + LOGGER.finer("ROBOT: ALLOWED(" + originalURL + ")"); + } + if(!notifiedPassed) { + if(filterGroup != null) { + filterGroup.setPassedRobots(); + } + notifiedPassed = true; + } + return ObjectFilter.FILTER_INCLUDE; + } + +// } catch (LiveDocumentNotAvailableException e) { + } catch (LiveWebCacheUnavailableException e) { + LOGGER.severe("ROBOT: LiveWebCacheUnavailableException(" + + originalURL + ")"); + filterGroup.setLiveWebGone(); + + } catch (LiveWebTimeoutException e) { + LOGGER.severe("ROBOT: LiveDocumentTimedOutException(" + + originalURL + ")"); + filterGroup.setRobotTimedOut(); + + } catch (MalformedURLException e) { + + LOGGER.warning("ROBOT: MalformedURLException(" + + originalURL + ")"); + + } catch (IOException e) { + e.printStackTrace(); + return ObjectFilter.FILTER_EXCLUDE; + } + + if(filterGroup.getRobotTimedOut() || filterGroup.getLiveWebGone()) { + return ObjectFilter.FILTER_ABORT; + } + if(LOGGER.isLoggable(Level.INFO)) { + LOGGER.fine("ROBOT: BLOCKED(" + originalURL + ")"); + } + return ObjectFilter.FILTER_EXCLUDE; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregation.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregation.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregation.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,111 @@ +package org.archive.wayback.accesscontrol.robotstxt; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.logging.Logger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Class which acts as an aggregation of RobotsDirectives. + * + * If given a host String, will return a list of additional robot URLs that + * need to be added to the current aggregation. + * + * Allows a user to then add new RobotsDirectives for one or more robot URLs. + * + * Finally, allows the aggregation to be queried to see if any of the + * directives block a particular path. + * + * + * @author brad + * + */ +public class RobotsDirectiveAggregation { + private final static Logger LOGGER = + Logger.getLogger(RobotsDirectiveAggregation.class.getName()); + + private final static String HTTP_PREFIX = "http://"; + private final static String ROBOT_SUFFIX = "/robots.txt"; + + private static String WWWN_REGEX = "^www[0-9]+\\."; + private final static Pattern WWWN_PATTERN = Pattern.compile(WWWN_REGEX); + + private HashMap<String,RobotsDirectives> cache = + new HashMap<String, RobotsDirectives>(); + + private StringBuilder sb = new StringBuilder(); + + private String hostToRobotUrlString(final String host) { + sb.setLength(0); + sb.append(HTTP_PREFIX).append(host).append(ROBOT_SUFFIX); + String robotUrl = sb.toString(); + LOGGER.fine("Adding robot URL:" + robotUrl); + return robotUrl; + } + /* + */ + /** + * @param resultHost + * @return a List of all robots.txt urls to attempt for this HOST: + * If HOST starts with "www.DOMAIN": + * [ + * http://HOST/robots.txt, + * http://DOMAIN/robots.txt + * ] + * If HOST starts with "www[0-9]+.DOMAIN": + * [ + * http://HOST/robots.txt, + * http://www.DOMAIN/robots.txt, + * http://DOMAIN/robots.txt + * ] + * Otherwise: + * [ + * http://HOST/robots.txt, + * http://www.HOST/robots.txt + * ] + */ + List<String> hostToRobotUrlStrings(final String resultHost) { + ArrayList<String> list = new ArrayList<String>(); + list.add(hostToRobotUrlString(resultHost)); + + if(resultHost.startsWith("www")) { + if(resultHost.startsWith("www.")) { + list.add(hostToRobotUrlString(resultHost.substring(4))); + } else { + Matcher m = WWWN_PATTERN.matcher(resultHost); + if(m.find()) { + String massagedHost = resultHost.substring(m.end()); + list.add(hostToRobotUrlString("www." + massagedHost)); + list.add(hostToRobotUrlString(massagedHost)); + } + } + } else { + list.add(hostToRobotUrlString("www." + resultHost)); + } + return list; + } + + public List<String> getMissingRobotUrls(String host) { + ArrayList<String> missing = new ArrayList<String>(); + List<String> needed = hostToRobotUrlStrings(host); + for(String need : needed) { + if(!cache.containsKey(need)) { + missing.add(need); + } + } + return missing; + } + public void addDirectives(String url, RobotsDirectives directives) { + cache.put(url, directives); + } + public boolean isBlocked(String path) { + for(RobotsDirectives directives : cache.values()) { + if(!directives.allows(path)) { + return true; + } + } + return false; + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectives.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectives.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectives.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,75 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.accesscontrol.robotstxt; + +import java.io.Serializable; +import java.util.concurrent.ConcurrentSkipListSet; + +/** + * Represents the directives that apply to a user-agent (or set of + * user-agents) + */ +public class RobotsDirectives implements Serializable { + private static final long serialVersionUID = 5386542759286155383L; + + ConcurrentSkipListSet<String> disallows = new ConcurrentSkipListSet<String>(); + ConcurrentSkipListSet<String> allows = new ConcurrentSkipListSet<String>(); + float crawlDelay = -1; + + public boolean allows(String path) { + return !(longestPrefixLength(disallows, path) > longestPrefixLength(allows, path)); + } + + /** + * @param prefixSet + * @param str + * @return length of longest entry in {@code prefixSet} that prefixes {@code str}, or zero + * if no entry prefixes {@code str} + */ + protected int longestPrefixLength(ConcurrentSkipListSet<String> prefixSet, + String str) { + String possiblePrefix = prefixSet.floor(str); + if (possiblePrefix != null && str.startsWith(possiblePrefix)) { + return possiblePrefix.length(); + } else { + return 0; + } + } + + public void addDisallow(String path) { + if(path.length()==0) { + // ignore empty-string disallows + // (they really mean allow, when alone) + return; + } + disallows.add(path); + } + + public void addAllow(String path) { + allows.add(path); + } + + public void setCrawlDelay(float i) { + crawlDelay=i; + } + + public float getCrawlDelay() { + return crawlDelay; + } +} \ No newline at end of file Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/Robotstxt.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/Robotstxt.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/robotstxt/Robotstxt.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,234 @@ +/* + * This file is part of the Heritrix web crawler (crawler.archive.org). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.accesscontrol.robotstxt; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Serializable; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.commons.io.IOUtils; +import org.archive.io.ReadSource; + +/** + * Utility class for parsing and representing 'robots.txt' format + * directives, into a list of named user-agents and map from user-agents + * to RobotsDirectives. + */ +public class Robotstxt implements Serializable { + static final long serialVersionUID = 7025386509301303890L; + private static final Logger logger = + Logger.getLogger(Robotstxt.class.getName()); + + // all user agents contained in this robots.txt + // in order of declaration + // TODO: consider discarding irrelevant entries + LinkedList<String> namedUserAgents = new LinkedList<String>(); + // map user-agents to directives + Map<String,RobotsDirectives> agentsToDirectives = + new HashMap<String,RobotsDirectives>(); + RobotsDirectives wildcardDirectives = null; + + boolean hasErrors = false; + + static RobotsDirectives NO_DIRECTIVES = new RobotsDirectives(); + /** empty, reusable instance for all sites providing no rules */ + public static Robotstxt NO_ROBOTS = new Robotstxt(); + + public Robotstxt() { + } + + public Robotstxt(BufferedReader reader) throws IOException { + initializeFromReader(reader); + } + + public Robotstxt(ReadSource customRobots) { + BufferedReader reader = new BufferedReader(customRobots.obtainReader()); + try { + initializeFromReader(reader); + } catch (IOException e) { + logger.log(Level.SEVERE, + "robots ReadSource problem: potential for inadvertent overcrawling", + e); + } finally { + IOUtils.closeQuietly(reader); + } + } + + protected void initializeFromReader(BufferedReader reader) throws IOException { + String read; + // current is the disallowed paths for the preceding User-Agent(s) + RobotsDirectives current = null; + // whether a non-'User-Agent' directive has been encountered + boolean hasDirectivesYet = false; + while (reader != null) { + do { + read = reader.readLine(); + // Skip comments & blanks + } while ((read != null) && ((read = read.trim()).startsWith("#") || + read.length() == 0)); + if (read == null) { + reader.close(); + reader = null; + } else { + // remove any html markup + read = read.replaceAll("<[^>]+>",""); + int commentIndex = read.indexOf("#"); + if (commentIndex > -1) { + // Strip trailing comment + read = read.substring(0, commentIndex); + } + read = read.trim(); + if (read.matches("(?i)^User-agent:.*")) { + String ua = read.substring(11).trim().toLowerCase(); + if (current == null || hasDirectivesYet ) { + // only create new rules-list if necessary + // otherwise share with previous user-agent + current = new RobotsDirectives(); + hasDirectivesYet = false; + } + if (ua.equals("*")) { + wildcardDirectives = current; + } else { + namedUserAgents.addLast(ua); + agentsToDirectives.put(ua, current); + } + continue; + } + if (read.matches("(?i)Disallow:.*")) { + if (current == null) { + // buggy robots.txt + hasErrors = true; + continue; + } + String path = read.substring(9).trim(); + // tolerate common error of ending path with '*' character + // (not allowed by original spec; redundant but harmless with + // Google's wildcarding extensions -- which we don't yet fully + // support). + if(path.endsWith("*")) { + path = path.substring(0,path.length()-1); + } + current.addDisallow(path); + hasDirectivesYet = true; + continue; + } + if (read.matches("(?i)Crawl-delay:.*")) { + if (current == null) { + // buggy robots.txt + hasErrors = true; + continue; + } + // consider a crawl-delay, even though we don't + // yet understand it, as sufficient to end a + // grouping of User-Agent lines + hasDirectivesYet = true; + String val = read.substring(12).trim(); + val = val.split("[^\\d\\.]+")[0]; + try { + current.setCrawlDelay(Float.parseFloat(val)); + } catch (NumberFormatException nfe) { + // ignore + } + continue; + } + if (read.matches("(?i)Allow:.*")) { + if (current == null) { + // buggy robots.txt + hasErrors = true; + continue; + } + String path = read.substring(6).trim(); + // tolerate common error of ending path with '*' character + // (not allowed by original spec; redundant but harmless with + // Google's wildcarding extensions -- which we don't yet fully + // support). + if(path.endsWith("*")) { + path = path.substring(0,path.length()-1); + } + current.addAllow(path); + hasDirectivesYet = true; + continue; + } + // unknown line; do nothing for now + } + } + } + + /** + * Does this policy effectively allow everything? (No + * disallows or timing (crawl-delay) directives?) + * @return + */ + public boolean allowsAll() { + // TODO: refine so directives that are all empty are also + // recognized as allowing all + return agentsToDirectives.isEmpty(); + } + + public List<String> getNamedUserAgents() { + return namedUserAgents; + } + + /** + * Return the RobotsDirectives, if any, appropriate for the given User-Agent + * string. If useFallbacks is true, a wildcard ('*') directives or the default + * of NO_DIRECTIVES will be returned, as appropriate, if there is no better + * match. If useFallbacks is false, a null will be returned if no declared + * directives targeted the given User-Agent. + * + * @param ua String User-Agent to lookup + * @param useFallbacks if true, fall-back to wildcard directives or + * default allow as needed + * @return directives to use, or null if useFallbacks is false and no + * non-wildcard directives match the supplied User-Agent + */ + public RobotsDirectives getDirectivesFor(String ua, boolean useFallbacks) { + // find matching ua + for(String uaListed : namedUserAgents) { + if(ua.indexOf(uaListed)>-1) { + return agentsToDirectives.get(uaListed); + } + } + if(useFallbacks==false) { + return null; + } + if (wildcardDirectives!=null) { + return wildcardDirectives; + } + // no applicable user-agents, so empty directives + return NO_DIRECTIVES; + } + + /** + * Return directives to use for the given User-Agent, resorting to wildcard + * rules or the default no-directives if necessary. + * + * @param userAgent String User-Agent to lookup + * @return directives to use + */ + public RobotsDirectives getDirectivesFor(String userAgent) { + return getDirectivesFor(userAgent, true); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregationTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregationTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/robotstxt/RobotsDirectiveAggregationTest.java 2011-11-16 22:17:57 UTC (rev 3559) @@ -0,0 +1,109 @@ +package org.archive.wayback.accesscontrol.robotstxt; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.lang.StringEscapeUtils; + +import com.google.common.base.Strings; +import com.google.common.collect.Lists; + +import junit.framework.TestCase; + +public class RobotsDirectiveAggregationTest extends TestCase { + + private String[] mapRobotUrls(String[] in ) { + String res[] = new String[in.length]; + for(int i = 0; i < in.length; i++) { + res[i] = "http://" + in[i] + "/robots.txt"; + } + return res; + } + + + /** + * + */ + public void testHostToRobotUrlStrings() { + RobotsDirectiveAggregation f = new RobotsDirectiveAggregation(); + String test1[] = {"www.foo.com","foo.com"}; + compareListTo(f.hostToRobotUrlStrings("www.foo.com"),mapRobotUrls(test1)); + + String test2[] = {"foo.com","www.foo.com"}; + compareListTo(f.hostToRobotUrlStrings("foo.com"),mapRobotUrls(test2)); + + String test3[] = {"fool.foo.com","www.fool.foo.com"}; + compareListTo(f.hostToRobotUrlStrings("fool.foo.com"),mapRobotUrls(test3)); + + String test4[] = {"www4.foo.com","www.foo.com","foo.com"}; + compareListTo(f.hostToRobotUrlStrings("www4.foo.com"),mapRobotUrls(test4)); + + String test5[] = {"www4w.foo.com"}; + compareListTo(f.hostToRobotUrlStrings("www4w.foo.com"),mapRobotUrls(test5)); + + String test6[] = {"www.www.foo.com","www.foo.com"}; + compareListTo(f.hostToRobotUrlStrings("www.www.foo.com"),mapRobotUrls(test6)); + } + private String strJoin(Iterable<String> i, char del) { + StringBuilder sb = new StringBuilder(); + boolean first = true; + for(String s : i) { + if(first) { + first = false; + } else { + sb.append(del); + } + sb.append(s); + } + return sb.toString(); + } + private List<String> sortA(String[] a) { + Arrays.sort(a); + return Lists.newArrayList(a); + } + private List<String> sortL(List<String> a) { + String[] Empty = new String[0]; + String[] tmp; + tmp = a.toArray(Empty); + Arrays.sort(tmp); + return Lists.newArrayList(tmp); + } + private void compareListTo(List<String> list, String strings[]) { + + boolean match = list.size() == strings.length; + List<String> ls = sortL(list); + List<String> ss = sortA(strings); + if(match) { + for(int i = 0; i < strings.length; i++) { + if(!ls.get(i).equals(ss.get(i))) { + match = false; + break; + } + } + } + if(!match) { + String a1 = strJoin(ls,','); + String a2 = strJoin(ss,','); + String msg = String.format("ArrayCMP (%s) != (%s)",a1,a2); + assertTrue(msg,false); + } + } + + public void testInteraction() { + RobotsDirectiveAggregation agg = new RobotsDirectiveAggregation(); + String test1[] = {"http://foo.com/robots.txt","http://www.foo.com/robots.txt"}; + compareListTo(agg.getMissingRobotUrls("foo.com"),test1); + compareListTo(agg.getMissingRobotUrls("www.foo.com"),test1); + agg.addDirectives("http://foo.com/robots.txt", new FixedRobotsDirectives(true)); + String test2[] = {"http://www.foo.com/robots.txt"}; + compareListTo(agg.getMissingRobotUrls("foo.com"),test2); + assertFalse(agg.isBlocked("/foo")); + + agg.addDirectives("http://www.foo.com/robots.txt", new FixedRobotsDirectives(false)); + String test3[] = {}; + compareListTo(agg.getMissingRobotUrls("foo.com"),test3); + assertTrue(agg.isBlocked("/foo")); + + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-11-16 22:19:55
|
Revision: 3560 http://archive-access.svn.sourceforge.net/archive-access/?rev=3560&view=rev Author: bradtofel Date: 2011-11-16 22:19:49 +0000 (Wed, 16 Nov 2011) Log Message: ----------- INITIAL REV: drop in replacement for StaticMapExclusionFilter*, which is much more performant, and has better test coverage Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterFactory.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilter.java 2011-11-16 22:19:49 UTC (rev 3560) @@ -0,0 +1,86 @@ +package org.archive.wayback.accesscontrol.staticmap; + +import java.util.Map; +import java.util.TreeSet; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.URIException; +import org.archive.util.SURT; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.resourceindex.filters.ExclusionFilter; +import org.archive.wayback.surt.SURTTokenizer; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +public class StaticListExclusionFilter extends ExclusionFilter { + private static final Logger LOGGER = Logger.getLogger( + StaticMapExclusionFilter.class.getName()); + + private String lastChecked = null; + private boolean lastCheckedExcluded = false; + private boolean notifiedSeen = false; + private boolean notifiedPassed = false; + TreeSet<String> exclusions = null; + UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + /** + * @param map where each String key is a SURT that is blocked. + */ + public StaticListExclusionFilter(TreeSet<String> exclusions, UrlCanonicalizer canonicalizer) { + this.exclusions = exclusions; + this.canonicalizer = canonicalizer; + } + + protected boolean isExcluded(String surt) { + String possiblePrefix = exclusions.floor(surt); + return (possiblePrefix != null && surt.startsWith(possiblePrefix)); + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourceindex.SearchResultFilter#filterSearchResult(org.archive.wayback.core.SearchResult) + */ + public int filterObject(CaptureSearchResult r) { + if(!notifiedSeen) { + if(filterGroup != null) { + filterGroup.setSawAdministrative(); + } + notifiedSeen = true; + } + String surt; + try { + String url = canonicalizer.urlStringToKey(r.getOriginalUrl()); + surt = SURT.fromPlain(url); +// surt = SURTTokenizer.prefixKey(url); + } catch (URIException e) { + + //e.printStackTrace(); + return FILTER_EXCLUDE; + } + if(lastChecked != null) { + if(lastChecked.equals(surt)) { + if(lastCheckedExcluded) { + return ObjectFilter.FILTER_EXCLUDE; + } else { + // don't need to: already did last time... + //filterGroup.setPassedAdministrative(); + return ObjectFilter.FILTER_INCLUDE; + } + } + } + lastChecked = surt; + lastCheckedExcluded = isExcluded(surt); + if(lastCheckedExcluded) { + return ObjectFilter.FILTER_EXCLUDE; + } else { + if(!notifiedPassed) { + if(filterGroup != null) { + filterGroup.setPassedAdministrative(); + } + notifiedPassed = true; + } + return ObjectFilter.FILTER_INCLUDE; + } + + } + +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterFactory.java 2011-11-16 22:19:49 UTC (rev 3560) @@ -0,0 +1,186 @@ +package org.archive.wayback.accesscontrol.staticmap; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.TreeSet; +import java.util.logging.Logger; + +import org.archive.util.SURT; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.accesscontrol.ExclusionFilterFactory; +import org.archive.wayback.resourceindex.filters.ExclusionFilter; +import org.archive.wayback.surt.SURTTokenizer; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.flatfile.FlatFile; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +public class StaticListExclusionFilterFactory implements ExclusionFilterFactory { + private static final Logger LOGGER = + Logger.getLogger(StaticMapExclusionFilterFactory.class.getName()); + + private int checkInterval = 0; + private TreeSet<String> excludes = null; + private File file = null; + long lastUpdated = 0; + UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + + /** + * Thread object of update thread -- also is flag indicating if the thread + * has already been started -- static, and access to it is synchronized. + */ + private static Thread updateThread = null; + + /** + * load exclusion file and startup polling thread to check for updates + * @throws IOException if the exclusion file could not be read. + */ + public void init() throws IOException { + reloadFile(); + if(checkInterval > 0) { + startUpdateThread(); + } + } + + protected void reloadFile() throws IOException { + long currentMod = file.lastModified(); + if(currentMod == lastUpdated) { + if(currentMod == 0) { + LOGGER.severe("No exclude file at " + file.getAbsolutePath()); + } + return; + } + LOGGER.info("Reloading exclusion file " + file.getAbsolutePath()); + try { + excludes = loadFile(file.getAbsolutePath()); + lastUpdated = currentMod; + LOGGER.info("Reload " + file.getAbsolutePath() + " OK"); + } catch(IOException e) { + lastUpdated = -1; + excludes = null; + e.printStackTrace(); + LOGGER.severe("Reload " + file.getAbsolutePath() + " FAILED:" + + e.getLocalizedMessage()); + } + } + protected TreeSet<String> loadFile(String path) throws IOException { + TreeSet<String> excludes = new TreeSet<String>(); + FlatFile ff = new FlatFile(path); + CloseableIterator<String> itr = ff.getSequentialIterator(); + while(itr.hasNext()) { + String line = (String) itr.next(); + line = line.trim(); + if(line.length() == 0) { + continue; + } + line = canonicalizer.urlStringToKey(line); + String surt = line.startsWith("(") ? line : SURT.fromPlain(line); +// SURTTokenizer.prefixKey(line); + LOGGER.fine("EXCLUSION-MAP: adding " + surt); + excludes.add(surt); + } + itr.close(); + return excludes; + } + + /** + * @return ObjectFilter which blocks CaptureSearchResults in the + * exclusion file. + */ + public ExclusionFilter get() { + if(excludes == null) { + return null; + } + return new StaticListExclusionFilter(excludes, canonicalizer); + } + + private synchronized void startUpdateThread() { + if (updateThread != null) { + return; + } + updateThread = new CacheUpdaterThread(this,checkInterval); + updateThread.start(); + } + private synchronized void stopUpdateThread() { + if (updateThread == null) { + return; + } + updateThread.interrupt(); + } + + private class CacheUpdaterThread extends Thread { + /** + * object which merges CDX files with the BDBResourceIndex + */ + private StaticListExclusionFilterFactory service = null; + + private int runInterval; + + /** + * @param service ExclusionFactory which will be reloaded + * @param runInterval int number of seconds between reloads + */ + public CacheUpdaterThread(StaticListExclusionFilterFactory service, int runInterval) { + super("CacheUpdaterThread"); + super.setDaemon(true); + this.service = service; + this.runInterval = runInterval; + LOGGER.info("CacheUpdaterThread is alive."); + } + + public void run() { + int sleepInterval = runInterval; + while (true) { + try { + try { + service.reloadFile(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + Thread.sleep(sleepInterval * 1000); + } catch (InterruptedException e) { + e.printStackTrace(); + return; + } + } + } + } + + /** + * @return the checkInterval in seconds + */ + public int getCheckInterval() { + return checkInterval; + } + + /** + * @param checkInterval the checkInterval in seconds to set + */ + public void setCheckInterval(int checkInterval) { + this.checkInterval = checkInterval; + } + + /** + * @return the path + */ + public String getFile() { + return file.getAbsolutePath(); + } + + /** + * @param path the file to set + */ + public void setFile(String path) { + this.file = new File(path); + } + + /* (non-Javadoc) + * @see org.archive.wayback.accesscontrol.ExclusionFilterFactory#shutdown() + */ + public void shutdown() { + stopUpdateThread(); + } + +} Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterTest.java 2011-11-16 22:19:49 UTC (rev 3560) @@ -0,0 +1,164 @@ +package org.archive.wayback.accesscontrol.staticmap; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.TreeSet; + +import org.archive.util.SURT; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +import junit.framework.TestCase; + +public class StaticListExclusionFilterTest extends TestCase { + File tmpFile = null; + StaticListExclusionFilterFactory factory = null; + UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + + protected void setUp() throws Exception { + super.setUp(); + factory = new StaticListExclusionFilterFactory(); + tmpFile = File.createTempFile("static-map", ".tmp"); +// Properties p = new Properties(); +// p.put("resourceindex.exclusionpath", tmpFile.getAbsolutePath()); +// factory.init(p); + } + + /* + * @see TestCase#tearDown() + */ + protected void tearDown() throws Exception { + super.tearDown(); + if(tmpFile != null && tmpFile.exists()) { + tmpFile.delete(); + } + } + + /** + * @throws Exception + */ + public void testRealWorld() throws Exception { + String bases[] = { "pho-c.co.jp/~clever", + "sf.net/pop/Roger", + "www.eva-stu.vn", + "mins.com.br/", + "24.ne.jp", + "24.ne.jp/~nekko"}; +// setTmpContents(bases); + + + ObjectFilter<CaptureSearchResult> filter = getFilter(bases); + assertFalse("unmassaged",isBlocked(filter,"24.ne.jp.idpnt.com/robots.txt")); + assertTrue("massage",isBlocked(filter,"http://24.ne.jp:80/")); + assertTrue("unmassaged",isBlocked(filter,"http://www.pho-c.co.jp/~clever")); + assertTrue("massage",isBlocked(filter,"http://24.ne.jp")); + + + assertTrue("unmassaged",isBlocked(filter,"http://www.pho-c.co.jp/~clever")); + assertTrue("massaged",isBlocked(filter,"http://pho-c.co.jp/~clever")); + assertTrue("trailing-slash",isBlocked(filter,"http://pho-c.co.jp/~clever/")); + assertTrue("subpath",isBlocked(filter,"http://pho-c.co.jp/~clever/foo.txt")); + + assertTrue("full-port",isBlocked(filter,"http://www.mins.com.br:80")); + assertTrue("tail-slash-port",isBlocked(filter,"http://www.mins.com.br:80/")); + assertTrue("full",isBlocked(filter,"http://www.mins.com.br")); + assertTrue("tail-slash",isBlocked(filter,"http://www.mins.com.br/")); + assertTrue("full-massage",isBlocked(filter,"http://mins.com.br")); + assertTrue("tail-slash-massage",isBlocked(filter,"http://mins.com.br/")); + assertTrue("massage",isBlocked(filter,"http://mins.com.br/foo.txt")); + assertTrue("subpath",isBlocked(filter,"http://www13.mins.com.br/~clever/foo.txt")); + + assertTrue("massage",isBlocked(filter,"24.ne.jp")); + assertTrue("full",isBlocked(filter,"http://www.mins.com.br")); + assertTrue("subpath",isBlocked(filter,"www.24.ne.jp")); + assertTrue("tail-slash-massage",isBlocked(filter,"http://mins.com.br/")); + assertTrue("subpath",isBlocked(filter,"http://www.24.ne.jp:80/")); + + + + + assertTrue(isBlocked(filter,"http://sf.net/pop/Roger")); + assertTrue(isBlocked(filter,"http://sf.net/pop/Roger/")); + assertTrue(isBlocked(filter,"http://sf.net/pop/Roger//")); + assertFalse(isBlocked(filter,"http://sf.net/pop/")); + assertTrue(isBlocked(filter,"http://sf.net/pop/Roger/2")); + assertTrue(isBlocked(filter,"http://sf.net/pop/Roger/23")); + assertTrue(isBlocked(filter,"http://www.sf.net/pop/Roger")); + assertTrue(isBlocked(filter,"http://www1.sf.net/pop/Roger")); + assertTrue(isBlocked(filter,"http://www23.sf.net/pop/Roger")); + + assertTrue(isBlocked(filter,"http://www23.eva-stu.vn/")); + assertTrue(isBlocked(filter,"http://www23.eva-stu.vn")); + assertTrue(isBlocked(filter,"http://eva-stu.vn")); + assertTrue(isBlocked(filter,"http://www.eva-stu.vn/")); + assertTrue(isBlocked(filter,"http://eva-stu.vn/")); + assertTrue(isBlocked(filter,"http://www.eva-stu.vn/foo.txt")); + assertTrue(isBlocked(filter,"http://www2.eva-stu.vn/foo/bar.txt")); + assertTrue(isBlocked(filter,"http://eva-stu.vn/foo/bar.txt")); + + } + + + /** + * @throws Exception + */ + public void testBaseNoPrefix() throws Exception { + + String str = "http://peagreenboat.com/"; +// String str = "http://(com,peagreenboat"; + System.out.format("(%s) -> [%s]\n", str,SURT.prefixFromPlain(str)); + + + String bases[] = {"http://www.peagreenboat.com/", + "http://peagreenboat.com/"}; +// setTmpContents(bases); + ObjectFilter<CaptureSearchResult> filter = getFilter(bases); + assertTrue("unmassaged",isBlocked(filter,"http://www.peagreenboat.com")); + assertTrue("unmassaged",isBlocked(filter,"http://peagreenboat.com")); + assertFalse("other1",isBlocked(filter,"http://peagreenboatt.com")); + assertFalse("other2",isBlocked(filter,"http://peagreenboat.org")); + assertFalse("other3",isBlocked(filter,"http://www.peagreenboat.org")); + // there is a problem with the SURTTokenizer... deal with ports! +// assertFalse("other4",isBlocked(filter,"http://www.peagreenboat.com:8080")); + assertTrue("subpath",isBlocked(filter,"http://www.peagreenboat.com/foo")); + assertTrue("emptypath",isBlocked(filter,"http://www.peagreenboat.com/")); + } + + private boolean isBlocked(ObjectFilter<CaptureSearchResult> filter, String url) { + CaptureSearchResult result = new CaptureSearchResult(); + result.setOriginalUrl(url); + int filterResult = filter.filterObject(result); + if(filterResult == ObjectFilter.FILTER_EXCLUDE) { + return true; + } + return false; + } + + private ObjectFilter<CaptureSearchResult> getFilter(String lines[]) + throws IOException { + + setTmpContents(lines); + TreeSet<String> excludes = factory.loadFile(tmpFile.getAbsolutePath()); + return new StaticListExclusionFilter(excludes,canonicalizer); + } + + private void setTmpContents(String[] lines) throws IOException { + if(tmpFile != null && tmpFile.exists()) { + tmpFile.delete(); + } +// tmpFile = File.createTempFile("range-map","tmp"); + FileWriter writer = new FileWriter(tmpFile); + StringBuilder sb = new StringBuilder(); + for(int i=0; i<lines.length; i++) { + sb.append(lines[i]).append("\n"); + } + String contents = sb.toString(); + writer.write(contents); + writer.close(); + //factory.reloadFile(); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |