From: <bra...@us...> - 2011-11-16 22:19:55
|
Revision: 3560 http://archive-access.svn.sourceforge.net/archive-access/?rev=3560&view=rev Author: bradtofel Date: 2011-11-16 22:19:49 +0000 (Wed, 16 Nov 2011) Log Message: ----------- INITIAL REV: drop in replacement for StaticMapExclusionFilter*, which is much more performant, and has better test coverage Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterFactory.java trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilter.java 2011-11-16 22:19:49 UTC (rev 3560) @@ -0,0 +1,86 @@ +package org.archive.wayback.accesscontrol.staticmap; + +import java.util.Map; +import java.util.TreeSet; +import java.util.logging.Logger; + +import org.apache.commons.httpclient.URIException; +import org.archive.util.SURT; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.resourceindex.filters.ExclusionFilter; +import org.archive.wayback.surt.SURTTokenizer; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +public class StaticListExclusionFilter extends ExclusionFilter { + private static final Logger LOGGER = Logger.getLogger( + StaticMapExclusionFilter.class.getName()); + + private String lastChecked = null; + private boolean lastCheckedExcluded = false; + private boolean notifiedSeen = false; + private boolean notifiedPassed = false; + TreeSet<String> exclusions = null; + UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + /** + * @param map where each String key is a SURT that is blocked. + */ + public StaticListExclusionFilter(TreeSet<String> exclusions, UrlCanonicalizer canonicalizer) { + this.exclusions = exclusions; + this.canonicalizer = canonicalizer; + } + + protected boolean isExcluded(String surt) { + String possiblePrefix = exclusions.floor(surt); + return (possiblePrefix != null && surt.startsWith(possiblePrefix)); + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourceindex.SearchResultFilter#filterSearchResult(org.archive.wayback.core.SearchResult) + */ + public int filterObject(CaptureSearchResult r) { + if(!notifiedSeen) { + if(filterGroup != null) { + filterGroup.setSawAdministrative(); + } + notifiedSeen = true; + } + String surt; + try { + String url = canonicalizer.urlStringToKey(r.getOriginalUrl()); + surt = SURT.fromPlain(url); +// surt = SURTTokenizer.prefixKey(url); + } catch (URIException e) { + + //e.printStackTrace(); + return FILTER_EXCLUDE; + } + if(lastChecked != null) { + if(lastChecked.equals(surt)) { + if(lastCheckedExcluded) { + return ObjectFilter.FILTER_EXCLUDE; + } else { + // don't need to: already did last time... + //filterGroup.setPassedAdministrative(); + return ObjectFilter.FILTER_INCLUDE; + } + } + } + lastChecked = surt; + lastCheckedExcluded = isExcluded(surt); + if(lastCheckedExcluded) { + return ObjectFilter.FILTER_EXCLUDE; + } else { + if(!notifiedPassed) { + if(filterGroup != null) { + filterGroup.setPassedAdministrative(); + } + notifiedPassed = true; + } + return ObjectFilter.FILTER_INCLUDE; + } + + } + +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterFactory.java 2011-11-16 22:19:49 UTC (rev 3560) @@ -0,0 +1,186 @@ +package org.archive.wayback.accesscontrol.staticmap; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.TreeSet; +import java.util.logging.Logger; + +import org.archive.util.SURT; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.accesscontrol.ExclusionFilterFactory; +import org.archive.wayback.resourceindex.filters.ExclusionFilter; +import org.archive.wayback.surt.SURTTokenizer; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.flatfile.FlatFile; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +public class StaticListExclusionFilterFactory implements ExclusionFilterFactory { + private static final Logger LOGGER = + Logger.getLogger(StaticMapExclusionFilterFactory.class.getName()); + + private int checkInterval = 0; + private TreeSet<String> excludes = null; + private File file = null; + long lastUpdated = 0; + UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + + /** + * Thread object of update thread -- also is flag indicating if the thread + * has already been started -- static, and access to it is synchronized. + */ + private static Thread updateThread = null; + + /** + * load exclusion file and startup polling thread to check for updates + * @throws IOException if the exclusion file could not be read. + */ + public void init() throws IOException { + reloadFile(); + if(checkInterval > 0) { + startUpdateThread(); + } + } + + protected void reloadFile() throws IOException { + long currentMod = file.lastModified(); + if(currentMod == lastUpdated) { + if(currentMod == 0) { + LOGGER.severe("No exclude file at " + file.getAbsolutePath()); + } + return; + } + LOGGER.info("Reloading exclusion file " + file.getAbsolutePath()); + try { + excludes = loadFile(file.getAbsolutePath()); + lastUpdated = currentMod; + LOGGER.info("Reload " + file.getAbsolutePath() + " OK"); + } catch(IOException e) { + lastUpdated = -1; + excludes = null; + e.printStackTrace(); + LOGGER.severe("Reload " + file.getAbsolutePath() + " FAILED:" + + e.getLocalizedMessage()); + } + } + protected TreeSet<String> loadFile(String path) throws IOException { + TreeSet<String> excludes = new TreeSet<String>(); + FlatFile ff = new FlatFile(path); + CloseableIterator<String> itr = ff.getSequentialIterator(); + while(itr.hasNext()) { + String line = (String) itr.next(); + line = line.trim(); + if(line.length() == 0) { + continue; + } + line = canonicalizer.urlStringToKey(line); + String surt = line.startsWith("(") ? line : SURT.fromPlain(line); +// SURTTokenizer.prefixKey(line); + LOGGER.fine("EXCLUSION-MAP: adding " + surt); + excludes.add(surt); + } + itr.close(); + return excludes; + } + + /** + * @return ObjectFilter which blocks CaptureSearchResults in the + * exclusion file. + */ + public ExclusionFilter get() { + if(excludes == null) { + return null; + } + return new StaticListExclusionFilter(excludes, canonicalizer); + } + + private synchronized void startUpdateThread() { + if (updateThread != null) { + return; + } + updateThread = new CacheUpdaterThread(this,checkInterval); + updateThread.start(); + } + private synchronized void stopUpdateThread() { + if (updateThread == null) { + return; + } + updateThread.interrupt(); + } + + private class CacheUpdaterThread extends Thread { + /** + * object which merges CDX files with the BDBResourceIndex + */ + private StaticListExclusionFilterFactory service = null; + + private int runInterval; + + /** + * @param service ExclusionFactory which will be reloaded + * @param runInterval int number of seconds between reloads + */ + public CacheUpdaterThread(StaticListExclusionFilterFactory service, int runInterval) { + super("CacheUpdaterThread"); + super.setDaemon(true); + this.service = service; + this.runInterval = runInterval; + LOGGER.info("CacheUpdaterThread is alive."); + } + + public void run() { + int sleepInterval = runInterval; + while (true) { + try { + try { + service.reloadFile(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + Thread.sleep(sleepInterval * 1000); + } catch (InterruptedException e) { + e.printStackTrace(); + return; + } + } + } + } + + /** + * @return the checkInterval in seconds + */ + public int getCheckInterval() { + return checkInterval; + } + + /** + * @param checkInterval the checkInterval in seconds to set + */ + public void setCheckInterval(int checkInterval) { + this.checkInterval = checkInterval; + } + + /** + * @return the path + */ + public String getFile() { + return file.getAbsolutePath(); + } + + /** + * @param path the file to set + */ + public void setFile(String path) { + this.file = new File(path); + } + + /* (non-Javadoc) + * @see org.archive.wayback.accesscontrol.ExclusionFilterFactory#shutdown() + */ + public void shutdown() { + stopUpdateThread(); + } + +} Added: trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/test/java/org/archive/wayback/accesscontrol/staticmap/StaticListExclusionFilterTest.java 2011-11-16 22:19:49 UTC (rev 3560) @@ -0,0 +1,164 @@ +package org.archive.wayback.accesscontrol.staticmap; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.TreeSet; + +import org.archive.util.SURT; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; + +import junit.framework.TestCase; + +public class StaticListExclusionFilterTest extends TestCase { + File tmpFile = null; + StaticListExclusionFilterFactory factory = null; + UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + + protected void setUp() throws Exception { + super.setUp(); + factory = new StaticListExclusionFilterFactory(); + tmpFile = File.createTempFile("static-map", ".tmp"); +// Properties p = new Properties(); +// p.put("resourceindex.exclusionpath", tmpFile.getAbsolutePath()); +// factory.init(p); + } + + /* + * @see TestCase#tearDown() + */ + protected void tearDown() throws Exception { + super.tearDown(); + if(tmpFile != null && tmpFile.exists()) { + tmpFile.delete(); + } + } + + /** + * @throws Exception + */ + public void testRealWorld() throws Exception { + String bases[] = { "pho-c.co.jp/~clever", + "sf.net/pop/Roger", + "www.eva-stu.vn", + "mins.com.br/", + "24.ne.jp", + "24.ne.jp/~nekko"}; +// setTmpContents(bases); + + + ObjectFilter<CaptureSearchResult> filter = getFilter(bases); + assertFalse("unmassaged",isBlocked(filter,"24.ne.jp.idpnt.com/robots.txt")); + assertTrue("massage",isBlocked(filter,"http://24.ne.jp:80/")); + assertTrue("unmassaged",isBlocked(filter,"http://www.pho-c.co.jp/~clever")); + assertTrue("massage",isBlocked(filter,"http://24.ne.jp")); + + + assertTrue("unmassaged",isBlocked(filter,"http://www.pho-c.co.jp/~clever")); + assertTrue("massaged",isBlocked(filter,"http://pho-c.co.jp/~clever")); + assertTrue("trailing-slash",isBlocked(filter,"http://pho-c.co.jp/~clever/")); + assertTrue("subpath",isBlocked(filter,"http://pho-c.co.jp/~clever/foo.txt")); + + assertTrue("full-port",isBlocked(filter,"http://www.mins.com.br:80")); + assertTrue("tail-slash-port",isBlocked(filter,"http://www.mins.com.br:80/")); + assertTrue("full",isBlocked(filter,"http://www.mins.com.br")); + assertTrue("tail-slash",isBlocked(filter,"http://www.mins.com.br/")); + assertTrue("full-massage",isBlocked(filter,"http://mins.com.br")); + assertTrue("tail-slash-massage",isBlocked(filter,"http://mins.com.br/")); + assertTrue("massage",isBlocked(filter,"http://mins.com.br/foo.txt")); + assertTrue("subpath",isBlocked(filter,"http://www13.mins.com.br/~clever/foo.txt")); + + assertTrue("massage",isBlocked(filter,"24.ne.jp")); + assertTrue("full",isBlocked(filter,"http://www.mins.com.br")); + assertTrue("subpath",isBlocked(filter,"www.24.ne.jp")); + assertTrue("tail-slash-massage",isBlocked(filter,"http://mins.com.br/")); + assertTrue("subpath",isBlocked(filter,"http://www.24.ne.jp:80/")); + + + + + assertTrue(isBlocked(filter,"http://sf.net/pop/Roger")); + assertTrue(isBlocked(filter,"http://sf.net/pop/Roger/")); + assertTrue(isBlocked(filter,"http://sf.net/pop/Roger//")); + assertFalse(isBlocked(filter,"http://sf.net/pop/")); + assertTrue(isBlocked(filter,"http://sf.net/pop/Roger/2")); + assertTrue(isBlocked(filter,"http://sf.net/pop/Roger/23")); + assertTrue(isBlocked(filter,"http://www.sf.net/pop/Roger")); + assertTrue(isBlocked(filter,"http://www1.sf.net/pop/Roger")); + assertTrue(isBlocked(filter,"http://www23.sf.net/pop/Roger")); + + assertTrue(isBlocked(filter,"http://www23.eva-stu.vn/")); + assertTrue(isBlocked(filter,"http://www23.eva-stu.vn")); + assertTrue(isBlocked(filter,"http://eva-stu.vn")); + assertTrue(isBlocked(filter,"http://www.eva-stu.vn/")); + assertTrue(isBlocked(filter,"http://eva-stu.vn/")); + assertTrue(isBlocked(filter,"http://www.eva-stu.vn/foo.txt")); + assertTrue(isBlocked(filter,"http://www2.eva-stu.vn/foo/bar.txt")); + assertTrue(isBlocked(filter,"http://eva-stu.vn/foo/bar.txt")); + + } + + + /** + * @throws Exception + */ + public void testBaseNoPrefix() throws Exception { + + String str = "http://peagreenboat.com/"; +// String str = "http://(com,peagreenboat"; + System.out.format("(%s) -> [%s]\n", str,SURT.prefixFromPlain(str)); + + + String bases[] = {"http://www.peagreenboat.com/", + "http://peagreenboat.com/"}; +// setTmpContents(bases); + ObjectFilter<CaptureSearchResult> filter = getFilter(bases); + assertTrue("unmassaged",isBlocked(filter,"http://www.peagreenboat.com")); + assertTrue("unmassaged",isBlocked(filter,"http://peagreenboat.com")); + assertFalse("other1",isBlocked(filter,"http://peagreenboatt.com")); + assertFalse("other2",isBlocked(filter,"http://peagreenboat.org")); + assertFalse("other3",isBlocked(filter,"http://www.peagreenboat.org")); + // there is a problem with the SURTTokenizer... deal with ports! +// assertFalse("other4",isBlocked(filter,"http://www.peagreenboat.com:8080")); + assertTrue("subpath",isBlocked(filter,"http://www.peagreenboat.com/foo")); + assertTrue("emptypath",isBlocked(filter,"http://www.peagreenboat.com/")); + } + + private boolean isBlocked(ObjectFilter<CaptureSearchResult> filter, String url) { + CaptureSearchResult result = new CaptureSearchResult(); + result.setOriginalUrl(url); + int filterResult = filter.filterObject(result); + if(filterResult == ObjectFilter.FILTER_EXCLUDE) { + return true; + } + return false; + } + + private ObjectFilter<CaptureSearchResult> getFilter(String lines[]) + throws IOException { + + setTmpContents(lines); + TreeSet<String> excludes = factory.loadFile(tmpFile.getAbsolutePath()); + return new StaticListExclusionFilter(excludes,canonicalizer); + } + + private void setTmpContents(String[] lines) throws IOException { + if(tmpFile != null && tmpFile.exists()) { + tmpFile.delete(); + } +// tmpFile = File.createTempFile("range-map","tmp"); + FileWriter writer = new FileWriter(tmpFile); + StringBuilder sb = new StringBuilder(); + for(int i=0; i<lines.length; i++) { + sb.append(lines[i]).append("\n"); + } + String contents = sb.toString(); + writer.write(contents); + writer.close(); + //factory.reloadFile(); + } + +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |