From: <bra...@us...> - 2007-12-15 02:01:46
|
Revision: 2116 http://archive-access.svn.sourceforge.net/archive-access/?rev=2116&view=rev Author: bradtofel Date: 2007-12-14 18:01:51 -0800 (Fri, 14 Dec 2007) Log Message: ----------- FEATURE: added DuplicateRecordFilter to standard filter chains, to omit identical records from result stream. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/DuplicateRecordFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2007-12-15 02:00:51 UTC (rev 2115) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2007-12-15 02:01:51 UTC (rev 2116) @@ -34,6 +34,7 @@ import org.archive.wayback.resourceindex.filters.CaptureToUrlResultFilter; import org.archive.wayback.resourceindex.filters.CounterFilter; import org.archive.wayback.resourceindex.filters.DateRangeFilter; +import org.archive.wayback.resourceindex.filters.DuplicateRecordFilter; import org.archive.wayback.resourceindex.filters.EndDateFilter; import org.archive.wayback.resourceindex.filters.GuardRailFilter; import org.archive.wayback.resourceindex.filters.HostMatchFilter; @@ -224,7 +225,11 @@ // use the same guardrail for both: forwardFilters.addFilter(guardrail); reverseFilters.addFilter(guardrail); - + + // BUGBUG: won't work when closest is a dupe! + forwardFilters.addFilter(new DuplicateRecordFilter()); + reverseFilters.addFilter(new DuplicateRecordFilter()); + // match URL key: forwardFilters.addFilter(new UrlMatchFilter(keyUrl)); reverseFilters.addFilter(new UrlMatchFilter(keyUrl)); @@ -298,6 +303,7 @@ ObjectFilterChain<SearchResult> filters = new ObjectFilterChain<SearchResult>(); filters.addFilter(guardrail); + filters.addFilter(new DuplicateRecordFilter()); filters.addFilter(new UrlMatchFilter(keyUrl)); if(hostMatchFilter != null) { @@ -331,6 +337,7 @@ ObjectFilterChain<SearchResult> filters = new ObjectFilterChain<SearchResult>(); filters.addFilter(guardrail); + filters.addFilter(new DuplicateRecordFilter()); filters.addFilter(new UrlPrefixMatchFilter(keyUrl)); if(hostMatchFilter != null) { Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/DuplicateRecordFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/DuplicateRecordFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/DuplicateRecordFilter.java 2007-12-15 02:01:51 UTC (rev 2116) @@ -0,0 +1,30 @@ +package org.archive.wayback.resourceindex.filters; + +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.util.ObjectFilter; + +/** + * ObjectFilter which omits exact duplicate URL+date records from a stream + * of SearchResults. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class DuplicateRecordFilter implements ObjectFilter<SearchResult> { + private String lastUrl = null; + private String lastDate = null; + + public int filterObject(SearchResult o) { + String thisUrl = o.getUrl(); + String thisDate = o.getCaptureDate(); + int result = ObjectFilter.FILTER_INCLUDE; + if(lastUrl != null) { + if(lastUrl.equals(thisUrl) && thisDate.equals(lastDate)) { + result = FILTER_EXCLUDE; + } + } + lastUrl = thisUrl; + lastDate = thisDate; + return result; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |