From: <bra...@us...> - 2007-11-28 00:41:32
|
Revision: 2078 http://archive-access.svn.sourceforge.net/archive-access/?rev=2078&view=rev Author: bradtofel Date: 2007-11-27 16:41:32 -0800 (Tue, 27 Nov 2007) Log Message: ----------- FEATURE: LocalResourceIndex now has option to annotate dedupe SearchResult records with information from previous captured copies. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java 2007-11-28 00:41:32 UTC (rev 2078) @@ -0,0 +1,64 @@ +package org.archive.wayback.resourceindex; + +import java.util.HashMap; + +import org.archive.wayback.WaybackConstants; +import org.archive.wayback.core.SearchResult; +import org.archive.wayback.util.Adapter; + +/** + * Adapter class that observes a stream of SearchResults tracking the last seen: + * Arc/Warc Filename + * Arc/Warc offset + * HTTP Response + * MIME-Type + * Redirect URL + * + * for complete SearchResults. If subsequent SearchResults are missing these + * fields ("-") and the Digest field is the same, then the subsequent + * SearchResults are updated with the values from the kept copy, and an + * additional annotation field is added. + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class DeduplicationSearchResultAnnotationAdapter +implements Adapter<SearchResult,SearchResult> { + private final static String EMPTY_VALUE = "-"; + private final static String FIELDS[] = { + WaybackConstants.RESULT_ARC_FILE, + WaybackConstants.RESULT_OFFSET, + WaybackConstants.RESULT_HTTP_CODE, + WaybackConstants.RESULT_MIME_TYPE, + WaybackConstants.RESULT_REDIRECT_URL + }; + private String lastDigest = null; + private HashMap<String,String> lastValues = new HashMap<String,String>(); + private SearchResult annotate(SearchResult o) { + String thisDigest = o.get(WaybackConstants.RESULT_MD5_DIGEST); + if(!thisDigest.equals(lastDigest)) { + return null; + } + for(String field : FIELDS) { + o.put(field, lastValues.get(field)); + } + o.put(WaybackConstants.RESULT_DUPLICATE_ANNOTATION, + WaybackConstants.RESULT_DUPLICATE_DIGEST); + return o; + } + private SearchResult remember(SearchResult o) { + lastDigest = o.get(WaybackConstants.RESULT_MD5_DIGEST); + for(String field : FIELDS) { + lastValues.put(field, o.get(field)); + } + return o; + } + public SearchResult adapt(SearchResult o) { + if(o.get(FIELDS[0]).equals(EMPTY_VALUE)) { + return annotate(o); + } + return remember(o); + } + +} Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2007-11-28 00:39:53 UTC (rev 2077) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2007-11-28 00:41:32 UTC (rev 2078) @@ -53,6 +53,7 @@ import org.archive.wayback.exception.BadQueryException; import org.archive.wayback.exception.ResourceIndexNotAvailableException; import org.archive.wayback.exception.ResourceNotInArchiveException; +import org.archive.wayback.util.AdaptedIterator; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.ObjectFilter; import org.archive.wayback.util.ObjectFilterChain; @@ -75,12 +76,18 @@ protected SearchResultSource source; - private UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); + private UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); + + private boolean dedupeRecords = false; private void filterRecords(CloseableIterator<SearchResult> itr, ObjectFilter<SearchResult> filter, SearchResults results, boolean forwards) throws IOException { + if(dedupeRecords) { + itr = new AdaptedIterator<SearchResult, SearchResult>(itr, + new DeduplicationSearchResultAnnotationAdapter()); + } while (itr.hasNext()) { SearchResult result = itr.next(); int ruling = filter.filterObject(result); @@ -408,4 +415,12 @@ public void setSource(SearchResultSource source) { this.source = source; } + + public boolean isDedupeRecords() { + return dedupeRecords; + } + + public void setDedupeRecords(boolean dedupeRecords) { + this.dedupeRecords = dedupeRecords; + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |