Revision: 2177 http://archive-access.svn.sourceforge.net/archive-access/?rev=2177&view=rev Author: bradtofel Date: 2008-02-06 16:09:12 -0800 (Wed, 06 Feb 2008) Log Message: ----------- BUGFIX (unreported) now we track all non-abbreviated records for matching against subsequent abbreviated records. Previously only the most recent non-abbrev record was saved, which caused problems with massaging: Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java 2008-02-06 02:01:26 UTC (rev 2176) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java 2008-02-07 00:09:12 UTC (rev 2177) @@ -7,17 +7,18 @@ import org.archive.wayback.util.Adapter; /** - * Adapter class that observes a stream of SearchResults tracking the last seen: + * Adapter class that observes a stream of SearchResults tracking for each + * complete record, a mapping of that records digest to: * Arc/Warc Filename * Arc/Warc offset * HTTP Response * MIME-Type * Redirect URL * - * for complete SearchResults. If subsequent SearchResults are missing these - * fields ("-") and the Digest field is the same, then the subsequent - * SearchResults are updated with the values from the kept copy, and an - * additional annotation field is added. + * If subsequent SearchResults are missing these fields ("-") and the Digest + * field has been seen, then the subsequent SearchResults are updated with the + * values from the kept copy matching that digest, and an additional annotation + * field is added. * * * @author brad @@ -26,42 +27,46 @@ public class DeduplicationSearchResultAnnotationAdapter implements Adapter<SearchResult,SearchResult> { private final static String EMPTY_VALUE = "-"; + + // these fields are all copied to deduped records as-is: private final static String FIELDS[] = { WaybackConstants.RESULT_ARC_FILE, WaybackConstants.RESULT_OFFSET, WaybackConstants.RESULT_HTTP_CODE, WaybackConstants.RESULT_MIME_TYPE, - WaybackConstants.RESULT_REDIRECT_URL + WaybackConstants.RESULT_REDIRECT_URL, }; - private String lastDigest = null; - private String lastTimeStamp = null; - private HashMap<String,String> lastValues = new HashMap<String,String>(); + private HashMap<String,SearchResult> memory = null; + + public DeduplicationSearchResultAnnotationAdapter() { + memory = new HashMap<String,SearchResult>(); + } + private SearchResult annotate(SearchResult o) { String thisDigest = o.get(WaybackConstants.RESULT_MD5_DIGEST); - if(!thisDigest.equals(lastDigest)) { + SearchResult last = memory.get(thisDigest); + if(last == null) { return null; } for(String field : FIELDS) { - o.put(field, lastValues.get(field)); + o.put(field, last.get(field)); } o.put(WaybackConstants.RESULT_DUPLICATE_ANNOTATION, WaybackConstants.RESULT_DUPLICATE_DIGEST); - o.put(WaybackConstants.RESULT_DUPLICATE_STORED_DATE, lastTimeStamp); + o.put(WaybackConstants.RESULT_DUPLICATE_STORED_DATE, + last.get(WaybackConstants.RESULT_CAPTURE_DATE)); return o; } + private SearchResult remember(SearchResult o) { - lastDigest = o.get(WaybackConstants.RESULT_MD5_DIGEST); - lastTimeStamp = o.get(WaybackConstants.RESULT_CAPTURE_DATE); - for(String field : FIELDS) { - lastValues.put(field, o.get(field)); - } + memory.put(o.get(WaybackConstants.RESULT_MD5_DIGEST),o); return o; } + public SearchResult adapt(SearchResult o) { if(o.get(FIELDS[0]).equals(EMPTY_VALUE)) { return annotate(o); } return remember(o); } - -} +} \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |