Revision: 2177
http://archive-access.svn.sourceforge.net/archive-access/?rev=2177&view=rev
Author: bradtofel
Date: 2008-02-06 16:09:12 -0800 (Wed, 06 Feb 2008)
Log Message:
-----------
BUGFIX (unreported) now we track all non-abbreviated records for matching against subsequent abbreviated records. Previously only the most recent non-abbrev record was saved, which caused problems with massaging:
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java 2008-02-06 02:01:26 UTC (rev 2176)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java 2008-02-07 00:09:12 UTC (rev 2177)
@@ -7,17 +7,18 @@
import org.archive.wayback.util.Adapter;
/**
- * Adapter class that observes a stream of SearchResults tracking the last seen:
+ * Adapter class that observes a stream of SearchResults tracking for each
+ * complete record, a mapping of that records digest to:
* Arc/Warc Filename
* Arc/Warc offset
* HTTP Response
* MIME-Type
* Redirect URL
*
- * for complete SearchResults. If subsequent SearchResults are missing these
- * fields ("-") and the Digest field is the same, then the subsequent
- * SearchResults are updated with the values from the kept copy, and an
- * additional annotation field is added.
+ * If subsequent SearchResults are missing these fields ("-") and the Digest
+ * field has been seen, then the subsequent SearchResults are updated with the
+ * values from the kept copy matching that digest, and an additional annotation
+ * field is added.
*
*
* @author brad
@@ -26,42 +27,46 @@
public class DeduplicationSearchResultAnnotationAdapter
implements Adapter<SearchResult,SearchResult> {
private final static String EMPTY_VALUE = "-";
+
+ // these fields are all copied to deduped records as-is:
private final static String FIELDS[] = {
WaybackConstants.RESULT_ARC_FILE,
WaybackConstants.RESULT_OFFSET,
WaybackConstants.RESULT_HTTP_CODE,
WaybackConstants.RESULT_MIME_TYPE,
- WaybackConstants.RESULT_REDIRECT_URL
+ WaybackConstants.RESULT_REDIRECT_URL,
};
- private String lastDigest = null;
- private String lastTimeStamp = null;
- private HashMap<String,String> lastValues = new HashMap<String,String>();
+ private HashMap<String,SearchResult> memory = null;
+
+ public DeduplicationSearchResultAnnotationAdapter() {
+ memory = new HashMap<String,SearchResult>();
+ }
+
private SearchResult annotate(SearchResult o) {
String thisDigest = o.get(WaybackConstants.RESULT_MD5_DIGEST);
- if(!thisDigest.equals(lastDigest)) {
+ SearchResult last = memory.get(thisDigest);
+ if(last == null) {
return null;
}
for(String field : FIELDS) {
- o.put(field, lastValues.get(field));
+ o.put(field, last.get(field));
}
o.put(WaybackConstants.RESULT_DUPLICATE_ANNOTATION,
WaybackConstants.RESULT_DUPLICATE_DIGEST);
- o.put(WaybackConstants.RESULT_DUPLICATE_STORED_DATE, lastTimeStamp);
+ o.put(WaybackConstants.RESULT_DUPLICATE_STORED_DATE,
+ last.get(WaybackConstants.RESULT_CAPTURE_DATE));
return o;
}
+
private SearchResult remember(SearchResult o) {
- lastDigest = o.get(WaybackConstants.RESULT_MD5_DIGEST);
- lastTimeStamp = o.get(WaybackConstants.RESULT_CAPTURE_DATE);
- for(String field : FIELDS) {
- lastValues.put(field, o.get(field));
- }
+ memory.put(o.get(WaybackConstants.RESULT_MD5_DIGEST),o);
return o;
}
+
public SearchResult adapt(SearchResult o) {
if(o.get(FIELDS[0]).equals(EMPTY_VALUE)) {
return annotate(o);
}
return remember(o);
}
-
-}
+}
\ No newline at end of file
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|