From: <bra...@us...> - 2009-10-23 00:35:20
|
Revision: 2822 http://archive-access.svn.sourceforge.net/archive-access/?rev=2822&view=rev Author: bradtofel Date: 2009-10-23 00:35:10 +0000 (Fri, 23 Oct 2009) Log Message: ----------- REFACTOR: moved functionality from adapters to filters. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ConditionalGetAnnotationFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/ConditionalGetAnnotationSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/ConditionalGetAnnotationSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/ConditionalGetAnnotationSearchResultAdapter.java 2009-10-23 00:34:19 UTC (rev 2821) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/ConditionalGetAnnotationSearchResultAdapter.java 2009-10-23 00:35:10 UTC (rev 2822) @@ -1,99 +0,0 @@ -/* ConditionalGetAnnotationSearchResultAdapter - * - * $Id$ - * - * Created on 6:09:05 PM Mar 12, 2009. - * - * Copyright (C) 2009 Internet Archive. - * - * This file is part of wayback. - * - * wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourceindex.adapters; - -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.util.Adapter; - -/** - * WARC file allows 2 forms of deduplication. The first actually downloads - * documents and compares their digest with a database of previous values. When - * a new capture of a document exactly matches the previous digest, an - * abbreviated record is stored in the WARC file. The second form uses an HTTP - * conditional GET request, sending previous values returned for a given URL - * (etag, last-modified, etc). In this case, the remote server either sends a - * new document (200) which is stored normally, or the server will return a - * 304 (Not Modified) response, which is stored in the WARC file. - * - * For the first record type, the wayback indexer will output a placeholder - * record that includes the digest of the last-stored record. For 304 responses, - * the indexer outputs a normal looking record, but the record will have a - * SHA1 digest which is easily distinguishable as an "empty" document. The SHA1 - * is always: - * - * 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - * - * This class will observe a stream of SearchResults, storing the values for - * the last seen non-empty SHA1 field. Any subsequent SearchResults with an - * empty SHA1 will be annotated, copying the values from the last non-empty - * record. - * - * This is highly experimental. - * - * @author brad - * @version $Date$, $Revision$ - */ - -public class ConditionalGetAnnotationSearchResultAdapter -implements Adapter<CaptureSearchResult,CaptureSearchResult> { - - private final static String EMPTY_VALUE = "-"; - private final static String EMPTY_SHA1 = "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"; - - private CaptureSearchResult lastSeen = null; - - public ConditionalGetAnnotationSearchResultAdapter() { - } - - private CaptureSearchResult annotate(CaptureSearchResult o) { - if(lastSeen == null) { - // TODO: log missing record digest reference - return null; - } - o.setFile(lastSeen.getFile()); - o.setOffset(lastSeen.getOffset()); - o.setDigest(lastSeen.getDigest()); - o.setHttpCode(lastSeen.getHttpCode()); - o.setMimeType(lastSeen.getMimeType()); - o.setRedirectUrl(lastSeen.getRedirectUrl()); - o.flagDuplicateHTTP(lastSeen.getCaptureTimestamp()); - return o; - } - - private CaptureSearchResult remember(CaptureSearchResult o) { - lastSeen = o; - return o; - } - - public CaptureSearchResult adapt(CaptureSearchResult o) { - if(o.getFile().equals(EMPTY_VALUE)) { - if(o.getDigest().equals(EMPTY_SHA1)) { - return annotate(o); - } - return o; - } - return remember(o); - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java 2009-10-23 00:34:19 UTC (rev 2821) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java 2009-10-23 00:35:10 UTC (rev 2822) @@ -1,65 +0,0 @@ -package org.archive.wayback.resourceindex.adapters; - -import java.util.HashMap; - -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.util.Adapter; - -/** - * Adapter class that observes a stream of SearchResults tracking for each - * complete record, a mapping of that records digest to: - * Arc/Warc Filename - * Arc/Warc offset - * HTTP Response - * MIME-Type - * Redirect URL - * - * If subsequent SearchResults are missing these fields ("-") and the Digest - * field has been seen, then the subsequent SearchResults are updated with the - * values from the kept copy matching that digest, and an additional annotation - * field is added. - * - * - * @author brad - * @version $Date$, $Revision$ - */ -public class DeduplicationSearchResultAnnotationAdapter -implements Adapter<CaptureSearchResult,CaptureSearchResult> { - private final static String EMPTY_VALUE = "-"; - private final static String REVISIT_VALUE = "warc/revisit"; - - private HashMap<String,CaptureSearchResult> memory = null; - - public DeduplicationSearchResultAnnotationAdapter() { - memory = new HashMap<String,CaptureSearchResult>(); - } - - private CaptureSearchResult annotate(CaptureSearchResult o) { - String thisDigest = o.getDigest(); - CaptureSearchResult last = memory.get(thisDigest); - if(last == null) { - // TODO: log missing record digest reference - return null; - } - o.setFile(last.getFile()); - o.setOffset(last.getOffset()); - o.setHttpCode(last.getHttpCode()); - o.setMimeType(last.getMimeType()); - o.setRedirectUrl(last.getRedirectUrl()); - o.flagDuplicateDigest(last.getCaptureTimestamp()); - return o; - } - - private CaptureSearchResult remember(CaptureSearchResult o) { - memory.put(o.getDigest(),o); - return o; - } - - public CaptureSearchResult adapt(CaptureSearchResult o) { - if(o.getFile().equals(EMPTY_VALUE) - || o.getMimeType().equals(REVISIT_VALUE)) { - return annotate(o); - } - return remember(o); - } -} \ No newline at end of file Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ConditionalGetAnnotationFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ConditionalGetAnnotationFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ConditionalGetAnnotationFilter.java 2009-10-23 00:35:10 UTC (rev 2822) @@ -0,0 +1,72 @@ +package org.archive.wayback.resourceindex.filters; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; + +/** + * WARC file allows 2 forms of deduplication. The first actually downloads + * documents and compares their digest with a database of previous values. When + * a new capture of a document exactly matches the previous digest, an + * abbreviated record is stored in the WARC file. The second form uses an HTTP + * conditional GET request, sending previous values returned for a given URL + * (etag, last-modified, etc). In this case, the remote server either sends a + * new document (200) which is stored normally, or the server will return a + * 304 (Not Modified) response, which is stored in the WARC file. + * + * For the first record type, the wayback indexer will output a placeholder + * record that includes the digest of the last-stored record. For 304 responses, + * the indexer outputs a normal looking record, but the record will have a + * SHA1 digest which is easily distinguishable as an "empty" document. The SHA1 + * is always: + * + * 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ + * + * This class will observe a stream of SearchResults, storing the values for + * the last seen non-empty SHA1 field. Any subsequent SearchResults with an + * empty SHA1 will be annotated, copying the values from the last non-empty + * record. + * + * This is highly experimental. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ConditionalGetAnnotationFilter +implements ObjectFilter<CaptureSearchResult> { + + private final static String EMPTY_VALUE = "-"; + private final static String EMPTY_SHA1 = "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"; + + private CaptureSearchResult lastSeen = null; + + private int annotate(CaptureSearchResult o) { + if(lastSeen == null) { + // TODO: log missing record digest reference + return FILTER_EXCLUDE; + } + o.setFile(lastSeen.getFile()); + o.setOffset(lastSeen.getOffset()); + o.setDigest(lastSeen.getDigest()); + o.setHttpCode(lastSeen.getHttpCode()); + o.setMimeType(lastSeen.getMimeType()); + o.setRedirectUrl(lastSeen.getRedirectUrl()); + o.flagDuplicateHTTP(lastSeen.getCaptureTimestamp()); + return FILTER_INCLUDE; + } + + private int remember(CaptureSearchResult o) { + lastSeen = o; + return FILTER_INCLUDE; + } + + public int filterObject(CaptureSearchResult o) { + if(o.getFile().equals(EMPTY_VALUE)) { + if(o.getDigest().equals(EMPTY_SHA1)) { + return annotate(o); + } + return FILTER_INCLUDE; + } + return remember(o); + } + +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java 2009-10-23 00:35:10 UTC (rev 2822) @@ -0,0 +1,73 @@ +package org.archive.wayback.resourceindex.filters; + +import java.util.HashMap; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; + +/** + * Filter class that observes a stream of SearchResults tracking for each + * complete record, a mapping of that records Digest to: + * Arc/Warc Filename + * Arc/Warc offset + * HTTP Response + * MIME-Type + * Redirect URL + * + * If subsequent SearchResults are missing these fields ("-") and the Digest + * field is in the map, then the SearchResults missing fields are replaced with + * the values from the previously seen record with the same digest, and an + * additional annotation field is added. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class WARCRevisitAnnotationFilter +implements ObjectFilter<CaptureSearchResult> { + + private final static String EMPTY_VALUE = "-"; + private final static String REVISIT_VALUE = "warc/revisit"; + + private HashMap<String,CaptureSearchResult> memory = null; + + public WARCRevisitAnnotationFilter() { + memory = new HashMap<String,CaptureSearchResult>(); + } + + private int annotate(CaptureSearchResult o) { + String thisDigest = o.getDigest(); + CaptureSearchResult last = memory.get(thisDigest); + if(last == null) { + // TODO: log missing record digest reference? + return FILTER_EXCLUDE; + } + o.setFile(last.getFile()); + o.setOffset(last.getOffset()); + o.setHttpCode(last.getHttpCode()); + o.setMimeType(last.getMimeType()); + o.setRedirectUrl(last.getRedirectUrl()); + o.flagDuplicateDigest(last.getCaptureTimestamp()); + return FILTER_INCLUDE; + } + + private int remember(CaptureSearchResult o) { + memory.put(o.getDigest(),o); + return FILTER_INCLUDE; + } + +// public CaptureSearchResult adapt(CaptureSearchResult o) { +// if(o.getFile().equals(EMPTY_VALUE) +// || o.getMimeType().equals(REVISIT_VALUE)) { +// return annotate(o); +// } +// return remember(o); +// } + + public int filterObject(CaptureSearchResult o) { + if(o.getFile().equals(EMPTY_VALUE) + || o.getMimeType().equals(REVISIT_VALUE)) { + return annotate(o); + } + return remember(o); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |