From: <bra...@us...> - 2008-07-23 01:06:22
|
Revision: 2487 http://archive-access.svn.sourceforge.net/archive-access/?rev=2487&view=rev Author: bradtofel Date: 2008-07-23 01:06:29 +0000 (Wed, 23 Jul 2008) Log Message: ----------- REFACTOR: moved various Adapter<*SearchResult> into org.archive.wayback.resourceindex.adapters Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java 2008-07-23 01:04:09 UTC (rev 2486) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java 2008-07-23 01:06:29 UTC (rev 2487) @@ -1,113 +0,0 @@ -/* CaptureToUrlSearchResultAdapter - * - * $Id$ - * - * Created on 4:45:55 PM Jun 28, 2008. - * - * Copyright (C) 2008 Internet Archive. - * - * This file is part of wayback. - * - * wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.core; - -import java.util.HashMap; - -import org.archive.wayback.util.AdaptedIterator; -import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.CloseableIterator; - -/** - * - * - * @author brad - * @version $Date$, $Revision$ - */ -public class CaptureToUrlSearchResultAdapter - implements Adapter<CaptureSearchResult, UrlSearchResult> { - - private String currentUrl; - private String originalUrl; - private String firstCapture; - private String lastCapture; - private int numCaptures; - private HashMap<String,Object> digests; - private UrlSearchResult resultRef = null; - public CaptureToUrlSearchResultAdapter() { - - } - private UrlSearchResult makeUrlSearchResult(CaptureSearchResult result) { - currentUrl = result.getUrlKey(); - originalUrl = result.getOriginalUrl(); - firstCapture = result.getCaptureTimestamp(); - lastCapture = firstCapture; - digests = new HashMap<String,Object>(); - digests.put(result.getDigest(),null); - numCaptures = 1; - - resultRef = new UrlSearchResult(); - resultRef.setUrlKey(currentUrl); - resultRef.setOriginalUrl(originalUrl); - resultRef.setFirstCapture(firstCapture); - resultRef.setLastCapture(lastCapture); - resultRef.setNumCaptures(1); - resultRef.setNumVersions(1); - return resultRef; - } - - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public UrlSearchResult adapt(CaptureSearchResult c) { - String urlKey = c.getUrlKey(); - if(resultRef == null || !currentUrl.equals(urlKey)) { - return makeUrlSearchResult(c); - } - - // same url -- accumulate into the last one we returned: - String captureDate = c.getCaptureTimestamp(); - if(captureDate.compareTo(firstCapture) < 0) { - firstCapture = captureDate; - resultRef.setFirstCapture(firstCapture); - } - if(captureDate.compareTo(lastCapture) > 0) { - lastCapture = captureDate; - resultRef.setLastCapture(lastCapture); - } - numCaptures++; - digests.put(c.getDigest(), null); - resultRef.setNumCaptures(numCaptures); - resultRef.setNumVersions(digests.size()); - return null; - } - public static CloseableIterator<UrlSearchResult> adaptCaptureIterator( - CloseableIterator<CaptureSearchResult> itr) { - - // HACKHACK: this is pretty lame. We return an UrlSearchResult the - // first time we see a new urlKey, and cache a reference to the returned - // UrlSearchResult, updating it as we see subsequent CaptureSearchResult - // objects with the same urlKey. - // This means that users of the returned UrlSearchResult need to wait - // until they've got the *next* returned UrlSearchResult before using - // the *previous* UrlSearchResult. - // At the moment, this all happens inside a LocalResourceIndex, so - // none of the UrlSearchResult objects should be seen/used in any - // significant way before they've all be accumulated into an - // UrlSearchResults object.. - return new AdaptedIterator<CaptureSearchResult,UrlSearchResult>(itr, - new CaptureToUrlSearchResultAdapter()); - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java 2008-07-23 01:04:09 UTC (rev 2486) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java 2008-07-23 01:06:29 UTC (rev 2487) @@ -1,63 +0,0 @@ -package org.archive.wayback.resourceindex; - -import java.util.HashMap; - -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.util.Adapter; - -/** - * Adapter class that observes a stream of SearchResults tracking for each - * complete record, a mapping of that records digest to: - * Arc/Warc Filename - * Arc/Warc offset - * HTTP Response - * MIME-Type - * Redirect URL - * - * If subsequent SearchResults are missing these fields ("-") and the Digest - * field has been seen, then the subsequent SearchResults are updated with the - * values from the kept copy matching that digest, and an additional annotation - * field is added. - * - * - * @author brad - * @version $Date$, $Revision$ - */ -public class DeduplicationSearchResultAnnotationAdapter -implements Adapter<CaptureSearchResult,CaptureSearchResult> { - private final static String EMPTY_VALUE = "-"; - - private HashMap<String,CaptureSearchResult> memory = null; - - public DeduplicationSearchResultAnnotationAdapter() { - memory = new HashMap<String,CaptureSearchResult>(); - } - - private CaptureSearchResult annotate(CaptureSearchResult o) { - String thisDigest = o.getDigest(); - CaptureSearchResult last = memory.get(thisDigest); - if(last == null) { - // TODO: log missing record digest reference - return null; - } - o.setFile(last.getFile()); - o.setOffset(last.getOffset()); - o.setHttpCode(last.getHttpCode()); - o.setMimeType(last.getMimeType()); - o.setRedirectUrl(last.getRedirectUrl()); - o.flagDuplicateDigest(last.getCaptureTimestamp()); - return o; - } - - private CaptureSearchResult remember(CaptureSearchResult o) { - memory.put(o.getDigest(),o); - return o; - } - - public CaptureSearchResult adapt(CaptureSearchResult o) { - if(o.getFile().equals(EMPTY_VALUE)) { - return annotate(o); - } - return remember(o); - } -} \ No newline at end of file Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2008-07-23 01:04:09 UTC (rev 2486) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2008-07-23 01:06:29 UTC (rev 2487) @@ -34,7 +34,6 @@ import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.core.CaptureSearchResults; -import org.archive.wayback.core.CaptureToUrlSearchResultAdapter; import org.archive.wayback.core.SearchResult; import org.archive.wayback.core.SearchResults; import org.archive.wayback.core.UrlSearchResult; @@ -44,6 +43,8 @@ import org.archive.wayback.exception.BadQueryException; import org.archive.wayback.exception.ResourceIndexNotAvailableException; import org.archive.wayback.exception.ResourceNotInArchiveException; +import org.archive.wayback.resourceindex.adapters.CaptureToUrlSearchResultAdapter; +import org.archive.wayback.resourceindex.adapters.DeduplicationSearchResultAnnotationAdapter; import org.archive.wayback.resourceindex.filters.CounterFilter; import org.archive.wayback.resourceindex.filters.DateRangeFilter; import org.archive.wayback.resourceindex.filters.DuplicateRecordFilter; Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultAdapter.java (from rev 2448, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/core/CaptureToUrlSearchResultAdapter.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultAdapter.java 2008-07-23 01:06:29 UTC (rev 2487) @@ -0,0 +1,115 @@ +/* CaptureToUrlSearchResultAdapter + * + * $Id$ + * + * Created on 4:45:55 PM Jun 28, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.adapters; + +import java.util.HashMap; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.UrlSearchResult; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.Adapter; +import org.archive.wayback.util.CloseableIterator; + +/** + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class CaptureToUrlSearchResultAdapter + implements Adapter<CaptureSearchResult, UrlSearchResult> { + + private String currentUrl; + private String originalUrl; + private String firstCapture; + private String lastCapture; + private int numCaptures; + private HashMap<String,Object> digests; + private UrlSearchResult resultRef = null; + public CaptureToUrlSearchResultAdapter() { + + } + private UrlSearchResult makeUrlSearchResult(CaptureSearchResult result) { + currentUrl = result.getUrlKey(); + originalUrl = result.getOriginalUrl(); + firstCapture = result.getCaptureTimestamp(); + lastCapture = firstCapture; + digests = new HashMap<String,Object>(); + digests.put(result.getDigest(),null); + numCaptures = 1; + + resultRef = new UrlSearchResult(); + resultRef.setUrlKey(currentUrl); + resultRef.setOriginalUrl(originalUrl); + resultRef.setFirstCapture(firstCapture); + resultRef.setLastCapture(lastCapture); + resultRef.setNumCaptures(1); + resultRef.setNumVersions(1); + return resultRef; + } + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public UrlSearchResult adapt(CaptureSearchResult c) { + String urlKey = c.getUrlKey(); + if(resultRef == null || !currentUrl.equals(urlKey)) { + return makeUrlSearchResult(c); + } + + // same url -- accumulate into the last one we returned: + String captureDate = c.getCaptureTimestamp(); + if(captureDate.compareTo(firstCapture) < 0) { + firstCapture = captureDate; + resultRef.setFirstCapture(firstCapture); + } + if(captureDate.compareTo(lastCapture) > 0) { + lastCapture = captureDate; + resultRef.setLastCapture(lastCapture); + } + numCaptures++; + digests.put(c.getDigest(), null); + resultRef.setNumCaptures(numCaptures); + resultRef.setNumVersions(digests.size()); + return null; + } + public static CloseableIterator<UrlSearchResult> adaptCaptureIterator( + CloseableIterator<CaptureSearchResult> itr) { + + // HACKHACK: this is pretty lame. We return an UrlSearchResult the + // first time we see a new urlKey, and cache a reference to the returned + // UrlSearchResult, updating it as we see subsequent CaptureSearchResult + // objects with the same urlKey. + // This means that users of the returned UrlSearchResult need to wait + // until they've got the *next* returned UrlSearchResult before using + // the *previous* UrlSearchResult. + // At the moment, this all happens inside a LocalResourceIndex, so + // none of the UrlSearchResult objects should be seen/used in any + // significant way before they've all be accumulated into an + // UrlSearchResults object.. + return new AdaptedIterator<CaptureSearchResult,UrlSearchResult>(itr, + new CaptureToUrlSearchResultAdapter()); + } +} Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java (from rev 2448, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/DeduplicationSearchResultAnnotationAdapter.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java 2008-07-23 01:06:29 UTC (rev 2487) @@ -0,0 +1,63 @@ +package org.archive.wayback.resourceindex.adapters; + +import java.util.HashMap; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.Adapter; + +/** + * Adapter class that observes a stream of SearchResults tracking for each + * complete record, a mapping of that records digest to: + * Arc/Warc Filename + * Arc/Warc offset + * HTTP Response + * MIME-Type + * Redirect URL + * + * If subsequent SearchResults are missing these fields ("-") and the Digest + * field has been seen, then the subsequent SearchResults are updated with the + * values from the kept copy matching that digest, and an additional annotation + * field is added. + * + * + * @author brad + * @version $Date$, $Revision$ + */ +public class DeduplicationSearchResultAnnotationAdapter +implements Adapter<CaptureSearchResult,CaptureSearchResult> { + private final static String EMPTY_VALUE = "-"; + + private HashMap<String,CaptureSearchResult> memory = null; + + public DeduplicationSearchResultAnnotationAdapter() { + memory = new HashMap<String,CaptureSearchResult>(); + } + + private CaptureSearchResult annotate(CaptureSearchResult o) { + String thisDigest = o.getDigest(); + CaptureSearchResult last = memory.get(thisDigest); + if(last == null) { + // TODO: log missing record digest reference + return null; + } + o.setFile(last.getFile()); + o.setOffset(last.getOffset()); + o.setHttpCode(last.getHttpCode()); + o.setMimeType(last.getMimeType()); + o.setRedirectUrl(last.getRedirectUrl()); + o.flagDuplicateDigest(last.getCaptureTimestamp()); + return o; + } + + private CaptureSearchResult remember(CaptureSearchResult o) { + memory.put(o.getDigest(),o); + return o; + } + + public CaptureSearchResult adapt(CaptureSearchResult o) { + if(o.getFile().equals(EMPTY_VALUE)) { + return annotate(o); + } + return remember(o); + } +} \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |