From: <bra...@us...> - 2008-07-23 01:07:26
|
Revision: 2488 http://archive-access.svn.sourceforge.net/archive-access/?rev=2488&view=rev Author: bradtofel Date: 2008-07-23 01:07:34 +0000 (Wed, 23 Jul 2008) Log Message: ----------- REFACTOR: moved various Adapter<*SearchResult> into org.archive.wayback.resourceindex.adapters Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java (from rev 2483, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java) =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/LegacyToIdentityFilter.java 2008-07-23 01:07:34 UTC (rev 2488) @@ -0,0 +1,74 @@ +/* LegacyToIdentityFilter + * + * $Id$ + * + * Created on 11:48:56 AM Jul 10, 2008. + * + * Copyright (C) 2008 Internet Archive. + * + * This file is part of wayback. + * + * wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +package org.archive.wayback.resourceindex.adapters; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.url.UrlOperations; + +/** + * CaptureSearchResult ObjectFilter which passes through all inputs, modifying + * each to construct a corrected original URL to comply with new Identity + * format. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class LegacyToIdentityFilter implements ObjectFilter<CaptureSearchResult> { + private final static String DEFAULT_SCHEME = "http://"; + + private int getEndOfHostIndex(String url) { + int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR); + int pathIdx = url.indexOf(UrlOperations.PATH_START); + if(portIdx == -1 && pathIdx == -1) { + return url.length(); + } + if(portIdx == -1) { + return pathIdx; + } + if(pathIdx == -1) { + return portIdx; + } + if(pathIdx > portIdx) { + return portIdx; + } else { + return pathIdx; + } + } + + /* (non-Javadoc) + * @see org.archive.wayback.util.ObjectFilter#filterObject(java.lang.Object) + */ + public int filterObject(CaptureSearchResult o) { + String urlKey = o.getUrlKey(); + StringBuilder sb = new StringBuilder(urlKey.length()); + sb.append(DEFAULT_SCHEME); + sb.append(o.getOriginalUrl()); + sb.append(urlKey.substring(getEndOfHostIndex(urlKey))); + o.setOriginalUrl(sb.toString()); + return FILTER_INCLUDE; + } + +} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java 2008-07-23 01:06:29 UTC (rev 2487) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/LegacyToIdentityFilter.java 2008-07-23 01:07:34 UTC (rev 2488) @@ -1,74 +0,0 @@ -/* LegacyToIdentityFilter - * - * $Id$ - * - * Created on 11:48:56 AM Jul 10, 2008. - * - * Copyright (C) 2008 Internet Archive. - * - * This file is part of wayback. - * - * wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourceindex.filters; - -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.util.ObjectFilter; -import org.archive.wayback.util.url.UrlOperations; - -/** - * CaptureSearchResult ObjectFilter which passes through all inputs, modifying - * each to construct a corrected original URL to comply with new Identity - * format. - * - * @author brad - * @version $Date$, $Revision$ - */ -public class LegacyToIdentityFilter implements ObjectFilter<CaptureSearchResult> { - private final static String DEFAULT_SCHEME = "http://"; - - private int getEndOfHostIndex(String url) { - int portIdx = url.indexOf(UrlOperations.PORT_SEPARATOR); - int pathIdx = url.indexOf(UrlOperations.PATH_START); - if(portIdx == -1 && pathIdx == -1) { - return url.length(); - } - if(portIdx == -1) { - return pathIdx; - } - if(pathIdx == -1) { - return portIdx; - } - if(pathIdx > portIdx) { - return portIdx; - } else { - return pathIdx; - } - } - - /* (non-Javadoc) - * @see org.archive.wayback.util.ObjectFilter#filterObject(java.lang.Object) - */ - public int filterObject(CaptureSearchResult o) { - String urlKey = o.getUrlKey(); - StringBuilder sb = new StringBuilder(urlKey.length()); - sb.append(DEFAULT_SCHEME); - sb.append(o.getOriginalUrl()); - sb.append(urlKey.substring(getEndOfHostIndex(urlKey))); - o.setOriginalUrl(sb.toString()); - return FILTER_INCLUDE; - } - -} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2008-10-11 02:00:04
|
Revision: 2606 http://archive-access.svn.sourceforge.net/archive-access/?rev=2606&view=rev Author: bradtofel Date: 2008-10-11 01:59:57 +0000 (Sat, 11 Oct 2008) Log Message: ----------- NEW FEATURE (ACC-43): Allow adding a generic ObjectFilter<CaptureSearchResult> on a LocalResourceIndex, implemented 2 new ObjectFilter<CaptureSearchResult>, one which allows include/exclude lists of HTTP response codes, and one highly experimental BeanShellFilter, which is too slow to use in any but small installations, but may provide the escape hatch needed for some installations where performance is not the crucial problem. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/BeanShellFilter.java Property Changed: ---------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/HttpCodeFilter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2008-10-11 01:50:43 UTC (rev 2605) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2008-10-11 01:59:57 UTC (rev 2606) @@ -86,6 +86,8 @@ private boolean dedupeRecords = false; private ObjectFilter<CaptureSearchResult> annotater = null; + + private ObjectFilter<CaptureSearchResult> filter = null; public LocalResourceIndex() { canonicalizer = new AggressiveUrlCanonicalizer(); @@ -122,7 +124,7 @@ CaptureSearchResults results = new CaptureSearchResults(); CaptureQueryFilterState filterState = - new CaptureQueryFilterState(wbRequest,canonicalizer, type); + new CaptureQueryFilterState(wbRequest,canonicalizer, type, filter); String keyUrl = filterState.getKeyUrl(); CloseableIterator<CaptureSearchResult> itr = getCaptureIterator(keyUrl); @@ -159,7 +161,7 @@ CaptureQueryFilterState filterState = new CaptureQueryFilterState(wbRequest,canonicalizer, - CaptureQueryFilterState.TYPE_URL); + CaptureQueryFilterState.TYPE_URL, filter); String keyUrl = filterState.getKeyUrl(); CloseableIterator<CaptureSearchResult> citr = getCaptureIterator(keyUrl); @@ -287,6 +289,14 @@ public void setAnnotater(ObjectFilter<CaptureSearchResult> annotater) { this.annotater = annotater; } + + public ObjectFilter<CaptureSearchResult> getFilter() { + return filter; + } + + public void setFilter(ObjectFilter<CaptureSearchResult> filter) { + this.filter = filter; + } private class CaptureQueryFilterState { public final static int TYPE_REPLAY = 0; @@ -302,7 +312,8 @@ private String exactDate; public CaptureQueryFilterState(WaybackRequest request, - UrlCanonicalizer canonicalizer, int type) + UrlCanonicalizer canonicalizer, int type, + ObjectFilter<CaptureSearchResult> genericFilter) throws BadQueryException { String searchUrl = request.getRequestUrl(); @@ -333,6 +344,9 @@ preExclusionCounter = new CounterFilter(); DateRangeFilter drFilter = new DateRangeFilter(startDate,endDate); + if(genericFilter != null) { + filter.addFilter(genericFilter); + } // has the user asked for only results on the exact host specified? ObjectFilter<CaptureSearchResult> exactHost = getExactHostFilter(request); Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/BeanShellFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/BeanShellFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/BeanShellFilter.java 2008-10-11 01:59:57 UTC (rev 2606) @@ -0,0 +1,80 @@ +package org.archive.wayback.resourceindex.filters; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; + +import bsh.EvalError; +import bsh.Interpreter; + +public class BeanShellFilter implements ObjectFilter<CaptureSearchResult> { + + private String expression = null; + private String method = null; + private String scriptPath = null; + + @SuppressWarnings("unchecked") + private final ThreadLocal tl = new ThreadLocal() { + protected synchronized Object initialValue() { + return new Interpreter(); + } + }; + private Interpreter getInterpreter() { + Interpreter i = (Interpreter) tl.get(); + if(method != null) { + + } + return i; + } + + public BeanShellFilter() { + } + + public int filterObject(CaptureSearchResult o) { + int result = FILTER_EXCLUDE; + try { + boolean bResult = false; + Interpreter interpreter = getInterpreter(); + interpreter.set("result", o); + + if(expression != null) { + bResult = (Boolean) interpreter.eval(expression); + } else if(method != null) { + bResult = (Boolean) interpreter.eval("matches(result)"); + } else if(scriptPath != null) { + bResult = (Boolean) interpreter.eval("matches(result)"); + } + + if(bResult) { + result = FILTER_INCLUDE; + } + + } catch (EvalError e) { + e.printStackTrace(); + } + return result; + } + + public String getExpression() { + return expression; + } + + public void setExpression(String expression) { + this.expression = expression; + } + + public String getMethod() { + return method; + } + + public void setMethod(String method) { + this.method = method; + } + + public String getScriptPath() { + return scriptPath; + } + + public void setScriptPath(String scriptPath) { + this.scriptPath = scriptPath; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/HttpCodeFilter.java ___________________________________________________________________ Added: svn:keywords + Date Rev This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-23 00:35:20
|
Revision: 2822 http://archive-access.svn.sourceforge.net/archive-access/?rev=2822&view=rev Author: bradtofel Date: 2009-10-23 00:35:10 +0000 (Fri, 23 Oct 2009) Log Message: ----------- REFACTOR: moved functionality from adapters to filters. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ConditionalGetAnnotationFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/ConditionalGetAnnotationSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/ConditionalGetAnnotationSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/ConditionalGetAnnotationSearchResultAdapter.java 2009-10-23 00:34:19 UTC (rev 2821) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/ConditionalGetAnnotationSearchResultAdapter.java 2009-10-23 00:35:10 UTC (rev 2822) @@ -1,99 +0,0 @@ -/* ConditionalGetAnnotationSearchResultAdapter - * - * $Id$ - * - * Created on 6:09:05 PM Mar 12, 2009. - * - * Copyright (C) 2009 Internet Archive. - * - * This file is part of wayback. - * - * wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourceindex.adapters; - -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.util.Adapter; - -/** - * WARC file allows 2 forms of deduplication. The first actually downloads - * documents and compares their digest with a database of previous values. When - * a new capture of a document exactly matches the previous digest, an - * abbreviated record is stored in the WARC file. The second form uses an HTTP - * conditional GET request, sending previous values returned for a given URL - * (etag, last-modified, etc). In this case, the remote server either sends a - * new document (200) which is stored normally, or the server will return a - * 304 (Not Modified) response, which is stored in the WARC file. - * - * For the first record type, the wayback indexer will output a placeholder - * record that includes the digest of the last-stored record. For 304 responses, - * the indexer outputs a normal looking record, but the record will have a - * SHA1 digest which is easily distinguishable as an "empty" document. The SHA1 - * is always: - * - * 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - * - * This class will observe a stream of SearchResults, storing the values for - * the last seen non-empty SHA1 field. Any subsequent SearchResults with an - * empty SHA1 will be annotated, copying the values from the last non-empty - * record. - * - * This is highly experimental. - * - * @author brad - * @version $Date$, $Revision$ - */ - -public class ConditionalGetAnnotationSearchResultAdapter -implements Adapter<CaptureSearchResult,CaptureSearchResult> { - - private final static String EMPTY_VALUE = "-"; - private final static String EMPTY_SHA1 = "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"; - - private CaptureSearchResult lastSeen = null; - - public ConditionalGetAnnotationSearchResultAdapter() { - } - - private CaptureSearchResult annotate(CaptureSearchResult o) { - if(lastSeen == null) { - // TODO: log missing record digest reference - return null; - } - o.setFile(lastSeen.getFile()); - o.setOffset(lastSeen.getOffset()); - o.setDigest(lastSeen.getDigest()); - o.setHttpCode(lastSeen.getHttpCode()); - o.setMimeType(lastSeen.getMimeType()); - o.setRedirectUrl(lastSeen.getRedirectUrl()); - o.flagDuplicateHTTP(lastSeen.getCaptureTimestamp()); - return o; - } - - private CaptureSearchResult remember(CaptureSearchResult o) { - lastSeen = o; - return o; - } - - public CaptureSearchResult adapt(CaptureSearchResult o) { - if(o.getFile().equals(EMPTY_VALUE)) { - if(o.getDigest().equals(EMPTY_SHA1)) { - return annotate(o); - } - return o; - } - return remember(o); - } -} Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java 2009-10-23 00:34:19 UTC (rev 2821) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/DeduplicationSearchResultAnnotationAdapter.java 2009-10-23 00:35:10 UTC (rev 2822) @@ -1,65 +0,0 @@ -package org.archive.wayback.resourceindex.adapters; - -import java.util.HashMap; - -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.util.Adapter; - -/** - * Adapter class that observes a stream of SearchResults tracking for each - * complete record, a mapping of that records digest to: - * Arc/Warc Filename - * Arc/Warc offset - * HTTP Response - * MIME-Type - * Redirect URL - * - * If subsequent SearchResults are missing these fields ("-") and the Digest - * field has been seen, then the subsequent SearchResults are updated with the - * values from the kept copy matching that digest, and an additional annotation - * field is added. - * - * - * @author brad - * @version $Date$, $Revision$ - */ -public class DeduplicationSearchResultAnnotationAdapter -implements Adapter<CaptureSearchResult,CaptureSearchResult> { - private final static String EMPTY_VALUE = "-"; - private final static String REVISIT_VALUE = "warc/revisit"; - - private HashMap<String,CaptureSearchResult> memory = null; - - public DeduplicationSearchResultAnnotationAdapter() { - memory = new HashMap<String,CaptureSearchResult>(); - } - - private CaptureSearchResult annotate(CaptureSearchResult o) { - String thisDigest = o.getDigest(); - CaptureSearchResult last = memory.get(thisDigest); - if(last == null) { - // TODO: log missing record digest reference - return null; - } - o.setFile(last.getFile()); - o.setOffset(last.getOffset()); - o.setHttpCode(last.getHttpCode()); - o.setMimeType(last.getMimeType()); - o.setRedirectUrl(last.getRedirectUrl()); - o.flagDuplicateDigest(last.getCaptureTimestamp()); - return o; - } - - private CaptureSearchResult remember(CaptureSearchResult o) { - memory.put(o.getDigest(),o); - return o; - } - - public CaptureSearchResult adapt(CaptureSearchResult o) { - if(o.getFile().equals(EMPTY_VALUE) - || o.getMimeType().equals(REVISIT_VALUE)) { - return annotate(o); - } - return remember(o); - } -} \ No newline at end of file Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ConditionalGetAnnotationFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ConditionalGetAnnotationFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/ConditionalGetAnnotationFilter.java 2009-10-23 00:35:10 UTC (rev 2822) @@ -0,0 +1,72 @@ +package org.archive.wayback.resourceindex.filters; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; + +/** + * WARC file allows 2 forms of deduplication. The first actually downloads + * documents and compares their digest with a database of previous values. When + * a new capture of a document exactly matches the previous digest, an + * abbreviated record is stored in the WARC file. The second form uses an HTTP + * conditional GET request, sending previous values returned for a given URL + * (etag, last-modified, etc). In this case, the remote server either sends a + * new document (200) which is stored normally, or the server will return a + * 304 (Not Modified) response, which is stored in the WARC file. + * + * For the first record type, the wayback indexer will output a placeholder + * record that includes the digest of the last-stored record. For 304 responses, + * the indexer outputs a normal looking record, but the record will have a + * SHA1 digest which is easily distinguishable as an "empty" document. The SHA1 + * is always: + * + * 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ + * + * This class will observe a stream of SearchResults, storing the values for + * the last seen non-empty SHA1 field. Any subsequent SearchResults with an + * empty SHA1 will be annotated, copying the values from the last non-empty + * record. + * + * This is highly experimental. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class ConditionalGetAnnotationFilter +implements ObjectFilter<CaptureSearchResult> { + + private final static String EMPTY_VALUE = "-"; + private final static String EMPTY_SHA1 = "3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"; + + private CaptureSearchResult lastSeen = null; + + private int annotate(CaptureSearchResult o) { + if(lastSeen == null) { + // TODO: log missing record digest reference + return FILTER_EXCLUDE; + } + o.setFile(lastSeen.getFile()); + o.setOffset(lastSeen.getOffset()); + o.setDigest(lastSeen.getDigest()); + o.setHttpCode(lastSeen.getHttpCode()); + o.setMimeType(lastSeen.getMimeType()); + o.setRedirectUrl(lastSeen.getRedirectUrl()); + o.flagDuplicateHTTP(lastSeen.getCaptureTimestamp()); + return FILTER_INCLUDE; + } + + private int remember(CaptureSearchResult o) { + lastSeen = o; + return FILTER_INCLUDE; + } + + public int filterObject(CaptureSearchResult o) { + if(o.getFile().equals(EMPTY_VALUE)) { + if(o.getDigest().equals(EMPTY_SHA1)) { + return annotate(o); + } + return FILTER_INCLUDE; + } + return remember(o); + } + +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java 2009-10-23 00:35:10 UTC (rev 2822) @@ -0,0 +1,73 @@ +package org.archive.wayback.resourceindex.filters; + +import java.util.HashMap; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.util.ObjectFilter; + +/** + * Filter class that observes a stream of SearchResults tracking for each + * complete record, a mapping of that records Digest to: + * Arc/Warc Filename + * Arc/Warc offset + * HTTP Response + * MIME-Type + * Redirect URL + * + * If subsequent SearchResults are missing these fields ("-") and the Digest + * field is in the map, then the SearchResults missing fields are replaced with + * the values from the previously seen record with the same digest, and an + * additional annotation field is added. + * + * @author brad + * @version $Date$, $Revision$ + */ +public class WARCRevisitAnnotationFilter +implements ObjectFilter<CaptureSearchResult> { + + private final static String EMPTY_VALUE = "-"; + private final static String REVISIT_VALUE = "warc/revisit"; + + private HashMap<String,CaptureSearchResult> memory = null; + + public WARCRevisitAnnotationFilter() { + memory = new HashMap<String,CaptureSearchResult>(); + } + + private int annotate(CaptureSearchResult o) { + String thisDigest = o.getDigest(); + CaptureSearchResult last = memory.get(thisDigest); + if(last == null) { + // TODO: log missing record digest reference? + return FILTER_EXCLUDE; + } + o.setFile(last.getFile()); + o.setOffset(last.getOffset()); + o.setHttpCode(last.getHttpCode()); + o.setMimeType(last.getMimeType()); + o.setRedirectUrl(last.getRedirectUrl()); + o.flagDuplicateDigest(last.getCaptureTimestamp()); + return FILTER_INCLUDE; + } + + private int remember(CaptureSearchResult o) { + memory.put(o.getDigest(),o); + return FILTER_INCLUDE; + } + +// public CaptureSearchResult adapt(CaptureSearchResult o) { +// if(o.getFile().equals(EMPTY_VALUE) +// || o.getMimeType().equals(REVISIT_VALUE)) { +// return annotate(o); +// } +// return remember(o); +// } + + public int filterObject(CaptureSearchResult o) { + if(o.getFile().equals(EMPTY_VALUE) + || o.getMimeType().equals(REVISIT_VALUE)) { + return annotate(o); + } + return remember(o); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-10-28 00:17:15
|
Revision: 2851 http://archive-access.svn.sourceforge.net/archive-access/?rev=2851&view=rev Author: bradtofel Date: 2009-10-28 00:14:40 +0000 (Wed, 28 Oct 2009) Log Message: ----------- REFACTOR: major overhaul of resource index query filtering, moving much of the logic out of LocalResourceIndex into ...wayback.resourceindex.filterfactory.* Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AccessPointCaptureFilterGroup.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AccessPointCaptureFilterGroupFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CaptureFilterGroup.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroup.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroupFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroup.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroupFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/FilterGroupFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroupFactory.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/WindowFilterGroup.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2009-10-28 00:08:00 UTC (rev 2850) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -25,7 +25,9 @@ package org.archive.wayback.resourceindex; import java.io.IOException; +import java.util.ArrayList; import java.util.Iterator; +import java.util.List; import org.apache.commons.httpclient.URIException; import org.archive.wayback.ResourceIndex; @@ -41,36 +43,59 @@ import org.archive.wayback.exception.BadQueryException; import org.archive.wayback.exception.ResourceIndexNotAvailableException; import org.archive.wayback.exception.ResourceNotInArchiveException; -import org.archive.wayback.resourceindex.adapters.ConditionalGetAnnotationSearchResultAdapter; import org.archive.wayback.resourceindex.adapters.CaptureToUrlSearchResultAdapter; -import org.archive.wayback.resourceindex.adapters.DeduplicationSearchResultAnnotationAdapter; -import org.archive.wayback.resourceindex.filters.CounterFilter; -import org.archive.wayback.resourceindex.filters.DateRangeFilter; -import org.archive.wayback.resourceindex.filters.DuplicateRecordFilter; -import org.archive.wayback.resourceindex.filters.GuardRailFilter; -import org.archive.wayback.resourceindex.filters.HostMatchFilter; -import org.archive.wayback.resourceindex.filters.SchemeMatchFilter; -import org.archive.wayback.resourceindex.filters.SelfRedirectFilter; -import org.archive.wayback.resourceindex.filters.UrlMatchFilter; -import org.archive.wayback.resourceindex.filters.UrlPrefixMatchFilter; -import org.archive.wayback.resourceindex.filters.WindowEndFilter; -import org.archive.wayback.resourceindex.filters.WindowStartFilter; -import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.resourceindex.filterfactory.AccessPointCaptureFilterGroupFactory; +import org.archive.wayback.resourceindex.filterfactory.CaptureFilterGroup; +import org.archive.wayback.resourceindex.filterfactory.CoreCaptureFilterGroupFactory; +import org.archive.wayback.resourceindex.filterfactory.ExclusionCaptureFilterGroupFactory; +import org.archive.wayback.resourceindex.filterfactory.FilterGroupFactory; +import org.archive.wayback.resourceindex.filterfactory.QueryCaptureFilterGroupFactory; +import org.archive.wayback.resourceindex.filterfactory.WindowFilterGroup; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.ObjectFilter; import org.archive.wayback.util.ObjectFilterChain; import org.archive.wayback.util.ObjectFilterIterator; -import org.archive.wayback.util.Timestamp; import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; -import org.archive.wayback.util.url.UrlOperations; /** + * ResourceIndex implementation which assumes a "local" SearchResultSource. + * + * Extracting SearchResults from the source involves several layered steps: + * + * 1) extraction of results based on a prefix into the index + * 2) passing each result through a series of adapters + * these adapters can create new fields based on existing fields, or can + * annotate fields as they are scanned in order + * 3) filtering results based on request filters, which may come from + * * WaybackRequest-specific parameters. + * Ex. exact host match only, exact scheme match only, ... + * * AccessPoint-specific configuration + * Ex. only return records with (ARC/WARC) filename prefixed with XXX + * Ex. block any dates not older than 6 months + * 4) filtering based on AccessControl configurations + * Ex. block any urls with prefixes in file X + * 5) windowing filters, which provide pagination of the results, allowing + * requests to specify "show results between 10 and 20" + * 6) post filter adapters, which may annotate final results with other + * information + * Ex. for each result, consult DB to see if user-contributed messages + * apply to the results + * + * After all results have been processed, we annotate the final SearchResultS + * object with summary information about the results included. As we set up the + * chain of filters, we instrument the chain with counters that observe the + * number of results that went into, and came out of the Exclusion filters. + * + * If there were results presented to the Exclusion filter, but none were + * emitted from it, an AccessControlException is thrown. * - * * @author brad * @version $Date$, $Revision$ */ public class LocalResourceIndex implements ResourceIndex { + public final static int TYPE_REPLAY = 0; + public final static int TYPE_CAPTURE = 1; + public final static int TYPE_URL = 2; /** * maximum number of records to return @@ -89,25 +114,18 @@ private ObjectFilter<CaptureSearchResult> filter = null; + + List<FilterGroupFactory> fgFactories = null; + public LocalResourceIndex() { canonicalizer = new AggressiveUrlCanonicalizer(); + fgFactories = new ArrayList<FilterGroupFactory>(); + fgFactories.add(new CoreCaptureFilterGroupFactory()); + fgFactories.add(new QueryCaptureFilterGroupFactory()); + fgFactories.add(new ExclusionCaptureFilterGroupFactory()); + fgFactories.add(new AccessPointCaptureFilterGroupFactory()); } - private CloseableIterator<CaptureSearchResult> getCaptureIterator(String k) - throws ResourceIndexNotAvailableException { - - CloseableIterator<CaptureSearchResult> captures = - source.getPrefixIterator(k); - if(dedupeRecords) { - // hack hack!!! - captures = new AdaptedIterator<CaptureSearchResult, CaptureSearchResult> - (captures, new ConditionalGetAnnotationSearchResultAdapter()); - captures = new AdaptedIterator<CaptureSearchResult, CaptureSearchResult> - (captures, new DeduplicationSearchResultAnnotationAdapter()); - } - return captures; - } - private void cleanupIterator(CloseableIterator<? extends SearchResult> itr) throws ResourceIndexNotAvailableException { try { @@ -119,42 +137,64 @@ } } + private List<CaptureFilterGroup> getRequestFilterGroups(WaybackRequest r) + throws BadQueryException { + + ArrayList<CaptureFilterGroup> groups = + new ArrayList<CaptureFilterGroup>(); + for(FilterGroupFactory f : fgFactories) { + groups.add(f.getGroup(r, canonicalizer, this)); + } + return groups; + } + + public CaptureSearchResults doCaptureQuery(WaybackRequest wbRequest, int type) throws ResourceIndexNotAvailableException, ResourceNotInArchiveException, BadQueryException, AccessControlException { - - CaptureSearchResults results = new CaptureSearchResults(); - CaptureQueryFilterState filterState = - new CaptureQueryFilterState(wbRequest, canonicalizer, type, - getUserFilters(wbRequest)); - String keyUrl = filterState.getKeyUrl(); + String urlKey; + try { + urlKey = canonicalizer.urlStringToKey(wbRequest.getRequestUrl()); + } catch (URIException e) { + throw new BadQueryException("Bad URL(" + + wbRequest.getRequestUrl() + ")"); + } - CloseableIterator<CaptureSearchResult> itr = getCaptureIterator(keyUrl); - // set up the common Filters: - ObjectFilter<CaptureSearchResult> filter = filterState.getFilter(); - itr = new ObjectFilterIterator<CaptureSearchResult>(itr,filter); + // the CaptureSearchResults we are about to return: + CaptureSearchResults results = new CaptureSearchResults(); + // the various filters to apply to the results: + ObjectFilterChain<CaptureSearchResult> filters = + new ObjectFilterChain<CaptureSearchResult>(); - // Windowing: - WindowFilterState<CaptureSearchResult> window = - new WindowFilterState<CaptureSearchResult>(wbRequest); - ObjectFilter<CaptureSearchResult> windowFilter = window.getFilter(); - itr = new ObjectFilterIterator<CaptureSearchResult>(itr,windowFilter); - - - if(annotater != null) { - itr = new ObjectFilterIterator<CaptureSearchResult>(itr,annotater); + // Groupings of filters for... sanity and summary annotation of results: + // Windows: + WindowFilterGroup<CaptureSearchResult> window = + new WindowFilterGroup<CaptureSearchResult>(wbRequest,this); + List<CaptureFilterGroup> groups = getRequestFilterGroups(wbRequest); + + for(CaptureFilterGroup cfg : groups) { + filters.addFilters(cfg.getFilters()); } + filters.addFilters(window.getFilters()); + CloseableIterator<CaptureSearchResult> itr = + new ObjectFilterIterator<CaptureSearchResult>( + source.getPrefixIterator(urlKey),filters); + while(itr.hasNext()) { results.addSearchResult(itr.next()); } - filterState.annotateResults(results); + for(CaptureFilterGroup cfg : groups) { + cfg.annotateResults(results); + } window.annotateResults(results); + cleanupIterator(itr); - return results; + + return results; } public UrlSearchResults doUrlQuery(WaybackRequest wbRequest) @@ -162,38 +202,61 @@ ResourceNotInArchiveException, BadQueryException, AccessControlException { + String urlKey; + try { + urlKey = canonicalizer.urlStringToKey(wbRequest.getRequestUrl()); + } catch (URIException e) { + throw new BadQueryException("Bad URL(" + + wbRequest.getRequestUrl() + ")"); + } + UrlSearchResults results = new UrlSearchResults(); - CaptureQueryFilterState filterState = - new CaptureQueryFilterState(wbRequest,canonicalizer, - CaptureQueryFilterState.TYPE_URL, getUserFilters(wbRequest)); - String keyUrl = filterState.getKeyUrl(); + // the various CAPTURE filters to apply to the results: + ObjectFilterChain<CaptureSearchResult> cFilters = + new ObjectFilterChain<CaptureSearchResult>(); - CloseableIterator<CaptureSearchResult> citr = getCaptureIterator(keyUrl); - // set up the common Filters: - ObjectFilter<CaptureSearchResult> filter = filterState.getFilter(); - citr = new ObjectFilterIterator<CaptureSearchResult>(citr,filter); - // adapt into UrlSearchResult: + // Groupings of filters for clarity(?) and summary annotation of + // results: + List<CaptureFilterGroup> groups = getRequestFilterGroups(wbRequest); + for(CaptureFilterGroup cfg : groups) { + cFilters.addFilters(cfg.getFilters()); + } - CloseableIterator<UrlSearchResult> itr = - CaptureToUrlSearchResultAdapter.adaptCaptureIterator(citr); + CloseableIterator<CaptureSearchResult> itrC = + new ObjectFilterIterator<CaptureSearchResult>( + source.getPrefixIterator(urlKey),cFilters); + + // we've filtered the appropriate CaptureResult objects within the + // iterator, now we're going to convert whatever records make it past + // the filters into UrlSearchResults, and then do further window + // filtering on those results: + // Windows: + // the window URL filters to apply to the results, once they're + // UrlSearchResult objects + ObjectFilterChain<UrlSearchResult> uFilters = + new ObjectFilterChain<UrlSearchResult>(); + WindowFilterGroup<UrlSearchResult> window = + new WindowFilterGroup<UrlSearchResult>(wbRequest,this); + uFilters.addFilters(window.getFilters()); + CloseableIterator<UrlSearchResult> itrU = + new ObjectFilterIterator<UrlSearchResult>( + CaptureToUrlSearchResultAdapter.adaptCaptureIterator(itrC), + uFilters); - // Windowing: - WindowFilterState<UrlSearchResult> window = - new WindowFilterState<UrlSearchResult>(wbRequest); - ObjectFilter<UrlSearchResult> windowFilter = window.getFilter(); - itr = new ObjectFilterIterator<UrlSearchResult>(itr,windowFilter); - - while(itr.hasNext()) { - results.addSearchResult(itr.next()); + while(itrU.hasNext()) { + results.addSearchResult(itrU.next()); } - filterState.annotateResults(results); + for(CaptureFilterGroup cfg : groups) { + cfg.annotateResults(results); + } window.annotateResults(results); - cleanupIterator(itr); - - return results; + + cleanupIterator(itrU); + + return results; } /* @@ -209,15 +272,13 @@ if (wbRequest.isReplayRequest()) { - results = doCaptureQuery(wbRequest, - CaptureQueryFilterState.TYPE_REPLAY); + results = doCaptureQuery(wbRequest, TYPE_REPLAY); results.putFilter(WaybackRequest.REQUEST_TYPE, WaybackRequest.REQUEST_REPLAY_QUERY); } else if (wbRequest.isCaptureQueryRequest()) { - results = doCaptureQuery(wbRequest, - CaptureQueryFilterState.TYPE_CAPTURE); + results = doCaptureQuery(wbRequest, TYPE_CAPTURE); results.putFilter(WaybackRequest.REQUEST_TYPE, WaybackRequest.REQUEST_CAPTURE_QUERY); @@ -259,7 +320,11 @@ public void setMaxRecords(int maxRecords) { this.maxRecords = maxRecords; } + public int getMaxRecords() { + return maxRecords; + } + /** * @param source the source to set */ @@ -302,190 +367,4 @@ public void setFilter(ObjectFilter<CaptureSearchResult> filter) { this.filter = filter; } - - public ObjectFilterChain<CaptureSearchResult> getUserFilters(WaybackRequest request) { - ObjectFilterChain<CaptureSearchResult> userFilters = - new ObjectFilterChain<CaptureSearchResult>(); - - // has the user asked for only results on the exact host specified? - if(request.isExactHost()) { - userFilters.addFilter(new HostMatchFilter( - UrlOperations.urlToHost(request.getRequestUrl()))); - } - - if(request.isExactScheme()) { - userFilters.addFilter(new SchemeMatchFilter( - UrlOperations.urlToScheme(request.getRequestUrl()))); - } - if(filter != null) { - userFilters.addFilter(filter); - } - - return userFilters; - } - - private class CaptureQueryFilterState { - public final static int TYPE_REPLAY = 0; - public final static int TYPE_CAPTURE = 1; - public final static int TYPE_URL = 2; - - private ObjectFilterChain<CaptureSearchResult> filter = null; - private CounterFilter finalCounter = null; - private CounterFilter preExclusionCounter = null; - private String keyUrl = null; - private String startDate; - private String endDate; - private String exactDate; - - public CaptureQueryFilterState(WaybackRequest request, - UrlCanonicalizer canonicalizer, int type, - ObjectFilterChain<CaptureSearchResult> userFilter) - throws BadQueryException { - - String searchUrl = request.getRequestUrl(); - try { - keyUrl = canonicalizer.urlStringToKey(searchUrl); - } catch (URIException e) { - throw new BadQueryException("invalid " - + WaybackRequest.REQUEST_URL + " " + searchUrl); - } - - filter = new ObjectFilterChain<CaptureSearchResult>(); - startDate = request.getStartTimestamp(); - if(startDate == null) { - startDate = Timestamp.earliestTimestamp().getDateStr(); - } - endDate = request.getEndTimestamp(); - if(endDate == null) { - endDate = Timestamp.latestTimestamp().getDateStr(); - } - if(type == TYPE_REPLAY) { - exactDate = request.getReplayTimestamp(); - if(exactDate == null) { - exactDate = Timestamp.latestTimestamp().getDateStr(); - } - } - - finalCounter = new CounterFilter(); - preExclusionCounter = new CounterFilter(); - DateRangeFilter drFilter = new DateRangeFilter(startDate,endDate); - - // checks an exclusion service for every matching record - ObjectFilter<CaptureSearchResult> exclusion = - request.getExclusionFilter(); - - - // makes sure we don't inspect too many records: prevents DOS - filter.addFilter(new GuardRailFilter(maxRecords)); - filter.addFilter(new DuplicateRecordFilter()); - - if(type == TYPE_REPLAY) { - filter.addFilter(new UrlMatchFilter(keyUrl)); - filter.addFilter(drFilter); - SelfRedirectFilter selfRedirectFilter= new SelfRedirectFilter(); - selfRedirectFilter.setCanonicalizer(canonicalizer); - filter.addFilter(selfRedirectFilter); - } else if(type == TYPE_CAPTURE){ - filter.addFilter(new UrlMatchFilter(keyUrl)); - filter.addFilter(drFilter); - } else if(type == TYPE_URL) { - filter.addFilter(new UrlPrefixMatchFilter(keyUrl)); - filter.addFilter(drFilter); - } else { - throw new BadQueryException("Unknown type"); - } - - if(userFilter != null) { - filter.addFilters(userFilter.getFilters()); - } - - // count how many results got to the ExclusionFilter: - filter.addFilter(preExclusionCounter); - - if(exclusion != null) { - filter.addFilter(exclusion); - } - - // count how many results got past the ExclusionFilter, or how - // many total matched, if there was no ExclusionFilter: - filter.addFilter(finalCounter); - } - public String getKeyUrl() { - return keyUrl; - } - public ObjectFilter<CaptureSearchResult> getFilter() { - return filter; - } - public void annotateResults(SearchResults results) - throws AccessControlException, ResourceNotInArchiveException { - - int matched = finalCounter.getNumMatched(); - if (matched == 0) { - if (preExclusionCounter != null) { - if(preExclusionCounter.getNumMatched() > 0) { - throw new AccessControlException("All results Excluded"); - } - } - throw new ResourceNotInArchiveException("the URL " + keyUrl - + " is not in the archive."); - } - // now we need to set some filter properties on the results: - results.putFilter(WaybackRequest.REQUEST_URL, keyUrl); - results.putFilter(WaybackRequest.REQUEST_START_DATE, startDate); - results.putFilter(WaybackRequest.REQUEST_END_DATE, endDate); - if(exactDate != null) { - results.putFilter(WaybackRequest.REQUEST_EXACT_DATE, exactDate); - } - } - } - - private class WindowFilterState<T> { - int startResult; // calculated based on hits/page * pagenum - int resultsPerPage; - int pageNum; - ObjectFilterChain<T> windowFilters; - WindowStartFilter<T> startFilter; - WindowEndFilter<T> endFilter; - public WindowFilterState(WaybackRequest request) - throws BadQueryException { - - windowFilters = new ObjectFilterChain<T>(); - // first grab all the info from the WaybackRequest, and validate it: - resultsPerPage = request.getResultsPerPage(); - pageNum = request.getPageNum(); - - if (resultsPerPage < 1) { - throw new BadQueryException("resultsPerPage cannot be < 1"); - } - if (resultsPerPage > maxRecords) { - throw new BadQueryException("resultsPerPage cannot be > " - + maxRecords); - } - if (pageNum < 1) { - throw new BadQueryException("pageNum must be > 0"); - } - startResult = (pageNum - 1) * resultsPerPage; - startFilter = new WindowStartFilter<T>(startResult); - endFilter = new WindowEndFilter<T>(resultsPerPage); - windowFilters.addFilter(startFilter); - windowFilters.addFilter(endFilter); - } - public ObjectFilter<T> getFilter() { - return windowFilters; - } - public void annotateResults(SearchResults results) - throws BadQueryException { - results.setFirstReturned(startResult); - results.setNumRequested(resultsPerPage); - int numSeen = endFilter.getNumSeen(); - if(numSeen == 0) { - throw new BadQueryException("No results in requested window"); - } - // how many went by the filters: - results.setMatchingCount(startFilter.getNumSeen()); - - // how many were actually returned: - results.setReturnedCount(endFilter.getNumReturned()); - } - } } Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AccessPointCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AccessPointCaptureFilterGroup.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AccessPointCaptureFilterGroup.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,48 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import java.util.List; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.SearchResults; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.AccessControlException; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.exception.ResourceNotInArchiveException; +import org.archive.wayback.resourceindex.filters.FilePrefixFilter; +import org.archive.wayback.resourceindex.filters.FileRegexFilter; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.ObjectFilterChain; + +public class AccessPointCaptureFilterGroup implements CaptureFilterGroup { + private ObjectFilterChain<CaptureSearchResult> chain = null; + private final static String[] sA = null; + + public AccessPointCaptureFilterGroup(WaybackRequest request) { + chain = new ObjectFilterChain<CaptureSearchResult>(); + List<String> prefixes = null; + if(request.getAccessPoint() != null) { + prefixes = request.getAccessPoint().getFilePrefixes(); + if(prefixes != null && prefixes.size() > 0) { + FilePrefixFilter f = new FilePrefixFilter(); + f.setPrefixes(prefixes.toArray(sA)); + chain.addFilter(f); + } + List<String> patterns = request.getAccessPoint().getFilePatterns(); + if(patterns != null && patterns.size() > 0) { + FileRegexFilter f = new FileRegexFilter(); + f.setPatterns(patterns); + chain.addFilter(f); + } + } + } + + public void annotateResults(SearchResults results) + throws ResourceNotInArchiveException, BadQueryException, + AccessControlException { + + } + + public List<ObjectFilter<CaptureSearchResult>> getFilters() { + return chain.getFilters(); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AccessPointCaptureFilterGroupFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AccessPointCaptureFilterGroupFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AccessPointCaptureFilterGroupFactory.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,15 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.LocalResourceIndex; + +public class AccessPointCaptureFilterGroupFactory implements FilterGroupFactory { + + public CaptureFilterGroup getGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer, LocalResourceIndex index) + throws BadQueryException { + return new AccessPointCaptureFilterGroup(request); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CaptureFilterGroup.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CaptureFilterGroup.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,18 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import java.util.List; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.SearchResults; +import org.archive.wayback.exception.AccessControlException; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.exception.ResourceNotInArchiveException; +import org.archive.wayback.util.ObjectFilter; + +public interface CaptureFilterGroup { + public List<ObjectFilter<CaptureSearchResult>> getFilters(); + + public void annotateResults(SearchResults results) + throws ResourceNotInArchiveException, BadQueryException, + AccessControlException; +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroup.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroup.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,35 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import java.util.List; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.SearchResults; +import org.archive.wayback.resourceindex.LocalResourceIndex; +import org.archive.wayback.resourceindex.filters.ConditionalGetAnnotationFilter; +import org.archive.wayback.resourceindex.filters.DuplicateRecordFilter; +import org.archive.wayback.resourceindex.filters.GuardRailFilter; +import org.archive.wayback.resourceindex.filters.WARCRevisitAnnotationFilter; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.ObjectFilterChain; + +public class CoreCaptureFilterGroup implements CaptureFilterGroup { + private ObjectFilterChain<CaptureSearchResult> chain = null; + + public CoreCaptureFilterGroup(LocalResourceIndex index) { + chain = new ObjectFilterChain<CaptureSearchResult>(); + chain.addFilter(new GuardRailFilter(index.getMaxRecords())); + chain.addFilter(new DuplicateRecordFilter()); + if(index.isDedupeRecords()) { + chain.addFilter(new WARCRevisitAnnotationFilter()); + chain.addFilter(new ConditionalGetAnnotationFilter()); + } + } + public List<ObjectFilter<CaptureSearchResult>> getFilters() { + return chain.getFilters(); + } + + public void annotateResults(SearchResults results) { + // TODO: ask guardRailFilter if it aborted processing (too many records) + // and annotate the results with info about how to continue the request? + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroupFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroupFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroupFactory.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,15 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.LocalResourceIndex; + +public class CoreCaptureFilterGroupFactory implements FilterGroupFactory { + + public CaptureFilterGroup getGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer, LocalResourceIndex index) + throws BadQueryException { + return new CoreCaptureFilterGroup(index); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroup.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroup.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,61 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import java.util.List; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.SearchResults; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.AccessControlException; +import org.archive.wayback.exception.ResourceNotInArchiveException; +import org.archive.wayback.resourceindex.filters.CounterFilter; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.ObjectFilterChain; + +public class ExclusionCaptureFilterGroup implements CaptureFilterGroup { + + private ObjectFilterChain<CaptureSearchResult> chain = null; + private CounterFilter preCounter = null; + private CounterFilter postCounter = null; + String requestUrl = null; + + public ExclusionCaptureFilterGroup(WaybackRequest request) { + + // checks an exclusion service for every matching record + ObjectFilter<CaptureSearchResult> exclusion = + request.getExclusionFilter(); + chain = new ObjectFilterChain<CaptureSearchResult>(); + if(exclusion != null) { + preCounter = new CounterFilter(); + // count how many results got to the ExclusionFilter: + chain.addFilter(preCounter); + chain.addFilter(exclusion); + // count how many results got past the ExclusionFilter: + requestUrl = request.getRequestUrl(); + } + postCounter = new CounterFilter(); + chain.addFilter(postCounter); + } + + public List<ObjectFilter<CaptureSearchResult>> getFilters() { + return chain.getFilters(); + } + + public void annotateResults(SearchResults results) + throws AccessControlException, ResourceNotInArchiveException { + if(postCounter.getNumMatched() == 0) { + + // nothing got to the counter after exclusions. If we have + // exclusions (detected by preCounter being non-null, and the + // preCounter passed any results, then they were all filtered by + // the exclusions filter. + if(preCounter != null && preCounter.getNumMatched() > 0) { + throw new AccessControlException("All results Excluded"); + } + ResourceNotInArchiveException e = + new ResourceNotInArchiveException("the URL " + requestUrl + + " is not in the archive."); + e.setCloseMatches(results.getCloseMatches()); + throw e; + } + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroupFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroupFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ExclusionCaptureFilterGroupFactory.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,15 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.LocalResourceIndex; + +public class ExclusionCaptureFilterGroupFactory implements FilterGroupFactory { + + public CaptureFilterGroup getGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer, LocalResourceIndex index) + throws BadQueryException { + return new ExclusionCaptureFilterGroup(request); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/FilterGroupFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/FilterGroupFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/FilterGroupFactory.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,12 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.LocalResourceIndex; + +public interface FilterGroupFactory { + public CaptureFilterGroup getGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer, LocalResourceIndex index) + throws BadQueryException; +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,120 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.httpclient.URIException; +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.SearchResults; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.filters.DateRangeFilter; +import org.archive.wayback.resourceindex.filters.HostMatchFilter; +import org.archive.wayback.resourceindex.filters.SchemeMatchFilter; +import org.archive.wayback.resourceindex.filters.SelfRedirectFilter; +import org.archive.wayback.resourceindex.filters.UrlMatchFilter; +import org.archive.wayback.resourceindex.filters.UrlPrefixMatchFilter; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.ObjectFilterChain; +import org.archive.wayback.util.Timestamp; +import org.archive.wayback.util.url.UrlOperations; + +public class QueryCaptureFilterGroup implements CaptureFilterGroup { +// private ObjectFilter<CaptureSearchResult> prefixFilter = null; +// private ObjectFilter<CaptureSearchResult> dateFilter = null; +// private ObjectFilter<CaptureSearchResult> selfRedirectFilter = null; +// private ObjectFilter<CaptureSearchResult> exactHost = null; +// private ObjectFilter<CaptureSearchResult> exactScheme = null; + private ObjectFilterChain<CaptureSearchResult> chain = null; + private String requestType = null; + private String keyUrl = null; + private String startDate; + private String endDate; + private String exactDate; + /** + * List of URL Strings that are "close" to the current request, but not + * included in the current CaptureSearchResults. + */ + private Map<String,String> closeMatches = new HashMap<String,String>(); + + + public QueryCaptureFilterGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer) + throws BadQueryException { + + requestType = request.get(WaybackRequest.REQUEST_TYPE); + + // URL-Filters: + chain = new ObjectFilterChain<CaptureSearchResult>(); + try { + keyUrl = canonicalizer.urlStringToKey(request.getRequestUrl()); + } catch (URIException e) { + throw new BadQueryException("Bad request URL(" + + request.getRequestUrl() +")"); + } + if(request.isReplayRequest()) { + exactDate = request.getReplayTimestamp(); + if(exactDate == null) { + exactDate = Timestamp.latestTimestamp().getDateStr(); + } + chain.addFilter(new UrlMatchFilter(keyUrl)); + chain.addFilter(new SelfRedirectFilter(canonicalizer)); + + } else if(request.isCaptureQueryRequest()) { + chain.addFilter(new UrlMatchFilter(keyUrl)); + } else if(request.isUrlQueryRequest()) { + chain.addFilter(new UrlPrefixMatchFilter(keyUrl)); + } + + // Date-Filters: + startDate = request.getStartTimestamp(); + if(startDate == null) { + startDate = Timestamp.earliestTimestamp().getDateStr(); + } + endDate = request.getEndTimestamp(); + if(endDate == null) { + endDate = Timestamp.latestTimestamp().getDateStr(); + } + chain.addFilter(new DateRangeFilter(startDate, endDate)); + + // Other Filters: + if(request.isExactHost()) { + chain.addFilter( + new HostMatchFilter( + UrlOperations.urlToHost(request.getRequestUrl()), + this) + ); + } + + if(request.isExactScheme()) { + chain.addFilter(new SchemeMatchFilter( + UrlOperations.urlToScheme(request.getRequestUrl()),this)); + } + } + + public List<ObjectFilter<CaptureSearchResult>> getFilters() { + return chain.getFilters(); + } + + public void annotateResults(SearchResults results) { + + // set the filter properties on the results: + results.putFilter(WaybackRequest.REQUEST_URL, keyUrl); + results.putFilter(WaybackRequest.REQUEST_START_DATE, startDate); + results.putFilter(WaybackRequest.REQUEST_END_DATE, endDate); + if(exactDate != null) { + results.putFilter(WaybackRequest.REQUEST_EXACT_DATE, exactDate); + } + results.putFilter(WaybackRequest.REQUEST_TYPE, requestType); + if(!closeMatches.isEmpty()) { + results.setCloseMatches(new ArrayList<String>(closeMatches.values())); + } + } + + public void addCloseMatch(String host, String closeMatch) { + closeMatches.put(host, closeMatch); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroupFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroupFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroupFactory.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,15 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.LocalResourceIndex; + +public class QueryCaptureFilterGroupFactory implements FilterGroupFactory { + + public CaptureFilterGroup getGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer, LocalResourceIndex index) + throws BadQueryException { + return new QueryCaptureFilterGroup(request,canonicalizer); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/WindowFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/WindowFilterGroup.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/WindowFilterGroup.java 2009-10-28 00:14:40 UTC (rev 2851) @@ -0,0 +1,63 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import java.util.List; + +import org.archive.wayback.core.SearchResults; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.LocalResourceIndex; +import org.archive.wayback.resourceindex.filters.WindowEndFilter; +import org.archive.wayback.resourceindex.filters.WindowStartFilter; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.ObjectFilterChain; + +public class WindowFilterGroup<T> { + int startResult; // calculated based on hits/page * pagenum + int resultsPerPage; + int pageNum; + ObjectFilterChain<T> windowFilters; + WindowStartFilter<T> startFilter; + WindowEndFilter<T> endFilter; + public WindowFilterGroup(WaybackRequest request, LocalResourceIndex index) + throws BadQueryException { + + windowFilters = new ObjectFilterChain<T>(); + // first grab all the info from the WaybackRequest, and validate it: + resultsPerPage = request.getResultsPerPage(); + pageNum = request.getPageNum(); + + if (resultsPerPage < 1) { + throw new BadQueryException("resultsPerPage cannot be < 1"); + } + if (resultsPerPage > index.getMaxRecords()) { + throw new BadQueryException("resultsPerPage cannot be > " + + index.getMaxRecords()); + } + if (pageNum < 1) { + throw new BadQueryException("pageNum must be > 0"); + } + startResult = (pageNum - 1) * resultsPerPage; + startFilter = new WindowStartFilter<T>(startResult); + endFilter = new WindowEndFilter<T>(resultsPerPage); + windowFilters.addFilter(startFilter); + windowFilters.addFilter(endFilter); + } + public List<ObjectFilter<T>> getFilters() { + return windowFilters.getFilters(); + } + + public void annotateResults(SearchResults results) + throws BadQueryException { + results.setFirstReturned(startResult); + results.setNumRequested(resultsPerPage); + int numSeen = endFilter.getNumSeen(); + if(numSeen == 0) { + throw new BadQueryException("No results in requested window"); + } + // how many went by the filters: + results.setMatchingCount(startFilter.getNumSeen()); + + // how many were actually returned: + results.setReturnedCount(endFilter.getNumReturned()); + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2009-12-09 06:50:16
|
Revision: 2938 http://archive-access.svn.sourceforge.net/archive-access/?rev=2938&view=rev Author: bradtofel Date: 2009-12-09 06:50:07 +0000 (Wed, 09 Dec 2009) Log Message: ----------- INITIAL REV: SearchResultSource composed of a series of alphabetically partitioned ziplined CDX files. Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java 2009-12-09 06:50:07 UTC (rev 2938) @@ -0,0 +1,90 @@ +/* StringPrefixIterator + * + * $Id$: + * + * Created on Nov 23, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import java.io.IOException; +import java.util.Iterator; + +import org.archive.wayback.util.CloseableIterator; + +/** + * @author brad + * + */ +public class StringPrefixIterator implements CloseableIterator<String> { + private String prefix = null; + Iterator<String> inner = null; + private String cachedNext = null; + private boolean done = false; + public StringPrefixIterator(Iterator<String> inner, String prefix) { + this.prefix = prefix; + this.inner = inner; + } + /* (non-Javadoc) + * @see java.util.Iterator#hasNext() + */ + public boolean hasNext() { + if(done) return false; + if(cachedNext != null) { + return true; + } + while(inner.hasNext()) { + String tmp = inner.next(); + if(tmp.startsWith(prefix)) { + cachedNext = tmp; + return true; + } else if(tmp.compareTo(prefix) > 0) { + done = true; + return false; + } + } + return false; + } + /* (non-Javadoc) + * @see java.util.Iterator#next() + */ + public String next() { + String tmp = cachedNext; + cachedNext = null; + return tmp; + } + /* (non-Javadoc) + * @see java.util.Iterator#remove() + */ + public void remove() { + // TODO Auto-generated method stub + + } + /* (non-Javadoc) + * @see java.io.Closeable#close() + */ + public void close() throws IOException { + if(inner instanceof CloseableIterator) { + CloseableIterator<String> toBeClosed = (CloseableIterator<String>) inner; + toBeClosed.close(); + } + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2009-12-09 06:50:07 UTC (rev 2938) @@ -0,0 +1,68 @@ +/* ZiplinedBlock + * + * $Id$: + * + * Created on Nov 23, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URL; +import java.net.URLConnection; +import java.util.zip.GZIPInputStream; + +/** + * @author brad + * + */ +public class ZiplinedBlock { + String urlOrPath = null; + long offset = -1; + public final static int BLOCK_SIZE = 128 * 1024; + private final static String RANGE_HEADER = "Range"; + private final static String BYTES_HEADER = "bytes="; + private final static String BYTES_MINUS = "-"; + /** + * @param urlOrPath URL where this file can be downloaded + * @param offset start of 128K block boundary. + */ + public ZiplinedBlock(String urlOrPath, long offset) { + this.urlOrPath = urlOrPath; + this.offset = offset; + } + /** + * @return a BufferedReader of the underlying compressed data in this block + * @throws IOException for usual reasons + */ + public BufferedReader readBlock() throws IOException { + URL u = new URL(urlOrPath); + URLConnection uc = u.openConnection(); + StringBuilder sb = new StringBuilder(16); + sb.append(BYTES_HEADER).append(offset).append(BYTES_MINUS); + sb.append((offset + BLOCK_SIZE)-1); + uc.setRequestProperty(RANGE_HEADER, sb.toString()); + return new BufferedReader(new InputStreamReader( + new GZIPInputStream(uc.getInputStream()))); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java 2009-12-09 06:50:07 UTC (rev 2938) @@ -0,0 +1,151 @@ +/* ZiplinesChunkIterator + * + * $Id$: + * + * Created on Nov 23, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.RandomAccessFile; +import java.util.Iterator; +import java.util.List; +import java.util.RandomAccess; +import java.util.zip.GZIPInputStream; + +import org.archive.wayback.util.CloseableIterator; + +/** + * @author brad + * + */ +public class ZiplinesChunkIterator implements CloseableIterator<String> { + private BufferedReader br = null; + private Iterator<ZiplinedBlock> blockItr = null; + private String cachedNext = null; + /** + * @param blocks which should be fetched and unzipped, one after another + */ + public ZiplinesChunkIterator(List<ZiplinedBlock> blocks) { + blockItr = blocks.iterator(); + } + /* (non-Javadoc) + * @see java.util.Iterator#hasNext() + */ + public boolean hasNext() { + if(cachedNext != null) { + return true; + } + while(cachedNext == null) { + if(br != null) { + // attempt to read the next line from this: + try { + cachedNext = br.readLine(); + if(cachedNext == null) { + br = null; + // next loop: + } else { + return true; + } + } catch (IOException e) { + e.printStackTrace(); + br = null; + } + } else { + // do we have more blocks to use? + if(blockItr.hasNext()) { + try { + br = blockItr.next().readBlock(); + } catch (IOException e) { + e.printStackTrace(); + } + } else { + return false; + } + } + } + + return false; + } + + /* (non-Javadoc) + * @see java.util.Iterator#next() + */ + public String next() { + String tmp = cachedNext; + cachedNext = null; + return tmp; + } + + /* (non-Javadoc) + * @see java.util.Iterator#remove() + */ + public void remove() { + throw new UnsupportedOperationException(); + } + + /* (non-Javadoc) + * @see java.io.Closeable#close() + */ + public void close() throws IOException { + if(br != null) { + br.close(); + } + } + public static void main(String[] args) { + if(args.length != 1) { + System.err.println("Usage: ZIPLINES_PATH"); + System.exit(1); + } + File f = new File(args[0]); + long size = f.length(); + long numBlocks = (long) (size / ZiplinedBlock.BLOCK_SIZE); + long size2 = numBlocks * ZiplinedBlock.BLOCK_SIZE; + if(size != size2) { + System.err.println("File size of " + args[0] + " is not a mulitple" + + " of " + ZiplinedBlock.BLOCK_SIZE); + } + try { + RandomAccessFile raf = new RandomAccessFile(f, "r"); + for(int i = 0; i < numBlocks; i++) { + long offset = i * ZiplinedBlock.BLOCK_SIZE; + raf.seek(offset); + BufferedReader br = new BufferedReader(new InputStreamReader( + new GZIPInputStream(new FileInputStream(raf.getFD())))); + String line = br.readLine(); + if(line == null) { + System.err.println("Bad block at " + offset + " in " + args[0]); + System.exit(1); + } + System.out.println(args[0] + " " + offset + " " + line); + } + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesChunkIterator.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2009-12-09 06:50:07 UTC (rev 2938) @@ -0,0 +1,218 @@ +/* ZiplinesSearchResultSource + * + * $Id$: + * + * Created on Nov 23, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import it.unimi.dsi.mg4j.util.FrontCodedStringList; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.exception.ResourceIndexNotAvailableException; +import org.archive.wayback.resourceindex.SearchResultSource; +import org.archive.wayback.resourceindex.cdx.CDXFormatToSearchResultAdapter; +import org.archive.wayback.resourceindex.cdx.format.CDXFormat; +import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; +import org.archive.wayback.util.AdaptedIterator; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.flatfile.FlatFile; + +/** + * A set of Ziplines files, which are CDX files specially compressed into a + * series of GZipMembers such that: + * + * 1) each member is exactly 128K, padded using a GZip comment header + * 2) each member contains complete lines: no line spans two GZip members + * + * If the data put into these files is sorted, then the data within the files + * can be uncompressed when needed, minimizing the total data to be uncompressed + * + * This SearchResultSource assumes a set of alphabetically partitioned Ziplined + * CDX files, so that each file is sorted, and no regions overlap. + * + * This class takes 2 files as input: + * 1) a specially constructed map of the first N bytes of data from each GZip + * member, and the filename and offset of that GZip member. + * 2) a mapping of filenames to URLs + * + * Data from #1 is actually stored in a serialized + * + * + * + * @author brad + * + */ +public class ZiplinesSearchResultSource implements SearchResultSource { + + /** + * Local path containing map of URL,TIMESTAMP,CHUNK,OFFSET for each 128K chunk + */ + private String chunkIndexPath = null; + private FlatFile chunkIndex = null; + /** + * Local path containing URL for each CHUNK + */ + private String chunkMapPath = null; + private HashMap<String,String> chunkMap = null; + private CDXFormat format = null; + + public ZiplinesSearchResultSource() { + } + public ZiplinesSearchResultSource(CDXFormat format) { + this.format = format; + } + public void init() throws IOException { + chunkMap = new HashMap<String, String>(); + FlatFile ff = new FlatFile(chunkMapPath); + Iterator<String> lines = ff.getSequentialIterator(); + while(lines.hasNext()) { + String line = lines.next(); + String[] parts = line.split("\\s"); + if(parts.length != 2) { + throw new IOException("Bad line(" + line +") in (" + + chunkMapPath + ")"); + } + chunkMap.put(parts[0],parts[1]); + } + chunkIndex = new FlatFile(chunkIndexPath); + } + protected CloseableIterator<CaptureSearchResult> adaptIterator(Iterator<String> itr) + throws IOException { + return new AdaptedIterator<String,CaptureSearchResult>(itr, + new CDXFormatToSearchResultAdapter(format)); + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourceindex.SearchResultSource#cleanup(org.archive.wayback.util.CloseableIterator) + */ + public void cleanup(CloseableIterator<CaptureSearchResult> c) + throws IOException { + c.close(); + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourceindex.SearchResultSource#getPrefixIterator(java.lang.String) + */ + public CloseableIterator<CaptureSearchResult> getPrefixIterator( + String prefix) throws ResourceIndexNotAvailableException { + try { + return adaptIterator(getStringPrefixIterator(prefix)); + } catch (IOException e) { + throw new ResourceIndexNotAvailableException(e.getMessage()); + } + } + + public Iterator<String> getStringPrefixIterator(String prefix) throws ResourceIndexNotAvailableException, IOException { + Iterator<String> itr = chunkIndex.getRecordIteratorLT(prefix); + ArrayList<ZiplinedBlock> blocks = new ArrayList<ZiplinedBlock>(); + boolean first = true; + while(itr.hasNext()) { + String blockDescriptor = itr.next(); + String parts[] = blockDescriptor.split("\t"); + if(parts.length != 3) { + throw new ResourceIndexNotAvailableException("Bad line(" + + blockDescriptor + ")"); + } + // only compare the correct length: + String prefCmp = prefix; + String blockCmp = parts[0]; +// if(prefCmp.length() < blockCmp.length()) { +// blockCmp = blockCmp.substring(0,prefCmp.length()); +// } else { +// prefCmp = prefCmp.substring(0,blockCmp.length()); +// } + if(first) { + // always add first: + first = false; +// } else if(blockCmp.compareTo(prefCmp) > 0) { + } else if(!blockCmp.startsWith(prefCmp)) { + // all done; + break; + } + // add this and keep lookin... + String url = chunkMap.get(parts[1]); + long offset = Long.parseLong(parts[2]); + blocks.add(new ZiplinedBlock(url, offset)); + } + return new StringPrefixIterator(new ZiplinesChunkIterator(blocks),prefix); + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourceindex.SearchResultSource#getPrefixReverseIterator(java.lang.String) + */ + public CloseableIterator<CaptureSearchResult> getPrefixReverseIterator( + String prefix) throws ResourceIndexNotAvailableException { + throw new ResourceIndexNotAvailableException("unsupported op"); + } + + /* (non-Javadoc) + * @see org.archive.wayback.resourceindex.SearchResultSource#shutdown() + */ + public void shutdown() throws IOException { + // no-op.. + } + /** + * @return the format + */ + public CDXFormat getFormat() { + return format; + } + /** + * @param format the format to set + */ + public void setFormat(CDXFormat format) { + this.format = format; + } + /** + * @return the chunkIndexPath + */ + public String getChunkIndexPath() { + return chunkIndexPath; + } + /** + * @param chunkIndexPath the chunkIndexPath to set + */ + public void setChunkIndexPath(String chunkIndexPath) { + this.chunkIndexPath = chunkIndexPath; + } + /** + * @return the chunkMapPath + */ + public String getChunkMapPath() { + return chunkMapPath; + } + /** + * @param chunkMapPath the chunkMapPath to set + */ + public void setChunkMapPath(String chunkMapPath) { + this.chunkMapPath = chunkMapPath; + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java 2009-12-09 06:50:07 UTC (rev 2938) @@ -0,0 +1,64 @@ +/* ZiplinesSearchResultSourceTest + * + * $Id$: + * + * Created on Nov 23, 2009. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import java.util.Iterator; + +import org.archive.wayback.resourceindex.cdx.format.CDXFormat; +import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; + +import junit.framework.TestCase; + +/** + * @author brad + * + */ +public class ZiplinesSearchResultSourceTest extends TestCase { + + /** + * Test method for {@link org.archive.wayback.resourceindex.ziplines.ZiplinesSearchResultSource#getPrefixIterator(java.lang.String)}. + * @throws CDXFormatException + */ + public void testGetPrefixIterator() throws Exception { + CDXFormat format = new CDXFormat(" CDX N b a m s k r M V g"); + ZiplinesSearchResultSource zsrs = new ZiplinesSearchResultSource(format); +// zsrs.setChunkIndexPath("/home/brad/zipline-test/part-00005-frag.cdx.zlm"); +// zsrs.setChunkMapPath("/home/brad/zipline-test/manifest.txt"); + zsrs.setChunkIndexPath("/home/brad/ALL.summary"); + zsrs.setChunkMapPath("/home/brad/ALL.loc"); + zsrs.init(); + Iterator<String> i = zsrs.getStringPrefixIterator("krunch.com/ "); + int max = 100; + int done = 0; + while(i.hasNext()) { + System.out.println(i.next()); + if(done++ > max) { + break; + } + } + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSourceTest.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-04-02 03:28:34
|
Revision: 3020 http://archive-access.svn.sourceforge.net/archive-access/?rev=3020&view=rev Author: bradtofel Date: 2010-04-02 03:28:28 +0000 (Fri, 02 Apr 2010) Log Message: ----------- BUGFIX(unreported): AdaptedIterator implementation which converted an Iterator<CaptureSearchResult> to an Iterator<UrlSearchResult> was not returning the last record... Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultIterator.java Removed Paths: ------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultAdapter.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2010-04-02 03:19:58 UTC (rev 3019) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2010-04-02 03:28:28 UTC (rev 3020) @@ -43,7 +43,7 @@ import org.archive.wayback.exception.BadQueryException; import org.archive.wayback.exception.ResourceIndexNotAvailableException; import org.archive.wayback.exception.ResourceNotInArchiveException; -import org.archive.wayback.resourceindex.adapters.CaptureToUrlSearchResultAdapter; +import org.archive.wayback.resourceindex.adapters.CaptureToUrlSearchResultIterator; import org.archive.wayback.resourceindex.filterfactory.AccessPointCaptureFilterGroupFactory; import org.archive.wayback.resourceindex.filterfactory.CaptureFilterGroup; import org.archive.wayback.resourceindex.filterfactory.CoreCaptureFilterGroupFactory; @@ -242,7 +242,7 @@ uFilters.addFilters(window.getFilters()); CloseableIterator<UrlSearchResult> itrU = new ObjectFilterIterator<UrlSearchResult>( - CaptureToUrlSearchResultAdapter.adaptCaptureIterator(itrC), + new CaptureToUrlSearchResultIterator(itrC), uFilters); while(itrU.hasNext()) { Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultAdapter.java 2010-04-02 03:19:58 UTC (rev 3019) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultAdapter.java 2010-04-02 03:28:28 UTC (rev 3020) @@ -1,115 +0,0 @@ -/* CaptureToUrlSearchResultAdapter - * - * $Id$ - * - * Created on 4:45:55 PM Jun 28, 2008. - * - * Copyright (C) 2008 Internet Archive. - * - * This file is part of wayback. - * - * wayback is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser Public License as published by - * the Free Software Foundation; either version 2.1 of the License, or - * any later version. - * - * wayback is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser Public License for more details. - * - * You should have received a copy of the GNU Lesser Public License - * along with wayback; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ -package org.archive.wayback.resourceindex.adapters; - -import java.util.HashMap; - -import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.core.UrlSearchResult; -import org.archive.wayback.util.AdaptedIterator; -import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.CloseableIterator; - -/** - * - * - * @author brad - * @version $Date$, $Revision$ - */ -public class CaptureToUrlSearchResultAdapter - implements Adapter<CaptureSearchResult, UrlSearchResult> { - - private String currentUrl; - private String originalUrl; - private String firstCapture; - private String lastCapture; - private int numCaptures; - private HashMap<String,Object> digests; - private UrlSearchResult resultRef = null; - public CaptureToUrlSearchResultAdapter() { - - } - private UrlSearchResult makeUrlSearchResult(CaptureSearchResult result) { - currentUrl = result.getUrlKey(); - originalUrl = result.getOriginalUrl(); - firstCapture = result.getCaptureTimestamp(); - lastCapture = firstCapture; - digests = new HashMap<String,Object>(); - digests.put(result.getDigest(),null); - numCaptures = 1; - - resultRef = new UrlSearchResult(); - resultRef.setUrlKey(currentUrl); - resultRef.setOriginalUrl(originalUrl); - resultRef.setFirstCapture(firstCapture); - resultRef.setLastCapture(lastCapture); - resultRef.setNumCaptures(1); - resultRef.setNumVersions(1); - return resultRef; - } - - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public UrlSearchResult adapt(CaptureSearchResult c) { - String urlKey = c.getUrlKey(); - if(resultRef == null || !currentUrl.equals(urlKey)) { - return makeUrlSearchResult(c); - } - - // same url -- accumulate into the last one we returned: - String captureDate = c.getCaptureTimestamp(); - if(captureDate.compareTo(firstCapture) < 0) { - firstCapture = captureDate; - resultRef.setFirstCapture(firstCapture); - } - if(captureDate.compareTo(lastCapture) > 0) { - lastCapture = captureDate; - resultRef.setLastCapture(lastCapture); - } - numCaptures++; - digests.put(c.getDigest(), null); - resultRef.setNumCaptures(numCaptures); - resultRef.setNumVersions(digests.size()); - return null; - } - public static CloseableIterator<UrlSearchResult> adaptCaptureIterator( - CloseableIterator<CaptureSearchResult> itr) { - - // HACKHACK: this is pretty lame. We return an UrlSearchResult the - // first time we see a new urlKey, and cache a reference to the returned - // UrlSearchResult, updating it as we see subsequent CaptureSearchResult - // objects with the same urlKey. - // This means that users of the returned UrlSearchResult need to wait - // until they've got the *next* returned UrlSearchResult before using - // the *previous* UrlSearchResult. - // At the moment, this all happens inside a LocalResourceIndex, so - // none of the UrlSearchResult objects should be seen/used in any - // significant way before they've all be accumulated into an - // UrlSearchResults object.. - return new AdaptedIterator<CaptureSearchResult,UrlSearchResult>(itr, - new CaptureToUrlSearchResultAdapter()); - } -} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultIterator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultIterator.java 2010-04-02 03:28:28 UTC (rev 3020) @@ -0,0 +1,138 @@ +/* CaptureToUrlSearchResultIterator + * + * $Id$: + * + * Created on Mar 31, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.adapters; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.NoSuchElementException; + +import org.apache.log4j.Logger; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.UrlSearchResult; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.PeekableIterator; + +/** + * @author brad + * + */ +public class CaptureToUrlSearchResultIterator implements CloseableIterator<UrlSearchResult> { + private static final Logger LOGGER = Logger.getLogger( + CaptureToUrlSearchResultIterator.class.getName()); + private PeekableIterator<CaptureSearchResult> peek = null; + UrlSearchResult cachedNext = null; + /** + * @param itr possibly closeable iterator of CaptureSearchResult objects + */ + public CaptureToUrlSearchResultIterator(Iterator<CaptureSearchResult> itr) { + peek = new PeekableIterator<CaptureSearchResult>(itr); + } + /* (non-Javadoc) + * @see java.util.Iterator#hasNext() + */ + public boolean hasNext() { + createNext(); + return (cachedNext != null); + } + + private void createNext() { + if(cachedNext == null) { + if(peek.hasNext()) { + // populate + CaptureSearchResult captureResult = peek.next(); + String currentKey = captureResult.getUrlKey(); + String originalUrl = captureResult.getOriginalUrl(); + String firstCapture = captureResult.getCaptureTimestamp(); + LOGGER.info("Creating new UrlResult:" + currentKey + " " + + firstCapture); + String lastCapture = firstCapture; + HashMap<String,Object> digests = new HashMap<String,Object>(); + digests.put(captureResult.getDigest(),null); + int numCaptures = 1; + + cachedNext = new UrlSearchResult(); + cachedNext.setUrlKey(currentKey); + cachedNext.setOriginalUrl(originalUrl); + + // now rip through the rest until we find either the last + // in the iterator, or the first having a different urlKey: + while((captureResult = peek.peekNext()) != null) { + String urlKey = captureResult.getUrlKey(); + if(currentKey.equals(urlKey)) { + // remove from iterator, and accumulate: + peek.next(); + numCaptures++; + digests.put(captureResult.getDigest(), null); + + String captureTS = captureResult.getCaptureTimestamp(); + if(captureTS.compareTo(firstCapture) < 0) { + firstCapture = captureTS; + } + if(captureTS.compareTo(lastCapture) > 0) { + lastCapture = captureTS; + } + + } else { + // all done. leave the next result and stop processing: + LOGGER.info("Hit next urlKey. Cur("+currentKey+") new(" + + urlKey + ")"); + break; + } + } + cachedNext.setFirstCapture(firstCapture); + cachedNext.setLastCapture(lastCapture); + cachedNext.setNumCaptures(numCaptures); + cachedNext.setNumVersions(digests.size()); + } + } + } + /* (non-Javadoc) + * @see java.util.Iterator#next() + */ + public UrlSearchResult next() { + if(cachedNext == null) { + throw new NoSuchElementException("use hasNext!"); + } + UrlSearchResult tmp = cachedNext; + cachedNext = null; + return tmp; + } + + /* (non-Javadoc) + * @see java.util.Iterator#remove() + */ + public void remove() { + throw new UnsupportedOperationException(); + } + + /* (non-Javadoc) + * @see java.io.Closeable#close() + */ + public void close() throws IOException { + peek.close(); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/adapters/CaptureToUrlSearchResultIterator.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-05-18 22:44:29
|
Revision: 3103 http://archive-access.svn.sourceforge.net/archive-access/?rev=3103&view=rev Author: bradtofel Date: 2010-05-18 22:44:22 +0000 (Tue, 18 May 2010) Log Message: ----------- Experimental: changes to make a SearchResultSource that can be directly indexed by ordinal position. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/SequencedSearchResultSource.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/SkippingStringPrefixIterator.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplineBlockMatches.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequence.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/SequencedSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/SequencedSearchResultSource.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/SequencedSearchResultSource.java 2010-05-18 22:44:22 UTC (rev 3103) @@ -0,0 +1,40 @@ +/* SequencedSearchResultSource + * + * $Id$: + * + * Created on May 14, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.exception.ResourceIndexNotAvailableException; +import org.archive.wayback.util.CloseableIterator; + +/** + * @author brad + * + */ +public interface SequencedSearchResultSource extends SearchResultSource { + public CloseableIterator<CaptureSearchResult> + getPrefixIterator(final String prefix, int startIdx) + throws ResourceIndexNotAvailableException; +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/SequencedSearchResultSource.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/SkippingStringPrefixIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/SkippingStringPrefixIterator.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/SkippingStringPrefixIterator.java 2010-05-18 22:44:22 UTC (rev 3103) @@ -0,0 +1,63 @@ +/* SkippingStringPrefixIterator + * + * $Id$: + * + * Created on May 14, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import java.util.Iterator; + +/** + * @author brad + * + */ +public class SkippingStringPrefixIterator extends StringPrefixIterator { + private long skipCount = 0; + private long totalMatches = -1; + + public SkippingStringPrefixIterator(Iterator<String> inner, String prefix, + long skipCount) { + super(inner,prefix); + this.skipCount = skipCount; + } + public SkippingStringPrefixIterator(Iterator<String> inner, String prefix) { + super(inner,prefix); + } + public long getTotalMatches() { + return totalMatches; + } + public void setTotalMatches(long totalMatches) { + this.totalMatches = totalMatches; + } + public boolean hasNext() { + while(skipCount > 0) { + if(super.hasNext()) { + next(); + skipCount--; + } else { + return false; + } + } + return super.hasNext(); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/SkippingStringPrefixIterator.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java 2010-05-18 22:38:59 UTC (rev 3102) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/StringPrefixIterator.java 2010-05-18 22:44:22 UTC (rev 3103) @@ -47,6 +47,9 @@ truncated = ((ZiplinesChunkIterator)inner).isTruncated(); } } + public long getTotalMatches() { + return 0 ; + } public boolean isTruncated() { return truncated; } Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplineBlockMatches.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplineBlockMatches.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplineBlockMatches.java 2010-05-18 22:44:22 UTC (rev 3103) @@ -0,0 +1,141 @@ +/* ZiplineBlockMatches + * + * $Id$: + * + * Created on May 14, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; + +/** + * @author brad + * + */ +public class ZiplineBlockMatches { + private ArrayList<ZiplinedBlock> blocks = null; + private String prefix = null; + private int cachedFirstCount = -1; + private int cachedLastCount = -1; + public ZiplineBlockMatches(ArrayList<ZiplinedBlock> blocks, String prefix) { + this.blocks = blocks; + this.prefix = prefix; + cachedFirstCount = -1; + cachedLastCount = -1; + } + + public StringPrefixIterator getIterator() { + ZiplinesChunkIterator zci = new ZiplinesChunkIterator(blocks); + zci.setTruncated(false); + return new StringPrefixIterator(zci,prefix); + } + + public StringPrefixIterator getIteratorAt(long skip) throws IOException { + SkippingStringPrefixIterator itr = null; + ArrayList<ZiplinedBlock> matchingBlocked = + new ArrayList<ZiplinedBlock>(); + long total = getTotalMatching(); + if(skip > total) { + // TODO: should return empty itr... + return null; + } + long firstBlockMatches = + countMatchesInStartBlock(blocks.get(0), prefix); + if(skip < firstBlockMatches) { + ZiplinesChunkIterator zci = new ZiplinesChunkIterator(blocks); + itr = new SkippingStringPrefixIterator(zci,prefix,skip); + itr.setTotalMatches(total); + return itr; + } + skip -= firstBlockMatches; + int size = blocks.size(); + for(int i = 1; i < size; i++) { + ZiplinedBlock block = blocks.get(i); + if(block.count > skip) { + // this is the block to start: + ZiplinesChunkIterator zci = + new ZiplinesChunkIterator(blocks.subList(i, size)); + itr = new SkippingStringPrefixIterator(zci,prefix,skip); + itr.setTotalMatches(total); + return itr; + } + skip -= block.count; + } + // should never get here... + return null; + } + + public long getTotalMatching() throws IOException { + if(blocks == null) { + return 0; + } + int size = blocks.size(); + if(size == 0) { + return 0; + } + long count = countMatchesInStartBlock(blocks.get(0),prefix); + if(size == 1) { + return count; + } + for(int i = 1; i < size-1; i++) { + count += blocks.get(i).count; + } + count += countMatchesInLastBlock(blocks.get(size-1), prefix); + return count; + } + private long countMatchesInStartBlock(ZiplinedBlock block, String prefix) + throws IOException { + if(cachedFirstCount == -1) { + BufferedReader r = block.readBlock(); + int matches = block.count; + while(true) { + String nextLine = r.readLine(); + if((nextLine == null) || nextLine.startsWith(prefix)) { + r.close(); + cachedFirstCount = matches; + break; + } + matches--; + } + } + return cachedFirstCount; + } + private long countMatchesInLastBlock(ZiplinedBlock block, String prefix) + throws IOException { + if(cachedLastCount == -1) { + BufferedReader r = block.readBlock(); + int matches = 0; + while(true) { + String nextLine = r.readLine(); + if((nextLine == null) || !nextLine.startsWith(prefix)) { + r.close(); + cachedLastCount = matches; + break; + } + matches++; + } + } + return cachedLastCount; + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplineBlockMatches.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2010-05-18 22:38:59 UTC (rev 3102) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlock.java 2010-05-18 22:44:22 UTC (rev 3103) @@ -44,6 +44,7 @@ String urlOrPath = null; long offset = -1; + int count = 0; public final static int BLOCK_SIZE = 128 * 1024; private final static String RANGE_HEADER = "Range"; private final static String BYTES_HEADER = "bytes="; @@ -53,8 +54,17 @@ * @param offset start of 128K block boundary. */ public ZiplinedBlock(String urlOrPath, long offset) { + this(urlOrPath,offset,0); + } + /** + * @param urlOrPath URL where this file can be downloaded + * @param offset start of 128K block boundary. + * @param count number of records in this block + */ + public ZiplinedBlock(String urlOrPath, long offset, int count) { this.urlOrPath = urlOrPath; this.offset = offset; + this.count = count; } /** * @return a BufferedReader of the underlying compressed data in this block Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequence.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequence.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequence.java 2010-05-18 22:44:22 UTC (rev 3103) @@ -0,0 +1,107 @@ +/* ZiplinedBlockIndex + * + * $Id$: + * + * Created on May 14, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; + +import org.archive.wayback.exception.ResourceIndexNotAvailableException; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.flatfile.FlatFile; + +/** + * @author brad + * + */ +public class ZiplinedBlockStringSequence { + private FlatFile chunkIndex = null; + private HashMap<String,String> chunkMap = null; + private int maxBlocks = 10000; + + public ZiplinedBlockStringSequence(FlatFile chunkIndex, + HashMap<String,String> chunkMap) { + this.chunkIndex = chunkIndex; + this.chunkMap = chunkMap; + } + + private ZiplineBlockMatches getBlockMatches(String prefix) + throws IOException, ResourceIndexNotAvailableException { + ArrayList<ZiplinedBlock> blocks = new ArrayList<ZiplinedBlock>(); + boolean first = true; + int numBlocks = 0; + boolean truncated = false; + CloseableIterator<String> itr = null; + try { + itr = chunkIndex.getRecordIteratorLT(prefix); + while(itr.hasNext()) { + if(numBlocks >= maxBlocks) { + truncated = true; + break; + } + String blockDescriptor = itr.next(); + numBlocks++; + String parts[] = blockDescriptor.split("\t"); + if(parts.length != 4) { + throw new ResourceIndexNotAvailableException("Bad line(" + + blockDescriptor + ")"); + } + // only compare the correct length: + String prefCmp = prefix; + String blockCmp = parts[0]; + if(first) { + // always add first: + first = false; + } else if(!blockCmp.startsWith(prefCmp)) { + // all done; + break; + } + // add this and keep lookin... + String url = chunkMap.get(parts[1]); + long offset = Long.parseLong(parts[2]); + int count = Integer.parseInt(parts[3]); + + blocks.add(new ZiplinedBlock(url, offset, count)); + } + } finally { + if(itr != null) { + itr.close(); + } + } + return new ZiplineBlockMatches(blocks,prefix); + } + + public StringPrefixIterator getIterator(String prefix, long skip) + throws ResourceIndexNotAvailableException, IOException { + ZiplineBlockMatches matches = getBlockMatches(prefix); + return matches.getIteratorAt(skip); + } + public StringPrefixIterator getIterator(String prefix) + throws ResourceIndexNotAvailableException, IOException { + ZiplineBlockMatches matches = getBlockMatches(prefix); + return matches.getIterator(); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequence.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java 2010-05-18 22:44:22 UTC (rev 3103) @@ -0,0 +1,85 @@ +/* ZiplinedBlockStringSequenceTest + * + * $Id$: + * + * Created on May 14, 2010. + * + * Copyright (C) 2006 Internet Archive. + * + * This file is part of Wayback. + * + * Wayback is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser Public License as published by + * the Free Software Foundation; either version 2.1 of the License, or + * any later version. + * + * Wayback is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser Public License for more details. + * + * You should have received a copy of the GNU Lesser Public License + * along with Wayback; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +package org.archive.wayback.resourceindex.ziplines; + +import java.io.IOException; +import java.util.HashMap; + +import org.archive.wayback.exception.ResourceIndexNotAvailableException; +import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.flatfile.FlatFile; + +import junit.framework.TestCase; + +/** + * @author brad + * + */ +public class ZiplinedBlockStringSequenceTest extends TestCase { + private String indexPath = "/home/brad/os-cdx/CDX-201002-clean/ALL.count.summary"; + private String mapPath = "/home/brad/os-cdx/CDX-201002-clean/ALL.loc-workstation"; + + private ZiplinedBlockStringSequence getSequence() throws IOException { + HashMap<String, String> chunkMap = new HashMap<String, String>(); + FlatFile ff = new FlatFile(mapPath); + CloseableIterator<String> lines = ff.getSequentialIterator(); + while(lines.hasNext()) { + String line = lines.next(); + String[] parts = line.split("\\s"); + if(parts.length != 2) { + throw new IOException("Bad line(" + line +") in (" + + mapPath + ")"); + } + chunkMap.put(parts[0],parts[1]); + } + lines.close(); + FlatFile chunkIndex = new FlatFile(indexPath); + return new ZiplinedBlockStringSequence(chunkIndex, chunkMap); + } + /** + * Test method for {@link org.archive.wayback.resourceindex.ziplines.ZiplinedBlockStringSequence#getIterator(java.lang.String, long)}. + * @throws IOException + * @throws ResourceIndexNotAvailableException + */ + public void testGetIteratorStringLong() throws IOException, ResourceIndexNotAvailableException { + ZiplinedBlockStringSequence seq = getSequence(); + StringPrefixIterator itr = seq.getIterator("yahoo.com/", 1000000); + System.out.format("Total Matches %d\n",itr.getTotalMatches()); + for(int i = 0; i < 10; i++) { + if(itr.hasNext()) { + System.out.format("Line(%d): %s\n",i,itr.next()); + } + } + } + + /** + * Test method for {@link org.archive.wayback.resourceindex.ziplines.ZiplinedBlockStringSequence#getIterator(java.lang.String)}. + */ + public void testGetIteratorString() { +// fail("Not yet implemented"); + } + +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinedBlockStringSequenceTest.java ___________________________________________________________________ Added: svn:keywords + Author Date Revision Id Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2010-05-18 22:38:59 UTC (rev 3102) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/ziplines/ZiplinesSearchResultSource.java 2010-05-18 22:44:22 UTC (rev 3103) @@ -39,6 +39,7 @@ import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.exception.ResourceIndexNotAvailableException; import org.archive.wayback.resourceindex.SearchResultSource; +import org.archive.wayback.resourceindex.SequencedSearchResultSource; import org.archive.wayback.resourceindex.cdx.CDXFormatToSearchResultAdapter; import org.archive.wayback.resourceindex.cdx.format.CDXFormat; import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; @@ -132,10 +133,9 @@ throw new ResourceIndexNotAvailableException(e.getMessage()); } } - - public Iterator<String> getStringPrefixIterator(String prefix) - throws ResourceIndexNotAvailableException, IOException { + private ArrayList<ZiplinedBlock> getBlockListForPrefix(String prefix) + throws IOException, ResourceIndexNotAvailableException { ArrayList<ZiplinedBlock> blocks = new ArrayList<ZiplinedBlock>(); boolean first = true; int numBlocks = 0; @@ -175,8 +175,15 @@ itr.close(); } } + return blocks; + } + + public Iterator<String> getStringPrefixIterator(String prefix) + throws ResourceIndexNotAvailableException, IOException { + + ArrayList<ZiplinedBlock> blocks = getBlockListForPrefix(prefix); ZiplinesChunkIterator zci = new ZiplinesChunkIterator(blocks); - zci.setTruncated(truncated); + zci.setTruncated(false); return new StringPrefixIterator(zci,prefix); } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2010-08-10 19:52:46
|
Revision: 3218 http://archive-access.svn.sourceforge.net/archive-access/?rev=3218&view=rev Author: bradtofel Date: 2010-08-10 19:52:40 +0000 (Tue, 10 Aug 2010) Log Message: ----------- removed getTextContent() methods... Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/RemoteResourceIndex.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java 2010-08-10 19:00:31 UTC (rev 3217) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/NutchResourceIndex.java 2010-08-10 19:52:40 UTC (rev 3218) @@ -49,6 +49,7 @@ import org.archive.wayback.util.Timestamp; import org.w3c.dom.Document; import org.w3c.dom.Element; +import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; @@ -215,7 +216,7 @@ results = new ArrayList<CaptureSearchResult>(); for(int i = 0; i < numDates; i++) { - String captureDate = nodes.item(i).getTextContent(); + String captureDate = getNodeTextValue(nodes.item(i)); CaptureSearchResult result = new CaptureSearchResult(); result.setFile(fileName); result.setCaptureTimestamp(captureDate); @@ -313,7 +314,7 @@ NodeList nodes = e.getElementsByTagNameNS(NUTCH_NS, key); String result = null; if (nodes != null && nodes.getLength() > 0) { - result = nodes.item(0).getTextContent(); + result = getNodeTextValue(nodes.item(0)); } return (result == null || result.length() == 0)? null: result; } @@ -323,10 +324,18 @@ NodeList nodes = e.getElementsByTagName(key); String result = null; if (nodes != null && nodes.getLength() > 0) { - result = nodes.item(0).getTextContent(); + result = getNodeTextValue(nodes.item(0)); } return (result == null || result.length() == 0)? null: result; } + private String getNodeTextValue(Node n) { + if(n.hasChildNodes()) { + if(n.getFirstChild().getNodeName().equals("#text")) { + return n.getFirstChild().getNodeValue(); + } + } + return ""; + } // do an HTTP request, plus parse the result into an XML DOM protected synchronized Document getHttpDocument(String url) Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/RemoteResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/RemoteResourceIndex.java 2010-08-10 19:00:31 UTC (rev 3217) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/RemoteResourceIndex.java 2010-08-10 19:52:40 UTC (rev 3218) @@ -190,11 +190,19 @@ NodeList list = document.getElementsByTagName( SearchResults.RESULTS_TYPE); if(list.getLength() == 1) { - return list.item(0).getTextContent(); + return getNodeTextValue(list.item(0)); } else { return SearchResults.RESULTS_TYPE_CAPTURE; } } + private String getNodeTextValue(Node n) { + if(n.hasChildNodes()) { + if(n.getFirstChild().getNodeName().equals("#text")) { + return n.getFirstChild().getNodeValue(); + } + } + return ""; + } protected ObjectFilter<CaptureSearchResult> getSearchResultFilters( WaybackRequest wbRequest) { @@ -224,7 +232,7 @@ } for(int i = 0; i < filters.getLength(); i++) { String key = filters.item(i).getNodeName(); - String value = filters.item(i).getTextContent(); + String value = getNodeTextValue(filters.item(i)); if(!key.equals("#text")) { results.putFilter(key,value); } @@ -288,7 +296,7 @@ NodeList chitlens = e.getChildNodes(); for(int i = 0; i < chitlens.getLength(); i++) { String key = chitlens.item(i).getNodeName(); - String value = chitlens.item(i).getTextContent(); + String value = getNodeTextValue(chitlens.item(i)); if(!key.equals("#text")) { result.put(key,value); } @@ -330,7 +338,7 @@ NodeList nodes = e.getElementsByTagName(key); String result = null; if (nodes != null && nodes.getLength() > 0) { - result = nodes.item(0).getTextContent(); + result = getNodeTextValue(nodes.item(0)); } return (result == null || result.length() == 0) ? null : result; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <bra...@us...> - 2011-06-16 16:50:19
|
Revision: 3468 http://archive-access.svn.sourceforge.net/archive-access/?rev=3468&view=rev Author: bradtofel Date: 2011-06-16 16:50:12 +0000 (Thu, 16 Jun 2011) Log Message: ----------- BUGFIX: the closest tracking filter was part of the QueryCaptureFilterGroup, forcing this group to be last. In fact, we definitely want to do date and URL filtering before exclusions, and probably want to do it as early as possible. Moved ClosestTrackingFilter into it's own FilterGroup, which now is installed last. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ClosestTrackingCaptureFilterGroup.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ClosestTrackingCaptureFilterGroupFactory.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2011-06-16 16:41:19 UTC (rev 3467) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2011-06-16 16:50:12 UTC (rev 3468) @@ -42,6 +42,7 @@ import org.archive.wayback.resourceindex.adapters.CaptureToUrlSearchResultIterator; import org.archive.wayback.resourceindex.filterfactory.AccessPointCaptureFilterGroupFactory; import org.archive.wayback.resourceindex.filterfactory.CaptureFilterGroup; +import org.archive.wayback.resourceindex.filterfactory.ClosestTrackingCaptureFilterGroupFactory; import org.archive.wayback.resourceindex.filterfactory.CoreCaptureFilterGroupFactory; import org.archive.wayback.resourceindex.filterfactory.ExclusionCaptureFilterGroupFactory; import org.archive.wayback.resourceindex.filterfactory.FilterGroupFactory; @@ -118,8 +119,9 @@ fgFactories = new ArrayList<FilterGroupFactory>(); fgFactories.add(new CoreCaptureFilterGroupFactory()); fgFactories.add(new AccessPointCaptureFilterGroupFactory()); + fgFactories.add(new QueryCaptureFilterGroupFactory()); fgFactories.add(new ExclusionCaptureFilterGroupFactory()); - fgFactories.add(new QueryCaptureFilterGroupFactory()); + fgFactories.add(new ClosestTrackingCaptureFilterGroupFactory()); } private void cleanupIterator(CloseableIterator<? extends SearchResult> itr) Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ClosestTrackingCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ClosestTrackingCaptureFilterGroup.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ClosestTrackingCaptureFilterGroup.java 2011-06-16 16:50:12 UTC (rev 3468) @@ -0,0 +1,45 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import java.util.List; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.CaptureSearchResults; +import org.archive.wayback.core.SearchResults; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.AccessControlException; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.exception.ResourceNotInArchiveException; +import org.archive.wayback.resourceindex.filters.ClosestResultTrackingFilter; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.ObjectFilterChain; + +public class ClosestTrackingCaptureFilterGroup implements CaptureFilterGroup { + private ObjectFilterChain<CaptureSearchResult> chain = null; + private ClosestResultTrackingFilter closestTracker = null; + public ClosestTrackingCaptureFilterGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer) { + chain = new ObjectFilterChain<CaptureSearchResult>(); + if(request.isCaptureQueryRequest() || + request.isReplayRequest()) { + closestTracker = + new ClosestResultTrackingFilter(request.getReplayDate().getTime()); + chain.addFilter(closestTracker); + } + } + + public List<ObjectFilter<CaptureSearchResult>> getFilters() { + return chain.getFilters(); + } + + public void annotateResults(SearchResults results) + throws ResourceNotInArchiveException, BadQueryException, + AccessControlException { + if(closestTracker != null) { + if(results instanceof CaptureSearchResults) { + CaptureSearchResults cResults = (CaptureSearchResults) results; + cResults.setClosest(closestTracker.getClosest()); + } + } + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ClosestTrackingCaptureFilterGroupFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ClosestTrackingCaptureFilterGroupFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/ClosestTrackingCaptureFilterGroupFactory.java 2011-06-16 16:50:12 UTC (rev 3468) @@ -0,0 +1,16 @@ +package org.archive.wayback.resourceindex.filterfactory; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.LocalResourceIndex; + +public class ClosestTrackingCaptureFilterGroupFactory implements FilterGroupFactory { + + public CaptureFilterGroup getGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer, LocalResourceIndex index) + throws BadQueryException { + return new ClosestTrackingCaptureFilterGroup(request,canonicalizer); + } + +} Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java 2011-06-16 16:41:19 UTC (rev 3467) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/QueryCaptureFilterGroup.java 2011-06-16 16:50:12 UTC (rev 3468) @@ -27,11 +27,9 @@ import org.apache.commons.httpclient.URIException; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.core.CaptureSearchResults; import org.archive.wayback.core.SearchResults; import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.BadQueryException; -import org.archive.wayback.resourceindex.filters.ClosestResultTrackingFilter; import org.archive.wayback.resourceindex.filters.DateRangeFilter; import org.archive.wayback.resourceindex.filters.HostMatchFilter; import org.archive.wayback.resourceindex.filters.SchemeMatchFilter; @@ -44,12 +42,7 @@ import org.archive.wayback.util.url.UrlOperations; public class QueryCaptureFilterGroup implements CaptureFilterGroup { -// private ObjectFilter<CaptureSearchResult> prefixFilter = null; -// private ObjectFilter<CaptureSearchResult> dateFilter = null; -// private ObjectFilter<CaptureSearchResult> selfRedirectFilter = null; -// private ObjectFilter<CaptureSearchResult> exactHost = null; -// private ObjectFilter<CaptureSearchResult> exactScheme = null; - private ClosestResultTrackingFilter closestTracker = null; + private ObjectFilterChain<CaptureSearchResult> chain = null; private String requestType = null; private String keyUrl = null; @@ -94,14 +87,9 @@ Timestamp.parseBefore(anchorTS).getDate().getTime(); } } - - closestTracker = new ClosestResultTrackingFilter( - request.getReplayDate().getTime()); } else if(request.isCaptureQueryRequest()) { chain.addFilter(new UrlMatchFilter(keyUrl)); - closestTracker = new ClosestResultTrackingFilter( - request.getReplayDate().getTime()); } else if(request.isUrlQueryRequest()) { chain.addFilter(new UrlPrefixMatchFilter(keyUrl)); } @@ -130,9 +118,6 @@ chain.addFilter(new SchemeMatchFilter( UrlOperations.urlToScheme(request.getRequestUrl()),this)); } - if(closestTracker != null) { - chain.addFilter(closestTracker); - } } public List<ObjectFilter<CaptureSearchResult>> getFilters() { @@ -152,12 +137,6 @@ if(!closeMatches.isEmpty()) { results.setCloseMatches(new ArrayList<String>(closeMatches.values())); } - if(closestTracker != null) { - if(results instanceof CaptureSearchResults) { - CaptureSearchResults cResults = (CaptureSearchResults) results; - cResults.setClosest(closestTracker.getClosest()); - } - } } public void addCloseMatch(String host, String closeMatch) { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <ikr...@us...> - 2011-11-29 06:04:06
|
Revision: 3574 http://archive-access.svn.sourceforge.net/archive-access/?rev=3574&view=rev Author: ikreymer Date: 2011-11-29 06:03:59 +0000 (Tue, 29 Nov 2011) Log Message: ----------- FIX: Better fix for LocalResourceIndex chain, restoring previous order, but splitting the annotating filters into a seperate filter group that gets added after the FilePrefix filter group LOGGING: Add logging to WARCRevisitAnnotationFilter when the base warc record is missing Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroup.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroup.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroupFactory.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2011-11-29 04:40:21 UTC (rev 3573) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2011-11-29 06:03:59 UTC (rev 3574) @@ -41,6 +41,7 @@ import org.archive.wayback.exception.RuntimeIOException; import org.archive.wayback.resourceindex.adapters.CaptureToUrlSearchResultIterator; import org.archive.wayback.resourceindex.filterfactory.AccessPointCaptureFilterGroupFactory; +import org.archive.wayback.resourceindex.filterfactory.AnnotatingCaptureFilterGroupFactory; import org.archive.wayback.resourceindex.filterfactory.CaptureFilterGroup; import org.archive.wayback.resourceindex.filterfactory.ClosestTrackingCaptureFilterGroupFactory; import org.archive.wayback.resourceindex.filterfactory.CoreCaptureFilterGroupFactory; @@ -117,9 +118,10 @@ public LocalResourceIndex() { canonicalizer = new AggressiveUrlCanonicalizer(); fgFactories = new ArrayList<FilterGroupFactory>(); + fgFactories.add(new CoreCaptureFilterGroupFactory()); fgFactories.add(new QueryCaptureFilterGroupFactory()); fgFactories.add(new AccessPointCaptureFilterGroupFactory()); - fgFactories.add(new CoreCaptureFilterGroupFactory()); + fgFactories.add(new AnnotatingCaptureFilterGroupFactory()); fgFactories.add(new ExclusionCaptureFilterGroupFactory()); fgFactories.add(new ClosestTrackingCaptureFilterGroupFactory()); } Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroup.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroup.java 2011-11-29 06:03:59 UTC (rev 3574) @@ -0,0 +1,56 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.resourceindex.filterfactory; + +import java.util.List; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.SearchResults; +import org.archive.wayback.exception.AccessControlException; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.exception.ResourceNotInArchiveException; +import org.archive.wayback.resourceindex.LocalResourceIndex; +import org.archive.wayback.resourceindex.filters.ConditionalGetAnnotationFilter; +import org.archive.wayback.resourceindex.filters.WARCRevisitAnnotationFilter; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.ObjectFilterChain; + +public class AnnotatingCaptureFilterGroup implements CaptureFilterGroup { + + private ObjectFilterChain<CaptureSearchResult> chain; + + public AnnotatingCaptureFilterGroup(LocalResourceIndex index) { + chain = new ObjectFilterChain<CaptureSearchResult>(); + if(index.isDedupeRecords()) { + chain.addFilter(new WARCRevisitAnnotationFilter()); + chain.addFilter(new ConditionalGetAnnotationFilter()); + } + } + + public void annotateResults(SearchResults results) + throws ResourceNotInArchiveException, BadQueryException, + AccessControlException { + + } + + public List<ObjectFilter<CaptureSearchResult>> getFilters() { + return chain.getFilters(); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroup.java ___________________________________________________________________ Added: svn:mime-type + text/plain Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroupFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroupFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroupFactory.java 2011-11-29 06:03:59 UTC (rev 3574) @@ -0,0 +1,34 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.resourceindex.filterfactory; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.LocalResourceIndex; + +public class AnnotatingCaptureFilterGroupFactory implements FilterGroupFactory { + + public CaptureFilterGroup getGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer, LocalResourceIndex index) + throws BadQueryException { + return new AnnotatingCaptureFilterGroup(index); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroupFactory.java ___________________________________________________________________ Added: svn:mime-type + text/plain Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroup.java 2011-11-29 04:40:21 UTC (rev 3573) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroup.java 2011-11-29 06:03:59 UTC (rev 3574) @@ -24,12 +24,10 @@ import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.core.SearchResults; import org.archive.wayback.resourceindex.LocalResourceIndex; -import org.archive.wayback.resourceindex.filters.ConditionalGetAnnotationFilter; import org.archive.wayback.resourceindex.filters.DuplicateRecordFilter; import org.archive.wayback.resourceindex.filters.GuardRailFilter; import org.archive.wayback.resourceindex.filters.MimeTypeFilter; import org.archive.wayback.resourceindex.filters.UserInfoInAuthorityFilter; -import org.archive.wayback.resourceindex.filters.WARCRevisitAnnotationFilter; import org.archive.wayback.util.ObjectFilter; import org.archive.wayback.util.ObjectFilterChain; @@ -42,10 +40,7 @@ chain = new ObjectFilterChain<CaptureSearchResult>(); chain.addFilter(new GuardRailFilter(index.getMaxRecords())); chain.addFilter(new DuplicateRecordFilter()); - if(index.isDedupeRecords()) { - chain.addFilter(new WARCRevisitAnnotationFilter()); - chain.addFilter(new ConditionalGetAnnotationFilter()); - } + MimeTypeFilter mimeExcludeFilter = new MimeTypeFilter(); mimeExcludeFilter.addMime(ALEXA_DAT_MIME); mimeExcludeFilter.setIncludeIfContains(false); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java 2011-11-29 04:40:21 UTC (rev 3573) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java 2011-11-29 06:03:59 UTC (rev 3574) @@ -20,6 +20,7 @@ package org.archive.wayback.resourceindex.filters; import java.util.HashMap; +import java.util.logging.Logger; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.ObjectFilter; @@ -46,6 +47,9 @@ private final static String EMPTY_VALUE = "-"; private final static String REVISIT_VALUE = "warc/revisit"; + + private static final Logger LOGGER = Logger.getLogger( + WARCRevisitAnnotationFilter.class.getName()); private HashMap<String,CaptureSearchResult> memory = null; @@ -57,7 +61,7 @@ String thisDigest = o.getDigest(); CaptureSearchResult last = memory.get(thisDigest); if(last == null) { - // TODO: log missing record digest reference? + LOGGER.warning("Missing revisit base warc for digest: " + o.getDigest() + " url: " + o.getOriginalUrl()); return FILTER_EXCLUDE; } o.setFile(last.getFile()); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |