From: <ikr...@us...> - 2011-11-29 06:04:06
|
Revision: 3574 http://archive-access.svn.sourceforge.net/archive-access/?rev=3574&view=rev Author: ikreymer Date: 2011-11-29 06:03:59 +0000 (Tue, 29 Nov 2011) Log Message: ----------- FIX: Better fix for LocalResourceIndex chain, restoring previous order, but splitting the annotating filters into a seperate filter group that gets added after the FilePrefix filter group LOGGING: Add logging to WARCRevisitAnnotationFilter when the base warc record is missing Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroup.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroup.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroupFactory.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2011-11-29 04:40:21 UTC (rev 3573) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2011-11-29 06:03:59 UTC (rev 3574) @@ -41,6 +41,7 @@ import org.archive.wayback.exception.RuntimeIOException; import org.archive.wayback.resourceindex.adapters.CaptureToUrlSearchResultIterator; import org.archive.wayback.resourceindex.filterfactory.AccessPointCaptureFilterGroupFactory; +import org.archive.wayback.resourceindex.filterfactory.AnnotatingCaptureFilterGroupFactory; import org.archive.wayback.resourceindex.filterfactory.CaptureFilterGroup; import org.archive.wayback.resourceindex.filterfactory.ClosestTrackingCaptureFilterGroupFactory; import org.archive.wayback.resourceindex.filterfactory.CoreCaptureFilterGroupFactory; @@ -117,9 +118,10 @@ public LocalResourceIndex() { canonicalizer = new AggressiveUrlCanonicalizer(); fgFactories = new ArrayList<FilterGroupFactory>(); + fgFactories.add(new CoreCaptureFilterGroupFactory()); fgFactories.add(new QueryCaptureFilterGroupFactory()); fgFactories.add(new AccessPointCaptureFilterGroupFactory()); - fgFactories.add(new CoreCaptureFilterGroupFactory()); + fgFactories.add(new AnnotatingCaptureFilterGroupFactory()); fgFactories.add(new ExclusionCaptureFilterGroupFactory()); fgFactories.add(new ClosestTrackingCaptureFilterGroupFactory()); } Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroup.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroup.java 2011-11-29 06:03:59 UTC (rev 3574) @@ -0,0 +1,56 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.resourceindex.filterfactory; + +import java.util.List; + +import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.core.SearchResults; +import org.archive.wayback.exception.AccessControlException; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.exception.ResourceNotInArchiveException; +import org.archive.wayback.resourceindex.LocalResourceIndex; +import org.archive.wayback.resourceindex.filters.ConditionalGetAnnotationFilter; +import org.archive.wayback.resourceindex.filters.WARCRevisitAnnotationFilter; +import org.archive.wayback.util.ObjectFilter; +import org.archive.wayback.util.ObjectFilterChain; + +public class AnnotatingCaptureFilterGroup implements CaptureFilterGroup { + + private ObjectFilterChain<CaptureSearchResult> chain; + + public AnnotatingCaptureFilterGroup(LocalResourceIndex index) { + chain = new ObjectFilterChain<CaptureSearchResult>(); + if(index.isDedupeRecords()) { + chain.addFilter(new WARCRevisitAnnotationFilter()); + chain.addFilter(new ConditionalGetAnnotationFilter()); + } + } + + public void annotateResults(SearchResults results) + throws ResourceNotInArchiveException, BadQueryException, + AccessControlException { + + } + + public List<ObjectFilter<CaptureSearchResult>> getFilters() { + return chain.getFilters(); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroup.java ___________________________________________________________________ Added: svn:mime-type + text/plain Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroupFactory.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroupFactory.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroupFactory.java 2011-11-29 06:03:59 UTC (rev 3574) @@ -0,0 +1,34 @@ +/* + * This file is part of the Wayback archival access software + * (http://archive-access.sourceforge.net/projects/wayback/). + * + * Licensed to the Internet Archive (IA) by one or more individual + * contributors. + * + * The IA licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.archive.wayback.resourceindex.filterfactory; + +import org.archive.wayback.UrlCanonicalizer; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.exception.BadQueryException; +import org.archive.wayback.resourceindex.LocalResourceIndex; + +public class AnnotatingCaptureFilterGroupFactory implements FilterGroupFactory { + + public CaptureFilterGroup getGroup(WaybackRequest request, + UrlCanonicalizer canonicalizer, LocalResourceIndex index) + throws BadQueryException { + return new AnnotatingCaptureFilterGroup(index); + } +} Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/AnnotatingCaptureFilterGroupFactory.java ___________________________________________________________________ Added: svn:mime-type + text/plain Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroup.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroup.java 2011-11-29 04:40:21 UTC (rev 3573) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filterfactory/CoreCaptureFilterGroup.java 2011-11-29 06:03:59 UTC (rev 3574) @@ -24,12 +24,10 @@ import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.core.SearchResults; import org.archive.wayback.resourceindex.LocalResourceIndex; -import org.archive.wayback.resourceindex.filters.ConditionalGetAnnotationFilter; import org.archive.wayback.resourceindex.filters.DuplicateRecordFilter; import org.archive.wayback.resourceindex.filters.GuardRailFilter; import org.archive.wayback.resourceindex.filters.MimeTypeFilter; import org.archive.wayback.resourceindex.filters.UserInfoInAuthorityFilter; -import org.archive.wayback.resourceindex.filters.WARCRevisitAnnotationFilter; import org.archive.wayback.util.ObjectFilter; import org.archive.wayback.util.ObjectFilterChain; @@ -42,10 +40,7 @@ chain = new ObjectFilterChain<CaptureSearchResult>(); chain.addFilter(new GuardRailFilter(index.getMaxRecords())); chain.addFilter(new DuplicateRecordFilter()); - if(index.isDedupeRecords()) { - chain.addFilter(new WARCRevisitAnnotationFilter()); - chain.addFilter(new ConditionalGetAnnotationFilter()); - } + MimeTypeFilter mimeExcludeFilter = new MimeTypeFilter(); mimeExcludeFilter.addMime(ALEXA_DAT_MIME); mimeExcludeFilter.setIncludeIfContains(false); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java 2011-11-29 04:40:21 UTC (rev 3573) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/WARCRevisitAnnotationFilter.java 2011-11-29 06:03:59 UTC (rev 3574) @@ -20,6 +20,7 @@ package org.archive.wayback.resourceindex.filters; import java.util.HashMap; +import java.util.logging.Logger; import org.archive.wayback.core.CaptureSearchResult; import org.archive.wayback.util.ObjectFilter; @@ -46,6 +47,9 @@ private final static String EMPTY_VALUE = "-"; private final static String REVISIT_VALUE = "warc/revisit"; + + private static final Logger LOGGER = Logger.getLogger( + WARCRevisitAnnotationFilter.class.getName()); private HashMap<String,CaptureSearchResult> memory = null; @@ -57,7 +61,7 @@ String thisDigest = o.getDigest(); CaptureSearchResult last = memory.get(thisDigest); if(last == null) { - // TODO: log missing record digest reference? + LOGGER.warning("Missing revisit base warc for digest: " + o.getDigest() + " url: " + o.getOriginalUrl()); return FILTER_EXCLUDE; } o.setFile(last.getFile()); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |