From: <bra...@us...> - 2008-01-15 03:03:16
|
Revision: 2138 http://archive-access.svn.sourceforge.net/archive-access/?rev=2138&view=rev Author: bradtofel Date: 2008-01-14 19:03:21 -0800 (Mon, 14 Jan 2008) Log Message: ----------- FEATURE: UrlCanonicalizer customization capabilities. Previous hard-coded UrlCanonicalizer is now AggressiveUrlCanonicalizer, which is the default, but now it can be overridden with another UrlCanonicalizer implementation. main() of WarcIndexer and ArcIndexer now include parsing of "-identity" option, which causes the IdentityUrlCanonicalizer to be used -- passing through URLs to the CDX as they appear in the ARC file. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SelfRedirectFilter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java 2008-01-15 03:00:16 UTC (rev 2137) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/liveweb/LiveWebCache.java 2008-01-15 03:03:21 UTC (rev 2138) @@ -32,6 +32,7 @@ import org.apache.commons.httpclient.URIException; import org.archive.io.arc.ARCLocation; import org.archive.io.arc.ARCRecord; +import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.CaptureSearchResults; import org.archive.wayback.core.Resource; @@ -44,8 +45,7 @@ import org.archive.wayback.exception.WaybackException; import org.archive.wayback.resourcestore.ARCRecordToSearchResultAdapter; import org.archive.wayback.resourcestore.ArcResource; -import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.UrlCanonicalizer; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; /** * @@ -61,10 +61,15 @@ private ARCCacheDirectory arcCacheDir = null; private URLCacher cacher = null; private LiveWebLocalResourceIndex index = null; - static UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); - private static Adapter<ARCRecord,SearchResult> adapter = - new ARCRecordToSearchResultAdapter(); + private UrlCanonicalizer canonicalizer = null; + private ARCRecordToSearchResultAdapter adapter = null; + public LiveWebCache() { + canonicalizer = new AggressiveUrlCanonicalizer(); + adapter = new ARCRecordToSearchResultAdapter(); + adapter.setCanonicalizer(canonicalizer); + } + /** * closes all resources */ @@ -330,4 +335,13 @@ public void setIndex(LiveWebLocalResourceIndex index) { this.index = index; } + + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + adapter.setCanonicalizer(canonicalizer); + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2008-01-15 03:00:16 UTC (rev 2137) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/LocalResourceIndex.java 2008-01-15 03:03:21 UTC (rev 2138) @@ -25,11 +25,13 @@ package org.archive.wayback.resourceindex; import java.io.IOException; +import java.util.Iterator; import org.apache.commons.httpclient.URIException; import org.archive.net.UURI; import org.archive.net.UURIFactory; import org.archive.wayback.ResourceIndex; +import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.WaybackConstants; import org.archive.wayback.resourceindex.filters.CaptureToUrlResultFilter; import org.archive.wayback.resourceindex.filters.CounterFilter; @@ -39,7 +41,6 @@ import org.archive.wayback.resourceindex.filters.GuardRailFilter; import org.archive.wayback.resourceindex.filters.HostMatchFilter; import org.archive.wayback.resourceindex.filters.SelfRedirectFilter; -import org.archive.wayback.resourceindex.filters.StartDateFilter; import org.archive.wayback.resourceindex.filters.UrlMatchFilter; import org.archive.wayback.resourceindex.filters.UrlPrefixMatchFilter; import org.archive.wayback.resourceindex.filters.WindowEndFilter; @@ -58,7 +59,7 @@ import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.ObjectFilter; import org.archive.wayback.util.ObjectFilterChain; -import org.archive.wayback.util.UrlCanonicalizer; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; /** * @@ -77,11 +78,15 @@ protected SearchResultSource source; - private UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); + private UrlCanonicalizer canonicalizer = null; private boolean dedupeRecords = false; - private void filterRecords(CloseableIterator<SearchResult> itr, + public LocalResourceIndex() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } + + private void filterRecords(Iterator<SearchResult> itr, ObjectFilter<SearchResult> filter, SearchResults results, boolean forwards) throws IOException { @@ -98,7 +103,11 @@ results.addSearchResult(result, forwards); } } - source.cleanup(itr); + if(itr instanceof CloseableIterator) { + CloseableIterator<SearchResult> citr = + (CloseableIterator<SearchResult>) itr; + source.cleanup(citr); + } } private String getRequired(WaybackRequest wbRequest, String field, @@ -216,27 +225,27 @@ if (searchType.equals(WaybackConstants.REQUEST_REPLAY_QUERY) || searchType.equals(WaybackConstants.REQUEST_CLOSEST_QUERY)) { - results = new CaptureSearchResults(); + results = new CaptureSearchResults(); + ObjectFilterChain<SearchResult> forwardFilters = new ObjectFilterChain<SearchResult>(); - ObjectFilterChain<SearchResult> reverseFilters = - new ObjectFilterChain<SearchResult>(); +// ObjectFilterChain<SearchResult> reverseFilters = +// new ObjectFilterChain<SearchResult>(); + // use the same guardrail for both: forwardFilters.addFilter(guardrail); - reverseFilters.addFilter(guardrail); +// reverseFilters.addFilter(guardrail); - // BUGBUG: won't work when closest is a dupe! forwardFilters.addFilter(new DuplicateRecordFilter()); - reverseFilters.addFilter(new DuplicateRecordFilter()); // match URL key: forwardFilters.addFilter(new UrlMatchFilter(keyUrl)); - reverseFilters.addFilter(new UrlMatchFilter(keyUrl)); +// reverseFilters.addFilter(new UrlMatchFilter(keyUrl)); if(hostMatchFilter != null) { forwardFilters.addFilter(hostMatchFilter); - reverseFilters.addFilter(hostMatchFilter); +// reverseFilters.addFilter(hostMatchFilter); } // be sure to only include records within the date range we want: @@ -246,11 +255,11 @@ // requested range. DateRangeFilter drFilter = new DateRangeFilter(startDate,endDate); forwardFilters.addFilter(drFilter); - reverseFilters.addFilter(drFilter); +// reverseFilters.addFilter(drFilter); // abort processing if we hit a date outside the search range: forwardFilters.addFilter(new EndDateFilter(endDate)); - reverseFilters.addFilter(new StartDateFilter(startDate)); +// reverseFilters.addFilter(new StartDateFilter(startDate)); // for replay, do not include records that redirect to // themselves.. We'll leave this for both closest and replays, @@ -258,39 +267,54 @@ // timeline in which case, we don't want to show captures that // redirect to themselves in the timeline if they are not viewable. SelfRedirectFilter selfRedirectFilter = new SelfRedirectFilter(); + selfRedirectFilter.setCanonicalizer(canonicalizer); forwardFilters.addFilter(selfRedirectFilter); - reverseFilters.addFilter(selfRedirectFilter); +// reverseFilters.addFilter(selfRedirectFilter); // possibly filter via exclusions: if(exclusion != null) { forwardFilters.addFilter(preExCounter); forwardFilters.addFilter(exclusion); - reverseFilters.addFilter(preExCounter); - reverseFilters.addFilter(exclusion); +// reverseFilters.addFilter(preExCounter); +// reverseFilters.addFilter(exclusion); } forwardFilters.addFilter(finalCounter); - reverseFilters.addFilter(finalCounter); +// reverseFilters.addFilter(finalCounter); - int resultsPerDirection = (int) Math.floor(resultsPerPage / 2); - if (resultsPerDirection * 2 == resultsPerPage) { - forwardFilters.addFilter(new WindowEndFilter( - resultsPerDirection)); - } else { - forwardFilters.addFilter(new WindowEndFilter( - resultsPerDirection + 1)); - } - reverseFilters.addFilter(new WindowEndFilter(resultsPerDirection)); + forwardFilters.addFilter(new WindowEndFilter(resultsPerPage)); +// int resultsPerDirection = (int) Math.floor(resultsPerPage / 2); +// reverseFilters.addFilter(new WindowEndFilter(resultsPerDirection)); - startKey = keyUrl + " " + exactDate; + startKey = keyUrl; - // first the reverse search: try { - filterRecords(source.getPrefixIterator(startKey), reverseFilters, - results, true); - // then the forwards: - filterRecords(source.getPrefixReverseIterator(startKey), - forwardFilters, results, false); +// CloseableIterator<SearchResult> reverse = +// new AdaptedObjectFilterIterator<SearchResult>( +// source.getPrefixReverseIterator(startKey), +// reverseFilters); + +// // reverse the reverseResults: +// ArrayList<SearchResult> reverseResults = +// new ArrayList<SearchResult>(); +// while(reverse.hasNext()) { +// reverseResults.add(0, reverse.next()); +// } + + // now make a composite of the reverse and forwards: + + CloseableIterator<SearchResult> forward = + source.getPrefixIterator(startKey); +// +// CompositeIterator<SearchResult> resultsItr = +// new CompositeIterator<SearchResult>(); +// resultsItr.addComponent(reverseResults.iterator()); +// resultsItr.addComponent(forward); + + // and filter: +// filterRecords(resultsItr, forwardFilters, results, true); + filterRecords(forward, forwardFilters, results, true); + } catch (IOException e) { throw new ResourceIndexNotAvailableException( e.getLocalizedMessage()); @@ -345,13 +369,11 @@ } filters.addFilter(new DateRangeFilter(startDate, endDate)); // possibly filter via exclusions: - if (exclusion == null) { - filters.addFilter(new CaptureToUrlResultFilter()); - } else { + if (exclusion != null) { filters.addFilter(preExCounter); filters.addFilter(exclusion); - filters.addFilter(new CaptureToUrlResultFilter()); } + filters.addFilter(new CaptureToUrlResultFilter()); filters.addFilter(finalCounter); startKey = keyUrl; @@ -430,4 +452,12 @@ public void setDedupeRecords(boolean dedupeRecords) { this.dedupeRecords = dedupeRecords; } + + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java 2008-01-15 03:00:16 UTC (rev 2137) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/distributed/AlphaPartitionedIndex.java 2008-01-15 03:03:21 UTC (rev 2138) @@ -35,6 +35,7 @@ import org.apache.commons.httpclient.URIException; import org.archive.wayback.ResourceIndex; +import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.SearchResults; import org.archive.wayback.core.WaybackRequest; @@ -42,8 +43,8 @@ import org.archive.wayback.exception.BadQueryException; import org.archive.wayback.exception.ResourceIndexNotAvailableException; import org.archive.wayback.exception.ResourceNotInArchiveException; -import org.archive.wayback.util.UrlCanonicalizer; import org.archive.wayback.util.flatfile.FlatFile; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; /** * @@ -75,8 +76,12 @@ private String mapPath; private static Comparator<RangeGroup> comparator = RangeGroup.getComparator(); - private UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); + private UrlCanonicalizer canonicalizer = null; + public AlphaPartitionedIndex() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } + @SuppressWarnings("unchecked") private void reloadMapFile() throws IOException { FlatFile ff = new FlatFile(mapPath); @@ -235,4 +240,12 @@ public void setMapPath(String mapPath) { this.mapPath = mapPath; } + + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SelfRedirectFilter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SelfRedirectFilter.java 2008-01-15 03:00:16 UTC (rev 2137) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourceindex/filters/SelfRedirectFilter.java 2008-01-15 03:03:21 UTC (rev 2138) @@ -25,10 +25,11 @@ package org.archive.wayback.resourceindex.filters; import org.apache.commons.httpclient.URIException; +import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.SearchResult; import org.archive.wayback.util.ObjectFilter; -import org.archive.wayback.util.UrlCanonicalizer; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; /** * SearchResultFilter which INCLUDEs all records, unless they redirect to @@ -39,7 +40,10 @@ */ public class SelfRedirectFilter implements ObjectFilter<SearchResult> { - private UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); + private UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + public SelfRedirectFilter() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } /* (non-Javadoc) * @see org.archive.wayback.resourceindex.SearchResultFilter#filterSearchResult(org.archive.wayback.core.SearchResult) */ @@ -63,4 +67,10 @@ } return FILTER_INCLUDE; } + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java 2008-01-15 03:00:16 UTC (rev 2137) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java 2008-01-15 03:03:21 UTC (rev 2138) @@ -34,10 +34,11 @@ import org.archive.io.arc.ARCRecordMetaData; import org.archive.net.UURI; import org.archive.net.UURIFactory; +import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.SearchResult; import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.UrlCanonicalizer; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; /** * @@ -51,9 +52,11 @@ private static final Logger LOGGER = Logger.getLogger( ARCRecordToSearchResultAdapter.class.getName()); - // TODO: make this configurable based on the ResourceIndex - private static UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); - + private UrlCanonicalizer canonicalizer = null; + + public ARCRecordToSearchResultAdapter() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } // public static SearchResult arcRecordToSearchResult(final ARCRecord rec) // throws IOException, ParseException { /* (non-Javadoc) @@ -161,4 +164,10 @@ } return result; } + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java 2008-01-15 03:00:16 UTC (rev 2137) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java 2008-01-15 03:03:21 UTC (rev 2138) @@ -33,11 +33,14 @@ import org.archive.io.arc.ARCReader; import org.archive.io.arc.ARCReaderFactory; import org.archive.io.arc.ARCRecord; +import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.SearchResult; import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; import org.archive.wayback.util.AdaptedIterator; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.wayback.util.url.IdentityUrlCanonicalizer; /** * Transforms an ARC file into Iterator<SearchResult>. @@ -51,7 +54,12 @@ * CDX Header line for these fields. not very configurable.. */ public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; - + private UrlCanonicalizer canonicalizer = null; + + public ArcIndexer() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } + /** * @param arc * @return Iterator of SearchResults for input arc File @@ -65,8 +73,9 @@ Adapter<ArchiveRecord,ARCRecord> adapter1 = new ArchiveRecordToARCRecordAdapter(); - Adapter<ARCRecord,SearchResult> adapter2 = + ARCRecordToSearchResultAdapter adapter2 = new ARCRecordToSearchResultAdapter(); + adapter2.setCanonicalizer(canonicalizer); Iterator<ArchiveRecord> itr1 = arcReader.iterator(); @@ -76,29 +85,22 @@ return new AdaptedIterator<ARCRecord,SearchResult>(itr2,adapter2); } - - private class ArchiveRecordToARCRecordAdapter - implements Adapter<ArchiveRecord,ARCRecord> { + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public ARCRecord adapt(ArchiveRecord o) { - ARCRecord rec = null; - if(o instanceof ARCRecord) { - rec = (ARCRecord) o; - } - return rec; - } + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; } private static void USAGE() { System.err.println("USAGE:"); System.err.println(""); - System.err.println("arc-indexer ARCFILE"); - System.err.println("arc-indexer ARCFILE CDXFILE"); + System.err.println("arc-indexer [-identity] ARCFILE"); + System.err.println("arc-indexer [-identity] ARCFILE CDXFILE"); System.err.println(""); - System.err.println("Create a CDX format index at CDXFILE or to STDOUT"); + System.err.println("Create a CDX format index at CDXFILE or to STDOUT."); + System.err.println("With -identity, perform no url canonicalization."); System.exit(1); } @@ -107,14 +109,20 @@ */ public static void main(String[] args) { ArcIndexer indexer = new ArcIndexer(); - File arc = new File(args[0]); + int idx = 0; + if(args[0] != null && args[0].equals("-identity")) { + indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); + idx++; + } + File arc = new File(args[idx]); + idx++; PrintWriter pw = null; try { - if(args.length == 1) { + if(args.length == idx) { // dump to STDOUT: pw = new PrintWriter(System.out); - } else if(args.length == 2) { - pw = new PrintWriter(args[1]); + } else if(args.length == (idx + 1)) { + pw = new PrintWriter(args[idx]); } else { USAGE(); } @@ -126,6 +134,22 @@ pw.close(); } catch (Exception e) { e.printStackTrace(); + System.exit(1); } } + + private class ArchiveRecordToARCRecordAdapter + implements Adapter<ArchiveRecord,ARCRecord> { + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public ARCRecord adapt(ArchiveRecord o) { + ARCRecord rec = null; + if(o instanceof ARCRecord) { + rec = (ARCRecord) o; + } + return rec; + } + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java 2008-01-15 03:00:16 UTC (rev 2137) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java 2008-01-15 03:03:21 UTC (rev 2138) @@ -16,10 +16,11 @@ import org.archive.io.warc.WARCRecord; import org.archive.net.UURI; import org.archive.net.UURIFactory; +import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.WaybackConstants; import org.archive.wayback.core.SearchResult; import org.archive.wayback.util.Adapter; -import org.archive.wayback.util.UrlCanonicalizer; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; /** * Adapts certain WARCRecords into SearchResults. DNS and response records are @@ -52,9 +53,12 @@ private static final Logger LOGGER = Logger.getLogger( WARCRecordToSearchResultAdapter.class.getName()); - // TODO: make this configurable based on the ResourceIndex - private static UrlCanonicalizer canonicalizer = new UrlCanonicalizer(); + private UrlCanonicalizer canonicalizer = null; + public WARCRecordToSearchResultAdapter() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } + /* (non-Javadoc) * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) */ @@ -303,4 +307,12 @@ return result; } + + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } + + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; + } } Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java 2008-01-15 03:00:16 UTC (rev 2137) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java 2008-01-15 03:03:21 UTC (rev 2138) @@ -9,11 +9,14 @@ import org.archive.io.warc.WARCReader; import org.archive.io.warc.WARCReaderFactory; import org.archive.io.warc.WARCRecord; +import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.SearchResult; import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; import org.archive.wayback.util.AdaptedIterator; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.CloseableIterator; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.wayback.util.url.IdentityUrlCanonicalizer; public class WarcIndexer { @@ -22,6 +25,11 @@ */ public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; + private UrlCanonicalizer canonicalizer = null; + public WarcIndexer() { + canonicalizer = new AggressiveUrlCanonicalizer(); + } + /** * @param arc * @return Iterator of SearchResults for input arc File @@ -32,7 +40,10 @@ Adapter<ArchiveRecord, WARCRecord> adapter1 = new ArchiveRecordToWARCRecordAdapter(); - Adapter<WARCRecord, SearchResult> adapter2 = new WARCRecordToSearchResultAdapter(); + WARCRecordToSearchResultAdapter adapter2 = + new WARCRecordToSearchResultAdapter(); + adapter2.setCanonicalizer(canonicalizer); + WARCReader reader = WARCReaderFactory.get(warc); Iterator<ArchiveRecord> itr1 = reader.iterator(); @@ -43,28 +54,22 @@ return new AdaptedIterator<WARCRecord, SearchResult>(itr2, adapter2); } - private class ArchiveRecordToWARCRecordAdapter implements - Adapter<ArchiveRecord, WARCRecord> { + public UrlCanonicalizer getCanonicalizer() { + return canonicalizer; + } - /* (non-Javadoc) - * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) - */ - public WARCRecord adapt(ArchiveRecord o) { - WARCRecord rec = null; - if (o instanceof WARCRecord) { - rec = (WARCRecord) o; - } - return rec; - } + public void setCanonicalizer(UrlCanonicalizer canonicalizer) { + this.canonicalizer = canonicalizer; } - + private static void USAGE() { System.err.println("USAGE:"); System.err.println(""); - System.err.println("warc-indexer WARCFILE"); - System.err.println("warc-indexer WARCFILE CDXFILE"); + System.err.println("warc-indexer [-identity] WARCFILE"); + System.err.println("warc-indexer [-identity] WARCFILE CDXFILE"); System.err.println(""); System.err.println("Create a CDX format index at CDXFILE or to STDOUT"); + System.err.println("With -identity, perform no url canonicalization."); System.exit(1); } @@ -73,13 +78,19 @@ */ public static void main(String[] args) { WarcIndexer indexer = new WarcIndexer(); - File arc = new File(args[0]); + int idx = 0; + if(args[0] != null && args[0].equals("-identity")) { + indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); + idx++; + } + File arc = new File(args[idx]); + idx++; PrintWriter pw = null; try { - if (args.length == 1) { + if (args.length == idx) { // dump to STDOUT: pw = new PrintWriter(System.out); - } else if (args.length == 2) { + } else if (args.length == (idx+1)) { pw = new PrintWriter(args[1]); } else { USAGE(); @@ -94,4 +105,19 @@ e.printStackTrace(); } } + + private class ArchiveRecordToWARCRecordAdapter implements + Adapter<ArchiveRecord, WARCRecord> { + + /* (non-Javadoc) + * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object) + */ + public WARCRecord adapt(ArchiveRecord o) { + WARCRecord rec = null; + if (o instanceof WARCRecord) { + rec = (WARCRecord) o; + } + return rec; + } + } } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |