From: <bra...@us...> - 2009-11-06 01:49:43
|
Revision: 2885 http://archive-access.svn.sourceforge.net/archive-access/?rev=2885&view=rev Author: bradtofel Date: 2009-11-06 01:49:32 +0000 (Fri, 06 Nov 2009) Log Message: ----------- REFACTOR: moved main() from ArcIndexer and WarcIndexer into IndexWorker Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java 2009-11-06 01:42:28 UTC (rev 2884) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java 2009-11-06 01:49:32 UTC (rev 2885) @@ -25,9 +25,7 @@ package org.archive.wayback.resourcestore.indexer; import java.io.File; -import java.io.PrintWriter; import java.io.IOException; -import java.util.Iterator; import org.archive.io.ArchiveRecord; import org.archive.io.arc.ARCReader; @@ -35,12 +33,10 @@ import org.archive.io.arc.ARCRecord; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; import org.archive.wayback.util.AdaptedIterator; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; -import org.archive.wayback.util.url.IdentityUrlCanonicalizer; /** * Transforms an ARC file into Iterator<CaptureSearchResult>. @@ -50,10 +46,6 @@ */ public class ArcIndexer { - /** - * CDX Header line for these fields. not very configurable.. - */ - public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; private UrlCanonicalizer canonicalizer = null; public ArcIndexer() { @@ -113,51 +105,6 @@ this.canonicalizer = canonicalizer; } - private static void USAGE() { - System.err.println("USAGE:"); - System.err.println(""); - System.err.println("arc-indexer [-identity] ARCFILE"); - System.err.println("arc-indexer [-identity] ARCFILE CDXFILE"); - System.err.println(""); - System.err.println("Create a CDX format index at CDXFILE or to STDOUT."); - System.err.println("With -identity, perform no url canonicalization."); - System.exit(1); - } - - /** - * @param args - */ - public static void main(String[] args) { - ArcIndexer indexer = new ArcIndexer(); - int idx = 0; - if(args[0] != null && args[0].equals("-identity")) { - indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); - idx++; - } - File arc = new File(args[idx]); - idx++; - PrintWriter pw = null; - try { - if(args.length == idx) { - // dump to STDOUT: - pw = new PrintWriter(System.out); - } else if(args.length == (idx + 1)) { - pw = new PrintWriter(args[idx]); - } else { - USAGE(); - } - Iterator<CaptureSearchResult> res = indexer.iterator(arc); - Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); - while(lines.hasNext()) { - pw.println(lines.next()); - } - pw.close(); - } catch (Exception e) { - e.printStackTrace(); - System.exit(1); - } - } - private class ArchiveRecordToARCRecordAdapter implements Adapter<ArchiveRecord,ARCRecord> { Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2009-11-06 01:42:28 UTC (rev 2884) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/IndexWorker.java 2009-11-06 01:49:32 UTC (rev 2885) @@ -24,16 +24,23 @@ */ package org.archive.wayback.resourcestore.indexer; +import java.io.FileNotFoundException; import java.io.IOException; +import java.io.PrintWriter; +import java.util.Iterator; import java.util.logging.Logger; import org.archive.wayback.Shutdownable; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; +import org.archive.wayback.resourceindex.cdx.CDXFormatIndex; +import org.archive.wayback.resourceindex.cdx.SearchResultToCDXFormatAdapter; +import org.archive.wayback.resourceindex.cdx.format.CDXFormat; +import org.archive.wayback.resourceindex.cdx.format.CDXFormatException; import org.archive.wayback.resourceindex.updater.IndexClient; import org.archive.wayback.resourcestore.locationdb.ResourceFileLocationDB; import org.archive.wayback.util.CloseableIterator; -//import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; +import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; import org.archive.wayback.util.url.IdentityUrlCanonicalizer; /** @@ -112,6 +119,7 @@ } } catch(IOException e) { LOGGER.severe("FAILED to index or upload (" + name + ")"); + e.printStackTrace(); } } return worked; @@ -133,7 +141,86 @@ } return itr; } + + private static void USAGE() { + System.err.println("USAGE:"); + System.err.println(""); + System.err.println("cdx-indexer [-format FORMAT|-identity] FILE"); + System.err.println("cdx-indexer [-format FORMAT|-identity] FILE CDXFILE"); + System.err.println(""); + System.err.println("Create a CDX format index from ARC or WARC file"); + System.err.println("FILE at CDXFILE or to STDOUT."); + System.err.println("With -identity, perform no url canonicalization."); + System.err.println("With -format, output CDX in format FORMAT."); + System.exit(1); + } + /** + * @param args + */ + public static void main(String[] args) { + String cdxSpec = CDXFormatIndex.CDX_HEADER_MAGIC; + PrintWriter pw = new PrintWriter(System.out); + UrlCanonicalizer canonicalizer = new AggressiveUrlCanonicalizer(); + boolean setFormat = false; + boolean isIdentity = false; + String path = null; + for(int idx = 0; idx < args.length; idx++) { + if(args[idx].equals("-identity")) { + canonicalizer = new IdentityUrlCanonicalizer(); + isIdentity = true; + } else if(args[idx].equals("-format")) { + idx++; + if(idx >= args.length) { + USAGE(); + } + cdxSpec = args[idx]; + setFormat = true; + } else { + // either input filename: + if(path == null) { + path = args[idx]; + } else { + // or if that's already been specified, then target file: + if(idx+1 != args.length){ + USAGE(); + } + try { + pw = new PrintWriter(args[idx]); + } catch (FileNotFoundException e) { + e.printStackTrace(); + System.exit(1); + } + break; + } + } + } + if(!setFormat && isIdentity) { + cdxSpec = cdxSpec.replace(" N ", " a "); + } + IndexWorker worker = new IndexWorker(); + worker.canonicalizer = canonicalizer; + worker.interval = 0; + worker.init(); + try { + CloseableIterator<CaptureSearchResult> itr = worker.indexFile(path); + CDXFormat cdxFormat = new CDXFormat(cdxSpec); + Iterator<String> lines = + SearchResultToCDXFormatAdapter.adapt(itr, cdxFormat); + pw.println(cdxSpec); + while(lines.hasNext()) { + pw.println(lines.next()); + } + pw.close(); + } catch (IOException e) { + e.printStackTrace(); + System.exit(1); + } catch (CDXFormatException e) { + e.printStackTrace(); + System.exit(1); + } + + } private class WorkerThread extends Thread { private long runInterval = 120000; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2009-11-06 01:42:28 UTC (rev 2884) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2009-11-06 01:49:32 UTC (rev 2885) @@ -2,8 +2,6 @@ import java.io.File; import java.io.IOException; -import java.io.PrintWriter; -import java.util.Iterator; import org.archive.io.ArchiveRecord; import org.archive.io.warc.WARCReader; @@ -11,20 +9,13 @@ import org.archive.io.warc.WARCRecord; import org.archive.wayback.UrlCanonicalizer; import org.archive.wayback.core.CaptureSearchResult; -import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter; import org.archive.wayback.util.AdaptedIterator; import org.archive.wayback.util.Adapter; import org.archive.wayback.util.CloseableIterator; import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; -import org.archive.wayback.util.url.IdentityUrlCanonicalizer; public class WarcIndexer { - /** - * CDX Header line for these fields. not very configurable.. - */ - public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; - private UrlCanonicalizer canonicalizer = null; private boolean processAll = false; public WarcIndexer() { @@ -89,60 +80,7 @@ public void setCanonicalizer(UrlCanonicalizer canonicalizer) { this.canonicalizer = canonicalizer; } - - private static void USAGE() { - System.err.println("USAGE:"); - System.err.println(""); - System.err.println("warc-indexer [-identity] [-all] WARCFILE"); - System.err.println("warc-indexer [-identity] [-all] WARCFILE CDXFILE"); - System.err.println(""); - System.err.println("Create a CDX format index at CDXFILE or to STDOUT"); - System.err.println("With -identity, perform no url canonicalization."); - System.err.println("With -all, output request and metadata records."); - System.exit(1); - } - /** - * @param args - */ - public static void main(String[] args) { - WarcIndexer indexer = new WarcIndexer(); - int idx = 0; - while(args[idx] != null) { - if(args[idx].equals("-identity")) { - indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); - } else if(args[idx].equals("-all")) { - indexer.setProcessAll(true); - } else { - break; - } - idx++; - } - File arc = new File(args[idx]); - idx++; - PrintWriter pw = null; - try { - if (args.length == idx) { - // dump to STDOUT: - pw = new PrintWriter(System.out); - } else if (args.length == (idx+1)) { - pw = new PrintWriter(args[1]); - } else { - USAGE(); - } - Iterator<CaptureSearchResult> res = indexer.iterator(arc); - Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res); - while (lines.hasNext()) { - pw.println(lines.next()); - } - pw.close(); - - } catch (Exception e) { - e.printStackTrace(); - System.exit(1); - } - } - private class ArchiveRecordToWARCRecordAdapter implements Adapter<ArchiveRecord, WARCRecord> { This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |