Revision: 2085
http://archive-access.svn.sourceforge.net/archive-access/?rev=2085&view=rev
Author: bradtofel
Date: 2007-11-27 18:08:02 -0800 (Tue, 27 Nov 2007)
Log Message:
-----------
INITIAL REV: class to transform a WARC file into an Iterator<SearchResult>. Includes main() to support command line conversion.
Added Paths:
-----------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java
Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java 2007-11-28 02:08:02 UTC (rev 2085)
@@ -0,0 +1,97 @@
+package org.archive.wayback.resourcestore;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.Iterator;
+
+import org.archive.io.ArchiveRecord;
+import org.archive.io.warc.WARCReader;
+import org.archive.io.warc.WARCReaderFactory;
+import org.archive.io.warc.WARCRecord;
+import org.archive.wayback.core.SearchResult;
+import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter;
+import org.archive.wayback.util.AdaptedIterator;
+import org.archive.wayback.util.Adapter;
+import org.archive.wayback.util.CloseableIterator;
+
+public class WarcIndexer {
+
+ /**
+ * CDX Header line for these fields. not very configurable..
+ */
+ public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g";
+
+ /**
+ * @param arc
+ * @return Iterator of SearchResults for input arc File
+ * @throws IOException
+ */
+ public CloseableIterator<SearchResult> iterator(File warc)
+ throws IOException {
+
+ Adapter<ArchiveRecord, WARCRecord> adapter1 = new ArchiveRecordToWARCRecordAdapter();
+
+ Adapter<WARCRecord, SearchResult> adapter2 = new WARCRecordToSearchResultAdapter();
+ WARCReader reader = WARCReaderFactory.get(warc);
+
+ Iterator<ArchiveRecord> itr1 = reader.iterator();
+
+ CloseableIterator<WARCRecord> itr2 = new AdaptedIterator<ArchiveRecord, WARCRecord>(
+ itr1, adapter1);
+
+ return new AdaptedIterator<WARCRecord, SearchResult>(itr2, adapter2);
+ }
+
+ private class ArchiveRecordToWARCRecordAdapter implements
+ Adapter<ArchiveRecord, WARCRecord> {
+
+ /* (non-Javadoc)
+ * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
+ */
+ public WARCRecord adapt(ArchiveRecord o) {
+ WARCRecord rec = null;
+ if (o instanceof WARCRecord) {
+ rec = (WARCRecord) o;
+ }
+ return rec;
+ }
+ }
+
+ private static void USAGE() {
+ System.err.println("USAGE:");
+ System.err.println("");
+ System.err.println("warc-indexer WARCFILE");
+ System.err.println("warc-indexer WARCFILE CDXFILE");
+ System.err.println("");
+ System.err.println("Create a CDX format index at CDXFILE or to STDOUT");
+ System.exit(1);
+ }
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) {
+ WarcIndexer indexer = new WarcIndexer();
+ File arc = new File(args[0]);
+ PrintWriter pw = null;
+ try {
+ if (args.length == 1) {
+ // dump to STDOUT:
+ pw = new PrintWriter(System.out);
+ } else if (args.length == 2) {
+ pw = new PrintWriter(args[1]);
+ } else {
+ USAGE();
+ }
+ Iterator<SearchResult> res = indexer.iterator(arc);
+ Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res);
+ while (lines.hasNext()) {
+ pw.println(lines.next());
+ }
+ pw.close();
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ }
+}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|