From: <bra...@us...> - 2008-11-07 22:38:52
|
Revision: 2639 http://archive-access.svn.sourceforge.net/archive-access/?rev=2639&view=rev Author: bradtofel Date: 2008-11-07 22:38:48 +0000 (Fri, 07 Nov 2008) Log Message: ----------- FEATURE: added -all option to warc-indexer command line tool, causing the tool to output records for request and metadata records as well as duplicate, capture, and dns records. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2008-11-07 22:35:24 UTC (rev 2638) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java 2008-11-07 22:38:48 UTC (rev 2639) @@ -2,7 +2,7 @@ import java.io.File; import java.io.IOException; -//import java.util.logging.Logger; +import java.util.logging.Logger; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpParser; @@ -33,14 +33,23 @@ */ public class WARCRecordToSearchResultAdapter implements Adapter<WARCRecord,CaptureSearchResult>{ + private static final Logger LOGGER = + Logger.getLogger(WARCRecordToSearchResultAdapter.class.getName()); private final static String DEFAULT_VALUE = "-"; -// private static final Logger LOGGER = Logger.getLogger( -// WARCRecordToSearchResultAdapter.class.getName()); - private UrlCanonicalizer canonicalizer = null; + + private boolean processAll = false; + public boolean isProcessAll() { + return processAll; + } + + public void setProcessAll(boolean processAll) { + this.processAll = processAll; + } + public WARCRecordToSearchResultAdapter() { canonicalizer = new AggressiveUrlCanonicalizer(); } @@ -75,12 +84,19 @@ return output.toString(); } - private static String transformHTTPMime(final String input) { + private static String escapeSpaces(final String input) { + if(input.contains(" ")) { + return input.replace(" ", "%20"); + } + return input; + } + + private static String transformHTTPMime(String input) { int semiIdx = input.indexOf(";"); if(semiIdx > 0) { - return input.substring(0,semiIdx).trim(); + return escapeSpaces(input.substring(0,semiIdx).trim()); } - return input.trim(); + return escapeSpaces(input.trim()); } private String transformWarcFilename(String readerIdentifier) { @@ -148,16 +164,21 @@ return result; } - private CaptureSearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec) + private CaptureSearchResult adaptGeneric(ArchiveRecordHeader header, + WARCRecord rec, String mime) throws IOException { CaptureSearchResult result = getBlankSearchResult(); result.setCaptureTimestamp(transformDate(header.getDate())); + result.setFile(transformWarcFilename(header.getReaderIdentifier())); + result.setOffset(header.getOffset()); result.setDigest(transformDigest(header.getHeaderValue( - WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); + WARCRecord.HEADER_KEY_PAYLOAD_DIGEST))); addUrlDataToSearchResult(result,header.getUrl()); + + result.setMimeType(mime); return result; } @@ -243,7 +264,7 @@ } return result; } - + private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException { CaptureSearchResult result = null; @@ -257,7 +278,17 @@ result = adaptResponse(header,rec); } } else if(type.equals(WARCConstants.REVISIT)) { - result = adaptRevisit(header,rec); + result = adaptGeneric(header,rec,"warc/revisit"); + } else if(type.equals(WARCConstants.REQUEST)) { + if(processAll) { + result = adaptGeneric(header,rec,"warc/request"); + } + } else if(type.equals(WARCConstants.METADATA)) { + if(processAll) { + result = adaptGeneric(header,rec,"warc/metadata"); + } + } else { + LOGGER.info("Skipping record type : " + type); } return result; Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2008-11-07 22:35:24 UTC (rev 2638) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java 2008-11-07 22:38:48 UTC (rev 2639) @@ -26,9 +26,19 @@ public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g"; private UrlCanonicalizer canonicalizer = null; + private boolean processAll = false; public WarcIndexer() { canonicalizer = new AggressiveUrlCanonicalizer(); } + + public boolean isProcessAll() { + return processAll; + } + + public void setProcessAll(boolean processAll) { + this.processAll = processAll; + } + /** * @param warc @@ -61,6 +71,7 @@ WARCRecordToSearchResultAdapter adapter2 = new WARCRecordToSearchResultAdapter(); adapter2.setCanonicalizer(canonicalizer); + adapter2.setProcessAll(processAll); ArchiveReaderCloseableIterator itr1 = new ArchiveReaderCloseableIterator(reader,reader.iterator()); @@ -82,11 +93,12 @@ private static void USAGE() { System.err.println("USAGE:"); System.err.println(""); - System.err.println("warc-indexer [-identity] WARCFILE"); - System.err.println("warc-indexer [-identity] WARCFILE CDXFILE"); + System.err.println("warc-indexer [-identity] [-all] WARCFILE"); + System.err.println("warc-indexer [-identity] [-all] WARCFILE CDXFILE"); System.err.println(""); System.err.println("Create a CDX format index at CDXFILE or to STDOUT"); System.err.println("With -identity, perform no url canonicalization."); + System.err.println("With -all, output request and metadata records."); System.exit(1); } @@ -96,8 +108,14 @@ public static void main(String[] args) { WarcIndexer indexer = new WarcIndexer(); int idx = 0; - if(args[0] != null && args[0].equals("-identity")) { - indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); + while(args[idx] != null) { + if(args[idx].equals("-identity")) { + indexer.setCanonicalizer(new IdentityUrlCanonicalizer()); + } else if(args[idx].equals("-all")) { + indexer.setProcessAll(true); + } else { + break; + } idx++; } File arc = new File(args[idx]); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |