[Archive-access-cvs] SF.net SVN: archive-access: [2321] trunk/archive-access/projects/wayback/ way

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2321
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2321&view=rev
Author:   bradtofel
Date:     2008-06-24 18:30:18 -0700 (Tue, 24 Jun 2008)

Log Message:
-----------
REFACTOR: moved indexing related code into indexer package

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArchiveReaderCloseableIterator.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java

Removed Paths:
-------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java

Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java
===================================================================

--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java	2008-06-25 00:32:57 UTC (rev 2320)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java	2008-06-25 01:30:18 UTC (rev 2321)
@@ -1,173 +0,0 @@
-/* ArcRecordToSearchResultAdapter
- *
- * $Id$
- *
- * Created on 3:27:03 PM Jul 26, 2007.
- *
- * Copyright (C) 2007 Internet Archive.
- *
- * This file is part of wayback-core.
- *
- * wayback-core is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * any later version.
- *
- * wayback-core is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser Public License for more details.
- *
- * You should have received a copy of the GNU Lesser Public License
- * along with wayback-core; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-package org.archive.wayback.resourcestore;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.logging.Logger;
-
-import org.apache.commons.httpclient.Header;
-import org.apache.commons.httpclient.URIException;
-import org.archive.io.arc.ARCRecord;
-import org.archive.io.arc.ARCRecordMetaData;
-import org.archive.net.UURI;
-import org.archive.net.UURIFactory;
-import org.archive.wayback.UrlCanonicalizer;
-import org.archive.wayback.WaybackConstants;
-import org.archive.wayback.core.SearchResult;
-import org.archive.wayback.util.Adapter;
-import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
-
-/**
- *
- *
- * @author brad
- * @version $Date$, $Revision$
- */
-public class ARCRecordToSearchResultAdapter 
-implements Adapter<ARCRecord,SearchResult>{
-
-	private static final Logger LOGGER = Logger.getLogger(
-			ARCRecordToSearchResultAdapter.class.getName());
-
-	private UrlCanonicalizer canonicalizer = null;
-	
-	public ARCRecordToSearchResultAdapter() {
-		canonicalizer = new AggressiveUrlCanonicalizer();
-	}
-//	public static SearchResult arcRecordToSearchResult(final ARCRecord rec)
-//	throws IOException, ParseException {
-	/* (non-Javadoc)
-	 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
-	 */
-	public SearchResult adapt(ARCRecord rec) {
-		try {
-			return adaptInner(rec);
-		} catch (IOException e) {
-			e.printStackTrace();
-			return null;
-		}
-	}
-	
-	private SearchResult adaptInner(ARCRecord rec) throws IOException {
-		rec.close();
-		ARCRecordMetaData meta = rec.getMetaData();
-		
-		SearchResult result = new SearchResult();
-		String arcName = meta.getArc(); 
-		int index = arcName.lastIndexOf(File.separator);
-		if (index > 0 && (index + 1) < arcName.length()) {
-		    arcName = arcName.substring(index + 1);
-		}
-		result.put(WaybackConstants.RESULT_ARC_FILE, arcName);
-		result.put(WaybackConstants.RESULT_OFFSET, String.valueOf(meta
-				.getOffset()));
-		
-		// initialize with default HTTP code...
-		result.put(WaybackConstants.RESULT_HTTP_CODE, "-");
-		
-		result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr());
-		result.put(WaybackConstants.RESULT_MIME_TYPE, meta.getMimetype());
-		result.put(WaybackConstants.RESULT_CAPTURE_DATE, meta.getDate());
-		
-		String uriStr = meta.getUrl();
-		if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) {
-			// skip filedesc record altogether...
-			return null;
-		}
-		if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) {
-			// skip URL + HTTP header processing for dns records...
-		
-			String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX
-					.length());
-			result.put(WaybackConstants.RESULT_ORIG_HOST, origHost);
-			result.put(WaybackConstants.RESULT_REDIRECT_URL, "-");
-			result.put(WaybackConstants.RESULT_URL, uriStr);
-			result.put(WaybackConstants.RESULT_URL_KEY, uriStr);
-		
-		} else {
-		
-			UURI uri = UURIFactory.getInstance(uriStr);
-			result.put(WaybackConstants.RESULT_URL, uriStr);
-		
-			String uriHost = uri.getHost();
-			if (uriHost == null) {
-				LOGGER.info("No host in " + uriStr + " in " + meta.getArc());
-			} else {
-				result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost);
-		
-				String statusCode = (meta.getStatusCode() == null) ? "-" : meta
-						.getStatusCode();
-				result.put(WaybackConstants.RESULT_HTTP_CODE, statusCode);
-		
-				String redirectUrl = "-";
-				Header[] headers = rec.getHttpHeaders();
-				if (headers != null) {
-		
-					for (int i = 0; i < headers.length; i++) {
-						if (headers[i].getName().equals(
-								WaybackConstants.LOCATION_HTTP_HEADER)) {
-
-							String locationStr = headers[i].getValue();
-							// TODO: "Location" is supposed to be absolute:
-							// (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
-							// (section 14.30) but Content-Location can be
-							// relative.
-							// is it correct to resolve a relative Location, as
-							// we are?
-							// it's also possible to have both in the HTTP
-							// headers...
-							// should we prefer one over the other?
-							// right now, we're ignoring "Content-Location"
-							try {
-								UURI uriRedirect = UURIFactory.getInstance(uri,
-										locationStr);
-								redirectUrl = uriRedirect.getEscapedURI();
-		
-							} catch (URIException e) {
-								LOGGER.info("Bad Location: " + locationStr
-										+ " for " + uriStr + " in "
-										+ meta.getArc() + " Skipped");
-							}
-							break;
-						}
-					}
-				}
-				result.put(WaybackConstants.RESULT_REDIRECT_URL, redirectUrl);
-		
-				String indexUrl = canonicalizer.urlStringToKey(meta.getUrl());
-				result.put(WaybackConstants.RESULT_URL_KEY, indexUrl);
-			}
-		
-		}
-		return result;
-	}
-	public UrlCanonicalizer getCanonicalizer() {
-		return canonicalizer;
-	}
-	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
-		this.canonicalizer = canonicalizer;
-	}
-}

Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java	2008-06-25 00:32:57 UTC (rev 2320)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java	2008-06-25 01:30:18 UTC (rev 2321)
@@ -1,175 +0,0 @@
-/* ArcIndexer
- *
- * $Id$
- *
- * Created on 2:33:29 PM Oct 11, 2006.
- *
- * Copyright (C) 2006 Internet Archive.
- *
- * This file is part of Wayback.
- *
- * Wayback is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * any later version.
- *
- * Wayback is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser Public License for more details.
- *
- * You should have received a copy of the GNU Lesser Public License
- * along with Wayback; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-package org.archive.wayback.resourcestore;
-
-import java.io.File;
-import java.io.PrintWriter;
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.archive.io.ArchiveRecord;
-import org.archive.io.arc.ARCReader;
-import org.archive.io.arc.ARCReaderFactory;
-import org.archive.io.arc.ARCRecord;
-import org.archive.wayback.UrlCanonicalizer;
-import org.archive.wayback.core.SearchResult;
-import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter;
-import org.archive.wayback.util.AdaptedIterator;
-import org.archive.wayback.util.Adapter;
-import org.archive.wayback.util.CloseableIterator;
-import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
-import org.archive.wayback.util.url.IdentityUrlCanonicalizer;
-
-/**
- * Transforms an ARC file into Iterator<SearchResult>.
- * 
- * @author brad
- * @version $Date$, $Revision$
- */
-public class ArcIndexer {
-
-	/**
-	 * CDX Header line for these fields. not very configurable..
-	 */
-	public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g";
-	private UrlCanonicalizer canonicalizer = null;
-	
-	public ArcIndexer() {
-		canonicalizer = new AggressiveUrlCanonicalizer();
-	}
-
-	/**
-	 * @param arc
-	 * @return Iterator of SearchResults for input arc File
-	 * @throws IOException
-	 */
-	public CloseableIterator<SearchResult> iterator(File arc)
-	throws IOException {
-		return iterator(ARCReaderFactory.get(arc));
-	}
-
-	/**
-	 * @param pathOrUrl
-	 * @return Iterator of SearchResults for input pathOrUrl
-	 * @throws IOException
-	 */
-	public CloseableIterator<SearchResult> iterator(String pathOrUrl)
-	throws IOException {
-		return iterator(ARCReaderFactory.get(pathOrUrl));
-	}
-	
-	/**
-	 * @param arcReader
-	 * @return Iterator of SearchResults for input ARCReader
-	 * @throws IOException
-	 */
-	public CloseableIterator<SearchResult> iterator(ARCReader arcReader)
-	throws IOException {
-		arcReader.setParseHttpHeaders(true);
-
-		Adapter<ArchiveRecord,ARCRecord> adapter1 =
-			new ArchiveRecordToARCRecordAdapter();
-
-		ARCRecordToSearchResultAdapter adapter2 =
-			new ARCRecordToSearchResultAdapter();
-		adapter2.setCanonicalizer(canonicalizer);
-		
-		ArchiveReaderCloseableIterator itr1 = 
-			new ArchiveReaderCloseableIterator(arcReader,arcReader.iterator());
-
-		CloseableIterator<ARCRecord> itr2 = 
-			new AdaptedIterator<ArchiveRecord,ARCRecord>(itr1,adapter1);
-		
-		return new AdaptedIterator<ARCRecord,SearchResult>(itr2,adapter2);
-	}
-	
-	public UrlCanonicalizer getCanonicalizer() {
-		return canonicalizer;
-	}
-
-	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
-		this.canonicalizer = canonicalizer;
-	}
-
-	private static void USAGE() {
-		System.err.println("USAGE:");
-		System.err.println("");
-		System.err.println("arc-indexer [-identity] ARCFILE");
-		System.err.println("arc-indexer [-identity] ARCFILE CDXFILE");
-		System.err.println("");
-		System.err.println("Create a CDX format index at CDXFILE or to STDOUT.");
-		System.err.println("With -identity, perform no url canonicalization.");
-		System.exit(1);
-	}
-	
-	/**
-	 * @param args
-	 */
-	public static void main(String[] args) {
-		ArcIndexer indexer = new ArcIndexer();
-		int idx = 0;
-		if(args[0] != null && args[0].equals("-identity")) {
-			indexer.setCanonicalizer(new IdentityUrlCanonicalizer());
-			idx++;
-		}
-		File arc = new File(args[idx]);
-		idx++;
-		PrintWriter pw = null;
-		try {
-			if(args.length == idx) {
-				// dump to STDOUT:
-				pw = new PrintWriter(System.out);
-			} else if(args.length == (idx + 1)) {
-				pw = new PrintWriter(args[idx]);
-			} else {
-				USAGE();
-			}
-			Iterator<SearchResult> res = indexer.iterator(arc);
-			Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res);
-			while(lines.hasNext()) {
-				pw.println(lines.next());
-			}
-			pw.close();
-		} catch (Exception e) {
-			e.printStackTrace();
-			System.exit(1);
-		}
-	}
-	
-	private class ArchiveRecordToARCRecordAdapter 
-	implements Adapter<ArchiveRecord,ARCRecord> {
-
-		/* (non-Javadoc)
-		 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
-		 */
-		public ARCRecord adapt(ArchiveRecord o) {
-			ARCRecord rec = null;
-			if(o instanceof ARCRecord) {
-				rec = (ARCRecord) o;
-			}
-			return rec;
-		}
-	}
-}

Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java	2008-06-25 00:32:57 UTC (rev 2320)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java	2008-06-25 01:30:18 UTC (rev 2321)
@@ -1,29 +0,0 @@
-package org.archive.wayback.resourcestore;
-
-import java.io.IOException;
-import java.util.Iterator;
-
-import org.archive.io.ArchiveReader;
-import org.archive.io.ArchiveRecord;
-import org.archive.wayback.util.CloseableIterator;
-
-public class ArchiveReaderCloseableIterator implements CloseableIterator<ArchiveRecord> {
-	private ArchiveReader reader = null;
-	private Iterator<ArchiveRecord> itr = null;
-	public ArchiveReaderCloseableIterator(ArchiveReader reader, Iterator<ArchiveRecord> itr) {
-		this.reader = reader;
-		this.itr = itr;
-	}
-	public boolean hasNext() {
-		return itr.hasNext();
-	}
-	public ArchiveRecord next() {
-		return itr.next();
-	}
-	public void remove() {
-		itr.remove();
-	}
-	public void close() throws IOException {
-		reader.close();
-	}
-}

Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java	2008-06-25 00:32:57 UTC (rev 2320)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java	2008-06-25 01:30:18 UTC (rev 2321)
@@ -1,318 +0,0 @@
-package org.archive.wayback.resourcestore;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.logging.Logger;
-
-import org.apache.commons.httpclient.Header;
-import org.apache.commons.httpclient.HttpParser;
-import org.apache.commons.httpclient.StatusLine;
-import org.apache.commons.httpclient.URIException;
-import org.apache.commons.httpclient.util.EncodingUtil;
-import org.archive.io.ArchiveRecordHeader;
-import org.archive.io.RecoverableIOException;
-import org.archive.io.arc.ARCConstants;
-import org.archive.io.warc.WARCConstants;
-import org.archive.io.warc.WARCRecord;
-import org.archive.net.UURI;
-import org.archive.net.UURIFactory;
-import org.archive.wayback.UrlCanonicalizer;
-import org.archive.wayback.WaybackConstants;
-import org.archive.wayback.core.SearchResult;
-import org.archive.wayback.util.Adapter;
-import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
-
-/**
- * Adapts certain WARCRecords into SearchResults. DNS and response records are
- * mostly straightforward, but SearchResult objects generated from revisit 
- * records contain lots of "placeholder" fields, which are expected to be
- * understood by later processes traversing a stream of SearchResult objects.
- * 
- * See org.archive.wayback.resourceindex.DeduplicateSearchResultAnnotationAdapter.
- *
- * @author brad
- * @version $Date$, $Revision$
- */
-public class WARCRecordToSearchResultAdapter
-implements Adapter<WARCRecord,SearchResult>{
-	
-	private final static String DEFAULT_VALUE = "-"; 
-	private final static String SEARCH_FIELDS[] = {
-			WaybackConstants.RESULT_URL,
-			WaybackConstants.RESULT_URL_KEY,
-			WaybackConstants.RESULT_ORIG_HOST,
-			WaybackConstants.RESULT_CAPTURE_DATE,
-			WaybackConstants.RESULT_MD5_DIGEST,
-			WaybackConstants.RESULT_MIME_TYPE,
-			WaybackConstants.RESULT_HTTP_CODE,
-			WaybackConstants.RESULT_REDIRECT_URL,
-			WaybackConstants.RESULT_ARC_FILE,
-			WaybackConstants.RESULT_OFFSET,
-	};
-
-	private static final Logger LOGGER = Logger.getLogger(
-			WARCRecordToSearchResultAdapter.class.getName());
-
-	private UrlCanonicalizer canonicalizer = null;
-
-	public WARCRecordToSearchResultAdapter() {
-		canonicalizer = new AggressiveUrlCanonicalizer();
-	}
-
-	/* (non-Javadoc)
-	 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
-	 */
-	public SearchResult adapt(WARCRecord rec) {
-		try {
-			return adaptInner(rec);
-		} catch (IOException e) {
-			e.printStackTrace();
-			return null;
-		}
-	}
-	
-	/*
-	 * Transform input date to 14-digit timestamp:
-	 * 2007-08-29T18:00:26Z => 20070829180026
-	 */
-	private static String transformDate(final String input) {
-		
-		StringBuilder output = new StringBuilder(14);
-		
-		output.append(input.substring(0,4));
-		output.append(input.substring(5,7));
-		output.append(input.substring(8,10));
-		output.append(input.substring(11,13));
-		output.append(input.substring(14,16));
-		output.append(input.substring(17,19));
-		
-		return output.toString();
-	}
-	
-	private static String transformHTTPMime(final String input) {
-		int semiIdx = input.indexOf(";");
-		if(semiIdx > 0) {
-			return input.substring(0,semiIdx).trim();
-		}
-		return input.trim();
-	}
-
-	private String transformWarcFilename(String readerIdentifier) {
-		String warcName = readerIdentifier;
-		int index = warcName.lastIndexOf(File.separator);
-		if (index > 0 && (index + 1) < warcName.length()) {
-		    warcName = warcName.substring(index + 1);
-		}
-		return warcName;
-	}
-
-	private String transformDigest(final Object o) {
-		if(o == null) {
-			return DEFAULT_VALUE;
-		}
-		String orig = o.toString();
-		if(orig.startsWith("sha1:")) {
-			return orig.substring(5);
-		}
-		return orig;
-	}
-
-	private SearchResult getBlankSearchResult() {
-		SearchResult result = new SearchResult();
-		for(String field : SEARCH_FIELDS) {
-			result.put(field, DEFAULT_VALUE);
-		}
-		return result;
-	}
-	
-	private UURI addUrlDataToSearchResult(SearchResult result, String urlStr)
-	throws IOException {
-
-		result.put(WaybackConstants.RESULT_URL, urlStr);
-		result.put(WaybackConstants.RESULT_URL_KEY, urlStr);
-
-	
-		UURI uri = UURIFactory.getInstance(urlStr);
-		String uriHost = uri.getHost();
-		if (uriHost == null) {
-
-			LOGGER.info("No host in " + urlStr);
-
-		} else {
-
-			result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost);
-		}
-
-		String urlKey = canonicalizer.urlStringToKey(urlStr);
-		result.put(WaybackConstants.RESULT_URL_KEY, urlKey);
-
-		return uri;
-	}
-
-	private SearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) 
-	throws IOException {
-
-		SearchResult result = getBlankSearchResult();
-
-		result.put(WaybackConstants.RESULT_CAPTURE_DATE, 
-				transformDate(header.getDate()));
-		result.put(WaybackConstants.RESULT_ARC_FILE,
-				transformWarcFilename(header.getReaderIdentifier()));
-		result.put(WaybackConstants.RESULT_OFFSET, 
-				String.valueOf(header.getOffset()));
-		
-		String uriStr = header.getUrl();
-		
-		String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX
-				.length());
-		result.put(WaybackConstants.RESULT_MIME_TYPE, header.getMimetype());
-
-		result.put(WaybackConstants.RESULT_ORIG_HOST, origHost);
-		result.put(WaybackConstants.RESULT_URL, uriStr);
-		result.put(WaybackConstants.RESULT_URL_KEY, uriStr);
-
-		rec.close();
-		result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr());
-
-		return result;
-	}
-
-	private SearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec) 
-	throws IOException {
-
-		SearchResult result = getBlankSearchResult();
-
-		result.put(WaybackConstants.RESULT_CAPTURE_DATE, 
-				transformDate(header.getDate()));
-		result.put(WaybackConstants.RESULT_MD5_DIGEST, 
-				transformDigest(header.getHeaderValue(
-						WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
-		
-		addUrlDataToSearchResult(result,header.getUrl());
-
-		return result;
-	}
-
-    /**
-     * borrowed(copied) from org.archive.io.arc.ARCRecord...
-     * 
-     * @param bytes Array of bytes to examine for an EOL.
-     * @return Count of end-of-line characters or zero if none.
-     */
-    private int getEolCharsCount(byte [] bytes) {
-        int count = 0;
-        if (bytes != null && bytes.length >=1 &&
-                bytes[bytes.length - 1] == '\n') {
-            count++;
-            if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
-                count++;
-            }
-        }
-        return count;
-    }
-	
-	private SearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec) 
-	throws IOException {
-
-		SearchResult result = getBlankSearchResult();
-
-		result.put(WaybackConstants.RESULT_CAPTURE_DATE, 
-				transformDate(header.getDate()));
-		result.put(WaybackConstants.RESULT_ARC_FILE,
-				transformWarcFilename(header.getReaderIdentifier()));
-		result.put(WaybackConstants.RESULT_OFFSET, 
-				String.valueOf(header.getOffset()));
-		
-		String origUrl = header.getUrl();
-		UURI uri = addUrlDataToSearchResult(result,origUrl);
-
-		// need to parse the documents HTTP message and headers here: WARCReader
-		// does not implement this... yet..
-		
-        byte [] statusBytes = HttpParser.readRawLine(rec);
-        int eolCharCount = getEolCharsCount(statusBytes);
-        if (eolCharCount <= 0) {
-            throw new RecoverableIOException("Failed to read http status where one " +
-                " was expected: " + new String(statusBytes));
-        }
-        String statusLine = EncodingUtil.getString(statusBytes, 0,
-            statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
-        if ((statusLine == null) ||
-                !StatusLine.startsWithHTTP(statusLine)) {
-           throw new RecoverableIOException("Failed parse of http status line.");
-        }
-        StatusLine status = new StatusLine(statusLine);
-		result.put(WaybackConstants.RESULT_HTTP_CODE, 
-				String.valueOf(status.getStatusCode()));
-        
-		Header[] headers = HttpParser.parseHeaders(rec,
-                ARCConstants.DEFAULT_ENCODING);
-
-		rec.close();
-		result.put(WaybackConstants.RESULT_MD5_DIGEST, 
-				transformDigest(header.getHeaderValue(
-						WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
-
-		if (headers != null) {
-	
-			for (Header httpHeader : headers) {
-				if (httpHeader.getName().equals(
-						WaybackConstants.LOCATION_HTTP_HEADER)) {
-	
-					String locationStr = httpHeader.getValue();
-					// TODO: "Location" is supposed to be absolute:
-					// (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
-					// (section 14.30) but Content-Location can be
-					// relative.
-					// is it correct to resolve a relative Location, as
-					// we are?
-					// it's also possible to have both in the HTTP
-					// headers...
-					// should we prefer one over the other?
-					// right now, we're ignoring "Content-Location"
-					try {
-						UURI uriRedirect = UURIFactory.getInstance(uri,
-								locationStr);
-						result.put(WaybackConstants.RESULT_REDIRECT_URL,
-								uriRedirect.getEscapedURI());
-					} catch (URIException e) {
-						LOGGER.info("Bad Location: " + locationStr
-								+ " for " + origUrl + " in "
-								+ header.getReaderIdentifier() + " Skipped");
-					}
-				} else if(httpHeader.getName().toLowerCase().equals("content-type")) {
-					result.put(WaybackConstants.RESULT_MIME_TYPE, 
-							transformHTTPMime(httpHeader.getValue()));
-				}
-			}
-		}
-		return result;
-	}
-	
-	private SearchResult adaptInner(WARCRecord rec) throws IOException {
-		
-		SearchResult result = null;
-		ArchiveRecordHeader header = rec.getHeader();
-		String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
-		if(type.equals(WARCConstants.RESPONSE)) {
-			String mime = header.getMimetype();
-			if(mime.equals("text/dns")) {
-				result = adaptDNS(header,rec);
-			} else {
-				result = adaptResponse(header,rec);
-			}
-		} else if(type.equals(WARCConstants.REVISIT)) {
-			result = adaptRevisit(header,rec);
-		}
-
-		return result;
-	}
-
-	public UrlCanonicalizer getCanonicalizer() {
-		return canonicalizer;
-	}
-
-	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
-		this.canonicalizer = canonicalizer;
-	}
-}

Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java	2008-06-25 00:32:57 UTC (rev 2320)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java	2008-06-25 01:30:18 UTC (rev 2321)
@@ -1,140 +0,0 @@
-package org.archive.wayback.resourcestore;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.util.Iterator;
-
-import org.archive.io.ArchiveRecord;
-import org.archive.io.warc.WARCReader;
-import org.archive.io.warc.WARCReaderFactory;
-import org.archive.io.warc.WARCRecord;
-import org.archive.wayback.UrlCanonicalizer;
-import org.archive.wayback.core.SearchResult;
-import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter;
-import org.archive.wayback.util.AdaptedIterator;
-import org.archive.wayback.util.Adapter;
-import org.archive.wayback.util.CloseableIterator;
-import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
-import org.archive.wayback.util.url.IdentityUrlCanonicalizer;
-
-public class WarcIndexer {
-
-	/**
-	 * CDX Header line for these fields. not very configurable..
-	 */
-	public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g";
-
-	private UrlCanonicalizer canonicalizer = null;
-	public WarcIndexer() {
-		canonicalizer = new AggressiveUrlCanonicalizer();
-	}
-	
-	/**
-	 * @param warc
-	 * @return Iterator of SearchResults for input arc File
-	 * @throws IOException
-	 */
-	public CloseableIterator<SearchResult> iterator(File warc)
-			throws IOException {
-		return iterator(WARCReaderFactory.get(warc));
-	}
-	/**
-	 * @param pathOrUrl
-	 * @return Iterator of SearchResults for input pathOrUrl
-	 * @throws IOException
-	 */
-	public CloseableIterator<SearchResult> iterator(String pathOrUrl)
-			throws IOException {
-		return iterator(WARCReaderFactory.get(pathOrUrl));
-	}
-	/**
-	 * @param arc
-	 * @return Iterator of SearchResults for input arc File
-	 * @throws IOException
-	 */
-	public CloseableIterator<SearchResult> iterator(WARCReader reader)
-			throws IOException {
-
-		Adapter<ArchiveRecord, WARCRecord> adapter1 = new ArchiveRecordToWARCRecordAdapter();
-
-		WARCRecordToSearchResultAdapter adapter2 = 
-			new WARCRecordToSearchResultAdapter();
-		adapter2.setCanonicalizer(canonicalizer);
-
-		ArchiveReaderCloseableIterator itr1 = 
-			new ArchiveReaderCloseableIterator(reader,reader.iterator());
-
-		CloseableIterator<WARCRecord> itr2 = 
-			new AdaptedIterator<ArchiveRecord, WARCRecord>(itr1, adapter1);
-
-		return new AdaptedIterator<WARCRecord, SearchResult>(itr2, adapter2);
-	}
-
-	public UrlCanonicalizer getCanonicalizer() {
-		return canonicalizer;
-	}
-
-	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
-		this.canonicalizer = canonicalizer;
-	}
-	
-	private static void USAGE() {
-		System.err.println("USAGE:");
-		System.err.println("");
-		System.err.println("warc-indexer [-identity] WARCFILE");
-		System.err.println("warc-indexer [-identity] WARCFILE CDXFILE");
-		System.err.println("");
-		System.err.println("Create a CDX format index at CDXFILE or to STDOUT");
-		System.err.println("With -identity, perform no url canonicalization.");
-		System.exit(1);
-	}
-
-	/**
-	 * @param args
-	 */
-	public static void main(String[] args) {
-		WarcIndexer indexer = new WarcIndexer();
-		int idx = 0;
-		if(args[0] != null && args[0].equals("-identity")) {
-			indexer.setCanonicalizer(new IdentityUrlCanonicalizer());
-			idx++;
-		}
-		File arc = new File(args[idx]);
-		idx++;
-		PrintWriter pw = null;
-		try {
-			if (args.length == idx) {
-				// dump to STDOUT:
-				pw = new PrintWriter(System.out);
-			} else if (args.length == (idx+1)) {
-				pw = new PrintWriter(args[1]);
-			} else {
-				USAGE();
-			}
-			Iterator<SearchResult> res = indexer.iterator(arc);
-			Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res);
-			while (lines.hasNext()) {
-				pw.println(lines.next());
-			}
-			pw.close();
-		} catch (Exception e) {
-			e.printStackTrace();
-		}
-	}
-
-	private class ArchiveRecordToWARCRecordAdapter implements
-			Adapter<ArchiveRecord, WARCRecord> {
-
-		/* (non-Javadoc)
-		 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
-		 */
-		public WARCRecord adapt(ArchiveRecord o) {
-			WARCRecord rec = null;
-			if (o instanceof WARCRecord) {
-				rec = (WARCRecord) o;
-			}
-			return rec;
-		}
-	}
-}

Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java (from rev 2138, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ARCRecordToSearchResultAdapter.java)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java	2008-06-25 01:30:18 UTC (rev 2321)
@@ -0,0 +1,173 @@
+/* ArcRecordToSearchResultAdapter
+ *
+ * $Id$
+ *
+ * Created on 3:27:03 PM Jul 26, 2007.
+ *
+ * Copyright (C) 2007 Internet Archive.
+ *
+ * This file is part of wayback-core.
+ *
+ * wayback-core is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback-core is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback-core; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+package org.archive.wayback.resourcestore.indexer;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.logging.Logger;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.URIException;
+import org.archive.io.arc.ARCRecord;
+import org.archive.io.arc.ARCRecordMetaData;
+import org.archive.net.UURI;
+import org.archive.net.UURIFactory;
+import org.archive.wayback.UrlCanonicalizer;
+import org.archive.wayback.WaybackConstants;
+import org.archive.wayback.core.SearchResult;
+import org.archive.wayback.util.Adapter;
+import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
+
+/**
+ *
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class ARCRecordToSearchResultAdapter 
+implements Adapter<ARCRecord,SearchResult>{
+
+	private static final Logger LOGGER = Logger.getLogger(
+			ARCRecordToSearchResultAdapter.class.getName());
+
+	private UrlCanonicalizer canonicalizer = null;
+	
+	public ARCRecordToSearchResultAdapter() {
+		canonicalizer = new AggressiveUrlCanonicalizer();
+	}
+//	public static SearchResult arcRecordToSearchResult(final ARCRecord rec)
+//	throws IOException, ParseException {
+	/* (non-Javadoc)
+	 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
+	 */
+	public SearchResult adapt(ARCRecord rec) {
+		try {
+			return adaptInner(rec);
+		} catch (IOException e) {
+			e.printStackTrace();
+			return null;
+		}
+	}
+	
+	private SearchResult adaptInner(ARCRecord rec) throws IOException {
+		rec.close();
+		ARCRecordMetaData meta = rec.getMetaData();
+		
+		SearchResult result = new SearchResult();
+		String arcName = meta.getArc(); 
+		int index = arcName.lastIndexOf(File.separator);
+		if (index > 0 && (index + 1) < arcName.length()) {
+		    arcName = arcName.substring(index + 1);
+		}
+		result.put(WaybackConstants.RESULT_ARC_FILE, arcName);
+		result.put(WaybackConstants.RESULT_OFFSET, String.valueOf(meta
+				.getOffset()));
+		
+		// initialize with default HTTP code...
+		result.put(WaybackConstants.RESULT_HTTP_CODE, "-");
+		
+		result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr());
+		result.put(WaybackConstants.RESULT_MIME_TYPE, meta.getMimetype());
+		result.put(WaybackConstants.RESULT_CAPTURE_DATE, meta.getDate());
+		
+		String uriStr = meta.getUrl();
+		if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) {
+			// skip filedesc record altogether...
+			return null;
+		}
+		if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) {
+			// skip URL + HTTP header processing for dns records...
+		
+			String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX
+					.length());
+			result.put(WaybackConstants.RESULT_ORIG_HOST, origHost);
+			result.put(WaybackConstants.RESULT_REDIRECT_URL, "-");
+			result.put(WaybackConstants.RESULT_URL, uriStr);
+			result.put(WaybackConstants.RESULT_URL_KEY, uriStr);
+		
+		} else {
+		
+			UURI uri = UURIFactory.getInstance(uriStr);
+			result.put(WaybackConstants.RESULT_URL, uriStr);
+		
+			String uriHost = uri.getHost();
+			if (uriHost == null) {
+				LOGGER.info("No host in " + uriStr + " in " + meta.getArc());
+			} else {
+				result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost);
+		
+				String statusCode = (meta.getStatusCode() == null) ? "-" : meta
+						.getStatusCode();
+				result.put(WaybackConstants.RESULT_HTTP_CODE, statusCode);
+		
+				String redirectUrl = "-";
+				Header[] headers = rec.getHttpHeaders();
+				if (headers != null) {
+		
+					for (int i = 0; i < headers.length; i++) {
+						if (headers[i].getName().equals(
+								WaybackConstants.LOCATION_HTTP_HEADER)) {
+
+							String locationStr = headers[i].getValue();
+							// TODO: "Location" is supposed to be absolute:
+							// (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
+							// (section 14.30) but Content-Location can be
+							// relative.
+							// is it correct to resolve a relative Location, as
+							// we are?
+							// it's also possible to have both in the HTTP
+							// headers...
+							// should we prefer one over the other?
+							// right now, we're ignoring "Content-Location"
+							try {
+								UURI uriRedirect = UURIFactory.getInstance(uri,
+										locationStr);
+								redirectUrl = uriRedirect.getEscapedURI();
+		
+							} catch (URIException e) {
+								LOGGER.info("Bad Location: " + locationStr
+										+ " for " + uriStr + " in "
+										+ meta.getArc() + " Skipped");
+							}
+							break;
+						}
+					}
+				}
+				result.put(WaybackConstants.RESULT_REDIRECT_URL, redirectUrl);
+		
+				String indexUrl = canonicalizer.urlStringToKey(meta.getUrl());
+				result.put(WaybackConstants.RESULT_URL_KEY, indexUrl);
+			}
+		
+		}
+		return result;
+	}
+	public UrlCanonicalizer getCanonicalizer() {
+		return canonicalizer;
+	}
+	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
+		this.canonicalizer = canonicalizer;
+	}
+}

Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java (from rev 2280, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArcIndexer.java)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArcIndexer.java	2008-06-25 01:30:18 UTC (rev 2321)
@@ -0,0 +1,175 @@
+/* ArcIndexer
+ *
+ * $Id$
+ *
+ * Created on 2:33:29 PM Oct 11, 2006.
+ *
+ * Copyright (C) 2006 Internet Archive.
+ *
+ * This file is part of Wayback.
+ *
+ * Wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * Wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with Wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+package org.archive.wayback.resourcestore.indexer;
+
+import java.io.File;
+import java.io.PrintWriter;
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.archive.io.ArchiveRecord;
+import org.archive.io.arc.ARCReader;
+import org.archive.io.arc.ARCReaderFactory;
+import org.archive.io.arc.ARCRecord;
+import org.archive.wayback.UrlCanonicalizer;
+import org.archive.wayback.core.SearchResult;
+import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter;
+import org.archive.wayback.util.AdaptedIterator;
+import org.archive.wayback.util.Adapter;
+import org.archive.wayback.util.CloseableIterator;
+import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
+import org.archive.wayback.util.url.IdentityUrlCanonicalizer;
+
+/**
+ * Transforms an ARC file into Iterator<SearchResult>.
+ * 
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class ArcIndexer {
+
+	/**
+	 * CDX Header line for these fields. not very configurable..
+	 */
+	public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g";
+	private UrlCanonicalizer canonicalizer = null;
+	
+	public ArcIndexer() {
+		canonicalizer = new AggressiveUrlCanonicalizer();
+	}
+
+	/**
+	 * @param arc
+	 * @return Iterator of SearchResults for input arc File
+	 * @throws IOException
+	 */
+	public CloseableIterator<SearchResult> iterator(File arc)
+	throws IOException {
+		return iterator(ARCReaderFactory.get(arc));
+	}
+
+	/**
+	 * @param pathOrUrl
+	 * @return Iterator of SearchResults for input pathOrUrl
+	 * @throws IOException
+	 */
+	public CloseableIterator<SearchResult> iterator(String pathOrUrl)
+	throws IOException {
+		return iterator(ARCReaderFactory.get(pathOrUrl));
+	}
+	
+	/**
+	 * @param arcReader
+	 * @return Iterator of SearchResults for input ARCReader
+	 * @throws IOException
+	 */
+	public CloseableIterator<SearchResult> iterator(ARCReader arcReader)
+	throws IOException {
+		arcReader.setParseHttpHeaders(true);
+
+		Adapter<ArchiveRecord,ARCRecord> adapter1 =
+			new ArchiveRecordToARCRecordAdapter();
+
+		ARCRecordToSearchResultAdapter adapter2 =
+			new ARCRecordToSearchResultAdapter();
+		adapter2.setCanonicalizer(canonicalizer);
+		
+		ArchiveReaderCloseableIterator itr1 = 
+			new ArchiveReaderCloseableIterator(arcReader,arcReader.iterator());
+
+		CloseableIterator<ARCRecord> itr2 = 
+			new AdaptedIterator<ArchiveRecord,ARCRecord>(itr1,adapter1);
+		
+		return new AdaptedIterator<ARCRecord,SearchResult>(itr2,adapter2);
+	}
+	
+	public UrlCanonicalizer getCanonicalizer() {
+		return canonicalizer;
+	}
+
+	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
+		this.canonicalizer = canonicalizer;
+	}
+
+	private static void USAGE() {
+		System.err.println("USAGE:");
+		System.err.println("");
+		System.err.println("arc-indexer [-identity] ARCFILE");
+		System.err.println("arc-indexer [-identity] ARCFILE CDXFILE");
+		System.err.println("");
+		System.err.println("Create a CDX format index at CDXFILE or to STDOUT.");
+		System.err.println("With -identity, perform no url canonicalization.");
+		System.exit(1);
+	}
+	
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		ArcIndexer indexer = new ArcIndexer();
+		int idx = 0;
+		if(args[0] != null && args[0].equals("-identity")) {
+			indexer.setCanonicalizer(new IdentityUrlCanonicalizer());
+			idx++;
+		}
+		File arc = new File(args[idx]);
+		idx++;
+		PrintWriter pw = null;
+		try {
+			if(args.length == idx) {
+				// dump to STDOUT:
+				pw = new PrintWriter(System.out);
+			} else if(args.length == (idx + 1)) {
+				pw = new PrintWriter(args[idx]);
+			} else {
+				USAGE();
+			}
+			Iterator<SearchResult> res = indexer.iterator(arc);
+			Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res);
+			while(lines.hasNext()) {
+				pw.println(lines.next());
+			}
+			pw.close();
+		} catch (Exception e) {
+			e.printStackTrace();
+			System.exit(1);
+		}
+	}
+	
+	private class ArchiveRecordToARCRecordAdapter 
+	implements Adapter<ArchiveRecord,ARCRecord> {
+
+		/* (non-Javadoc)
+		 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
+		 */
+		public ARCRecord adapt(ArchiveRecord o) {
+			ARCRecord rec = null;
+			if(o instanceof ARCRecord) {
+				rec = (ARCRecord) o;
+			}
+			return rec;
+		}
+	}
+}

Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArchiveReaderCloseableIterator.java (from rev 2209, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/ArchiveReaderCloseableIterator.java)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArchiveReaderCloseableIterator.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ArchiveReaderCloseableIterator.java	2008-06-25 01:30:18 UTC (rev 2321)
@@ -0,0 +1,29 @@
+package org.archive.wayback.resourcestore.indexer;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.archive.io.ArchiveReader;
+import org.archive.io.ArchiveRecord;
+import org.archive.wayback.util.CloseableIterator;
+
+public class ArchiveReaderCloseableIterator implements CloseableIterator<ArchiveRecord> {
+	private ArchiveReader reader = null;
+	private Iterator<ArchiveRecord> itr = null;
+	public ArchiveReaderCloseableIterator(ArchiveReader reader, Iterator<ArchiveRecord> itr) {
+		this.reader = reader;
+		this.itr = itr;
+	}
+	public boolean hasNext() {
+		return itr.hasNext();
+	}
+	public ArchiveRecord next() {
+		return itr.next();
+	}
+	public void remove() {
+		itr.remove();
+	}
+	public void close() throws IOException {
+		reader.close();
+	}
+}

Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java (from rev 2138, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java	2008-06-25 01:30:18 UTC (rev 2321)
@@ -0,0 +1,318 @@
+package org.archive.wayback.resourcestore.indexer;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.logging.Logger;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpParser;
+import org.apache.commons.httpclient.StatusLine;
+import org.apache.commons.httpclient.URIException;
+import org.apache.commons.httpclient.util.EncodingUtil;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.io.RecoverableIOException;
+import org.archive.io.arc.ARCConstants;
+import org.archive.io.warc.WARCConstants;
+import org.archive.io.warc.WARCRecord;
+import org.archive.net.UURI;
+import org.archive.net.UURIFactory;
+import org.archive.wayback.UrlCanonicalizer;
+import org.archive.wayback.WaybackConstants;
+import org.archive.wayback.core.SearchResult;
+import org.archive.wayback.util.Adapter;
+import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
+
+/**
+ * Adapts certain WARCRecords into SearchResults. DNS and response records are
+ * mostly straightforward, but SearchResult objects generated from revisit 
+ * records contain lots of "placeholder" fields, which are expected to be
+ * understood by later processes traversing a stream of SearchResult objects.
+ * 
+ * See org.archive.wayback.resourceindex.DeduplicateSearchResultAnnotationAdapter.
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class WARCRecordToSearchResultAdapter
+implements Adapter<WARCRecord,SearchResult>{
+	
+	private final static String DEFAULT_VALUE = "-"; 
+	private final static String SEARCH_FIELDS[] = {
+			WaybackConstants.RESULT_URL,
+			WaybackConstants.RESULT_URL_KEY,
+			WaybackConstants.RESULT_ORIG_HOST,
+			WaybackConstants.RESULT_CAPTURE_DATE,
+			WaybackConstants.RESULT_MD5_DIGEST,
+			WaybackConstants.RESULT_MIME_TYPE,
+			WaybackConstants.RESULT_HTTP_CODE,
+			WaybackConstants.RESULT_REDIRECT_URL,
+			WaybackConstants.RESULT_ARC_FILE,
+			WaybackConstants.RESULT_OFFSET,
+	};
+
+	private static final Logger LOGGER = Logger.getLogger(
+			WARCRecordToSearchResultAdapter.class.getName());
+
+	private UrlCanonicalizer canonicalizer = null;
+
+	public WARCRecordToSearchResultAdapter() {
+		canonicalizer = new AggressiveUrlCanonicalizer();
+	}
+
+	/* (non-Javadoc)
+	 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
+	 */
+	public SearchResult adapt(WARCRecord rec) {
+		try {
+			return adaptInner(rec);
+		} catch (IOException e) {
+			e.printStackTrace();
+			return null;
+		}
+	}
+	
+	/*
+	 * Transform input date to 14-digit timestamp:
+	 * 2007-08-29T18:00:26Z => 20070829180026
+	 */
+	private static String transformDate(final String input) {
+		
+		StringBuilder output = new StringBuilder(14);
+		
+		output.append(input.substring(0,4));
+		output.append(input.substring(5,7));
+		output.append(input.substring(8,10));
+		output.append(input.substring(11,13));
+		output.append(input.substring(14,16));
+		output.append(input.substring(17,19));
+		
+		return output.toString();
+	}
+	
+	private static String transformHTTPMime(final String input) {
+		int semiIdx = input.indexOf(";");
+		if(semiIdx > 0) {
+			return input.substring(0,semiIdx).trim();
+		}
+		return input.trim();
+	}
+
+	private String transformWarcFilename(String readerIdentifier) {
+		String warcName = readerIdentifier;
+		int index = warcName.lastIndexOf(File.separator);
+		if (index > 0 && (index + 1) < warcName.length()) {
+		    warcName = warcName.substring(index + 1);
+		}
+		return warcName;
+	}
+
+	private String transformDigest(final Object o) {
+		if(o == null) {
+			return DEFAULT_VALUE;
+		}
+		String orig = o.toString();
+		if(orig.startsWith("sha1:")) {
+			return orig.substring(5);
+		}
+		return orig;
+	}
+
+	private SearchResult getBlankSearchResult() {
+		SearchResult result = new SearchResult();
+		for(String field : SEARCH_FIELDS) {
+			result.put(field, DEFAULT_VALUE);
+		}
+		return result;
+	}
+	
+	private UURI addUrlDataToSearchResult(SearchResult result, String urlStr)
+	throws IOException {
+
+		result.put(WaybackConstants.RESULT_URL, urlStr);
+		result.put(WaybackConstants.RESULT_URL_KEY, urlStr);
+
+	
+		UURI uri = UURIFactory.getInstance(urlStr);
+		String uriHost = uri.getHost();
+		if (uriHost == null) {
+
+			LOGGER.info("No host in " + urlStr);
+
+		} else {
+
+			result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost);
+		}
+
+		String urlKey = canonicalizer.urlStringToKey(urlStr);
+		result.put(WaybackConstants.RESULT_URL_KEY, urlKey);
+
+		return uri;
+	}
+
+	private SearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) 
+	throws IOException {
+
+		SearchResult result = getBlankSearchResult();
+
+		result.put(WaybackConstants.RESULT_CAPTURE_DATE, 
+				transformDate(header.getDate()));
+		result.put(WaybackConstants.RESULT_ARC_FILE,
+				transformWarcFilename(header.getReaderIdentifier()));
+		result.put(WaybackConstants.RESULT_OFFSET, 
+				String.valueOf(header.getOffset()));
+		
+		String uriStr = header.getUrl();
+		
+		String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX
+				.length());
+		result.put(WaybackConstants.RESULT_MIME_TYPE, header.getMimetype());
+
+		result.put(WaybackConstants.RESULT_ORIG_HOST, origHost);
+		result.put(WaybackConstants.RESULT_URL, uriStr);
+		result.put(WaybackConstants.RESULT_URL_KEY, uriStr);
+
+		rec.close();
+		result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr());
+
+		return result;
+	}
+
+	private SearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec) 
+	throws IOException {
+
+		SearchResult result = getBlankSearchResult();
+
+		result.put(WaybackConstants.RESULT_CAPTURE_DATE, 
+				transformDate(header.getDate()));
+		result.put(WaybackConstants.RESULT_MD5_DIGEST, 
+				transformDigest(header.getHeaderValue(
+						WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
+		
+		addUrlDataToSearchResult(result,header.getUrl());
+
+		return result;
+	}
+
+    /**
+     * borrowed(copied) from org.archive.io.arc.ARCRecord...
+     * 
+     * @param bytes Array of bytes to examine for an EOL.
+     * @return Count of end-of-line characters or zero if none.
+     */
+    private int getEolCharsCount(byte [] bytes) {
+        int count = 0;
+        if (bytes != null && bytes.length >=1 &&
+                bytes[bytes.length - 1] == '\n') {
+            count++;
+            if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
+                count++;
+            }
+        }
+        return count;
+    }
+	
+	private SearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec) 
+	throws IOException {
+
+		SearchResult result = getBlankSearchResult();
+
+		result.put(WaybackConstants.RESULT_CAPTURE_DATE, 
+				transformDate(header.getDate()));
+		result.put(WaybackConstants.RESULT_ARC_FILE,
+				transformWarcFilename(header.getReaderIdentifier()));
+		result.put(WaybackConstants.RESULT_OFFSET, 
+				String.valueOf(header.getOffset()));
+		
+		String origUrl = header.getUrl();
+		UURI uri = addUrlDataToSearchResult(result,origUrl);
+
+		// need to parse the documents HTTP message and headers here: WARCReader
+		// does not implement this... yet..
+		
+        byte [] statusBytes = HttpParser.readRawLine(rec);
+        int eolCharCount = getEolCharsCount(statusBytes);
+        if (eolCharCount <= 0) {
+            throw new RecoverableIOException("Failed to read http status where one " +
+                " was expected: " + new String(statusBytes));
+        }
+        String statusLine = EncodingUtil.getString(statusBytes, 0,
+            statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
+        if ((statusLine == null) ||
+                !StatusLine.startsWithHTTP(statusLine)) {
+           throw new RecoverableIOException("Failed parse of http status line.");
+        }
+        StatusLine status = new StatusLine(statusLine);
+		result.put(WaybackConstants.RESULT_HTTP_CODE, 
+				String.valueOf(status.getStatusCode()));
+        
+		Header[] headers = HttpParser.parseHeaders(rec,
+                ARCConstants.DEFAULT_ENCODING);
+
+		rec.close();
+		result.put(WaybackConstants.RESULT_MD5_DIGEST, 
+				transformDigest(header.getHeaderValue(
+						WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
+
+		if (headers != null) {
+	
+			for (Header httpHeader : headers) {
+				if (httpHeader.getName().equals(
+						WaybackConstants.LOCATION_HTTP_HEADER)) {
+	
+					String locationStr = httpHeader.getValue();
+					// TODO: "Location" is supposed to be absolute:
+					// (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
+					// (section 14.30) but Content-Location can be
+					// relative.
+					// is it correct to resolve a relative Location, as
+					// we are?
+					// it's also possible to have both in the HTTP
+					// headers...
+					// should we prefer one over the other?
+					// right now, we're ignoring "Content-Location"
+					try {
+						UURI uriRedirect = UURIFactory.getInstance(uri,
+								locationStr);
+						result.put(WaybackConstants.RESULT_REDIRECT_URL,
+								uriRedirect.getEscapedURI());
+					} catch (URIException e) {
+						LOGGER.info("Bad Location: " + locationStr
+								+ " for " + origUrl + " in "
+								+ header.getReaderIdentifier() + " Skipped");
+					}
+				} else if(httpHeader.getName().toLowerCase().equals("content-type")) {
+					result.put(WaybackConstants.RESULT_MIME_TYPE, 
+							transformHTTPMime(httpHeader.getValue()));
+				}
+			}
+		}
+		return result;
+	}
+	
+	private SearchResult adaptInner(WARCRecord rec) throws IOException {
+		
+		SearchResult result = null;
+		ArchiveRecordHeader header = rec.getHeader();
+		String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
+		if(type.equals(WARCConstants.RESPONSE)) {
+			String mime = header.getMimetype();
+			if(mime.equals("text/dns")) {
+				result = adaptDNS(header,rec);
+			} else {
+				result = adaptResponse(header,rec);
+			}
+		} else if(type.equals(WARCConstants.REVISIT)) {
+			result = adaptRevisit(header,rec);
+		}
+
+		return result;
+	}
+
+	public UrlCanonicalizer getCanonicalizer() {
+		return canonicalizer;
+	}
+
+	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
+		this.canonicalizer = canonicalizer;
+	}
+}

Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java (from rev 2280, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WarcIndexer.java)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WarcIndexer.java	2008-06-25 01:30:18 UTC (rev 2321)
@@ -0,0 +1,140 @@
+package org.archive.wayback.resourcestore.indexer;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.Iterator;
+
+import org.archive.io.ArchiveRecord;
+import org.archive.io.warc.WARCReader;
+import org.archive.io.warc.WARCReaderFactory;
+import org.archive.io.warc.WARCRecord;
+import org.archive.wayback.UrlCanonicalizer;
+import org.archive.wayback.core.SearchResult;
+import org.archive.wayback.resourceindex.cdx.SearchResultToCDXLineAdapter;
+import org.archive.wayback.util.AdaptedIterator;
+import org.archive.wayback.util.Adapter;
+import org.archive.wayback.util.CloseableIterator;
+import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
+import org.archive.wayback.util.url.IdentityUrlCanonicalizer;
+
+public class WarcIndexer {
+
+	/**
+	 * CDX Header line for these fields. not very configurable..
+	 */
+	public final static String CDX_HEADER_MAGIC = " CDX N b h m s k r V g";
+
+	private UrlCanonicalizer canonicalizer = null;
+	public WarcIndexer() {
+		canonicalizer = new AggressiveUrlCanonicalizer();
+	}
+	
+	/**
+	 * @param warc
+	 * @return Iterator of SearchResults for input arc File
+	 * @throws IOException
+	 */
+	public CloseableIterator<SearchResult> iterator(File warc)
+			throws IOException {
+		return iterator(WARCReaderFactory.get(warc));
+	}
+	/**
+	 * @param pathOrUrl
+	 * @return Iterator of SearchResults for input pathOrUrl
+	 * @throws IOException
+	 */
+	public CloseableIterator<SearchResult> iterator(String pathOrUrl)
+			throws IOException {
+		return iterator(WARCReaderFactory.get(pathOrUrl));
+	}
+	/**
+	 * @param arc
+	 * @return Iterator of SearchResults for input arc File
+	 * @throws IOException
+	 */
+	public CloseableIterator<SearchResult> iterator(WARCReader reader)
+			throws IOException {
+
+		Adapter<ArchiveRecord, WARCRecord> adapter1 = new ArchiveRecordToWARCRecordAdapter();
+
+		WARCRecordToSearchResultAdapter adapter2 = 
+			new WARCRecordToSearchResultAdapter();
+		adapter2.setCanonicalizer(canonicalizer);
+
+		ArchiveReaderCloseableIterator itr1 = 
+			new ArchiveReaderCloseableIterator(reader,reader.iterator());
+
+		CloseableIterator<WARCRecord> itr2 = 
+			new AdaptedIterator<ArchiveRecord, WARCRecord>(itr1, adapter1);
+
+		return new AdaptedIterator<WARCRecord, SearchResult>(itr2, adapter2);
+	}
+
+	public UrlCanonicalizer getCanonicalizer() {
+		return canonicalizer;
+	}
+
+	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
+		this.canonicalizer = canonicalizer;
+	}
+	
+	private static void USAGE() {
+		System.err.println("USAGE:");
+		System.err.println("");
+		System.err.println("warc-indexer [-identity] WARCFILE");
+		System.err.println("warc-indexer [-identity] WARCFILE CDXFILE");
+		System.err.println("");
+		System.err.println("Create a CDX format index at CDXFILE or to STDOUT");
+		System.err.println("With -identity, perform no url canonicalization.");
+		System.exit(1);
+	}
+
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		WarcIndexer indexer = new WarcIndexer();
+		int idx = 0;
+		if(args[0] != null && args[0].equals("-identity")) {
+			indexer.setCanonicalizer(new IdentityUrlCanonicalizer());
+			idx++;
+		}
+		File arc = new File(args[idx]);
+		idx++;
+		PrintWriter pw = null;
+		try {
+			if (args.length == idx) {
+				// dump to STDOUT:
+				pw = new PrintWriter(System.out);
+			} else if (args.length == (idx+1)) {
+				pw = new PrintWriter(args[1]);
+			} else {
+				USAGE();
+			}
+			Iterator<SearchResult> res = indexer.iterator(arc);
+			Iterator<String> lines = SearchResultToCDXLineAdapter.adapt(res);
+			while (lines.hasNext()) {
+				pw.println(lines.next());
+			}
+			pw.close();
+		} catch (Exception e) {
+			e.printStackTrace();
+		}
+	}
+
+	private class ArchiveRecordToWARCRecordAdapter implements
+			Adapter<ArchiveRecord, WARCRecord> {
+
+		/* (non-Javadoc)
+		 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
+		 */
+		public WARCRecord adapt(ArchiveRecord o) {
+			WARCRecord rec = null;
+			if (o instanceof WARCRecord) {
+				rec = (WARCRecord) o;
+			}
+			return rec;
+		}
+	}
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access: [2321] trunk/archive-access/projects/wayback/ way

[Archive-access-cvs] SF.net SVN: archive-access: [2321] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/resourcestore