[Archive-access-cvs] SF.net SVN: archive-access: [2084] trunk/archive-access/projects/wayback/ way

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2084
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2084&view=rev
Author:   bradtofel
Date:     2007-11-27 18:06:27 -0800 (Tue, 27 Nov 2007)

Log Message:
-----------
INITIAL REV: class which adapts (some) WARCRecords into SearchResult objects.

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java
===================================================================

--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/WARCRecordToSearchResultAdapter.java	2007-11-28 02:06:27 UTC (rev 2084)
@@ -0,0 +1,302 @@
+package org.archive.wayback.resourcestore;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.logging.Logger;
+
+import org.apache.commons.httpclient.Header;
+import org.apache.commons.httpclient.HttpParser;
+import org.apache.commons.httpclient.StatusLine;
+import org.apache.commons.httpclient.URIException;
+import org.apache.commons.httpclient.util.EncodingUtil;
+import org.archive.io.ArchiveRecordHeader;
+import org.archive.io.RecoverableIOException;
+import org.archive.io.arc.ARCConstants;
+import org.archive.io.warc.WARCConstants;
+import org.archive.io.warc.WARCRecord;
+import org.archive.net.UURI;
+import org.archive.net.UURIFactory;
+import org.archive.wayback.WaybackConstants;
+import org.archive.wayback.core.SearchResult;
+import org.archive.wayback.util.Adapter;
+import org.archive.wayback.util.UrlCanonicalizer;
+
+/**
+ * Adapts certain WARCRecords into SearchResults. DNS and response records are
+ * mostly straightforward, but SearchResult objects generated from revisit 
+ * records contain lots of "placeholder" fields, which are expected to be
+ * understood by later processes traversing a stream of SearchResult objects.
+ * 
+ * See org.archive.wayback.resourceindex.DeduplicateSearchResultAnnotationAdapter.
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class WARCRecordToSearchResultAdapter
+implements Adapter<WARCRecord,SearchResult>{
+	
+	private final static String DEFAULT_VALUE = "-"; 
+	private final static String SEARCH_FIELDS[] = {
+			WaybackConstants.RESULT_URL,
+			WaybackConstants.RESULT_URL_KEY,
+			WaybackConstants.RESULT_ORIG_HOST,
+			WaybackConstants.RESULT_CAPTURE_DATE,
+			WaybackConstants.RESULT_MD5_DIGEST,
+			WaybackConstants.RESULT_MIME_TYPE,
+			WaybackConstants.RESULT_HTTP_CODE,
+			WaybackConstants.RESULT_REDIRECT_URL,
+			WaybackConstants.RESULT_ARC_FILE,
+			WaybackConstants.RESULT_OFFSET,
+	};
+
+	private static final Logger LOGGER = Logger.getLogger(
+			WARCRecordToSearchResultAdapter.class.getName());
+
+	// TODO: make this configurable based on the ResourceIndex
+	private static UrlCanonicalizer canonicalizer = new UrlCanonicalizer();
+
+	/* (non-Javadoc)
+	 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
+	 */
+	public SearchResult adapt(WARCRecord rec) {
+		try {
+			return adaptInner(rec);
+		} catch (IOException e) {
+			e.printStackTrace();
+			return null;
+		}
+	}
+	
+	/*
+	 * Transform input date to 14-digit timestamp:
+	 * 2007-08-29T18:00:26Z => 20070829180026
+	 */
+	private static String transformDate(final String input) {
+		
+		StringBuilder output = new StringBuilder(14);
+		
+		output.append(input.substring(0,4));
+		output.append(input.substring(5,7));
+		output.append(input.substring(8,10));
+		output.append(input.substring(11,13));
+		output.append(input.substring(14,16));
+		output.append(input.substring(17,19));
+		
+		return output.toString();
+	}
+	
+	private static String transformHTTPMime(final String input) {
+		int semiIdx = input.indexOf(";");
+		if(semiIdx > 0) {
+			return input.substring(0,semiIdx).trim();
+		}
+		return input.trim();
+	}
+
+	private String transformWarcFilename(String readerIdentifier) {
+		String warcName = readerIdentifier;
+		int index = warcName.lastIndexOf(File.separator);
+		if (index > 0 && (index + 1) < warcName.length()) {
+		    warcName = warcName.substring(index + 1);
+		}
+		return warcName;
+	}
+
+	private String transformDigest(final Object o) {
+		if(o == null) {
+			return DEFAULT_VALUE;
+		}
+		String orig = o.toString();
+		if(orig.startsWith("sha1:")) {
+			return orig.substring(5);
+		}
+		return orig;
+	}
+
+	private SearchResult getBlankSearchResult() {
+		SearchResult result = new SearchResult();
+		for(String field : SEARCH_FIELDS) {
+			result.put(field, DEFAULT_VALUE);
+		}
+		return result;
+	}
+	
+	private void addUrlDataToSearchResult(SearchResult result, String urlStr)
+	throws IOException {
+
+		result.put(WaybackConstants.RESULT_URL, urlStr);
+		result.put(WaybackConstants.RESULT_URL_KEY, urlStr);
+
+	
+		UURI uri = UURIFactory.getInstance(urlStr);
+		String uriHost = uri.getHost();
+		if (uriHost == null) {
+
+			LOGGER.info("No host in " + urlStr);
+
+		} else {
+
+			result.put(WaybackConstants.RESULT_ORIG_HOST, uriHost);
+		}
+
+		String urlKey = canonicalizer.urlStringToKey(urlStr);
+		result.put(WaybackConstants.RESULT_URL_KEY, urlKey);
+	}
+
+	private SearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) 
+	throws IOException {
+
+		SearchResult result = getBlankSearchResult();
+
+		result.put(WaybackConstants.RESULT_CAPTURE_DATE, 
+				transformDate(header.getDate()));
+		result.put(WaybackConstants.RESULT_ARC_FILE,
+				transformWarcFilename(header.getReaderIdentifier()));
+		result.put(WaybackConstants.RESULT_OFFSET, 
+				String.valueOf(header.getOffset()));
+		
+		String uriStr = header.getUrl();
+		
+		String origHost = uriStr.substring(WaybackConstants.DNS_URL_PREFIX
+				.length());
+		result.put(WaybackConstants.RESULT_MIME_TYPE, header.getMimetype());
+
+		result.put(WaybackConstants.RESULT_ORIG_HOST, origHost);
+		result.put(WaybackConstants.RESULT_URL, uriStr);
+		result.put(WaybackConstants.RESULT_URL_KEY, uriStr);
+
+		rec.close();
+		result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr());
+
+		return result;
+	}
+
+	private SearchResult adaptRevisit(ArchiveRecordHeader header, WARCRecord rec) 
+	throws IOException {
+
+		SearchResult result = getBlankSearchResult();
+
+		result.put(WaybackConstants.RESULT_CAPTURE_DATE, 
+				transformDate(header.getDate()));
+		result.put(WaybackConstants.RESULT_MD5_DIGEST, 
+				transformDigest(header.getHeaderValue(
+						WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
+		
+		addUrlDataToSearchResult(result,header.getUrl());
+
+		return result;
+	}
+
+    /**
+     * borrowed(copied) from org.archive.io.arc.ARCRecord...
+     * 
+     * @param bytes Array of bytes to examine for an EOL.
+     * @return Count of end-of-line characters or zero if none.
+     */
+    private int getEolCharsCount(byte [] bytes) {
+        int count = 0;
+        if (bytes != null && bytes.length >=1 &&
+                bytes[bytes.length - 1] == '\n') {
+            count++;
+            if (bytes.length >=2 && bytes[bytes.length -2] == '\r') {
+                count++;
+            }
+        }
+        return count;
+    }
+	
+	private SearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec) 
+	throws IOException {
+
+		SearchResult result = getBlankSearchResult();
+
+		result.put(WaybackConstants.RESULT_CAPTURE_DATE, 
+				transformDate(header.getDate()));
+		result.put(WaybackConstants.RESULT_ARC_FILE,
+				transformWarcFilename(header.getReaderIdentifier()));
+		result.put(WaybackConstants.RESULT_OFFSET, 
+				String.valueOf(header.getOffset()));
+		
+		String origUrl = header.getUrl();
+		addUrlDataToSearchResult(result,origUrl);
+
+		// need to parse the documents HTTP message and headers here: WARCReader
+		// does not implement this... yet..
+		
+        byte [] statusBytes = HttpParser.readRawLine(rec);
+        int eolCharCount = getEolCharsCount(statusBytes);
+        if (eolCharCount <= 0) {
+            throw new RecoverableIOException("Failed to read http status where one " +
+                " was expected: " + new String(statusBytes));
+        }
+        String statusLine = EncodingUtil.getString(statusBytes, 0,
+            statusBytes.length - eolCharCount, ARCConstants.DEFAULT_ENCODING);
+        if ((statusLine == null) ||
+                !StatusLine.startsWithHTTP(statusLine)) {
+           throw new RecoverableIOException("Failed parse of http status line.");
+        }
+        StatusLine status = new StatusLine(statusLine);
+		result.put(WaybackConstants.RESULT_HTTP_CODE, 
+				String.valueOf(status.getStatusCode()));
+        
+		Header[] headers = HttpParser.parseHeaders(rec,
+                ARCConstants.DEFAULT_ENCODING);
+
+		rec.close();
+		result.put(WaybackConstants.RESULT_MD5_DIGEST, rec.getDigestStr());
+
+		if (headers != null) {
+	
+			for (Header httpHeader : headers) {
+				if (httpHeader.getName().equals(
+						WaybackConstants.LOCATION_HTTP_HEADER)) {
+	
+					String locationStr = httpHeader.getValue();
+					// TODO: "Location" is supposed to be absolute:
+					// (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
+					// (section 14.30) but Content-Location can be
+					// relative.
+					// is it correct to resolve a relative Location, as
+					// we are?
+					// it's also possible to have both in the HTTP
+					// headers...
+					// should we prefer one over the other?
+					// right now, we're ignoring "Content-Location"
+					try {
+						UURI uriRedirect = UURIFactory.getInstance(origUrl,
+								locationStr);
+						result.put(WaybackConstants.RESULT_REDIRECT_URL,
+								uriRedirect.getEscapedURI());
+					} catch (URIException e) {
+						LOGGER.info("Bad Location: " + locationStr
+								+ " for " + origUrl + " in "
+								+ header.getReaderIdentifier() + " Skipped");
+					}
+				} else if(httpHeader.getName().toLowerCase().equals("content-type")) {
+					result.put(WaybackConstants.RESULT_MIME_TYPE, 
+							transformHTTPMime(httpHeader.getValue()));
+				}
+			}
+		}
+		return result;
+	}
+	
+	private SearchResult adaptInner(WARCRecord rec) throws IOException {
+		
+		SearchResult result = null;
+		ArchiveRecordHeader header = rec.getHeader();
+		String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
+		if(type.equals(WARCConstants.RESPONSE)) {
+			String mime = header.getMimetype();
+			if(mime.equals("text/dns")) {
+				result = adaptDNS(header,rec);
+			} else {
+				result = adaptResponse(header,rec);
+			}
+		} else if(type.equals(WARCConstants.REVISIT)) {
+			result = adaptRevisit(header,rec);
+		}
+
+		return result;
+	}
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access: [2084] trunk/archive-access/projects/wayback/ way

[Archive-access-cvs] SF.net SVN: archive-access: [2084] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/resourcestore/ WARCRecordToSearchResultAdapter.java