[Archive-access-cvs] SF.net SVN: archive-access:[2887] trunk/archive-access/projects/wayback/ wayb

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2887
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2887&view=rev
Author:   bradtofel
Date:     2009-11-06 01:53:23 +0000 (Fri, 06 Nov 2009)

Log Message:
-----------
REFACTOR: Moved common HTTP header parsing code into HTTPRecordAnnotater
FEATURE: HTML content is now parsed using the SAX parser, to search for META robots tags
FEATURE: Now HTTP headers are inspected for Robot related instructions

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java
===================================================================

--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java	2009-11-06 01:50:20 UTC (rev 2886)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/ARCRecordToSearchResultAdapter.java	2009-11-06 01:53:23 UTC (rev 2887)
@@ -36,7 +36,6 @@
 import org.archive.wayback.core.CaptureSearchResult;
 import org.archive.wayback.util.Adapter;
 import org.archive.wayback.util.url.IdentityUrlCanonicalizer;
-import org.archive.wayback.util.url.UrlOperations;
 
 /**
  *
@@ -50,13 +49,14 @@
 //	private static final Logger LOGGER = Logger.getLogger(
 //			ARCRecordToSearchResultAdapter.class.getName());
 
+	private HTTPRecordAnnotater annotater = null;
 	private UrlCanonicalizer canonicalizer = null;
 	
 	public ARCRecordToSearchResultAdapter() {
 		canonicalizer = new IdentityUrlCanonicalizer();
+		annotater = new HTTPRecordAnnotater();
 	}
-//	public static SearchResult arcRecordToSearchResult(final ARCRecord rec)
-//	throws IOException, ParseException {
+
 	/* (non-Javadoc)
 	 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
 	 */
@@ -68,7 +68,7 @@
 			return null;
 		}
 	}
-	
+
 	private CaptureSearchResult adaptInner(ARCRecord rec) throws IOException {
 		rec.close();
 		ARCRecordMetaData meta = rec.getMetaData();
@@ -84,12 +84,14 @@
 		
 		// initialize with default HTTP code...
 		result.setHttpCode("-");
+		result.setRedirectUrl("-");
 		
 		result.setDigest(rec.getDigestStr());
-		result.setMimeType(meta.getMimetype());
 		result.setCaptureTimestamp(meta.getDate());
-		
 		String uriStr = meta.getUrl();
+		result.setOriginalUrl(uriStr);
+		
+		
 		if (uriStr.startsWith(ARCRecord.ARC_MAGIC_NUMBER)) {
 			// skip filedesc record altogether...
 			return null;
@@ -97,49 +99,20 @@
 		if (uriStr.startsWith(WaybackConstants.DNS_URL_PREFIX)) {
 			// skip URL + HTTP header processing for dns records...
 		
-			result.setOriginalUrl(uriStr);
-			result.setRedirectUrl("-");
 			result.setUrlKey(uriStr);
-		
+			result.setMimeType("text/dns");
+			result.setEndOffset(rec.compressedBytes);
+
 		} else {
 		
-			result.setOriginalUrl(uriStr);
+			result.setUrlKey(canonicalizer.urlStringToKey(uriStr));
 		
-		
 			String statusCode = (meta.getStatusCode() == null) ? "-" : meta
 					.getStatusCode();
 			result.setHttpCode(statusCode);
 	
-			String redirectUrl = "-";
 			Header[] headers = rec.getHttpHeaders();
-			if (headers != null) {
-	
-				for (int i = 0; i < headers.length; i++) {
-					if (headers[i].getName().equals(
-							WaybackConstants.LOCATION_HTTP_HEADER)) {
-
-						String locationStr = headers[i].getValue();
-						// TODO: "Location" is supposed to be absolute:
-						// (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
-						// (section 14.30) but Content-Location can be
-						// relative.
-						// is it correct to resolve a relative Location, as
-						// we are?
-						// it's also possible to have both in the HTTP
-						// headers...
-						// should we prefer one over the other?
-						// right now, we're ignoring "Content-Location"
-						redirectUrl = UrlOperations.resolveUrl(uriStr, 
-								locationStr);
-	
-						break;
-					}
-				}
-				result.setRedirectUrl(redirectUrl);
-		
-				String urlKey = canonicalizer.urlStringToKey(meta.getUrl());
-				result.setUrlKey(urlKey);
-			}
+			annotater.annotateHTTPContent(result, rec, headers, meta.getMimetype());
 		}
 		return result;
 	}
@@ -149,4 +122,18 @@
 	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
 		this.canonicalizer = canonicalizer;
 	}
+
+	/**
+	 * @return the annotater
+	 */
+	public HTTPRecordAnnotater getAnnotater() {
+		return annotater;
+	}
+
+	/**
+	 * @param annotater the annotater to set
+	 */
+	public void setAnnotater(HTTPRecordAnnotater annotater) {
+		this.annotater = annotater;
+	}
 }

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java	2009-11-06 01:53:23 UTC (rev 2887)
@@ -0,0 +1,144 @@
+package org.archive.wayback.resourcestore.indexer;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.logging.Logger;
+
+import org.apache.commons.httpclient.Header;
+import org.archive.wayback.WaybackConstants;
+import org.archive.wayback.core.CaptureSearchResult;
+import org.archive.wayback.util.htmllex.ContextAwareLexer;
+import org.archive.wayback.util.htmllex.ParseEventDelegator;
+import org.archive.wayback.util.htmllex.ParseContext;
+import org.archive.wayback.util.url.UrlOperations;
+import org.htmlparser.Node;
+import org.htmlparser.lexer.Lexer;
+import org.htmlparser.lexer.Page;
+import org.htmlparser.util.ParserException;
+
+public class HTTPRecordAnnotater {
+	private RobotMetaRule rule = null;
+	private ParseEventDelegator rules = null;
+	private RobotMetaFlags robotFlags;
+	private static final Logger LOGGER =
+        Logger.getLogger(HTTPRecordAnnotater.class.getName());
+	
+	private final static String[] mimes = {
+		"html"
+	};
+	public HTTPRecordAnnotater() {
+		rules = new ParseEventDelegator();
+		rules.init();
+		rule = new RobotMetaRule();
+		robotFlags = new RobotMetaFlags();
+		rule.setRobotFlags(robotFlags);
+		rule.visit(rules);
+	}
+	public boolean isHTML(String mimeType) {
+		String mimeLower = mimeType.toLowerCase();
+		for(String mime : mimes) {
+			if(mimeLower.contains(mime)) {
+				return true;
+			}
+		}
+		return false;
+	}
+
+	private String escapeSpaces(final String input) {
+		if(input.contains(" ")) {
+			return input.replace(" ", "%20");
+		}
+		return input;
+	}
+	
+	public String transformHTTPMime(String input) {
+		int semiIdx = input.indexOf(";");
+		if(semiIdx > 0) {
+			return escapeSpaces(input.substring(0,semiIdx).trim());
+		}
+		return escapeSpaces(input.trim());
+	}
+	
+	public void annotateHTTPContent(CaptureSearchResult result, 
+    		InputStream is, Header[] headers, String mimeGuess) {
+		robotFlags.reset();
+		String mimeType = null;
+		if (headers != null) {
+	
+			for (Header httpHeader : headers) {
+				if (httpHeader.getName().equals(
+						WaybackConstants.LOCATION_HTTP_HEADER)) {
+	
+					String locationStr = httpHeader.getValue();
+					// TODO: "Location" is supposed to be absolute:
+					// (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
+					// (section 14.30) but Content-Location can be
+					// relative.
+					// is it correct to resolve a relative Location, as
+					// we are?
+					// it's also possible to have both in the HTTP
+					// headers...
+					// should we prefer one over the other?
+					// right now, we're ignoring "Content-Location"
+					result.setRedirectUrl(
+							UrlOperations.resolveUrl(result.getOriginalUrl(),
+									locationStr));
+
+				} else if(httpHeader.getName().toLowerCase().equals("content-type")) {
+					mimeType = transformHTTPMime(httpHeader.getValue());
+				} else if(httpHeader.getName().toLowerCase().equals(
+						WaybackConstants.X_ROBOTS_HTTP_HEADER)) {
+
+					robotFlags.parse(httpHeader.getValue());
+				}
+			}
+		}
+		
+		// TODO: get the encoding:
+		String encoding = "utf-8";
+		if(mimeType == null) {
+			// nothing present in the HTTP headers.. Use the WARC field:
+			mimeType = transformHTTPMime(mimeGuess);
+		}
+		result.setMimeType(mimeType);
+		// Now the sticky part: If it looks like an HTML document, look for
+		// robot meta tags:
+		if(isHTML(mimeType)) {
+			String fileContext = result.getFile() + ":" + result.getOffset();
+			annotateHTMLContent(is, encoding, fileContext, result);
+		}
+		robotFlags.apply(result);
+		
+	}
+	
+	public void annotateHTMLContent(InputStream is, String charSet, String fileContext, 
+			CaptureSearchResult result) {
+
+		ParseContext context = new ParseContext();
+   	
+    	Node node;
+    	try {
+        	ContextAwareLexer lex = new ContextAwareLexer(
+        			new Lexer(new Page(is,charSet)),context);
+			while((node = lex.nextNode()) != null) {
+//				System.err.println("\nDEBUG-Node:js("+context.isInJS()+")css("+context.isInCSS()+"):");
+//				System.err.println("-------------------/START");
+//				System.err.println(node.toHtml(true));
+//				System.err.println("-------------------/END");
+				rules.handleNode(context, node);
+			}
+			rules.handleParseComplete(context);
+		} catch (ParserException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+			LOGGER.warning(fileContext + " " + e.getLocalizedMessage());
+		} catch (UnsupportedEncodingException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+			LOGGER.warning(fileContext + " " + e.getLocalizedMessage());
+		} catch (IOException e) {
+			LOGGER.warning(fileContext + " " + e.getLocalizedMessage());
+		}
+	}
+}


Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/HTTPRecordAnnotater.java
___________________________________________________________________
Added: svn:keywords
   + Author Date Revision Id

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java	2009-11-06 01:53:23 UTC (rev 2887)
@@ -0,0 +1,44 @@
+package org.archive.wayback.resourcestore.indexer;
+
+import org.archive.wayback.core.CaptureSearchResult;
+
+public class RobotMetaFlags {
+	private static String NO_NOTHIN_MATCH = "NONE";
+	private static String NO_FOLLOW_MATCH = "NOFOLLOW";
+	private static String NO_INDEX_MATCH = "NOINDEX";
+	private static String NO_ARCHIVE_MATCH = "NOARCHIVE";
+	
+	private boolean noArchive = false;
+	private boolean noIndex = false;
+	private boolean noFollow = false;
+	public void reset() {
+		noArchive = false;
+		noIndex = false;
+		noFollow = false;
+	}
+	public void parse(String content) {
+		if(content == null) {
+			return;
+		}
+		String up = content.replaceAll("-", "").toUpperCase();
+		if(up.contains(NO_FOLLOW_MATCH)) {
+			noFollow = true;
+		}
+		if(up.contains(NO_ARCHIVE_MATCH)) {
+			noArchive = true;
+		}
+		if(up.contains(NO_INDEX_MATCH)) {
+			noIndex = true;
+		}
+		if(up.contains(NO_NOTHIN_MATCH)) {
+			noFollow = true;
+			noArchive = true;
+			noIndex = true;
+		}
+	}
+	public void apply(CaptureSearchResult result) {
+		if(noFollow) result.setRobotNoFollow();
+		if(noIndex) result.setRobotNoIndex();
+		if(noArchive) result.setRobotNoArchive();
+	}
+}


Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaFlags.java
___________________________________________________________________
Added: svn:keywords
   + Author Date Revision Id

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java	2009-11-06 01:53:23 UTC (rev 2887)
@@ -0,0 +1,47 @@
+package org.archive.wayback.resourcestore.indexer;
+
+import java.io.IOException;
+
+import org.archive.wayback.util.htmllex.ParseEventDelegator;
+import org.archive.wayback.util.htmllex.ParseEventDelegatorVisitor;
+import org.archive.wayback.util.htmllex.ParseContext;
+import org.archive.wayback.util.htmllex.handlers.OpenTagHandler;
+import org.htmlparser.nodes.TagNode;
+
+public class RobotMetaRule implements ParseEventDelegatorVisitor, OpenTagHandler {
+
+	private RobotMetaFlags robotFlags = null;
+	
+	public void visit(ParseEventDelegator rules) {
+		// register for <META> Start tags:
+		rules.addOpenTagHandler(this, "META");
+	}
+
+	public void handleOpenTagNode(ParseContext context, TagNode node)
+			throws IOException {
+		String nameVal = node.getAttribute("name");
+		if(nameVal != null) {
+			if(nameVal.toUpperCase().equals("ROBOTS")) {
+				String content = node.getAttribute("content");
+				if(content != null) {
+					robotFlags.parse(content);
+				}
+			}
+		}
+	}
+
+	/**
+	 * @return the robotFlags
+	 */
+	public RobotMetaFlags getRobotFlags() {
+		return robotFlags;
+	}
+
+	/**
+	 * @param robotFlags the robotFlags to set
+	 */
+	public void setRobotFlags(RobotMetaFlags robotFlags) {
+		this.robotFlags = robotFlags;
+	}
+
+}


Property changes on: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/RobotMetaRule.java
___________________________________________________________________
Added: svn:keywords
   + Author Date Revision Id

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java	2009-11-06 01:50:20 UTC (rev 2886)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer/WARCRecordToSearchResultAdapter.java	2009-11-06 01:53:23 UTC (rev 2887)
@@ -2,23 +2,23 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.util.logging.Logger;
+//import java.util.logging.Logger;
 
 import org.apache.commons.httpclient.Header;
 import org.apache.commons.httpclient.HttpParser;
 import org.apache.commons.httpclient.StatusLine;
+import org.apache.commons.httpclient.URIException;
 import org.apache.commons.httpclient.util.EncodingUtil;
+import org.apache.log4j.Logger;
 import org.archive.io.ArchiveRecordHeader;
 import org.archive.io.RecoverableIOException;
 import org.archive.io.arc.ARCConstants;
 import org.archive.io.warc.WARCConstants;
 import org.archive.io.warc.WARCRecord;
 import org.archive.wayback.UrlCanonicalizer;
-import org.archive.wayback.WaybackConstants;
 import org.archive.wayback.core.CaptureSearchResult;
 import org.archive.wayback.util.Adapter;
-import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;
-import org.archive.wayback.util.url.UrlOperations;
+import org.archive.wayback.util.url.IdentityUrlCanonicalizer;
 
 /**
  * Adapts certain WARCRecords into SearchResults. DNS and response records are
@@ -33,29 +33,23 @@
  */
 public class WARCRecordToSearchResultAdapter
 implements Adapter<WARCRecord,CaptureSearchResult>{
+	
 	private static final Logger LOGGER =
         Logger.getLogger(WARCRecordToSearchResultAdapter.class.getName());
 	
 	private final static String DEFAULT_VALUE = "-"; 
-
 	private UrlCanonicalizer canonicalizer = null;
+	private HTTPRecordAnnotater annotater = null;
 	
 	private boolean processAll = false;
 
-	public boolean isProcessAll() {
-		return processAll;
-	}
-
-	public void setProcessAll(boolean processAll) {
-		this.processAll = processAll;
-	}
-
 	public WARCRecordToSearchResultAdapter() {
-		canonicalizer = new AggressiveUrlCanonicalizer();
+		canonicalizer = new IdentityUrlCanonicalizer();
+		annotater = new HTTPRecordAnnotater();
 	}
 
-	/* (non-Javadoc)
-	 * @see org.archive.wayback.util.Adapter#adapt(java.lang.Object)
+	/* 
+	 * This just calls adaptInner, returning null if an Exception is thrown:
 	 */
 	public CaptureSearchResult adapt(WARCRecord rec) {
 		try {
@@ -65,121 +59,94 @@
 			return null;
 		}
 	}
-	
-	/*
-	 * Transform input date to 14-digit timestamp:
-	 * 2007-08-29T18:00:26Z => 20070829180026
-	 */
-	private static String transformDate(final String input) {
+
+	private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException {
 		
-		StringBuilder output = new StringBuilder(14);
-		
-		output.append(input.substring(0,4));
-		output.append(input.substring(5,7));
-		output.append(input.substring(8,10));
-		output.append(input.substring(11,13));
-		output.append(input.substring(14,16));
-		output.append(input.substring(17,19));
-		
-		return output.toString();
-	}
-	
-	private static String escapeSpaces(final String input) {
-		if(input.contains(" ")) {
-			return input.replace(" ", "%20");
-		}
-		return input;
-	}
-	
-	private static String transformHTTPMime(String input) {
-		int semiIdx = input.indexOf(";");
-		if(semiIdx > 0) {
-			return escapeSpaces(input.substring(0,semiIdx).trim());
-		}
-		return escapeSpaces(input.trim());
-	}
+		ArchiveRecordHeader header = rec.getHeader();
 
-	private String transformWarcFilename(String readerIdentifier) {
-		String warcName = readerIdentifier;
-		int index = warcName.lastIndexOf(File.separator);
-		if (index > 0 && (index + 1) < warcName.length()) {
-		    warcName = warcName.substring(index + 1);
+		String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
+		if(type.equals(WARCConstants.WARCINFO)) {
+			LOGGER.info("Skipping record type : " + type);
+			return null;
 		}
-		return warcName;
-	}
 
-	private String transformDigest(final Object o) {
-		if(o == null) {
-			return DEFAULT_VALUE;
+		CaptureSearchResult result = genericResult(rec);
+
+		if(type.equals(WARCConstants.RESPONSE)) {
+			String mime = annotater.transformHTTPMime(header.getMimetype());
+			if(mime.equals("text/dns")) {
+				// close to complete reading, then the digest is legit
+				// TODO: DO we want to use the WARC header digest for this?
+				rec.close();
+				result.setDigest(transformWARCDigest(rec.getDigestStr()));
+				result.setMimeType(mime);
+			} else {
+				result = adaptWARCHTTPResponse(result,rec);
+			}
+		} else if(type.equals(WARCConstants.REVISIT)) {
+			// also set the mime type:
+			result.setMimeType("warc/revisit");
+
+		} else if(type.equals(WARCConstants.REQUEST)) {
+			
+			if(processAll) {
+				// also set the mime type:
+				result.setMimeType("warc/request");
+			} else {
+				result = null;
+			}
+		} else if(type.equals(WARCConstants.METADATA)) {
+
+			if(processAll) {
+				// also set the mime type:
+				result.setMimeType("warc/metadata");
+			} else {
+				result = null;
+			}
+		} else {
+			LOGGER.info("Skipping record type : " + type);
 		}
-		String orig = o.toString();
-		if(orig.startsWith("sha1:")) {
-			return orig.substring(5);
-		}
-		return orig;
+
+		return result;
 	}
 
-	private CaptureSearchResult getBlankSearchResult() {
+	// ALL HELPER METHODS BELOW:
+
+	/*
+	 * Extract all common WARC fields into a CaptureSearchResult. This is the
+	 * same for all WARC record types:
+	 *  
+	 *    file, offset, timestamp, digest, urlKey, originalUrl 
+	 */
+	private CaptureSearchResult genericResult(WARCRecord rec) {
+
 		CaptureSearchResult result = new CaptureSearchResult();
 
-		result.setUrlKey(DEFAULT_VALUE);
-		result.setOriginalUrl(DEFAULT_VALUE);
-		result.setCaptureTimestamp(DEFAULT_VALUE);
-		result.setDigest(DEFAULT_VALUE);
 		result.setMimeType(DEFAULT_VALUE);
 		result.setHttpCode(DEFAULT_VALUE);
 		result.setRedirectUrl(DEFAULT_VALUE);
-		result.setFile(DEFAULT_VALUE);
-		result.setOffset(0);
-		return result;
-	}
-	
-	private void addUrlDataToSearchResult(CaptureSearchResult result, String urlStr)
-	throws IOException {
 
-		result.setOriginalUrl(urlStr);
-		String urlKey = canonicalizer.urlStringToKey(urlStr);
-		result.setUrlKey(urlKey);
-	}
+		ArchiveRecordHeader header = rec.getHeader();
 
-	private CaptureSearchResult adaptDNS(ArchiveRecordHeader header, WARCRecord rec) 
-	throws IOException {
-
-		CaptureSearchResult result = getBlankSearchResult();
-
-		result.setCaptureTimestamp(transformDate(header.getDate()));
-		result.setFile(transformWarcFilename(header.getReaderIdentifier()));
-		result.setOffset(header.getOffset());
+		String file = transformWARCFilename(header.getReaderIdentifier());
+		long offset = header.getOffset();
 		
-		String uriStr = header.getUrl();
-		
-		result.setMimeType(header.getMimetype());
-
-		result.setOriginalUrl(uriStr);
-		result.setUrlKey(uriStr);
-
-		rec.close();
-		result.setDigest(rec.getDigestStr());
-
-		return result;
-	}
-
-	private CaptureSearchResult adaptGeneric(ArchiveRecordHeader header,
-			WARCRecord rec, String mime) 
-	throws IOException {
-
-		CaptureSearchResult result = getBlankSearchResult();
-
-		result.setCaptureTimestamp(transformDate(header.getDate()));
-		result.setFile(transformWarcFilename(header.getReaderIdentifier()));
-		result.setOffset(header.getOffset());
-		result.setDigest(transformDigest(header.getHeaderValue(
+		result.setCaptureTimestamp(transformWARCDate(header.getDate()));
+		result.setFile(file);
+		result.setOffset(offset);
+		result.setDigest(transformWARCDigest(header.getHeaderValue(
 				WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
 		
-		addUrlDataToSearchResult(result,header.getUrl());
-		
-		result.setMimeType(mime);
-
+		String origUrl = header.getUrl();
+		result.setOriginalUrl(origUrl);
+		try {
+			String urlKey = canonicalizer.urlStringToKey(origUrl);
+			result.setUrlKey(urlKey);
+		} catch (URIException e) {
+			LOGGER.warn("FAILED canonicalize(" + origUrl + "):" + 
+					file + " " + offset);
+			result.setUrlKey(origUrl);
+		}
 		return result;
 	}
 
@@ -200,19 +167,55 @@
         }
         return count;
     }
-	
-	private CaptureSearchResult adaptResponse(ArchiveRecordHeader header, WARCRecord rec) 
-	throws IOException {
 
-		CaptureSearchResult result = getBlankSearchResult();
+    private String transformWARCFilename(String readerIdentifier) {
+		String warcName = readerIdentifier;
+		int index = warcName.lastIndexOf(File.separator);
+		if (index > 0 && (index + 1) < warcName.length()) {
+		    warcName = warcName.substring(index + 1);
+		}
+		return warcName;
+	}
 
-		result.setCaptureTimestamp(transformDate(header.getDate()));
-		result.setFile(transformWarcFilename(header.getReaderIdentifier()));
-		result.setOffset(header.getOffset());
+	private String transformWARCDigest(final Object o) {
+		if(o == null) {
+			return DEFAULT_VALUE;
+		}
+		String orig = o.toString();
+		if(orig.startsWith("sha1:")) {
+			return orig.substring(5);
+		}
+		return orig;
+	}
+
+	/*
+	 * Transform input date to 14-digit timestamp:
+	 * 2007-08-29T18:00:26Z => 20070829180026
+	 */
+	private static String transformWARCDate(final String input) {
 		
-		String origUrl = header.getUrl();
-		addUrlDataToSearchResult(result,origUrl);
+		StringBuilder output = new StringBuilder(14);
+		
+		output.append(input.substring(0,4));
+		output.append(input.substring(5,7));
+		output.append(input.substring(8,10));
+		output.append(input.substring(11,13));
+		output.append(input.substring(14,16));
+		output.append(input.substring(17,19));
+		
+		return output.toString();
+	}
 
+    /*
+     * Currently the WARCReader doesn't parse HTTP headers. This method parses
+     * them then calls the common ARC/WARC shared record parsing code, which
+     * addresses HTTP headers, and possibly even parses HTML content to look
+     * for Robot Meta tags.
+     */
+	private CaptureSearchResult adaptWARCHTTPResponse(CaptureSearchResult result,
+			WARCRecord rec) throws IOException {
+
+		ArchiveRecordHeader header = rec.getHeader();
 		// need to parse the documents HTTP message and headers here: WARCReader
 		// does not implement this... yet..
 		
@@ -234,66 +237,13 @@
 		Header[] headers = HttpParser.parseHeaders(rec,
                 ARCConstants.DEFAULT_ENCODING);
 
-		rec.close();
-		result.setDigest(transformDigest(header.getHeaderValue(
-						WARCRecord.HEADER_KEY_PAYLOAD_DIGEST)));
-
-		if (headers != null) {
-	
-			for (Header httpHeader : headers) {
-				if (httpHeader.getName().equals(
-						WaybackConstants.LOCATION_HTTP_HEADER)) {
-	
-					String locationStr = httpHeader.getValue();
-					// TODO: "Location" is supposed to be absolute:
-					// (http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html)
-					// (section 14.30) but Content-Location can be
-					// relative.
-					// is it correct to resolve a relative Location, as
-					// we are?
-					// it's also possible to have both in the HTTP
-					// headers...
-					// should we prefer one over the other?
-					// right now, we're ignoring "Content-Location"
-					result.setRedirectUrl(
-							UrlOperations.resolveUrl(origUrl, locationStr));
-				} else if(httpHeader.getName().toLowerCase().equals("content-type")) {
-					result.setMimeType(transformHTTPMime(httpHeader.getValue()));
-				}
-			}
-		}
-		return result;
-	}
-
-	private CaptureSearchResult adaptInner(WARCRecord rec) throws IOException {
 		
-		CaptureSearchResult result = null;
-		ArchiveRecordHeader header = rec.getHeader();
-		String type = header.getHeaderValue(WARCConstants.HEADER_KEY_TYPE).toString();
-		if(type.equals(WARCConstants.RESPONSE)) {
-			String mime = header.getMimetype();
-			if(mime.equals("text/dns")) {
-				result = adaptDNS(header,rec);
-			} else {
-				result = adaptResponse(header,rec);
-			}
-		} else if(type.equals(WARCConstants.REVISIT)) {
-			result = adaptGeneric(header,rec,"warc/revisit");
-		} else if(type.equals(WARCConstants.REQUEST)) {
-			if(processAll) {
-				result = adaptGeneric(header,rec,"warc/request");
-			}
-		} else if(type.equals(WARCConstants.METADATA)) {
-			if(processAll) {
-				result = adaptGeneric(header,rec,"warc/metadata");
-			}
-		} else {
-			LOGGER.info("Skipping record type : " + type);
-		}
+		annotater.annotateHTTPContent(result,rec,headers,header.getMimetype());
 
 		return result;
 	}
 
+
 	public UrlCanonicalizer getCanonicalizer() {
 		return canonicalizer;
 	}
@@ -301,4 +251,25 @@
 	public void setCanonicalizer(UrlCanonicalizer canonicalizer) {
 		this.canonicalizer = canonicalizer;
 	}
+
+	public boolean isProcessAll() {
+		return processAll;
+	}
+
+	public void setProcessAll(boolean processAll) {
+		this.processAll = processAll;
+	}
+	/**
+	 * @return the annotater
+	 */
+	public HTTPRecordAnnotater getAnnotater() {
+		return annotater;
+	}
+
+	/**
+	 * @param annotater the annotater to set
+	 */
+	public void setAnnotater(HTTPRecordAnnotater annotater) {
+		this.annotater = annotater;
+	}
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access:[2887] trunk/archive-access/projects/wayback/ wayb

[Archive-access-cvs] SF.net SVN: archive-access:[2887] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/resourcestore/indexer