[Archive-access-cvs] SF.net SVN: archive-access:[2476] trunk/archive-access/projects/wayback/ wayb

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2476
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2476&view=rev
Author:   bradtofel
Date:     2008-07-22 02:02:01 +0000 (Tue, 22 Jul 2008)

Log Message:
-----------
RENAME: HTMLPage => TextDocument
INITIAL REV: TextReplayRenderer, abstract ReplayRenderer base-class which manages converting a Resource into a TextDocument, provides an updatePage() abstract method which allows subclasses to perform needed modifications to the TextDocument prior to returning it to the client.

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java

Removed Paths:
-------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java

Deleted: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java
===================================================================

--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java	2008-07-22 01:52:21 UTC (rev 2475)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java	2008-07-22 02:02:01 UTC (rev 2476)
@@ -1,510 +0,0 @@
-/* HTMLPage
- *
- * $Id$
- *
- * Created on 12:39:52 PM Aug 7, 2007.
- *
- * Copyright (C) 2007 Internet Archive.
- *
- * This file is part of wayback-core.
- *
- * wayback-core is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * any later version.
- *
- * wayback-core is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU Lesser Public License for more details.
- *
- * You should have received a copy of the GNU Lesser Public License
- * along with wayback-core; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-package org.archive.wayback.replay;
-
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.OutputStream;
-import java.io.UnsupportedEncodingException;
-import java.nio.charset.Charset;
-import java.nio.charset.IllegalCharsetNameException;
-import java.text.ParseException;
-import java.util.Map;
-
-import javax.servlet.ServletException;
-import javax.servlet.http.HttpServletRequest;
-import javax.servlet.http.HttpServletResponse;
-
-import org.archive.wayback.ResultURIConverter;
-import org.archive.wayback.core.Resource;
-import org.archive.wayback.core.CaptureSearchResult;
-import org.archive.wayback.core.CaptureSearchResults;
-import org.archive.wayback.core.UIResults;
-import org.archive.wayback.core.WaybackRequest;
-import org.mozilla.universalchardet.UniversalDetector;
-
-/**
- * Class which wraps functionality for converting a Resource(InputStream + 
- * HTTP headers) into a StringBuilder, performing several common URL 
- * resolution methods against that StringBuilder, inserting arbitrary Strings
- * into the page, and then converting the page back to a byte array. 
- *
- * @author brad
- * @version $Date$, $Revision$
- */
-public class HTMLPage {
-
-	// hand off this many bytes to the chardet library
-	private final static int MAX_CHARSET_READAHEAD = 65536;
-	// ...if it also includes "charset="
-	private final static String CHARSET_TOKEN = "charset=";
-	// ...and if the chardet library fails, use the Content-Type header
-	private final static String HTTP_CONTENT_TYPE_HEADER = "Content-Type";
-	// if documents are marked up before sending to clients, the data is
-	// decoded into a String in chunks. This is how big a chunk to decode with.
-	private final static int C_BUFFER_SIZE = 4096;
-
-	private Resource resource = null;
-	private CaptureSearchResult result = null; 
-	private ResultURIConverter uriConverter = null;
-	/**
-	 * the internal StringBuilder
-	 */
-	public StringBuilder sb = null;
-	private String charSet = null;
-	private byte[] resultBytes = null;
-
-	/**
-	 * @param resource
-	 * @param result
-	 * @param uriConverter 
-	 */
-	public HTMLPage(Resource resource, CaptureSearchResult result, 
-			ResultURIConverter uriConverter) {
-		this.resource = resource;
-		this.result = result;
-		this.uriConverter = uriConverter;
-	}
-
-	private boolean isCharsetSupported(String charsetName) {
-		// can you believe that this throws a runtime? Just asking if it's
-		// supported!!?! They coulda just said "no"...
-		try {
-			return Charset.isSupported(charsetName);
-		} catch(IllegalCharsetNameException e) {
-			return false;
-		}
-	}
-	
-	private String contentTypeToCharset(final String contentType) {
-		int offset = contentType.indexOf(CHARSET_TOKEN);
-		if (offset != -1) {
-			String cs = contentType.substring(offset + CHARSET_TOKEN.length());
-			if(isCharsetSupported(cs)) {
-				return cs;
-			}
-			// test for extra spaces... there's at least one page out there that
-			// indicates it's charset with:
-
-//  <meta http-equiv="Content-type" content="text/html; charset=i so-8859-1">
-
-			// bad web page!
-			if(isCharsetSupported(cs.replace(" ", ""))) {
-				return cs.replace(" ", "");
-			}
-		}
-		return null;
-	}
-	
-	/**
-	 * Attempt to divine the character encoding of the document from the 
-	 * Content-Type HTTP header (with a "charset=")
-	 * 
-	 * @param resource
-	 * @return String character set found or null if the header was not present
-	 * @throws IOException 
-	 */
-	protected String getCharsetFromHeaders(Resource resource) 
-	throws IOException {
-		
-		String charsetName = null;
-
-		Map<String,String> httpHeaders = resource.getHttpHeaders();
-		String ctype = httpHeaders.get(HTTP_CONTENT_TYPE_HEADER);
-		if (ctype != null) {
-			charsetName = contentTypeToCharset(ctype);
-		}
-		return charsetName;
-	}
-
-	/**
-	 * Attempt to find a META tag in the HTML that hints at the character set
-	 * used to write the document.
-	 * 
-	 * @param resource
-	 * @return String character set found from META tags in the HTML
-	 * @throws IOException
-	 */
-	protected String getCharsetFromMeta(Resource resource) throws IOException {
-		String charsetName = null;
-
-		byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD];
-		resource.mark(MAX_CHARSET_READAHEAD);
-		resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD);
-		resource.reset();
-		// convert to UTF-8 String -- which hopefully will not mess up the
-		// characters we're interested in...
-		StringBuilder sb = new StringBuilder(new String(bbuffer,"UTF-8"));
-		String metaContentType = TagMagix.getTagAttrWhere(sb, "META",
-				"content", "http-equiv", "Content-Type");
-		if(metaContentType != null) {
-			charsetName = contentTypeToCharset(metaContentType);
-		}
-		return charsetName;
-	}
-	
-	/**
-	 * Attempts to figure out the character set of the document using
-	 * the excellent juniversalchardet library.
-	 * 
-	 * @param resource
-	 * @return String character encoding found, or null if nothing looked good.
-	 * @throws IOException
-	 */
-	protected String getCharsetFromBytes(Resource resource) throws IOException {
-		String charsetName = null;
-
-		byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD];
-		   // (1)
-	    UniversalDetector detector = new UniversalDetector(null);
-
-	    // (2)
-		resource.mark(MAX_CHARSET_READAHEAD);
-		int len = resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD);
-		resource.reset();
-		detector.handleData(bbuffer, 0, len);
-		// (3)
-		detector.dataEnd();
-	    // (4)
-	    charsetName = detector.getDetectedCharset();
-
-	    // (5)
-	    detector.reset();
-
-		return charsetName;
-	}
-
-	/**
-	 * Use META tags, byte-character-detection, HTTP headers, hope, and prayer
-	 * to figure out what character encoding is being used for the document.
-	 * If nothing else works, assumes UTF-8 for now.
-	 * 
-	 * @param resource
-	 * @return String charset for Resource
-	 * @throws IOException
-	 */
-	protected String guessCharset() throws IOException {
-		
-		String charSet = getCharsetFromMeta(resource);
-		if(charSet == null) {
-			charSet = getCharsetFromBytes(resource);
-			if(charSet == null) {
-				charSet = getCharsetFromHeaders(resource);
-				if(charSet == null) {
-					charSet = "UTF-8";
-				}
-			}
-		}
-		return charSet;
-	}
-
-	/**
-	 * Update URLs inside the page, so those URLs which must be correct at
-	 * page load time resolve correctly to absolute URLs.
-	 * 
-	 * This means ensuring there is a BASE HREF tag, adding one if missing,
-	 * and then resolving:
-	 *     FRAME-SRC, META-URL, LINK-HREF, SCRIPT-SRC
-	 * tag-attribute pairs against either the existing BASE-HREF, or the
-	 * page's absolute URL if it was missing.
-	 */
-	public void resolvePageUrls() {
-
-		// TODO: get url from Resource instead of SearchResult?
-		String pageUrl = result.getOriginalUrl();
-		String captureDate = result.getCaptureTimestamp();
-
-		String existingBaseHref = TagMagix.getBaseHref(sb);
-		if (existingBaseHref == null) {
-			insertAtStartOfHead("<base href=\"" + pageUrl + "\" />");
-		} else {
-			pageUrl = existingBaseHref;
-		}
-
-		String markups[][] = {
-				{"FRAME","SRC"},
-				{"META","URL"},
-				{"LINK","HREF"},
-				{"SCRIPT","SRC"},
-				{TagMagix.ANY_TAGNAME,"background"}
-		};
-		// TODO: The classic WM added a js_ to the datespec, so NotInArchives
-		// can return an valid javascript doc, and not cause Javascript errors.
-		for(String tagAttr[] : markups) {
-			TagMagix.markupTagREURIC(sb, uriConverter, captureDate, pageUrl,
-					tagAttr[0], tagAttr[1]);
-		}
-		TagMagix.markupCSSImports(sb,uriConverter, captureDate, pageUrl);
-		TagMagix.markupStyleUrls(sb,uriConverter,captureDate,pageUrl);
-	}
-	
-	/**
-	 * Update all URLs inside the page, so they resolve correctly to absolute 
-	 * URLs within the Wayback service.
-	 */
-	public void resolveAllPageUrls() {
-
-		// TODO: get url from Resource instead of SearchResult?
-		String pageUrl = result.getOriginalUrl();
-		String captureDate = result.getCaptureTimestamp();
-
-		String existingBaseHref = TagMagix.getBaseHref(sb);
-		if (existingBaseHref != null) {
-			pageUrl = existingBaseHref;
-		}
-		ResultURIConverter ruc = new SpecialResultURIConverter(uriConverter);
-		
-		// TODO: forms...?
-		String markups[][] = {
-				{"FRAME","SRC"},
-				{"META","URL"},
-				{"LINK","HREF"},
-				{"SCRIPT","SRC"},
-				{"IMG","SRC"},
-				{"A","HREF"},
-				{"AREA","HREF"},
-				{"OBJECT","CODEBASE"},
-				{"OBJECT","CDATA"},
-				{"APPLET","CODEBASE"},
-				{"APPLET","ARCHIVE"},
-				{"EMBED","SRC"},
-				{"IFRAME","SRC"},
-				{TagMagix.ANY_TAGNAME,"background"}
-		};
-		for(String tagAttr[] : markups) {
-			TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl,
-					tagAttr[0], tagAttr[1]);
-		}
-		TagMagix.markupCSSImports(sb,uriConverter, captureDate, pageUrl);
-		TagMagix.markupStyleUrls(sb,uriConverter,captureDate,pageUrl);
-	}
-	
-	public void resolveCSSUrls() {
-		// TODO: get url from Resource instead of SearchResult?
-		String pageUrl = result.getOriginalUrl();
-		String captureDate = result.getCaptureTimestamp();
-		TagMagix.markupCSSImports(sb,uriConverter, captureDate, pageUrl);
-	}
-
-	public void resolveASXRefUrls() {
-
-		// TODO: get url from Resource instead of SearchResult?
-		String pageUrl = result.getOriginalUrl();
-		String captureDate = result.getCaptureTimestamp();
-		ResultURIConverter ruc = new MMSToHTTPResultURIConverter(uriConverter);
-		
-		TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl,
-				"REF", "HREF");
-	}
-	
-	public void stripHTML() {
-		String stripped = sb.toString().replaceAll("\\<.*?>","");
-		sb.setLength(0);
-		sb.append(stripped);
-	}
-	/**
-	 * @param charSet
-	 * @throws IOException 
-	 */
-	public void readFully(String charSet) throws IOException {
-		if(charSet == null) {
-			charSet = guessCharset();
-		}
-		this.charSet = charSet;
-		int recordLength = (int) resource.getRecordLength();
-
-		// convert bytes to characters for charset:
-		InputStreamReader isr = new InputStreamReader(resource, charSet);
-
-		char[] cbuffer = new char[C_BUFFER_SIZE];
-
-		// slurp the whole thing into RAM:
-		sb = new StringBuilder(recordLength);
-		for (int r = -1; (r = isr.read(cbuffer, 0, C_BUFFER_SIZE)) != -1;) {
-			sb.append(cbuffer, 0, r);
-		}
-	}
-		
-	/**
-	 * Read bytes from input stream, using best-guess for character encoding
-	 * @throws IOException 
-	 */
-	public void readFully() throws IOException {
-		readFully(null);
-	}
-	
-	/**
-	 * @return raw bytes contained in internal StringBuilder
-	 * @throws UnsupportedEncodingException
-	 */
-	public byte[] getBytes() throws UnsupportedEncodingException {
-		if(sb == null) {
-			throw new IllegalStateException("No interal StringBuffer");
-		}
-		if(resultBytes == null) {
-			resultBytes = sb.toString().getBytes(charSet);
-		}
-		return resultBytes;
-	}
-	
-	/**
-	 * Write the contents of the page to the client.
-	 * 
-	 * @param os
-	 * @throws IOException
-	 */
-	public void writeToOutputStream(OutputStream os) throws IOException {
-		if(sb == null) {
-			throw new IllegalStateException("No interal StringBuffer");
-		}
-		byte[] b;
-		try {
-			b = getBytes();
-		} catch (UnsupportedEncodingException e) {
-			throw new RuntimeException(e);
-		}
-		os.write(b);
-	}
-
-	/**
-	 * @param toInsert
-	 */	
-	public void insertAtStartOfHead(String toInsert) {
-		int insertPoint = TagMagix.getEndOfFirstTag(sb,"head");
-		if (-1 == insertPoint) {
-			insertPoint = 0;
-		}
-		sb.insert(insertPoint,toInsert);
-	}
-
-	/**
-	 * @param toInsert
-	 */
-	public void insertAtEndOfBody(String toInsert) {
-		int insertPoint = sb.lastIndexOf("</body>");
-		if (-1 == insertPoint) {
-			insertPoint = sb.lastIndexOf("</BODY>");
-		}
-		if (-1 == insertPoint) {
-			insertPoint = sb.length();
-		}
-		sb.insert(insertPoint,toInsert);
-	}
-	/**
-	 * @param toInsert
-	 */
-	public void insertAtStartOfBody(String toInsert) {
-		int insertPoint = TagMagix.getEndOfFirstTag(sb,"body");
-		if (-1 == insertPoint) {
-			insertPoint = 0;
-		}
-		sb.insert(insertPoint,toInsert);
-	}	
-	/**
-	 * @param jspPath
-	 * @param httpRequest
-	 * @param httpResponse
-	 * @param wbRequest
-	 * @param results
-	 * @return
-	 * @throws IOException 
-	 * @throws ServletException 
-	 * @throws ParseException 
-	 */
-	public String includeJspString(String jspPath, 
-			HttpServletRequest httpRequest, HttpServletResponse httpResponse,
-			WaybackRequest wbRequest, CaptureSearchResults results, 
-			CaptureSearchResult result, Resource resource) 
-	throws ServletException, IOException {
-		
-		UIResults uiResults = new UIResults(wbRequest,uriConverter,results,
-				result,resource);
-
-		StringHttpServletResponseWrapper wrappedResponse = 
-			new StringHttpServletResponseWrapper(httpResponse);
-		uiResults.forward(httpRequest, wrappedResponse, jspPath);
-//		uiResults.storeInRequest(httpRequest,jspPath);
-//		RequestDispatcher dispatcher = httpRequest.getRequestDispatcher(jspPath);
-//		dispatcher.forward(httpRequest, wrappedResponse);
-		return wrappedResponse.getStringResponse();
-	}
-	
-	/**
-	 * @param jsUrl
-	 * @return
-	 */
-	public String getJSIncludeString(final String jsUrl) {
-		return "<script type=\"text/javascript\" src=\"" 
-			+ jsUrl + "\" ></script>\n";
-	}
-
-	/**
-	 * @return the charSet
-	 */
-	public String getCharSet() {
-		return charSet;
-	}
-
-	/**
-	 * @param charSet the charSet to set
-	 */
-	public void setCharSet(String charSet) {
-		this.charSet = charSet;
-	}
-
-	private class SpecialResultURIConverter implements ResultURIConverter {
-		private static final String EMAIL_PROTOCOL_PREFIX = "mailto:";
-		private static final String JAVASCRIPT_PROTOCOL_PREFIX = "javascript:";
-		private ResultURIConverter base = null;
-		public SpecialResultURIConverter(ResultURIConverter base) {
-			this.base = base;
-		}
-		public String makeReplayURI(String datespec, String url) {
-			if(url.startsWith(EMAIL_PROTOCOL_PREFIX)) {
-				return url;
-			}
-			if(url.startsWith(JAVASCRIPT_PROTOCOL_PREFIX)) {
-				return url;
-			}
-			return base.makeReplayURI(datespec, url);
-		}
-	}
-
-	private class MMSToHTTPResultURIConverter implements ResultURIConverter {
-		private static final String MMS_PROTOCOL_PREFIX = "mms://";
-		private static final String HTTP_PROTOCOL_PREFIX = "http://";
-		private ResultURIConverter base = null;
-		public MMSToHTTPResultURIConverter(ResultURIConverter base) {
-			this.base = base;
-		}
-		public String makeReplayURI(String datespec, String url) {
-			if(url.startsWith(MMS_PROTOCOL_PREFIX)) {
-				url = HTTP_PROTOCOL_PREFIX + 
-					url.substring(MMS_PROTOCOL_PREFIX.length());
-			}
-			return base.makeReplayURI(datespec, url);
-		}
-	}	
-}

Copied: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java (from rev 2448, trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/HTMLPage.java)
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java	2008-07-22 02:02:01 UTC (rev 2476)
@@ -0,0 +1,510 @@
+/* HTMLPage
+ *
+ * $Id$
+ *
+ * Created on 12:39:52 PM Aug 7, 2007.
+ *
+ * Copyright (C) 2007 Internet Archive.
+ *
+ * This file is part of wayback-core.
+ *
+ * wayback-core is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback-core is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback-core; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+package org.archive.wayback.replay;
+
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.text.ParseException;
+import java.util.Map;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+import org.archive.wayback.ResultURIConverter;
+import org.archive.wayback.core.Resource;
+import org.archive.wayback.core.CaptureSearchResult;
+import org.archive.wayback.core.CaptureSearchResults;
+import org.archive.wayback.core.UIResults;
+import org.archive.wayback.core.WaybackRequest;
+import org.mozilla.universalchardet.UniversalDetector;
+
+/**
+ * Class which wraps functionality for converting a Resource(InputStream + 
+ * HTTP headers) into a StringBuilder, performing several common URL 
+ * resolution methods against that StringBuilder, inserting arbitrary Strings
+ * into the page, and then converting the page back to a byte array. 
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public class TextDocument {
+
+	// hand off this many bytes to the chardet library
+	private final static int MAX_CHARSET_READAHEAD = 65536;
+	// ...if it also includes "charset="
+	private final static String CHARSET_TOKEN = "charset=";
+	// ...and if the chardet library fails, use the Content-Type header
+	private final static String HTTP_CONTENT_TYPE_HEADER = "Content-Type";
+	// if documents are marked up before sending to clients, the data is
+	// decoded into a String in chunks. This is how big a chunk to decode with.
+	private final static int C_BUFFER_SIZE = 4096;
+
+	private Resource resource = null;
+	private CaptureSearchResult result = null; 
+	private ResultURIConverter uriConverter = null;
+	/**
+	 * the internal StringBuilder
+	 */
+	public StringBuilder sb = null;
+	private String charSet = null;
+	private byte[] resultBytes = null;
+
+	/**
+	 * @param resource
+	 * @param result
+	 * @param uriConverter 
+	 */
+	public TextDocument(Resource resource, CaptureSearchResult result, 
+			ResultURIConverter uriConverter) {
+		this.resource = resource;
+		this.result = result;
+		this.uriConverter = uriConverter;
+	}
+
+	private boolean isCharsetSupported(String charsetName) {
+		// can you believe that this throws a runtime? Just asking if it's
+		// supported!!?! They coulda just said "no"...
+		try {
+			return Charset.isSupported(charsetName);
+		} catch(IllegalCharsetNameException e) {
+			return false;
+		}
+	}
+	
+	private String contentTypeToCharset(final String contentType) {
+		int offset = contentType.indexOf(CHARSET_TOKEN);
+		if (offset != -1) {
+			String cs = contentType.substring(offset + CHARSET_TOKEN.length());
+			if(isCharsetSupported(cs)) {
+				return cs;
+			}
+			// test for extra spaces... there's at least one page out there that
+			// indicates it's charset with:
+
+//  <meta http-equiv="Content-type" content="text/html; charset=i so-8859-1">
+
+			// bad web page!
+			if(isCharsetSupported(cs.replace(" ", ""))) {
+				return cs.replace(" ", "");
+			}
+		}
+		return null;
+	}
+	
+	/**
+	 * Attempt to divine the character encoding of the document from the 
+	 * Content-Type HTTP header (with a "charset=")
+	 * 
+	 * @param resource
+	 * @return String character set found or null if the header was not present
+	 * @throws IOException 
+	 */
+	protected String getCharsetFromHeaders(Resource resource) 
+	throws IOException {
+		
+		String charsetName = null;
+
+		Map<String,String> httpHeaders = resource.getHttpHeaders();
+		String ctype = httpHeaders.get(HTTP_CONTENT_TYPE_HEADER);
+		if (ctype != null) {
+			charsetName = contentTypeToCharset(ctype);
+		}
+		return charsetName;
+	}
+
+	/**
+	 * Attempt to find a META tag in the HTML that hints at the character set
+	 * used to write the document.
+	 * 
+	 * @param resource
+	 * @return String character set found from META tags in the HTML
+	 * @throws IOException
+	 */
+	protected String getCharsetFromMeta(Resource resource) throws IOException {
+		String charsetName = null;
+
+		byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD];
+		resource.mark(MAX_CHARSET_READAHEAD);
+		resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD);
+		resource.reset();
+		// convert to UTF-8 String -- which hopefully will not mess up the
+		// characters we're interested in...
+		StringBuilder sb = new StringBuilder(new String(bbuffer,"UTF-8"));
+		String metaContentType = TagMagix.getTagAttrWhere(sb, "META",
+				"content", "http-equiv", "Content-Type");
+		if(metaContentType != null) {
+			charsetName = contentTypeToCharset(metaContentType);
+		}
+		return charsetName;
+	}
+	
+	/**
+	 * Attempts to figure out the character set of the document using
+	 * the excellent juniversalchardet library.
+	 * 
+	 * @param resource
+	 * @return String character encoding found, or null if nothing looked good.
+	 * @throws IOException
+	 */
+	protected String getCharsetFromBytes(Resource resource) throws IOException {
+		String charsetName = null;
+
+		byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD];
+		   // (1)
+	    UniversalDetector detector = new UniversalDetector(null);
+
+	    // (2)
+		resource.mark(MAX_CHARSET_READAHEAD);
+		int len = resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD);
+		resource.reset();
+		detector.handleData(bbuffer, 0, len);
+		// (3)
+		detector.dataEnd();
+	    // (4)
+	    charsetName = detector.getDetectedCharset();
+
+	    // (5)
+	    detector.reset();
+
+		return charsetName;
+	}
+
+	/**
+	 * Use META tags, byte-character-detection, HTTP headers, hope, and prayer
+	 * to figure out what character encoding is being used for the document.
+	 * If nothing else works, assumes UTF-8 for now.
+	 * 
+	 * @param resource
+	 * @return String charset for Resource
+	 * @throws IOException
+	 */
+	protected String guessCharset() throws IOException {
+		
+		String charSet = getCharsetFromMeta(resource);
+		if(charSet == null) {
+			charSet = getCharsetFromBytes(resource);
+			if(charSet == null) {
+				charSet = getCharsetFromHeaders(resource);
+				if(charSet == null) {
+					charSet = "UTF-8";
+				}
+			}
+		}
+		return charSet;
+	}
+
+	/**
+	 * Update URLs inside the page, so those URLs which must be correct at
+	 * page load time resolve correctly to absolute URLs.
+	 * 
+	 * This means ensuring there is a BASE HREF tag, adding one if missing,
+	 * and then resolving:
+	 *     FRAME-SRC, META-URL, LINK-HREF, SCRIPT-SRC
+	 * tag-attribute pairs against either the existing BASE-HREF, or the
+	 * page's absolute URL if it was missing.
+	 */
+	public void resolvePageUrls() {
+
+		// TODO: get url from Resource instead of SearchResult?
+		String pageUrl = result.getOriginalUrl();
+		String captureDate = result.getCaptureTimestamp();
+
+		String existingBaseHref = TagMagix.getBaseHref(sb);
+		if (existingBaseHref == null) {
+			insertAtStartOfHead("<base href=\"" + pageUrl + "\" />");
+		} else {
+			pageUrl = existingBaseHref;
+		}
+
+		String markups[][] = {
+				{"FRAME","SRC"},
+				{"META","URL"},
+				{"LINK","HREF"},
+				{"SCRIPT","SRC"},
+				{TagMagix.ANY_TAGNAME,"background"}
+		};
+		// TODO: The classic WM added a js_ to the datespec, so NotInArchives
+		// can return an valid javascript doc, and not cause Javascript errors.
+		for(String tagAttr[] : markups) {
+			TagMagix.markupTagREURIC(sb, uriConverter, captureDate, pageUrl,
+					tagAttr[0], tagAttr[1]);
+		}
+		TagMagix.markupCSSImports(sb,uriConverter, captureDate, pageUrl);
+		TagMagix.markupStyleUrls(sb,uriConverter,captureDate,pageUrl);
+	}
+	
+	/**
+	 * Update all URLs inside the page, so they resolve correctly to absolute 
+	 * URLs within the Wayback service.
+	 */
+	public void resolveAllPageUrls() {
+
+		// TODO: get url from Resource instead of SearchResult?
+		String pageUrl = result.getOriginalUrl();
+		String captureDate = result.getCaptureTimestamp();
+
+		String existingBaseHref = TagMagix.getBaseHref(sb);
+		if (existingBaseHref != null) {
+			pageUrl = existingBaseHref;
+		}
+		ResultURIConverter ruc = new SpecialResultURIConverter(uriConverter);
+		
+		// TODO: forms...?
+		String markups[][] = {
+				{"FRAME","SRC"},
+				{"META","URL"},
+				{"LINK","HREF"},
+				{"SCRIPT","SRC"},
+				{"IMG","SRC"},
+				{"A","HREF"},
+				{"AREA","HREF"},
+				{"OBJECT","CODEBASE"},
+				{"OBJECT","CDATA"},
+				{"APPLET","CODEBASE"},
+				{"APPLET","ARCHIVE"},
+				{"EMBED","SRC"},
+				{"IFRAME","SRC"},
+				{TagMagix.ANY_TAGNAME,"background"}
+		};
+		for(String tagAttr[] : markups) {
+			TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl,
+					tagAttr[0], tagAttr[1]);
+		}
+		TagMagix.markupCSSImports(sb,uriConverter, captureDate, pageUrl);
+		TagMagix.markupStyleUrls(sb,uriConverter,captureDate,pageUrl);
+	}
+	
+	public void resolveCSSUrls() {
+		// TODO: get url from Resource instead of SearchResult?
+		String pageUrl = result.getOriginalUrl();
+		String captureDate = result.getCaptureTimestamp();
+		TagMagix.markupCSSImports(sb,uriConverter, captureDate, pageUrl);
+	}
+
+	public void resolveASXRefUrls() {
+
+		// TODO: get url from Resource instead of SearchResult?
+		String pageUrl = result.getOriginalUrl();
+		String captureDate = result.getCaptureTimestamp();
+		ResultURIConverter ruc = new MMSToHTTPResultURIConverter(uriConverter);
+		
+		TagMagix.markupTagREURIC(sb, ruc, captureDate, pageUrl,
+				"REF", "HREF");
+	}
+	
+	public void stripHTML() {
+		String stripped = sb.toString().replaceAll("\\<.*?>","");
+		sb.setLength(0);
+		sb.append(stripped);
+	}
+	/**
+	 * @param charSet
+	 * @throws IOException 
+	 */
+	public void readFully(String charSet) throws IOException {
+		if(charSet == null) {
+			charSet = guessCharset();
+		}
+		this.charSet = charSet;
+		int recordLength = (int) resource.getRecordLength();
+
+		// convert bytes to characters for charset:
+		InputStreamReader isr = new InputStreamReader(resource, charSet);
+
+		char[] cbuffer = new char[C_BUFFER_SIZE];
+
+		// slurp the whole thing into RAM:
+		sb = new StringBuilder(recordLength);
+		for (int r = -1; (r = isr.read(cbuffer, 0, C_BUFFER_SIZE)) != -1;) {
+			sb.append(cbuffer, 0, r);
+		}
+	}
+		
+	/**
+	 * Read bytes from input stream, using best-guess for character encoding
+	 * @throws IOException 
+	 */
+	public void readFully() throws IOException {
+		readFully(null);
+	}
+	
+	/**
+	 * @return raw bytes contained in internal StringBuilder
+	 * @throws UnsupportedEncodingException
+	 */
+	public byte[] getBytes() throws UnsupportedEncodingException {
+		if(sb == null) {
+			throw new IllegalStateException("No interal StringBuffer");
+		}
+		if(resultBytes == null) {
+			resultBytes = sb.toString().getBytes(charSet);
+		}
+		return resultBytes;
+	}
+	
+	/**
+	 * Write the contents of the page to the client.
+	 * 
+	 * @param os
+	 * @throws IOException
+	 */
+	public void writeToOutputStream(OutputStream os) throws IOException {
+		if(sb == null) {
+			throw new IllegalStateException("No interal StringBuffer");
+		}
+		byte[] b;
+		try {
+			b = getBytes();
+		} catch (UnsupportedEncodingException e) {
+			throw new RuntimeException(e);
+		}
+		os.write(b);
+	}
+
+	/**
+	 * @param toInsert
+	 */	
+	public void insertAtStartOfHead(String toInsert) {
+		int insertPoint = TagMagix.getEndOfFirstTag(sb,"head");
+		if (-1 == insertPoint) {
+			insertPoint = 0;
+		}
+		sb.insert(insertPoint,toInsert);
+	}
+
+	/**
+	 * @param toInsert
+	 */
+	public void insertAtEndOfBody(String toInsert) {
+		int insertPoint = sb.lastIndexOf("</body>");
+		if (-1 == insertPoint) {
+			insertPoint = sb.lastIndexOf("</BODY>");
+		}
+		if (-1 == insertPoint) {
+			insertPoint = sb.length();
+		}
+		sb.insert(insertPoint,toInsert);
+	}
+	/**
+	 * @param toInsert
+	 */
+	public void insertAtStartOfBody(String toInsert) {
+		int insertPoint = TagMagix.getEndOfFirstTag(sb,"body");
+		if (-1 == insertPoint) {
+			insertPoint = 0;
+		}
+		sb.insert(insertPoint,toInsert);
+	}	
+	/**
+	 * @param jspPath
+	 * @param httpRequest
+	 * @param httpResponse
+	 * @param wbRequest
+	 * @param results
+	 * @return
+	 * @throws IOException 
+	 * @throws ServletException 
+	 * @throws ParseException 
+	 */
+	public String includeJspString(String jspPath, 
+			HttpServletRequest httpRequest, HttpServletResponse httpResponse,
+			WaybackRequest wbRequest, CaptureSearchResults results, 
+			CaptureSearchResult result, Resource resource) 
+	throws ServletException, IOException {
+		
+		UIResults uiResults = new UIResults(wbRequest,uriConverter,results,
+				result,resource);
+
+		StringHttpServletResponseWrapper wrappedResponse = 
+			new StringHttpServletResponseWrapper(httpResponse);
+		uiResults.forward(httpRequest, wrappedResponse, jspPath);
+//		uiResults.storeInRequest(httpRequest,jspPath);
+//		RequestDispatcher dispatcher = httpRequest.getRequestDispatcher(jspPath);
+//		dispatcher.forward(httpRequest, wrappedResponse);
+		return wrappedResponse.getStringResponse();
+	}
+	
+	/**
+	 * @param jsUrl
+	 * @return
+	 */
+	public String getJSIncludeString(final String jsUrl) {
+		return "<script type=\"text/javascript\" src=\"" 
+			+ jsUrl + "\" ></script>\n";
+	}
+
+	/**
+	 * @return the charSet
+	 */
+	public String getCharSet() {
+		return charSet;
+	}
+
+	/**
+	 * @param charSet the charSet to set
+	 */
+	public void setCharSet(String charSet) {
+		this.charSet = charSet;
+	}
+
+	private class SpecialResultURIConverter implements ResultURIConverter {
+		private static final String EMAIL_PROTOCOL_PREFIX = "mailto:";
+		private static final String JAVASCRIPT_PROTOCOL_PREFIX = "javascript:";
+		private ResultURIConverter base = null;
+		public SpecialResultURIConverter(ResultURIConverter base) {
+			this.base = base;
+		}
+		public String makeReplayURI(String datespec, String url) {
+			if(url.startsWith(EMAIL_PROTOCOL_PREFIX)) {
+				return url;
+			}
+			if(url.startsWith(JAVASCRIPT_PROTOCOL_PREFIX)) {
+				return url;
+			}
+			return base.makeReplayURI(datespec, url);
+		}
+	}
+
+	private class MMSToHTTPResultURIConverter implements ResultURIConverter {
+		private static final String MMS_PROTOCOL_PREFIX = "mms://";
+		private static final String HTTP_PROTOCOL_PREFIX = "http://";
+		private ResultURIConverter base = null;
+		public MMSToHTTPResultURIConverter(ResultURIConverter base) {
+			this.base = base;
+		}
+		public String makeReplayURI(String datespec, String url) {
+			if(url.startsWith(MMS_PROTOCOL_PREFIX)) {
+				url = HTTP_PROTOCOL_PREFIX + 
+					url.substring(MMS_PROTOCOL_PREFIX.length());
+			}
+			return base.makeReplayURI(datespec, url);
+		}
+	}	
+}

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java	2008-07-22 02:02:01 UTC (rev 2476)
@@ -0,0 +1,119 @@
+/* HTMLReplayRenderer
+ *
+ * $Id$
+ *
+ * Created on 1:07:28 PM Jul 15, 2008.
+ *
+ * Copyright (C) 2008 Internet Archive.
+ *
+ * This file is part of wayback.
+ *
+ * wayback is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * any later version.
+ *
+ * wayback is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser Public License
+ * along with wayback; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+package org.archive.wayback.replay;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+import org.archive.wayback.ReplayRenderer;
+import org.archive.wayback.ResultURIConverter;
+import org.archive.wayback.core.CaptureSearchResult;
+import org.archive.wayback.core.CaptureSearchResults;
+import org.archive.wayback.core.Resource;
+import org.archive.wayback.core.WaybackRequest;
+import org.archive.wayback.exception.BadContentException;
+
+/**
+ *
+ *
+ * @author brad
+ * @version $Date$, $Revision$
+ */
+public abstract class TextReplayRenderer implements ReplayRenderer {
+
+	public final static String HTTP_LENGTH_HEADER = "Content-Length";
+	public final static String HTTP_LENGTH_HEADER_UP = 
+		HTTP_LENGTH_HEADER.toUpperCase();
+
+	private List<String> jspInserts = null;
+	private HttpHeaderProcessor httpHeaderProcessor;
+
+	public TextReplayRenderer(HttpHeaderProcessor httpHeaderProcessor) {
+		this.httpHeaderProcessor = httpHeaderProcessor;
+	}
+	
+	protected abstract void updatePage(TextDocument page, 
+			HttpServletRequest httpRequest,
+			HttpServletResponse httpResponse, WaybackRequest wbRequest,
+			CaptureSearchResult result, Resource resource,
+			ResultURIConverter uriConverter, CaptureSearchResults results)
+		throws ServletException, IOException;
+
+	/* (non-Javadoc)
+	 * @see org.archive.wayback.ReplayRenderer#renderResource(javax.servlet.http.HttpServletRequest, javax.servlet.http.HttpServletResponse, org.archive.wayback.core.WaybackRequest, org.archive.wayback.core.SearchResult, org.archive.wayback.core.Resource, org.archive.wayback.ResultURIConverter, org.archive.wayback.core.SearchResults)
+	 */
+	public void renderResource(HttpServletRequest httpRequest,
+			HttpServletResponse httpResponse, WaybackRequest wbRequest,
+			CaptureSearchResult result, Resource resource,
+			ResultURIConverter uriConverter, CaptureSearchResults results)
+			throws ServletException, IOException, BadContentException {
+
+		HttpHeaderOperation.copyHTTPMessageHeader(resource, httpResponse);
+
+		Map<String,String> headers = HttpHeaderOperation.processHeaders(
+				resource, result, uriConverter, httpHeaderProcessor);
+	
+		// Load content into an HTML page, and resolve load-time URLs:
+		TextDocument page = new TextDocument(resource,result,uriConverter);
+		page.readFully();
+		
+		updatePage(page,httpRequest,httpResponse,wbRequest,result,resource,
+				uriConverter,results);
+
+		// set the corrected length:
+		int bytes = page.getBytes().length;
+		headers.put(HTTP_LENGTH_HEADER, String.valueOf(bytes));
+		// Tomcat will always send a charset... It's trying to be smarter than
+		// we are. If the original page didn't include a "charset" as part of
+		// the "Content-Type" HTTP header, then Tomcat will use the default..
+		// who knows what that is, or what that will do to the page..
+		// let's try explicitly setting it to what we used:
+		httpResponse.setCharacterEncoding(page.getCharSet());
+
+		// send back the headers:
+		HttpHeaderOperation.sendHeaders(headers, httpResponse);
+
+		page.writeToOutputStream(httpResponse.getOutputStream());
+	}
+
+	/**
+	 * @return the jspInserts
+	 */
+	public List<String> getJspInserts() {
+		return jspInserts;
+	}
+
+	/**
+	 * @param jspInserts the jspInserts to set
+	 */
+	public void setJspInserts(List<String> jspInserts) {
+		this.jspInserts = jspInserts;
+	}
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access:[2476] trunk/archive-access/projects/wayback/ wayb

[Archive-access-cvs] SF.net SVN: archive-access:[2476] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/replay