[Archive-access-cvs] SF.net SVN: archive-access:[2812] trunk/archive-access/projects/wayback/ wayb

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 2812
          http://archive-access.svn.sourceforge.net/archive-access/?rev=2812&view=rev
Author:   bradtofel
Date:     2009-10-22 23:34:57 +0000 (Thu, 22 Oct 2009)

Log Message:
-----------
REFACTOR: moved all character encoding detection into CharsetDetector interface. Two initial implementations, one which implements the previous behavior, and another which allows a user to "rotate" through different detection strategies.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java

Added Paths:
-----------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/RotatingCharsetDetector.java
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/StandardCharsetDetector.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java
===================================================================

--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java	2009-10-19 22:55:27 UTC (rev 2811)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java	2009-10-22 23:34:57 UTC (rev 2812)
@@ -28,11 +28,7 @@
 import java.io.InputStreamReader;
 import java.io.OutputStream;
 import java.io.UnsupportedEncodingException;
-import java.nio.charset.Charset;
-import java.nio.charset.IllegalCharsetNameException;
 import java.text.ParseException;
-import java.util.Iterator;
-import java.util.Map;
 
 import javax.servlet.ServletException;
 import javax.servlet.http.HttpServletRequest;
@@ -44,7 +40,6 @@
 import org.archive.wayback.core.CaptureSearchResults;
 import org.archive.wayback.core.UIResults;
 import org.archive.wayback.core.WaybackRequest;
-import org.mozilla.universalchardet.UniversalDetector;
 
 /**
  * Class which wraps functionality for converting a Resource(InputStream + 
@@ -56,13 +51,6 @@
  * @version $Date$, $Revision$
  */
 public class TextDocument {
-
-	// hand off this many bytes to the chardet library
-	private final static int MAX_CHARSET_READAHEAD = 65536;
-	// ...if it also includes "charset="
-	private final static String CHARSET_TOKEN = "charset=";
-	// ...and if the chardet library fails, use the Content-Type header
-	private final static String HTTP_CONTENT_TYPE_HEADER = "Content-Type";
 	// if documents are marked up before sending to clients, the data is
 	// decoded into a String in chunks. This is how big a chunk to decode with.
 	private final static int C_BUFFER_SIZE = 4096;
@@ -89,155 +77,8 @@
 		this.uriConverter = uriConverter;
 	}
 
-	private boolean isCharsetSupported(String charsetName) {
-		// can you believe that this throws a runtime? Just asking if it's
-		// supported!!?! They coulda just said "no"...
-		if(charsetName == null) {
-			return false;
-		}
-		try {
-			return Charset.isSupported(charsetName);
-		} catch(IllegalCharsetNameException e) {
-			return false;
-		}
-	}
-	
-	private String contentTypeToCharset(final String contentType) {
-		int offset = 
-			contentType.toUpperCase().indexOf(CHARSET_TOKEN.toUpperCase());
-		
-		if (offset != -1) {
-			String cs = contentType.substring(offset + CHARSET_TOKEN.length());
-			if(isCharsetSupported(cs)) {
-				return cs;
-			}
-			// test for extra spaces... there's at least one page out there that
-			// indicates it's charset with:
 
-//  <meta http-equiv="Content-type" content="text/html; charset=i so-8859-1">
-
-			// bad web page!
-			if(isCharsetSupported(cs.replace(" ", ""))) {
-				return cs.replace(" ", "");
-			}
-		}
-		return null;
-	}
-	
 	/**
-	 * Attempt to divine the character encoding of the document from the 
-	 * Content-Type HTTP header (with a "charset=")
-	 * 
-	 * @param resource
-	 * @return String character set found or null if the header was not present
-	 * @throws IOException 
-	 */
-	protected String getCharsetFromHeaders(Resource resource) 
-	throws IOException {
-		
-		String charsetName = null;
-
-		Map<String,String> httpHeaders = resource.getHttpHeaders();
-		Iterator<String> keys = httpHeaders.keySet().iterator();
-		String ctype = null;
-		while(keys.hasNext()) {
-			String headerKey = keys.next();
-			String keyCmp = headerKey.toUpperCase().trim();
-			if(keyCmp.equals(HTTP_CONTENT_TYPE_HEADER.toUpperCase())) {
-				ctype = httpHeaders.get(headerKey);
-				break;
-			}
-		}
-		if (ctype != null) {
-			charsetName = contentTypeToCharset(ctype);
-		}
-		return charsetName;
-	}
-
-	/**
-	 * Attempt to find a META tag in the HTML that hints at the character set
-	 * used to write the document.
-	 * 
-	 * @param resource
-	 * @return String character set found from META tags in the HTML
-	 * @throws IOException
-	 */
-	protected String getCharsetFromMeta(Resource resource) throws IOException {
-		String charsetName = null;
-
-		byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD];
-		resource.mark(MAX_CHARSET_READAHEAD);
-		resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD);
-		resource.reset();
-		// convert to UTF-8 String -- which hopefully will not mess up the
-		// characters we're interested in...
-		StringBuilder sb = new StringBuilder(new String(bbuffer,"UTF-8"));
-		String metaContentType = TagMagix.getTagAttrWhere(sb, "META",
-				"content", "http-equiv", "Content-Type");
-		if(metaContentType != null) {
-			charsetName = contentTypeToCharset(metaContentType);
-		}
-		return charsetName;
-	}
-	
-	/**
-	 * Attempts to figure out the character set of the document using
-	 * the excellent juniversalchardet library.
-	 * 
-	 * @param resource
-	 * @return String character encoding found, or null if nothing looked good.
-	 * @throws IOException
-	 */
-	protected String getCharsetFromBytes(Resource resource) throws IOException {
-		String charsetName = null;
-
-		byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD];
-		   // (1)
-	    UniversalDetector detector = new UniversalDetector(null);
-
-	    // (2)
-		resource.mark(MAX_CHARSET_READAHEAD);
-		int len = resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD);
-		resource.reset();
-		detector.handleData(bbuffer, 0, len);
-		// (3)
-		detector.dataEnd();
-	    // (4)
-	    charsetName = detector.getDetectedCharset();
-
-	    // (5)
-	    detector.reset();
-	    if(isCharsetSupported(charsetName)) {
-	    	return charsetName;
-	    }
-	    return null;
-	}
-
-	/**
-	 * Use META tags, byte-character-detection, HTTP headers, hope, and prayer
-	 * to figure out what character encoding is being used for the document.
-	 * If nothing else works, assumes UTF-8 for now.
-	 * 
-	 * @param resource
-	 * @return String charset for Resource
-	 * @throws IOException
-	 */
-	protected String guessCharset() throws IOException {
-		
-		String charSet = getCharsetFromHeaders(resource);
-		if(charSet == null) {
-			charSet = getCharsetFromBytes(resource);
-			if(charSet == null) {
-				charSet = getCharsetFromMeta(resource);
-				if(charSet == null) {
-					charSet = "UTF-8";
-				}
-			}
-		}
-		return charSet;
-	}
-
-	/**
 	 * Update URLs inside the page, so those URLs which must be correct at
 	 * page load time resolve correctly to absolute URLs.
 	 * 
@@ -346,9 +187,6 @@
 	 * @throws IOException 
 	 */
 	public void readFully(String charSet) throws IOException {
-		if(charSet == null) {
-			charSet = guessCharset();
-		}
 		this.charSet = charSet;
 		int recordLength = (int) resource.getRecordLength();
 

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java	2009-10-19 22:55:27 UTC (rev 2811)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java	2009-10-22 23:34:57 UTC (rev 2812)
@@ -39,6 +39,8 @@
 import org.archive.wayback.core.Resource;
 import org.archive.wayback.core.WaybackRequest;
 import org.archive.wayback.exception.BadContentException;
+import org.archive.wayback.replay.charset.CharsetDetector;
+import org.archive.wayback.replay.charset.StandardCharsetDetector;
 
 /**
  *
@@ -48,12 +50,9 @@
  */
 public abstract class TextReplayRenderer implements ReplayRenderer {
 
-	public final static String HTTP_LENGTH_HEADER = "Content-Length";
-	public final static String HTTP_LENGTH_HEADER_UP = 
-		HTTP_LENGTH_HEADER.toUpperCase();
-
 	private List<String> jspInserts = null;
 	private HttpHeaderProcessor httpHeaderProcessor;
+	private CharsetDetector charsetDetector = new StandardCharsetDetector();
 
 	public TextReplayRenderer(HttpHeaderProcessor httpHeaderProcessor) {
 		this.httpHeaderProcessor = httpHeaderProcessor;
@@ -80,16 +79,17 @@
 		Map<String,String> headers = HttpHeaderOperation.processHeaders(
 				resource, result, uriConverter, httpHeaderProcessor);
 	
+		String charSet = charsetDetector.getCharset(resource, wbRequest);
 		// Load content into an HTML page, and resolve load-time URLs:
 		TextDocument page = new TextDocument(resource,result,uriConverter);
-		page.readFully();
+		page.readFully(charSet);
 		
 		updatePage(page,httpRequest,httpResponse,wbRequest,result,resource,
 				uriConverter,results);
 
 		// set the corrected length:
 		int bytes = page.getBytes().length;
-		headers.put(HTTP_LENGTH_HEADER, String.valueOf(bytes));
+		headers.put(HttpHeaderOperation.HTTP_LENGTH_HEADER, String.valueOf(bytes));
 		// Tomcat will always send a charset... It's trying to be smarter than
 		// we are. If the original page didn't include a "charset" as part of
 		// the "Content-Type" HTTP header, then Tomcat will use the default..
@@ -117,4 +117,18 @@
 	public void setJspInserts(List<String> jspInserts) {
 		this.jspInserts = jspInserts;
 	}
+
+	/**
+	 * @return the charsetDetector
+	 */
+	public CharsetDetector getCharsetDetector() {
+		return charsetDetector;
+	}
+
+	/**
+	 * @param charsetDetector the charsetDetector to set
+	 */
+	public void setCharsetDetector(CharsetDetector charsetDetector) {
+		this.charsetDetector = charsetDetector;
+	}
 }

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java	2009-10-22 23:34:57 UTC (rev 2812)
@@ -0,0 +1,148 @@
+package org.archive.wayback.replay.charset;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.archive.wayback.core.Resource;
+import org.archive.wayback.core.WaybackRequest;
+import org.archive.wayback.replay.TagMagix;
+import org.mozilla.universalchardet.UniversalDetector;
+
+public abstract class CharsetDetector {
+	// hand off this many bytes to the chardet library
+	protected final static int MAX_CHARSET_READAHEAD = 65536;
+	// ...if it also includes "charset="
+	protected final static String CHARSET_TOKEN = "charset=";
+	// ...and if the chardet library fails, use the Content-Type header
+	protected final static String HTTP_CONTENT_TYPE_HEADER = "Content-Type";
+	public final static String DEFAULT_CHARSET = "UTF-8";
+	
+	protected boolean isCharsetSupported(String charsetName) {
+		// can you believe that this throws a runtime? Just asking if it's
+		// supported!!?! They coulda just said "no"...
+		if(charsetName == null) {
+			return false;
+		}
+		try {
+			return Charset.isSupported(charsetName);
+		} catch(IllegalCharsetNameException e) {
+			return false;
+		}
+	}
+	
+	protected String contentTypeToCharset(final String contentType) {
+		int offset = 
+			contentType.toUpperCase().indexOf(CHARSET_TOKEN.toUpperCase());
+		
+		if (offset != -1) {
+			String cs = contentType.substring(offset + CHARSET_TOKEN.length());
+			if(isCharsetSupported(cs)) {
+				return cs;
+			}
+			// test for extra spaces... there's at least one page out there that
+			// indicates it's charset with:
+
+//  <meta http-equiv="Content-type" content="text/html; charset=i so-8859-1">
+
+			// bad web page!
+			if(isCharsetSupported(cs.replace(" ", ""))) {
+				return cs.replace(" ", "");
+			}
+		}
+		return null;
+	}
+	
+	/**
+	 * Attempt to divine the character encoding of the document from the 
+	 * Content-Type HTTP header (with a "charset=")
+	 * 
+	 * @param resource
+	 * @return String character set found or null if the header was not present
+	 * @throws IOException 
+	 */
+	protected String getCharsetFromHeaders(Resource resource) 
+	throws IOException {
+		
+		String charsetName = null;
+
+		Map<String,String> httpHeaders = resource.getHttpHeaders();
+		Iterator<String> keys = httpHeaders.keySet().iterator();
+		String ctype = null;
+		while(keys.hasNext()) {
+			String headerKey = keys.next();
+			String keyCmp = headerKey.toUpperCase().trim();
+			if(keyCmp.equals(HTTP_CONTENT_TYPE_HEADER.toUpperCase())) {
+				ctype = httpHeaders.get(headerKey);
+				break;
+			}
+		}
+		if (ctype != null) {
+			charsetName = contentTypeToCharset(ctype);
+		}
+		return charsetName;
+	}
+
+	/**
+	 * Attempt to find a META tag in the HTML that hints at the character set
+	 * used to write the document.
+	 * 
+	 * @param resource
+	 * @return String character set found from META tags in the HTML
+	 * @throws IOException
+	 */
+	protected String getCharsetFromMeta(Resource resource) throws IOException {
+		String charsetName = null;
+
+		byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD];
+		resource.mark(MAX_CHARSET_READAHEAD);
+		resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD);
+		resource.reset();
+		// convert to UTF-8 String -- which hopefully will not mess up the
+		// characters we're interested in...
+		StringBuilder sb = new StringBuilder(new String(bbuffer,DEFAULT_CHARSET));
+		String metaContentType = TagMagix.getTagAttrWhere(sb, "META",
+				"content", "http-equiv", "Content-Type");
+		if(metaContentType != null) {
+			charsetName = contentTypeToCharset(metaContentType);
+		}
+		return charsetName;
+	}
+	
+	/**
+	 * Attempts to figure out the character set of the document using
+	 * the excellent juniversalchardet library.
+	 * 
+	 * @param resource
+	 * @return String character encoding found, or null if nothing looked good.
+	 * @throws IOException
+	 */
+	protected String getCharsetFromBytes(Resource resource) throws IOException {
+		String charsetName = null;
+
+		byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD];
+		   // (1)
+	    UniversalDetector detector = new UniversalDetector(null);
+
+	    // (2)
+		resource.mark(MAX_CHARSET_READAHEAD);
+		int len = resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD);
+		resource.reset();
+		detector.handleData(bbuffer, 0, len);
+		// (3)
+		detector.dataEnd();
+	    // (4)
+	    charsetName = detector.getDetectedCharset();
+
+	    // (5)
+	    detector.reset();
+	    if(isCharsetSupported(charsetName)) {
+	    	return charsetName;
+	    }
+	    return null;
+	}
+	public abstract String getCharset(Resource resource, WaybackRequest request)
+		throws IOException;
+}

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/RotatingCharsetDetector.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/RotatingCharsetDetector.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/RotatingCharsetDetector.java	2009-10-22 23:34:57 UTC (rev 2812)
@@ -0,0 +1,63 @@
+package org.archive.wayback.replay.charset;
+
+import java.io.IOException;
+
+import org.archive.wayback.core.Resource;
+import org.archive.wayback.core.WaybackRequest;
+
+/**
+ * @author brad
+ *
+ * Provides a way to rotate through several detection schemes 
+ */
+public class RotatingCharsetDetector extends CharsetDetector {
+	public final static int MODES[][] = {
+		{0,1,2},
+		{0,2,1},
+		{1,0,2},
+		{1,2,0},
+		{2,1,0},
+		{2,0,1}
+	};
+	public final static int MODE_COUNT = 6;
+	public final static int GUESS_TYPES = 3;
+
+	public int nextMode(int curMode) {
+		if(curMode >= MODE_COUNT - 1) {
+			return 0;
+		}
+		return curMode + 1;
+	}
+	public String getCharsetType(Resource resource, int type) throws IOException {
+		if(type == 0) {
+			return getCharsetFromHeaders(resource);
+		} else if(type == 1) {
+			return getCharsetFromMeta(resource);
+		} else if(type == 2) {
+			return getCharsetFromBytes(resource);
+		}
+		return null;
+	}
+	public String getCharset(Resource resource, int mode) throws IOException {
+		String charset = null;
+		if(mode >= MODE_COUNT) {
+			mode = 0;
+		}
+		for(int type = 0; type < GUESS_TYPES; type++) {
+			charset = getCharsetType(resource,MODES[mode][type]);
+			if(charset != null) {
+				break;
+			}
+		}
+		if(charset == null) {
+			charset = DEFAULT_CHARSET;
+		}
+		return charset;
+	}
+	@Override
+	public String getCharset(Resource resource, WaybackRequest request) 
+	throws IOException {
+		int mode = request.getCharsetMode();
+		return getCharset(resource,mode);
+	}
+}

Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/StandardCharsetDetector.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/StandardCharsetDetector.java	                        (rev 0)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/StandardCharsetDetector.java	2009-10-22 23:34:57 UTC (rev 2812)
@@ -0,0 +1,25 @@
+package org.archive.wayback.replay.charset;
+
+import java.io.IOException;
+
+import org.archive.wayback.core.Resource;
+import org.archive.wayback.core.WaybackRequest;
+
+public class StandardCharsetDetector extends CharsetDetector {
+
+	@Override
+	public String getCharset(Resource resource, WaybackRequest request)
+	throws IOException {
+		String charSet = getCharsetFromHeaders(resource);
+		if(charSet == null) {
+			charSet = getCharsetFromMeta(resource);
+			if(charSet == null) {
+				charSet = getCharsetFromBytes(resource);
+				if(charSet == null) {
+					charSet = DEFAULT_CHARSET;
+				}
+			}
+		}
+		return charSet;
+	}
+}


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access:[2812] trunk/archive-access/projects/wayback/ wayb

[Archive-access-cvs] SF.net SVN: archive-access:[2812] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/replay