[Archive-access-cvs] SF.net SVN: archive-access:[3145] trunk/archive-access/projects/wayback/ wayb

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 3145
          http://archive-access.svn.sourceforge.net/archive-access/?rev=3145&view=rev
Author:   bradtofel
Date:     2010-06-03 22:19:29 +0000 (Thu, 03 Jun 2010)

Log Message:
-----------
BUGFIX(ACC-97): now silently remaps iso-8859-1 to cp1252.

Modified Paths:
--------------
    trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java

Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java
===================================================================

--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java	2010-06-01 22:03:23 UTC (rev 3144)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java	2010-06-03 22:19:29 UTC (rev 3145)
@@ -36,6 +36,13 @@
 import org.archive.wayback.replay.TagMagix;
 import org.mozilla.universalchardet.UniversalDetector;
 
+/**
+ * Abstract class containing common methods for determining the character 
+ * encoding of a text Resource, most of which should be refactored into a
+ * Util package.
+ * @author brad
+ *
+ */
 public abstract class CharsetDetector {
 	// hand off this many bytes to the chardet library
 	protected final static int MAX_CHARSET_READAHEAD = 65536;
@@ -43,6 +50,7 @@
 	protected final static String CHARSET_TOKEN = "charset=";
 	// ...and if the chardet library fails, use the Content-Type header
 	protected final static String HTTP_CONTENT_TYPE_HEADER = "Content-Type";
+	/** the default charset name to use when giving up */
 	public final static String DEFAULT_CHARSET = "UTF-8";
 	
 	protected boolean isCharsetSupported(String charsetName) {
@@ -57,7 +65,13 @@
 			return false;
 		}
 	}
-	
+	protected String mapCharset(String orig) {
+		String lc = orig.toLowerCase();
+		if(lc.contains("iso8859-1") || lc.contains("iso-8859-1")) {
+			return "cp1252";
+		}
+		return orig;
+	}
 	protected String contentTypeToCharset(final String contentType) {
 		int offset = 
 			contentType.toUpperCase().indexOf(CHARSET_TOKEN.toUpperCase());
@@ -65,7 +79,7 @@
 		if (offset != -1) {
 			String cs = contentType.substring(offset + CHARSET_TOKEN.length());
 			if(isCharsetSupported(cs)) {
-				return cs;
+				return mapCharset(cs);
 			}
 			// test for extra spaces... there's at least one page out there that
 			// indicates it's charset with:
@@ -74,7 +88,7 @@
 
 			// bad web page!
 			if(isCharsetSupported(cs.replace(" ", ""))) {
-				return cs.replace(" ", "");
+				return mapCharset(cs.replace(" ", ""));
 			}
 		}
 		return null;
@@ -168,6 +182,13 @@
 	    }
 	    return null;
 	}
+	/**
+	 * @param resource (presumably text) Resource to determine the charset
+	 * @param request WaybackRequest which may contain additional hints to
+	 *        processing
+	 * @return String charset name for the Resource
+	 * @throws IOException if there are problems reading the Resource
+	 */
 	public abstract String getCharset(Resource resource, WaybackRequest request)
 		throws IOException;
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.




[Archive-access-cvs] SF.net SVN: archive-access:[3145] trunk/archive-access/projects/wayback/ wayb

[Archive-access-cvs] SF.net SVN: archive-access:[3145] trunk/archive-access/projects/wayback/ wayback-core/src/main/java/org/archive/wayback/replay/charset/ CharsetDetector.java