Revision: 3145
http://archive-access.svn.sourceforge.net/archive-access/?rev=3145&view=rev
Author: bradtofel
Date: 2010-06-03 22:19:29 +0000 (Thu, 03 Jun 2010)
Log Message:
-----------
BUGFIX(ACC-97): now silently remaps iso-8859-1 to cp1252.
Modified Paths:
--------------
trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java
Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java
===================================================================
--- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java 2010-06-01 22:03:23 UTC (rev 3144)
+++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java 2010-06-03 22:19:29 UTC (rev 3145)
@@ -36,6 +36,13 @@
import org.archive.wayback.replay.TagMagix;
import org.mozilla.universalchardet.UniversalDetector;
+/**
+ * Abstract class containing common methods for determining the character
+ * encoding of a text Resource, most of which should be refactored into a
+ * Util package.
+ * @author brad
+ *
+ */
public abstract class CharsetDetector {
// hand off this many bytes to the chardet library
protected final static int MAX_CHARSET_READAHEAD = 65536;
@@ -43,6 +50,7 @@
protected final static String CHARSET_TOKEN = "charset=";
// ...and if the chardet library fails, use the Content-Type header
protected final static String HTTP_CONTENT_TYPE_HEADER = "Content-Type";
+ /** the default charset name to use when giving up */
public final static String DEFAULT_CHARSET = "UTF-8";
protected boolean isCharsetSupported(String charsetName) {
@@ -57,7 +65,13 @@
return false;
}
}
-
+ protected String mapCharset(String orig) {
+ String lc = orig.toLowerCase();
+ if(lc.contains("iso8859-1") || lc.contains("iso-8859-1")) {
+ return "cp1252";
+ }
+ return orig;
+ }
protected String contentTypeToCharset(final String contentType) {
int offset =
contentType.toUpperCase().indexOf(CHARSET_TOKEN.toUpperCase());
@@ -65,7 +79,7 @@
if (offset != -1) {
String cs = contentType.substring(offset + CHARSET_TOKEN.length());
if(isCharsetSupported(cs)) {
- return cs;
+ return mapCharset(cs);
}
// test for extra spaces... there's at least one page out there that
// indicates it's charset with:
@@ -74,7 +88,7 @@
// bad web page!
if(isCharsetSupported(cs.replace(" ", ""))) {
- return cs.replace(" ", "");
+ return mapCharset(cs.replace(" ", ""));
}
}
return null;
@@ -168,6 +182,13 @@
}
return null;
}
+ /**
+ * @param resource (presumably text) Resource to determine the charset
+ * @param request WaybackRequest which may contain additional hints to
+ * processing
+ * @return String charset name for the Resource
+ * @throws IOException if there are problems reading the Resource
+ */
public abstract String getCharset(Resource resource, WaybackRequest request)
throws IOException;
}
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|