From: <bra...@us...> - 2009-10-22 23:35:11
|
Revision: 2812 http://archive-access.svn.sourceforge.net/archive-access/?rev=2812&view=rev Author: bradtofel Date: 2009-10-22 23:34:57 +0000 (Thu, 22 Oct 2009) Log Message: ----------- REFACTOR: moved all character encoding detection into CharsetDetector interface. Two initial implementations, one which implements the previous behavior, and another which allows a user to "rotate" through different detection strategies. Modified Paths: -------------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java Added Paths: ----------- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/RotatingCharsetDetector.java trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/StandardCharsetDetector.java Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java 2009-10-19 22:55:27 UTC (rev 2811) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextDocument.java 2009-10-22 23:34:57 UTC (rev 2812) @@ -28,11 +28,7 @@ import java.io.InputStreamReader; import java.io.OutputStream; import java.io.UnsupportedEncodingException; -import java.nio.charset.Charset; -import java.nio.charset.IllegalCharsetNameException; import java.text.ParseException; -import java.util.Iterator; -import java.util.Map; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; @@ -44,7 +40,6 @@ import org.archive.wayback.core.CaptureSearchResults; import org.archive.wayback.core.UIResults; import org.archive.wayback.core.WaybackRequest; -import org.mozilla.universalchardet.UniversalDetector; /** * Class which wraps functionality for converting a Resource(InputStream + @@ -56,13 +51,6 @@ * @version $Date$, $Revision$ */ public class TextDocument { - - // hand off this many bytes to the chardet library - private final static int MAX_CHARSET_READAHEAD = 65536; - // ...if it also includes "charset=" - private final static String CHARSET_TOKEN = "charset="; - // ...and if the chardet library fails, use the Content-Type header - private final static String HTTP_CONTENT_TYPE_HEADER = "Content-Type"; // if documents are marked up before sending to clients, the data is // decoded into a String in chunks. This is how big a chunk to decode with. private final static int C_BUFFER_SIZE = 4096; @@ -89,155 +77,8 @@ this.uriConverter = uriConverter; } - private boolean isCharsetSupported(String charsetName) { - // can you believe that this throws a runtime? Just asking if it's - // supported!!?! They coulda just said "no"... - if(charsetName == null) { - return false; - } - try { - return Charset.isSupported(charsetName); - } catch(IllegalCharsetNameException e) { - return false; - } - } - - private String contentTypeToCharset(final String contentType) { - int offset = - contentType.toUpperCase().indexOf(CHARSET_TOKEN.toUpperCase()); - - if (offset != -1) { - String cs = contentType.substring(offset + CHARSET_TOKEN.length()); - if(isCharsetSupported(cs)) { - return cs; - } - // test for extra spaces... there's at least one page out there that - // indicates it's charset with: -// <meta http-equiv="Content-type" content="text/html; charset=i so-8859-1"> - - // bad web page! - if(isCharsetSupported(cs.replace(" ", ""))) { - return cs.replace(" ", ""); - } - } - return null; - } - /** - * Attempt to divine the character encoding of the document from the - * Content-Type HTTP header (with a "charset=") - * - * @param resource - * @return String character set found or null if the header was not present - * @throws IOException - */ - protected String getCharsetFromHeaders(Resource resource) - throws IOException { - - String charsetName = null; - - Map<String,String> httpHeaders = resource.getHttpHeaders(); - Iterator<String> keys = httpHeaders.keySet().iterator(); - String ctype = null; - while(keys.hasNext()) { - String headerKey = keys.next(); - String keyCmp = headerKey.toUpperCase().trim(); - if(keyCmp.equals(HTTP_CONTENT_TYPE_HEADER.toUpperCase())) { - ctype = httpHeaders.get(headerKey); - break; - } - } - if (ctype != null) { - charsetName = contentTypeToCharset(ctype); - } - return charsetName; - } - - /** - * Attempt to find a META tag in the HTML that hints at the character set - * used to write the document. - * - * @param resource - * @return String character set found from META tags in the HTML - * @throws IOException - */ - protected String getCharsetFromMeta(Resource resource) throws IOException { - String charsetName = null; - - byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD]; - resource.mark(MAX_CHARSET_READAHEAD); - resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD); - resource.reset(); - // convert to UTF-8 String -- which hopefully will not mess up the - // characters we're interested in... - StringBuilder sb = new StringBuilder(new String(bbuffer,"UTF-8")); - String metaContentType = TagMagix.getTagAttrWhere(sb, "META", - "content", "http-equiv", "Content-Type"); - if(metaContentType != null) { - charsetName = contentTypeToCharset(metaContentType); - } - return charsetName; - } - - /** - * Attempts to figure out the character set of the document using - * the excellent juniversalchardet library. - * - * @param resource - * @return String character encoding found, or null if nothing looked good. - * @throws IOException - */ - protected String getCharsetFromBytes(Resource resource) throws IOException { - String charsetName = null; - - byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD]; - // (1) - UniversalDetector detector = new UniversalDetector(null); - - // (2) - resource.mark(MAX_CHARSET_READAHEAD); - int len = resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD); - resource.reset(); - detector.handleData(bbuffer, 0, len); - // (3) - detector.dataEnd(); - // (4) - charsetName = detector.getDetectedCharset(); - - // (5) - detector.reset(); - if(isCharsetSupported(charsetName)) { - return charsetName; - } - return null; - } - - /** - * Use META tags, byte-character-detection, HTTP headers, hope, and prayer - * to figure out what character encoding is being used for the document. - * If nothing else works, assumes UTF-8 for now. - * - * @param resource - * @return String charset for Resource - * @throws IOException - */ - protected String guessCharset() throws IOException { - - String charSet = getCharsetFromHeaders(resource); - if(charSet == null) { - charSet = getCharsetFromBytes(resource); - if(charSet == null) { - charSet = getCharsetFromMeta(resource); - if(charSet == null) { - charSet = "UTF-8"; - } - } - } - return charSet; - } - - /** * Update URLs inside the page, so those URLs which must be correct at * page load time resolve correctly to absolute URLs. * @@ -346,9 +187,6 @@ * @throws IOException */ public void readFully(String charSet) throws IOException { - if(charSet == null) { - charSet = guessCharset(); - } this.charSet = charSet; int recordLength = (int) resource.getRecordLength(); Modified: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java 2009-10-19 22:55:27 UTC (rev 2811) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/TextReplayRenderer.java 2009-10-22 23:34:57 UTC (rev 2812) @@ -39,6 +39,8 @@ import org.archive.wayback.core.Resource; import org.archive.wayback.core.WaybackRequest; import org.archive.wayback.exception.BadContentException; +import org.archive.wayback.replay.charset.CharsetDetector; +import org.archive.wayback.replay.charset.StandardCharsetDetector; /** * @@ -48,12 +50,9 @@ */ public abstract class TextReplayRenderer implements ReplayRenderer { - public final static String HTTP_LENGTH_HEADER = "Content-Length"; - public final static String HTTP_LENGTH_HEADER_UP = - HTTP_LENGTH_HEADER.toUpperCase(); - private List<String> jspInserts = null; private HttpHeaderProcessor httpHeaderProcessor; + private CharsetDetector charsetDetector = new StandardCharsetDetector(); public TextReplayRenderer(HttpHeaderProcessor httpHeaderProcessor) { this.httpHeaderProcessor = httpHeaderProcessor; @@ -80,16 +79,17 @@ Map<String,String> headers = HttpHeaderOperation.processHeaders( resource, result, uriConverter, httpHeaderProcessor); + String charSet = charsetDetector.getCharset(resource, wbRequest); // Load content into an HTML page, and resolve load-time URLs: TextDocument page = new TextDocument(resource,result,uriConverter); - page.readFully(); + page.readFully(charSet); updatePage(page,httpRequest,httpResponse,wbRequest,result,resource, uriConverter,results); // set the corrected length: int bytes = page.getBytes().length; - headers.put(HTTP_LENGTH_HEADER, String.valueOf(bytes)); + headers.put(HttpHeaderOperation.HTTP_LENGTH_HEADER, String.valueOf(bytes)); // Tomcat will always send a charset... It's trying to be smarter than // we are. If the original page didn't include a "charset" as part of // the "Content-Type" HTTP header, then Tomcat will use the default.. @@ -117,4 +117,18 @@ public void setJspInserts(List<String> jspInserts) { this.jspInserts = jspInserts; } + + /** + * @return the charsetDetector + */ + public CharsetDetector getCharsetDetector() { + return charsetDetector; + } + + /** + * @param charsetDetector the charsetDetector to set + */ + public void setCharsetDetector(CharsetDetector charsetDetector) { + this.charsetDetector = charsetDetector; + } } Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/CharsetDetector.java 2009-10-22 23:34:57 UTC (rev 2812) @@ -0,0 +1,148 @@ +package org.archive.wayback.replay.charset; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.util.Iterator; +import java.util.Map; + +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.WaybackRequest; +import org.archive.wayback.replay.TagMagix; +import org.mozilla.universalchardet.UniversalDetector; + +public abstract class CharsetDetector { + // hand off this many bytes to the chardet library + protected final static int MAX_CHARSET_READAHEAD = 65536; + // ...if it also includes "charset=" + protected final static String CHARSET_TOKEN = "charset="; + // ...and if the chardet library fails, use the Content-Type header + protected final static String HTTP_CONTENT_TYPE_HEADER = "Content-Type"; + public final static String DEFAULT_CHARSET = "UTF-8"; + + protected boolean isCharsetSupported(String charsetName) { + // can you believe that this throws a runtime? Just asking if it's + // supported!!?! They coulda just said "no"... + if(charsetName == null) { + return false; + } + try { + return Charset.isSupported(charsetName); + } catch(IllegalCharsetNameException e) { + return false; + } + } + + protected String contentTypeToCharset(final String contentType) { + int offset = + contentType.toUpperCase().indexOf(CHARSET_TOKEN.toUpperCase()); + + if (offset != -1) { + String cs = contentType.substring(offset + CHARSET_TOKEN.length()); + if(isCharsetSupported(cs)) { + return cs; + } + // test for extra spaces... there's at least one page out there that + // indicates it's charset with: + +// <meta http-equiv="Content-type" content="text/html; charset=i so-8859-1"> + + // bad web page! + if(isCharsetSupported(cs.replace(" ", ""))) { + return cs.replace(" ", ""); + } + } + return null; + } + + /** + * Attempt to divine the character encoding of the document from the + * Content-Type HTTP header (with a "charset=") + * + * @param resource + * @return String character set found or null if the header was not present + * @throws IOException + */ + protected String getCharsetFromHeaders(Resource resource) + throws IOException { + + String charsetName = null; + + Map<String,String> httpHeaders = resource.getHttpHeaders(); + Iterator<String> keys = httpHeaders.keySet().iterator(); + String ctype = null; + while(keys.hasNext()) { + String headerKey = keys.next(); + String keyCmp = headerKey.toUpperCase().trim(); + if(keyCmp.equals(HTTP_CONTENT_TYPE_HEADER.toUpperCase())) { + ctype = httpHeaders.get(headerKey); + break; + } + } + if (ctype != null) { + charsetName = contentTypeToCharset(ctype); + } + return charsetName; + } + + /** + * Attempt to find a META tag in the HTML that hints at the character set + * used to write the document. + * + * @param resource + * @return String character set found from META tags in the HTML + * @throws IOException + */ + protected String getCharsetFromMeta(Resource resource) throws IOException { + String charsetName = null; + + byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD]; + resource.mark(MAX_CHARSET_READAHEAD); + resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD); + resource.reset(); + // convert to UTF-8 String -- which hopefully will not mess up the + // characters we're interested in... + StringBuilder sb = new StringBuilder(new String(bbuffer,DEFAULT_CHARSET)); + String metaContentType = TagMagix.getTagAttrWhere(sb, "META", + "content", "http-equiv", "Content-Type"); + if(metaContentType != null) { + charsetName = contentTypeToCharset(metaContentType); + } + return charsetName; + } + + /** + * Attempts to figure out the character set of the document using + * the excellent juniversalchardet library. + * + * @param resource + * @return String character encoding found, or null if nothing looked good. + * @throws IOException + */ + protected String getCharsetFromBytes(Resource resource) throws IOException { + String charsetName = null; + + byte[] bbuffer = new byte[MAX_CHARSET_READAHEAD]; + // (1) + UniversalDetector detector = new UniversalDetector(null); + + // (2) + resource.mark(MAX_CHARSET_READAHEAD); + int len = resource.read(bbuffer, 0, MAX_CHARSET_READAHEAD); + resource.reset(); + detector.handleData(bbuffer, 0, len); + // (3) + detector.dataEnd(); + // (4) + charsetName = detector.getDetectedCharset(); + + // (5) + detector.reset(); + if(isCharsetSupported(charsetName)) { + return charsetName; + } + return null; + } + public abstract String getCharset(Resource resource, WaybackRequest request) + throws IOException; +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/RotatingCharsetDetector.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/RotatingCharsetDetector.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/RotatingCharsetDetector.java 2009-10-22 23:34:57 UTC (rev 2812) @@ -0,0 +1,63 @@ +package org.archive.wayback.replay.charset; + +import java.io.IOException; + +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.WaybackRequest; + +/** + * @author brad + * + * Provides a way to rotate through several detection schemes + */ +public class RotatingCharsetDetector extends CharsetDetector { + public final static int MODES[][] = { + {0,1,2}, + {0,2,1}, + {1,0,2}, + {1,2,0}, + {2,1,0}, + {2,0,1} + }; + public final static int MODE_COUNT = 6; + public final static int GUESS_TYPES = 3; + + public int nextMode(int curMode) { + if(curMode >= MODE_COUNT - 1) { + return 0; + } + return curMode + 1; + } + public String getCharsetType(Resource resource, int type) throws IOException { + if(type == 0) { + return getCharsetFromHeaders(resource); + } else if(type == 1) { + return getCharsetFromMeta(resource); + } else if(type == 2) { + return getCharsetFromBytes(resource); + } + return null; + } + public String getCharset(Resource resource, int mode) throws IOException { + String charset = null; + if(mode >= MODE_COUNT) { + mode = 0; + } + for(int type = 0; type < GUESS_TYPES; type++) { + charset = getCharsetType(resource,MODES[mode][type]); + if(charset != null) { + break; + } + } + if(charset == null) { + charset = DEFAULT_CHARSET; + } + return charset; + } + @Override + public String getCharset(Resource resource, WaybackRequest request) + throws IOException { + int mode = request.getCharsetMode(); + return getCharset(resource,mode); + } +} Added: trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/StandardCharsetDetector.java =================================================================== --- trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/StandardCharsetDetector.java (rev 0) +++ trunk/archive-access/projects/wayback/wayback-core/src/main/java/org/archive/wayback/replay/charset/StandardCharsetDetector.java 2009-10-22 23:34:57 UTC (rev 2812) @@ -0,0 +1,25 @@ +package org.archive.wayback.replay.charset; + +import java.io.IOException; + +import org.archive.wayback.core.Resource; +import org.archive.wayback.core.WaybackRequest; + +public class StandardCharsetDetector extends CharsetDetector { + + @Override + public String getCharset(Resource resource, WaybackRequest request) + throws IOException { + String charSet = getCharsetFromHeaders(resource); + if(charSet == null) { + charSet = getCharsetFromMeta(resource); + if(charSet == null) { + charSet = getCharsetFromBytes(resource); + if(charSet == null) { + charSet = DEFAULT_CHARSET; + } + } + } + return charSet; + } +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |