From: <jrb...@us...> - 2010-06-16 23:01:04
|
Revision: 1074 http://cishell.svn.sourceforge.net/cishell/?rev=1074&view=rev Author: jrbibers Date: 2010-06-16 23:00:57 +0000 (Wed, 16 Jun 2010) Log Message: ----------- Improved UTF-8 support by: * Tolerating byte-order marks in UTF-8 files, where they are unnecessary but harmless and standard-permissible. * Wrapping FileReaders throughout the converters inside UnicodeReader, a CIShell utility class. This detects the character encodings UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, and UTF-32BE according to any byte-order mark at the head of the file, defaulting to UTF-8 when unclear. The multiple existing UnicodeReaders are removed. * Specifying the preferred encoding UTF-8 on FileWriters throughout the converter set. * Adding a friendlier error message to the File > Load algorithm when an unrecognized encoding is presented. It suggests two common fixes. Reviewed by Russell. Modified Paths: -------------- trunk/clients/gui/org.cishell.reference.gui.persistence/src/org/cishell/reference/gui/persistence/load/FileLoadAlgorithm.java Added Paths: ----------- trunk/core/org.cishell.utilities/src/org/cishell/utilities/UnicodeReader.java Modified: trunk/clients/gui/org.cishell.reference.gui.persistence/src/org/cishell/reference/gui/persistence/load/FileLoadAlgorithm.java =================================================================== --- trunk/clients/gui/org.cishell.reference.gui.persistence/src/org/cishell/reference/gui/persistence/load/FileLoadAlgorithm.java 2010-06-12 01:41:40 UTC (rev 1073) +++ trunk/clients/gui/org.cishell.reference.gui.persistence/src/org/cishell/reference/gui/persistence/load/FileLoadAlgorithm.java 2010-06-16 23:00:57 UTC (rev 1074) @@ -1,6 +1,7 @@ package org.cishell.reference.gui.persistence.load; import java.io.File; +import java.io.UnsupportedEncodingException; import java.util.Dictionary; import org.cishell.framework.CIShellContext; @@ -88,30 +89,38 @@ return fileSelector.getFile(); } - private Data[] validateFile(IWorkbenchWindow window, Display display, File file) - throws AlgorithmExecutionException { + private Data[] validateFile(IWorkbenchWindow window, Display display, File file) { AlgorithmFactory validator = null; - boolean shouldTryValidator = true; - while (shouldTryValidator) { - try { - validator = getValidatorFromUser(window, display, file); + try { + validator = getValidatorFromUser(window, display, file); - if ((file == null) || (validator == null)) { - String logMessage = "File loading canceled"; - this.logger.log(LogService.LOG_WARNING, logMessage); - - shouldTryValidator = false; - } else { + if ((file == null) || (validator == null)) { + String logMessage = "File loading canceled"; + this.logger.log(LogService.LOG_WARNING, logMessage); + } else { + try { return FileValidator.validateFile( - file, validator, this.progressMonitor, this.ciShellContext, this.logger); + file, validator, this.progressMonitor, this.ciShellContext, this.logger); + } catch (AlgorithmExecutionException e) { + if (e.getCause() != null + && e.getCause() instanceof UnsupportedEncodingException) { + String logMessage = + "This file cannot be loaded; it uses the unsupported character encoding " + + e.getCause().getMessage() + "."; + this.logger.log(LogService.LOG_ERROR, logMessage); + } else { + throw e; + } } - } catch (Throwable e) { - String logMessage = - "The chosen file is not compatible with the chosen file. " + - "Please try a different format or cancel."; - this.logger.log(LogService.LOG_ERROR, logMessage); } + } catch (Throwable e) { + String logMessage = + "The chosen file is not compatible with this format. " + + "Check that your file is correctly formatted or try another validator. " + + "The reason is: " + e.getMessage(); + e.printStackTrace(); // TODO remove + this.logger.log(LogService.LOG_ERROR, logMessage); } return null; Added: trunk/core/org.cishell.utilities/src/org/cishell/utilities/UnicodeReader.java =================================================================== --- trunk/core/org.cishell.utilities/src/org/cishell/utilities/UnicodeReader.java (rev 0) +++ trunk/core/org.cishell.utilities/src/org/cishell/utilities/UnicodeReader.java 2010-06-16 23:00:57 UTC (rev 1074) @@ -0,0 +1,141 @@ +package org.cishell.utilities; + +/** + version: 1.1 / 2007-01-25 + - changed BOM recognition ordering (longer boms first) + + Original pseudocode : Thomas Weidenfeller + Implementation tweaked: Aki Nieminen + + http://www.unicode.org/unicode/faq/utf_bom.html + BOMs: + 00 00 FE FF = UTF-32, big-endian + FF FE 00 00 = UTF-32, little-endian + EF BB BF = UTF-8, + FE FF = UTF-16, big-endian + FF FE = UTF-16, little-endian + + Win2k Notepad: + Unicode format = UTF-16LE + */ + +import java.io.*; + +/** + * Generic unicode textreader, which will use BOM mark to identify the encoding + * to be used. If BOM is not found then use a given default or system encoding. + */ +public class UnicodeReader extends Reader { + public static final int BOM_SIZE = 4; + + private PushbackInputStream internalIn; + private InputStreamReader internalIn2 = null; + private String defaultEnc; + + /** + * @param in + * inputstream to be read + */ + public UnicodeReader(InputStream in) { + this(in, "UTF-8"); + } + + /** + * @param in + * inputstream to be read + * @param defaultEnc + * default encoding if stream does not have BOM marker. Give null + * to use system-level default. + */ + public UnicodeReader(InputStream in, String defaultEnc) { + internalIn = new PushbackInputStream(in, BOM_SIZE); + this.defaultEnc = defaultEnc; + } + + public String getDefaultEncoding() { + return defaultEnc; + } + + /** + * Get stream encoding or null if stream is uninitialized. Call init() or + * read() method to initialize it. + */ + public String getEncoding() { + if (internalIn2 == null) { + return null; + } + + return internalIn2.getEncoding(); + } + + /** + * Read-ahead four bytes and check for BOM marks. Extra bytes are unread + * back to the stream, only BOM bytes are skipped. + */ + protected void init() throws IOException { + if (internalIn2 != null) { + return; + } + + String encoding; + byte bom[] = new byte[BOM_SIZE]; + int n; + int unread; + n = internalIn.read(bom, 0, bom.length); + + if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) + && (bom[3] == (byte) 0xFF)) { + encoding = "UTF-32BE"; + unread = n - 4; + System.out.println("encoding detected: " + encoding); + } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) + && (bom[3] == (byte) 0x00)) { + encoding = "UTF-32LE"; + unread = n - 4; + System.out.println("encoding detected: " + encoding); + } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { + encoding = "UTF-8"; + unread = n - 3; + System.out.println("encoding detected: " + encoding); + } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { + encoding = "UTF-16BE"; + unread = n - 2; + System.out.println("encoding detected: " + encoding); + } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { + encoding = "UTF-16LE"; + unread = n - 2; + System.out.println("encoding detected: " + encoding); + } else { + // Unicode BOM mark not found, unread all bytes + encoding = defaultEnc; + unread = n; + System.out.println("using default encoding: " + encoding); + } + + + + if (unread > 0) { + internalIn.unread(bom, (n - unread), unread); + } + + // Use given encoding + if (encoding == null) { + internalIn2 = new InputStreamReader(internalIn, "UTF-8"); + } else { + internalIn2 = new InputStreamReader(internalIn, encoding); + } + } + + @Override + public void close() throws IOException { + init(); + internalIn2.close(); + } + + @Override + public int read(char[] cbuf, int off, int len) throws IOException { + init(); + return internalIn2.read(cbuf, off, len); + } + +} \ No newline at end of file This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |