[CIShell-SVN] SF.net SVN: cishell:[1074] trunk

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Revision: 1074
          http://cishell.svn.sourceforge.net/cishell/?rev=1074&view=rev
Author:   jrbibers
Date:     2010-06-16 23:00:57 +0000 (Wed, 16 Jun 2010)

Log Message:
-----------
Improved UTF-8 support by:

* Tolerating byte-order marks in UTF-8 files, where they are unnecessary but harmless and standard-permissible.

* Wrapping FileReaders throughout the converters inside UnicodeReader, a CIShell utility class.  This detects the character encodings UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, and UTF-32BE according to any byte-order mark at the head of the file, defaulting to UTF-8 when unclear.  The multiple existing UnicodeReaders are removed.

* Specifying the preferred encoding UTF-8 on FileWriters throughout the converter set.

* Adding a friendlier error message to the File > Load algorithm when an unrecognized encoding is presented.  It suggests two common fixes.

Reviewed by Russell.

Modified Paths:
--------------
    trunk/clients/gui/org.cishell.reference.gui.persistence/src/org/cishell/reference/gui/persistence/load/FileLoadAlgorithm.java

Added Paths:
-----------
    trunk/core/org.cishell.utilities/src/org/cishell/utilities/UnicodeReader.java

Modified: trunk/clients/gui/org.cishell.reference.gui.persistence/src/org/cishell/reference/gui/persistence/load/FileLoadAlgorithm.java
===================================================================

--- trunk/clients/gui/org.cishell.reference.gui.persistence/src/org/cishell/reference/gui/persistence/load/FileLoadAlgorithm.java	2010-06-12 01:41:40 UTC (rev 1073)
+++ trunk/clients/gui/org.cishell.reference.gui.persistence/src/org/cishell/reference/gui/persistence/load/FileLoadAlgorithm.java	2010-06-16 23:00:57 UTC (rev 1074)
@@ -1,6 +1,7 @@
 package org.cishell.reference.gui.persistence.load;
 
 import java.io.File;
+import java.io.UnsupportedEncodingException;
 import java.util.Dictionary;
 
 import org.cishell.framework.CIShellContext;
@@ -88,30 +89,38 @@
 		return fileSelector.getFile();
 	}
 
-	private Data[] validateFile(IWorkbenchWindow window, Display display, File file)
-			throws AlgorithmExecutionException {
+	private Data[] validateFile(IWorkbenchWindow window, Display display, File file) {
 		AlgorithmFactory validator = null;
-		boolean shouldTryValidator = true;
 
-		while (shouldTryValidator) {
-			try {
-				validator = getValidatorFromUser(window, display, file);
+		try {
+			validator = getValidatorFromUser(window, display, file);
 
-				if ((file == null) || (validator == null)) {
-					String logMessage = "File loading canceled";
-					this.logger.log(LogService.LOG_WARNING, logMessage);
-
-					shouldTryValidator = false;
-				} else {
+			if ((file == null) || (validator == null)) {
+				String logMessage = "File loading canceled";
+				this.logger.log(LogService.LOG_WARNING, logMessage);
+			} else {
+				try {
 					return FileValidator.validateFile(
-						file, validator, this.progressMonitor, this.ciShellContext, this.logger);
+							file, validator, this.progressMonitor, this.ciShellContext, this.logger);
+				} catch (AlgorithmExecutionException e) {
+					if (e.getCause() != null
+							&& e.getCause() instanceof UnsupportedEncodingException) {
+						String logMessage =
+							"This file cannot be loaded; it uses the unsupported character encoding "
+							+ e.getCause().getMessage() + ".";
+						this.logger.log(LogService.LOG_ERROR, logMessage);
+					} else {						
+						throw e;
+					}
 				}
-			} catch (Throwable e) {
-				String logMessage =
-					"The chosen file is not compatible with the chosen file.  " +
-					"Please try a different format or cancel.";
-				this.logger.log(LogService.LOG_ERROR, logMessage);
 			}
+		} catch (Throwable e) {
+			String logMessage =
+				"The chosen file is not compatible with this format.  " +
+				"Check that your file is correctly formatted or try another validator.  " +
+				"The reason is: " + e.getMessage();
+			e.printStackTrace(); // TODO remove
+			this.logger.log(LogService.LOG_ERROR, logMessage);
 		}
 
 		return null;

Added: trunk/core/org.cishell.utilities/src/org/cishell/utilities/UnicodeReader.java
===================================================================
--- trunk/core/org.cishell.utilities/src/org/cishell/utilities/UnicodeReader.java	                        (rev 0)
+++ trunk/core/org.cishell.utilities/src/org/cishell/utilities/UnicodeReader.java	2010-06-16 23:00:57 UTC (rev 1074)
@@ -0,0 +1,141 @@
+package org.cishell.utilities;
+
+/**
+ version: 1.1 / 2007-01-25
+ - changed BOM recognition ordering (longer boms first)
+
+ Original pseudocode   : Thomas Weidenfeller
+ Implementation tweaked: Aki Nieminen
+
+ http://www.unicode.org/unicode/faq/utf_bom.html
+ BOMs:
+ 00 00 FE FF    = UTF-32, big-endian
+ FF FE 00 00    = UTF-32, little-endian
+ EF BB BF       = UTF-8,
+ FE FF          = UTF-16, big-endian
+ FF FE          = UTF-16, little-endian
+
+ Win2k Notepad:
+ Unicode format = UTF-16LE
+ */
+
+import java.io.*;
+
+/**
+ * Generic unicode textreader, which will use BOM mark to identify the encoding
+ * to be used. If BOM is not found then use a given default or system encoding.
+ */
+public class UnicodeReader extends Reader {
+	public static final int BOM_SIZE = 4;
+	
+	private PushbackInputStream internalIn;
+	private InputStreamReader internalIn2 = null;
+	private String defaultEnc;
+
+	/**
+	 * @param in
+	 *            inputstream to be read
+	 */
+	public UnicodeReader(InputStream in) {
+		this(in, "UTF-8");
+	}
+	
+	/**
+	 * @param in
+	 *            inputstream to be read
+	 * @param defaultEnc
+	 *            default encoding if stream does not have BOM marker. Give null
+	 *            to use system-level default.
+	 */
+	public UnicodeReader(InputStream in, String defaultEnc) {
+		internalIn = new PushbackInputStream(in, BOM_SIZE);
+		this.defaultEnc = defaultEnc;
+	}
+
+	public String getDefaultEncoding() {
+		return defaultEnc;
+	}
+
+	/**
+	 * Get stream encoding or null if stream is uninitialized. Call init() or
+	 * read() method to initialize it.
+	 */
+	public String getEncoding() {
+		if (internalIn2 == null) {
+			return null;
+		}
+		
+		return internalIn2.getEncoding();
+	}
+
+	/**
+	 * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
+	 * back to the stream, only BOM bytes are skipped.
+	 */
+	protected void init() throws IOException {
+		if (internalIn2 != null) {
+			return;
+		}
+
+		String encoding;
+		byte bom[] = new byte[BOM_SIZE];
+		int n;
+		int unread;
+		n = internalIn.read(bom, 0, bom.length);
+
+		if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE)
+				&& (bom[3] == (byte) 0xFF)) {
+			encoding = "UTF-32BE";
+			unread = n - 4;
+			System.out.println("encoding detected: " + encoding);
+		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00)
+				&& (bom[3] == (byte) 0x00)) {
+			encoding = "UTF-32LE";
+			unread = n - 4;
+			System.out.println("encoding detected: " + encoding);
+		} else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
+			encoding = "UTF-8";
+			unread = n - 3;
+			System.out.println("encoding detected: " + encoding);
+		} else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
+			encoding = "UTF-16BE";
+			unread = n - 2;
+			System.out.println("encoding detected: " + encoding);
+		} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
+			encoding = "UTF-16LE";
+			unread = n - 2;
+			System.out.println("encoding detected: " + encoding);
+		} else {
+			// Unicode BOM mark not found, unread all bytes
+			encoding = defaultEnc;
+			unread = n;
+			System.out.println("using default encoding: " + encoding);
+		}
+		
+		
+
+		if (unread > 0) {
+			internalIn.unread(bom, (n - unread), unread);
+		}
+
+		// Use given encoding
+		if (encoding == null) {
+			internalIn2 = new InputStreamReader(internalIn, "UTF-8");
+		} else {
+			internalIn2 = new InputStreamReader(internalIn, encoding);
+		}
+	}
+
+	@Override
+	public void close() throws IOException {
+		init();
+		internalIn2.close();
+	}
+
+	@Override
+	public int read(char[] cbuf, int off, int len) throws IOException {
+		init();
+		return internalIn2.read(cbuf, off, len);
+	}
+
+}
\ No newline at end of file


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.