|
From: <jrb...@us...> - 2010-06-16 23:01:04
|
Revision: 1074
http://cishell.svn.sourceforge.net/cishell/?rev=1074&view=rev
Author: jrbibers
Date: 2010-06-16 23:00:57 +0000 (Wed, 16 Jun 2010)
Log Message:
-----------
Improved UTF-8 support by:
* Tolerating byte-order marks in UTF-8 files, where they are unnecessary but harmless and standard-permissible.
* Wrapping FileReaders throughout the converters inside UnicodeReader, a CIShell utility class. This detects the character encodings UTF-8, UTF-16LE, UTF-16BE, UTF-32LE, and UTF-32BE according to any byte-order mark at the head of the file, defaulting to UTF-8 when unclear. The multiple existing UnicodeReaders are removed.
* Specifying the preferred encoding UTF-8 on FileWriters throughout the converter set.
* Adding a friendlier error message to the File > Load algorithm when an unrecognized encoding is presented. It suggests two common fixes.
Reviewed by Russell.
Modified Paths:
--------------
trunk/clients/gui/org.cishell.reference.gui.persistence/src/org/cishell/reference/gui/persistence/load/FileLoadAlgorithm.java
Added Paths:
-----------
trunk/core/org.cishell.utilities/src/org/cishell/utilities/UnicodeReader.java
Modified: trunk/clients/gui/org.cishell.reference.gui.persistence/src/org/cishell/reference/gui/persistence/load/FileLoadAlgorithm.java
===================================================================
--- trunk/clients/gui/org.cishell.reference.gui.persistence/src/org/cishell/reference/gui/persistence/load/FileLoadAlgorithm.java 2010-06-12 01:41:40 UTC (rev 1073)
+++ trunk/clients/gui/org.cishell.reference.gui.persistence/src/org/cishell/reference/gui/persistence/load/FileLoadAlgorithm.java 2010-06-16 23:00:57 UTC (rev 1074)
@@ -1,6 +1,7 @@
package org.cishell.reference.gui.persistence.load;
import java.io.File;
+import java.io.UnsupportedEncodingException;
import java.util.Dictionary;
import org.cishell.framework.CIShellContext;
@@ -88,30 +89,38 @@
return fileSelector.getFile();
}
- private Data[] validateFile(IWorkbenchWindow window, Display display, File file)
- throws AlgorithmExecutionException {
+ private Data[] validateFile(IWorkbenchWindow window, Display display, File file) {
AlgorithmFactory validator = null;
- boolean shouldTryValidator = true;
- while (shouldTryValidator) {
- try {
- validator = getValidatorFromUser(window, display, file);
+ try {
+ validator = getValidatorFromUser(window, display, file);
- if ((file == null) || (validator == null)) {
- String logMessage = "File loading canceled";
- this.logger.log(LogService.LOG_WARNING, logMessage);
-
- shouldTryValidator = false;
- } else {
+ if ((file == null) || (validator == null)) {
+ String logMessage = "File loading canceled";
+ this.logger.log(LogService.LOG_WARNING, logMessage);
+ } else {
+ try {
return FileValidator.validateFile(
- file, validator, this.progressMonitor, this.ciShellContext, this.logger);
+ file, validator, this.progressMonitor, this.ciShellContext, this.logger);
+ } catch (AlgorithmExecutionException e) {
+ if (e.getCause() != null
+ && e.getCause() instanceof UnsupportedEncodingException) {
+ String logMessage =
+ "This file cannot be loaded; it uses the unsupported character encoding "
+ + e.getCause().getMessage() + ".";
+ this.logger.log(LogService.LOG_ERROR, logMessage);
+ } else {
+ throw e;
+ }
}
- } catch (Throwable e) {
- String logMessage =
- "The chosen file is not compatible with the chosen file. " +
- "Please try a different format or cancel.";
- this.logger.log(LogService.LOG_ERROR, logMessage);
}
+ } catch (Throwable e) {
+ String logMessage =
+ "The chosen file is not compatible with this format. " +
+ "Check that your file is correctly formatted or try another validator. " +
+ "The reason is: " + e.getMessage();
+ e.printStackTrace(); // TODO remove
+ this.logger.log(LogService.LOG_ERROR, logMessage);
}
return null;
Added: trunk/core/org.cishell.utilities/src/org/cishell/utilities/UnicodeReader.java
===================================================================
--- trunk/core/org.cishell.utilities/src/org/cishell/utilities/UnicodeReader.java (rev 0)
+++ trunk/core/org.cishell.utilities/src/org/cishell/utilities/UnicodeReader.java 2010-06-16 23:00:57 UTC (rev 1074)
@@ -0,0 +1,141 @@
+package org.cishell.utilities;
+
+/**
+ version: 1.1 / 2007-01-25
+ - changed BOM recognition ordering (longer boms first)
+
+ Original pseudocode : Thomas Weidenfeller
+ Implementation tweaked: Aki Nieminen
+
+ http://www.unicode.org/unicode/faq/utf_bom.html
+ BOMs:
+ 00 00 FE FF = UTF-32, big-endian
+ FF FE 00 00 = UTF-32, little-endian
+ EF BB BF = UTF-8,
+ FE FF = UTF-16, big-endian
+ FF FE = UTF-16, little-endian
+
+ Win2k Notepad:
+ Unicode format = UTF-16LE
+ */
+
+import java.io.*;
+
+/**
+ * Generic unicode textreader, which will use BOM mark to identify the encoding
+ * to be used. If BOM is not found then use a given default or system encoding.
+ */
+public class UnicodeReader extends Reader {
+ public static final int BOM_SIZE = 4;
+
+ private PushbackInputStream internalIn;
+ private InputStreamReader internalIn2 = null;
+ private String defaultEnc;
+
+ /**
+ * @param in
+ * inputstream to be read
+ */
+ public UnicodeReader(InputStream in) {
+ this(in, "UTF-8");
+ }
+
+ /**
+ * @param in
+ * inputstream to be read
+ * @param defaultEnc
+ * default encoding if stream does not have BOM marker. Give null
+ * to use system-level default.
+ */
+ public UnicodeReader(InputStream in, String defaultEnc) {
+ internalIn = new PushbackInputStream(in, BOM_SIZE);
+ this.defaultEnc = defaultEnc;
+ }
+
+ public String getDefaultEncoding() {
+ return defaultEnc;
+ }
+
+ /**
+ * Get stream encoding or null if stream is uninitialized. Call init() or
+ * read() method to initialize it.
+ */
+ public String getEncoding() {
+ if (internalIn2 == null) {
+ return null;
+ }
+
+ return internalIn2.getEncoding();
+ }
+
+ /**
+ * Read-ahead four bytes and check for BOM marks. Extra bytes are unread
+ * back to the stream, only BOM bytes are skipped.
+ */
+ protected void init() throws IOException {
+ if (internalIn2 != null) {
+ return;
+ }
+
+ String encoding;
+ byte bom[] = new byte[BOM_SIZE];
+ int n;
+ int unread;
+ n = internalIn.read(bom, 0, bom.length);
+
+ if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE)
+ && (bom[3] == (byte) 0xFF)) {
+ encoding = "UTF-32BE";
+ unread = n - 4;
+ System.out.println("encoding detected: " + encoding);
+ } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00)
+ && (bom[3] == (byte) 0x00)) {
+ encoding = "UTF-32LE";
+ unread = n - 4;
+ System.out.println("encoding detected: " + encoding);
+ } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) {
+ encoding = "UTF-8";
+ unread = n - 3;
+ System.out.println("encoding detected: " + encoding);
+ } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
+ encoding = "UTF-16BE";
+ unread = n - 2;
+ System.out.println("encoding detected: " + encoding);
+ } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
+ encoding = "UTF-16LE";
+ unread = n - 2;
+ System.out.println("encoding detected: " + encoding);
+ } else {
+ // Unicode BOM mark not found, unread all bytes
+ encoding = defaultEnc;
+ unread = n;
+ System.out.println("using default encoding: " + encoding);
+ }
+
+
+
+ if (unread > 0) {
+ internalIn.unread(bom, (n - unread), unread);
+ }
+
+ // Use given encoding
+ if (encoding == null) {
+ internalIn2 = new InputStreamReader(internalIn, "UTF-8");
+ } else {
+ internalIn2 = new InputStreamReader(internalIn, encoding);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ init();
+ internalIn2.close();
+ }
+
+ @Override
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ init();
+ return internalIn2.read(cbuf, off, len);
+ }
+
+}
\ No newline at end of file
This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.
|