From: <wi...@us...> - 2010-04-20 09:04:33
|
Revision: 6066 http://nsis.svn.sourceforge.net/nsis/?rev=6066&view=rev Author: wizou Date: 2010-04-20 09:04:26 +0000 (Tue, 20 Apr 2010) Log Message: ----------- Unicode port: Support for Unicode/UTF8 input files by Jim Park. Modified Paths: -------------- NSIS/trunk/Source/tstring.h Added Paths: ----------- NSIS/trunk/Source/tstring.cpp NSIS/trunk/Source/validateunicode.cpp NSIS/trunk/Source/validateunicode.h Added: NSIS/trunk/Source/tstring.cpp =================================================================== --- NSIS/trunk/Source/tstring.cpp (rev 0) +++ NSIS/trunk/Source/tstring.cpp 2010-04-20 09:04:26 UTC (rev 6066) @@ -0,0 +1,131 @@ +// tstring.cpp +// +// This file is a part of Unicode NSIS. +// +// Copyright (C) 2007-2009 Jim Park +// +// Licensed under the zlib/libpng license (the "License"); +// you may not use this file except in compliance with the License. +// +// This software is provided 'as-is', without any expressed or implied +// warranty. +// +// Provides TSTRING support. + +#ifdef _UNICODE + +#include "tstring.h" +#include "validateunicode.h" +#include <vector> + +// Simple RAII for C-styled FILE pointers. +class ScopedFile +{ + public: + ScopedFile(FILE* file) : m_file(file) {} + + ~ScopedFile() + { + if (this->m_file != NULL) + { + fflush(this->m_file); + fclose(this->m_file); + } + } + + operator FILE*(){ return this->m_file; } + + operator bool() { return this->m_file != NULL; } + + private: + FILE* m_file; +}; + +FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode) +{ + extern FILE *g_output; + CValidateUnicode::FILE_TYPE ftype = CValidateUnicode::UTF_16LE; + + // If we are reading an existing file, check to see what type of file it + // is first. + if (_tcsstr(mode, _T("w+")) || + _tcsstr(mode, _T("r"))) + { + ScopedFile fp(_tfopen(file, _T("rb"))); + + if (fp) + { + fseek(fp, 0, SEEK_END); + size_t fileSize = ftell(fp); + if (fileSize == 0) + { + // Empty files are treated as UTF-8. + ftype = CValidateUnicode::UTF_8; + } + else + { + std::vector<unsigned char> buffer(fileSize); + fseek(fp, 0, SEEK_SET); + fread(&buffer[0], sizeof(unsigned char), fileSize, fp); + + ftype = CValidateUnicode::CheckBOM(&buffer[0], buffer.size()); + + switch (ftype) + { + case CValidateUnicode::UTF_8: + case CValidateUnicode::UTF_16LE: + case CValidateUnicode::UTF_16BE: + //_ftprintf(g_output, _T("File '%s' has a BOM marked as %s.\n"), + // file, CValidateUnicode::TypeToName(ftype)); + break; + case CValidateUnicode::UTF_32LE: + case CValidateUnicode::UTF_32BE: + _ftprintf(g_output, _T("File '%s' has a BOM marked as %s which is not supported at this time.\n"), + file, CValidateUnicode::TypeToName(ftype)); + exit(-1); + break; + case CValidateUnicode::UNKNOWN: + // If unknown, let's see if it's not just UTF_8 without a BOM. + if (CValidateUnicode::ValidateUTF8(&buffer[0], buffer.size())) + { + ftype = CValidateUnicode::UTF_8; + _ftprintf(g_output, _T("File '%s' has no BOM but seems to be UTF-8.\n"), file); + } + else + { + _ftprintf(g_output, _T("File '%s' has no BOM and does not validate as UTF-8.\n"), file); + } + break; + default: + _ftprintf(g_output, _T("CValidateUnicode::CheckBOM() for file '%s' returned an unknown return value: %d\n"), + file, ftype); + exit(-1); + break; + } + } + } + } + + tstring strMode(mode); + + switch (ftype) + { + case CValidateUnicode::UTF_8: + strMode.append(_T(", ccs=UTF-8")); + _ftprintf(g_output, _T("Opening '%s' as UTF-8.\n"), file); + break; + case CValidateUnicode::UTF_16LE: + strMode.append(_T(", ccs=UTF-16LE")); + _ftprintf(g_output, _T("Opening '%s' as UTF-16LE.\n"), file); + break; + default: + // Looks like fopen() doesn't support other encodings of Unicode. + strMode.append(_T(", ccs=UNICODE")); + _ftprintf(g_output, _T("Opening '%s' as ANSI.\n"), file); + break; + } + + return _tfopen(file, strMode.c_str()); +} + +#endif Modified: NSIS/trunk/Source/tstring.h =================================================================== --- NSIS/trunk/Source/tstring.h 2010-04-19 10:06:48 UTC (rev 6065) +++ NSIS/trunk/Source/tstring.h 2010-04-20 09:04:26 UTC (rev 6066) @@ -29,7 +29,8 @@ typedef std::wofstream tofstream; typedef std::wifstream tifstream; // Use the following macros to open text files. -#define FOPENTEXT(file, mode) _wfopen(file, mode) +FILE* FileOpenUnicodeText(const TCHAR* file, const TCHAR* mode); +#define FOPENTEXT(file, mode) FileOpenUnicodeText(file, mode) #else typedef std::string tstring; typedef std::ofstream tofstream; Added: NSIS/trunk/Source/validateunicode.cpp =================================================================== --- NSIS/trunk/Source/validateunicode.cpp (rev 0) +++ NSIS/trunk/Source/validateunicode.cpp 2010-04-20 09:04:26 UTC (rev 6066) @@ -0,0 +1,235 @@ +// validateunicode.cpp +// +// This file is a part of Unicode NSIS. +// +// Copyright (C) 2009 - Jim Park +// +// Licensed under the zlib/libpng license (the "License"); +// you may not use this file except in compliance with the License. +// +// This software is provided 'as-is', without any expressed or implied +// warranty. +// +// This class can be used to check a buffer to see if it has the expected +// Unicode encoding and look for byte order marks. + +#ifdef _UNICODE + +#include "validateunicode.h" +#include <vector> + +// anonymous namespace +namespace +{ + struct CUTF8BytesToFollow + { + unsigned char m_rShift; + unsigned char m_result; + unsigned char m_bytesToFollow; + }; + + const CUTF8BytesToFollow g_utf8BytesToFollow[] = + { + /* r-shift, result, length */ + { 7, 0x0, 0}, + { 5, 0x6, 1}, + { 4, 0xe, 2}, + { 3, 0x1e, 3}, + { 2, 0x3e, 4}, + { 1, 0x7e, 5} + }; +}; + +bool CValidateUnicode::ValidateUTF8(unsigned char* buf, size_t characters) +{ + bool valid = true; + int bytesToFollow = 0; + + while (valid && characters > 0) + { + // Last character may be 0. + if (*buf == 0 && characters != 1) + { + valid = false; + } + else + { + bytesToFollow = GetBytesToFollow(*buf); + if (bytesToFollow > 0) + { + while (bytesToFollow) + { + ++buf; + --characters; + if (*buf >> 6 != 0x2) + { + valid = false; + } + --bytesToFollow; + } + } + } + ++buf; + --characters; + } + + return valid; +} + +int CValidateUnicode::GetBytesToFollow(unsigned char ch) +{ + int result = -1; + for (int i = 0; i < sizeof(g_utf8BytesToFollow)/sizeof(CUTF8BytesToFollow); ++i) + { + if (ch >> g_utf8BytesToFollow[i].m_rShift == g_utf8BytesToFollow[i].m_result) + { + result = g_utf8BytesToFollow[i].m_bytesToFollow; + } + } + + return result; +} + +bool CValidateUnicode::ValidateUTF16LE(unsigned char* buf, size_t bytes) +{ + // We need to make sure the endianness matches the processor. + // Intel x86 is little endian. + return ValidateUTF16((unsigned short*)(buf), bytes/2); +} + +bool CValidateUnicode::ValidateUTF16BE(unsigned char* buf, size_t bytes) +{ + std::vector<unsigned short> correctedBuf(bytes/2); + + for (size_t i = 0; i < bytes; i += 2) + { + correctedBuf[i/2] = buf[i] << 8 | buf[i+1]; + } + + return ValidateUTF16(&correctedBuf[0], correctedBuf.size()); +} + +bool CValidateUnicode::ValidateUTF16(unsigned short* buf, size_t characters) +{ + unsigned short ch; + bool valid = true; + + while (valid && characters > 0) + { + // Last character may be 0. + if ((ch = *buf) == 0 && characters != 1) + { + valid = false; + } + else if (ch >= 0xd800 && ch <= 0xdbff) + { + unsigned short trailing = *(++buf); + --characters; + // Unpaired leading surrogate found? + if (trailing < 0xdc00 || trailing > 0xdfff) + { + valid = false; + } + // Invalid surrogate pairs found? + else if ((ch == 0xd83f || + ch == 0xd87f || + ch == 0xd8bf || + ch == 0xd8ff || + ch == 0xd93f || + ch == 0xd97f || + ch == 0xd9bf || + ch == 0xd9ff || + ch == 0xda3f || + ch == 0xdA7f || + ch == 0xdabf || + ch == 0xdaff || + ch == 0xdb3f || + ch == 0xdb7f || + ch == 0xdbbf || + ch == 0xdbff) + && + (trailing == 0xdffe || trailing == 0xdfff)) + { + valid = false; + } + } + // Unpaired trailing surrogate! + else if (ch >= 0xdc00 && ch <= 0xdfff) + { + valid = false; + } + // Invalid values + else if (ch == 0xfffe || ch == 0xffff || + (ch >= 0xfdd0 && ch <= 0xfdef)) + { + valid = false; + } + + ++buf; + --characters; + } + + return valid; +} + +CValidateUnicode::FILE_TYPE CValidateUnicode::CheckBOM( + unsigned char* buf, + size_t bytes) +{ + FILE_TYPE result = UNKNOWN; + + if (bytes >= 2) + { + if (buf[0] == 0xff && buf[1] == 0xfe) + { + result = UTF_16LE; + } + else if (buf[0] == 0xfe && buf[1] == 0xff) + { + result = UTF_16BE; + } + else if (bytes >= 3 && + buf[0] == 0xef && + buf[1] == 0xbb && + buf[2] == 0xbf) + { + result = UTF_8; + } + else if (bytes >= 4) + { + if (buf[0] == 0 && + buf[1] == 0 && + buf[2] == 0xfe && + buf[3] == 0xff) + { + result = UTF_32BE; + } + else if (buf[0] == 0xff && + buf[1] == 0xfe && + buf[2] == 0 && + buf[3] == 0) + { + result = UTF_32LE; + } + } + } + + return result; +} + +const TCHAR* CValidateUnicode::TypeToName(CValidateUnicode::FILE_TYPE ftype) +{ + static const TCHAR* names[] = + { + _T("UTF-8"), + _T("UTF-16LE"), + _T("UTF-16BE"), + _T("UTF-32LE"), + _T("UTF-32BE"), + _T("UNKNOWN") + }; + + return names[ftype]; +} + +#endif Added: NSIS/trunk/Source/validateunicode.h =================================================================== --- NSIS/trunk/Source/validateunicode.h (rev 0) +++ NSIS/trunk/Source/validateunicode.h 2010-04-20 09:04:26 UTC (rev 6066) @@ -0,0 +1,61 @@ +// validateunicode.h +// +// This file is a part of Unicode NSIS. +// +// Copyright (C) 2009 Jim Park +// +// Licensed under the zlib/libpng license (the "License"); +// you may not use this file except in compliance with the License. +// +// This software is provided 'as-is', without any expressed or implied +// warranty. +// +// This class can be used to check a buffer to see if it has the expected +// Unicode encoding and look for byte order marks. + +#ifndef _VALIDATEUNICODE_ +#define _VALIDATEUNICODE_ + +#include "tchar.h" + +class CValidateUnicode +{ + public: + + // Enum type for each Unicode encoding. + enum FILE_TYPE + { + UTF_8 = 0, + UTF_16LE, + UTF_16BE, + UTF_32LE, + UTF_32BE, + UNKNOWN + }; + + // Make sure that the buffer contains valid UTF-8 encoding. + static bool ValidateUTF8(unsigned char* buf, size_t characters); + + // Make sure that the buffer contains valid UTF-16LE encoding. + static bool ValidateUTF16LE(unsigned char* buf, size_t bytes); + + // Make sure that the buffer contains valid UTF-16BE encoding. + static bool ValidateUTF16BE(unsigned char* buf, size_t bytes); + + // Make sure that the buffer contains valid UTF-16 encoding. + static bool ValidateUTF16(unsigned short* buf, size_t characters); + + // Does the buffer have a byte order mark? And if so, what does it say? + static FILE_TYPE CheckBOM(unsigned char* buf, size_t bytes); + + // Convert a FILE_TYPE enum to a string. + static const TCHAR* TypeToName(FILE_TYPE ftype); + + protected: + + // Given the initial byte of a UTF-8 character, how many bytes are to + // follow? + static int GetBytesToFollow(unsigned char ch); +}; + +#endif This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |