[St-m-svn] SF.net SVN: st-m: [1043] trunk/src

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 454-5900

Revision: 1043
          http://st-m.svn.sourceforge.net/st-m/?rev=1043&view=rev
Author:   cnx_glenn
Date:     2007-10-09 10:59:56 -0700 (Tue, 09 Oct 2007)

Log Message:
-----------
Created new UTF8 and UTF16 fstream classes:
- utf8ofstream and utf8ifstream (utf8fstream.h)
- utf16ofstream and utf16ifstream (utf16fstream.h)

Modified Paths:
--------------
    trunk/src/MainFrm.cpp
    trunk/src/Startup Manager_2003.vcproj
    trunk/src/types/tfstream.h

Added Paths:
-----------
    trunk/src/types/utf16fstream.h
    trunk/src/types/utf8fstream.cpp
    trunk/src/types/utf8fstream.h

Removed Paths:
-------------
    trunk/src/types/tfstream.cpp

Modified: trunk/src/MainFrm.cpp
===================================================================

--- trunk/src/MainFrm.cpp	2007-10-09 16:30:10 UTC (rev 1042)
+++ trunk/src/MainFrm.cpp	2007-10-09 17:59:56 UTC (rev 1043)
@@ -30,6 +30,7 @@
 #include "aboutdlg.h"
 #include "dropdlg.h"
 #include "updatedlg.h"
+#include "types\utf8fstream.h"
 
 #include <winuser.h>
 #include "lib/htmlhelp.h"
@@ -415,7 +416,7 @@
 
 void CMainFrame::ExportItemsAsHtmlVertical(tofstream &htmlfile)
 {
-	if(!htmlfile)
+	if(!htmlfile.is_open())
 		throw(CWinException(_T("ExportItemsAsHtmlVetical... htmlfile is not a valid file handle")));
 
 	tstring item     = LoadString(STR_COL1);
@@ -445,7 +446,7 @@
 
 void CMainFrame::ExportItemsAsHtmlHorizontal(tofstream &htmlfile)
 {
-	if(!htmlfile)
+	if(!htmlfile.is_open())
 		throw(CWinException(_T("ExportItemsAsHtmlHorizontal... htmlfile is not a valid file handle")));
 
 	int iItems = ListView_GetItemCount(GetListView().GetHwnd());
@@ -467,14 +468,12 @@
 	try
 	{
 	#ifdef UNICODE
-		tofstream htmlfile;
-		IMBUE_UTF8_CODECVT(htmlfile);
-		htmlfile.open(toNarrowString(path).c_str(), std::ios::out | std::ios::binary);
+		utf8ofstream htmlfile(toNarrowString(path).c_str(), std::ios::out | std::ios::binary);
 	#else
 		tofstream htmlfile(path.c_str(), std::ios::out);
 	#endif
 		
-		if(htmlfile)
+		if (htmlfile.is_open())
 		{
 			htmlfile << _T("<html>") << NEWLINE;
 			htmlfile << _T("\t<head><title>Startup Programs List</title></head>") << NEWLINE;
@@ -534,7 +533,7 @@
 
 void CMainFrame::ExportItemsAsXml(tofstream &xmlfile)
 {
-	if(!xmlfile)
+	if(!xmlfile.is_open())
 		throw(CWinException(_T("ExportItemsAsXml... xmlfile is not a valid file handle")));
 
 	int iItems = ListView_GetItemCount(GetListView().GetHwnd());
@@ -555,7 +554,7 @@
 	try
 	{
 		tofstream xmlfile(toNarrowString(path).c_str(), std::ios::out);	
-		if(xmlfile)
+		if(xmlfile.is_open())
 		{
 			xmlfile << _T("<?xml version=\"1.0\" encoding=\"UTF-8\"?>") << std::endl;
 			xmlfile << _T("<?xml-stylesheet type=\"text/xsl\" href=\"http://st-m.sourceforge.net/misc/st-m_items.xsl\"?>") << std::endl;

Modified: trunk/src/Startup Manager_2003.vcproj
===================================================================
--- trunk/src/Startup Manager_2003.vcproj	2007-10-09 16:30:10 UTC (rev 1042)
+++ trunk/src/Startup Manager_2003.vcproj	2007-10-09 17:59:56 UTC (rev 1043)
@@ -546,10 +546,10 @@
 				Name="Types"
 				Filter="">
 				<File
-					RelativePath=".\types\tfstream.cpp">
+					RelativePath=".\types\tstring.cpp">
 				</File>
 				<File
-					RelativePath=".\types\tstring.cpp">
+					RelativePath=".\types\utf8fstream.cpp">
 				</File>
 			</Filter>
 		</Filter>
@@ -681,6 +681,12 @@
 				<File
 					RelativePath=".\types\tstring.h">
 				</File>
+				<File
+					RelativePath=".\types\utf16fstream.h">
+				</File>
+				<File
+					RelativePath=".\types\utf8fstream.h">
+				</File>
 			</Filter>
 		</Filter>
 		<Filter

Deleted: trunk/src/types/tfstream.cpp
===================================================================
--- trunk/src/types/tfstream.cpp	2007-10-09 16:30:10 UTC (rev 1042)
+++ trunk/src/types/tfstream.cpp	2007-10-09 17:59:56 UTC (rev 1043)
@@ -1,262 +0,0 @@
-/*
- * this file is part of Startup Manager
- * Copyright (C) 2004-2007 Glenn Van Loon, cnx...@us...
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- */
-
-// The UTF8Codecvt class was based on the utf8_codecvt_facet boost class.
-// Copyright  2001 Ronald Garcia, Indiana University (ga...@os...)
-// Andrew Lumsdaine, Indiana University (lu...@os...). Permission to copy, 
-// use, modify, sell and distribute this software is granted provided this
-// copyright notice appears in all copies. This software is provided "as is"
-// without express or implied warranty, and with no claim as to its suitability
-// for any purpose.
-
-#include "tfstream.h"
-
-#include <cassert>
-#include <limits>
-
-// Translate incoming UTF-8 into UCS-4
-std::codecvt_base::result UTF8Codecvt::do_in(
-	std::mbstate_t&, const char * from,
-	const char * from_end, const char * & from_next,
-	wchar_t * to, wchar_t * to_end, wchar_t * & to_next) const
-{
-	// Basic algorithm:  The first octet determines how many
-	// octets total make up the UCS-4 character.  The remaining
-	// "continuing octets" all begin with "10". To convert, subtract
-	// the amount that specifies the number of octets from the first
-	// octet.  Subtract 0x80 (1000 0000) from each continuing octet,
-	// then mash the whole lot together.  Note that each continuing
-	// octet only uses 6 bits as unique values, so only shift by
-	// multiples of 6 to combine.
-	while ((from != from_end) && (to != to_end))
-	{
-		// Error checking on the first octet
-		if (invalid_leading_octet(*from))
-		{
-			from_next = from;
-			to_next = to;
-			return std::codecvt_base::error;
-		}
-
-		// The first octet is adjusted by a value dependent upon 
-		// the number of "continuing octets" encoding the character
-		const int cont_octet_count = get_cont_octet_count(*from);
-		const wchar_t octet1_modifier_table[] = {0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
-
-		// The unsigned char conversion is necessary in case char is
-		// signed (I learned this the hard way)
-		wchar_t ucs_result = (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
-
-		// Invariants   : 
-		//   1) At the start of the loop,   'i' continuing characters have been
-		//    processed 
-		//   2) *from   points to the next continuing character to be processed.
-		int i   = 0;
-		while ((i != cont_octet_count) && (from != from_end))
-		{
-			// Error checking on continuing characters
-			if (invalid_continuing_octet(*from))
-			{
-				from_next = from;
-				to_next = to;
-				return std::codecvt_base::error;
-			}
-
-			ucs_result *= (1 << 6); 
-
-			// each continuing character has an extra (10xxxxxx)b attached to 
-			// it that must be removed.
-			ucs_result += (unsigned char)(*from++) - 0x80;
-			++i;
-		}
-
-		// If the buffer ends with an incomplete unicode character...
-		if ((from == from_end) && (i != cont_octet_count))
-		{
-			// rewind "from" to before the current character translation
-			from_next = from - (i+1); 
-			to_next = to;
-			return std::codecvt_base::partial;
-		}
-		*to++ = ucs_result;
-	}
-	from_next = from;
-	to_next = to;
-
-	// Were we done converting or did we run out of destination space?
-	if (from == from_end)
-		return std::codecvt_base::ok;
-
-	return std::codecvt_base::partial;
-}
-
-std::codecvt_base::result UTF8Codecvt::do_out(
-	std::mbstate_t &, const wchar_t *   from,
-	const wchar_t * from_end, const wchar_t * & from_next,
-	char * to, char * to_end, char * & to_next) const
-{
-	// RG - consider merging this table with the other one
-	const wchar_t octet1_modifier_table[] = {0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
-
-	while ((from != from_end) && (to != to_end))
-	{
-		// Check for invalid UCS-4 character
-		if (*from > reinterpret_cast<wchar_t>(std::numeric_limits<wchar_t>::max))
-		{
-			from_next = from;
-			to_next = to;
-			return std::codecvt_base::error;
-		}
-
-		int cont_octet_count = get_cont_octet_out_count(*from);
-
-		// RG  - comment this formula better
-		int shift_exponent = (cont_octet_count) * 6;
-
-		// Process the first character
-		*to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] + (unsigned char)(*from / (1 << shift_exponent)));
-
-		// Process the continuation characters 
-		// Invariants: At   the start of the loop:
-		//   1) 'i' continuing octets   have been generated
-		//   2) '*to'   points to the next location to place an octet
-		//   3) shift_exponent is   6 more than needed for the next octet
-		int i = 0;
-		while ((i != cont_octet_count) && (to != to_end))
-		{
-			shift_exponent -= 6;
-			*to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6)));
-			++i;
-		}
-		// If we filled up the out buffer before encoding the character
-		if ((to == to_end) && (i != cont_octet_count))
-		{
-			from_next = from;
-			to_next = to - (i+1);
-			return std::codecvt_base::partial;
-		}
-		*from++;
-	}
-	from_next = from;
-	to_next = to;
-
-	// Were we done or did we run out of destination space
-	if (from == from_end)
-		return std::codecvt_base::ok;
-	
-	return std::codecvt_base::partial;
-}
-
-// How many char objects can I process to get <= max_limit
-// wchar_t objects?
-int UTF8Codecvt::do_length(
-	std::mbstate_t &, const char * from,
-	const char * from_end, std::size_t max_limit) const throw()
-{ 
-	// RG - this code is confusing!  I need a better way to express it.
-	// and test cases.
-
-	// Invariants:
-	// 1) last_octet_count has the size of the last measured character
-	// 2) char_count holds the number of characters shown to fit
-	// within the bounds so far (no greater than max_limit)
-	// 3) from_next points to the octet 'last_octet_count' before the
-	// last measured character.  
-	int last_octet_count=0;
-	std::size_t char_count = 0;
-	const char* from_next = from;
-	// Use "<" because the buffer may represent incomplete characters
-	while ((from_next+last_octet_count <= from_end) && (char_count <= max_limit))
-	{
-		from_next += last_octet_count;
-		last_octet_count = (get_octet_count(*from_next));
-		++char_count;
-	}
-	return static_cast<int>(from_next-from_end);
-}
-
-unsigned int UTF8Codecvt::get_octet_count(unsigned char lead_octet)
-{
-	// if the 0-bit (MSB) is 0, then 1 character
-	if (lead_octet <= 0x7f) return 1;
-
-	// Otherwise the count number of consecutive 1 bits starting at MSB
-	assert((0xc0 <= lead_octet) && (lead_octet <= 0xfd));
-
-	if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
-	else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
-	else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
-	else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
-	else return 6;
-}
-
-namespace
-{
-
-	template<std::size_t s>
-	int get_cont_octet_out_count_impl(wchar_t word)
-	{
-		if (word < 0x80)
-		{
-			return 0;
-		}
-		if (word < 0x800)
-		{
-			return 1;
-		}
-		return 2;
-	}
-
-	// note the following code will generate on some platforms where
-	// wchar_t is defined as UCS2.  The warnings are superfluous as
-	// the specialization is never instantitiated with such compilers.
-	template<>
-	int get_cont_octet_out_count_impl<4>(wchar_t word)
-	{
-		if (word < 0x80)
-		{
-			return 0;
-		}
-		if (word < 0x800)
-		{
-			return 1;
-		}
-		if (word < 0x10000)
-		{
-			return 2;
-		}
-		if (word < 0x200000)
-		{
-			return 3;
-		}
-		if (word < 0x4000000)
-		{
-			return 4;
-		}
-		return 5;
-	}
-
-} // namespace anonymous
-
-// How many "continuing octets" will be needed for this word
-// ==   total octets - 1.
-int UTF8Codecvt::get_cont_octet_out_count(wchar_t word) const
-{
-	return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
-}

Modified: trunk/src/types/tfstream.h
===================================================================
--- trunk/src/types/tfstream.h	2007-10-09 16:30:10 UTC (rev 1042)
+++ trunk/src/types/tfstream.h	2007-10-09 17:59:56 UTC (rev 1043)
@@ -17,19 +17,10 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  */
 
-// The UTF8Codecvt class was based on the utf8_codecvt_facet boost class.
-// Copyright  2001 Ronald Garcia, Indiana University (ga...@os...)
-// Andrew Lumsdaine, Indiana University (lu...@os...). Permission to copy, 
-// use, modify, sell and distribute this software is granted provided this
-// copyright notice appears in all copies. This software is provided "as is"
-// without express or implied warranty, and with no claim as to its suitability
-// for any purpose.
-
 #ifndef TFSTREAM_H
 #define TFSTREAM_H
 
 #include <fstream>
-#include <locale>
 
 #ifdef UNICODE
 	typedef std::wifstream tifstream;
@@ -38,85 +29,5 @@
 	typedef std::ifstream tifstream;
 	typedef std::ofstream tofstream;
 #endif
- 
-typedef std::codecvt<wchar_t, char, mbstate_t> NullCodecvtBase;
-class UTF8Codecvt : public NullCodecvtBase
-{
-public:
-	explicit UTF8Codecvt(std::size_t no_locale_manage=0)
-		: std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage) 
-	{}
 
-protected:
-	virtual std::codecvt_base::result do_in(
-		std::mbstate_t& state, const char * from,
-		const char * from_end, const char * & from_next,
-		wchar_t * to, wchar_t * to_end, wchar_t * & to_next) const;
-
-	virtual std::codecvt_base::result do_out(
-		std::mbstate_t & state, const wchar_t * from,
-		const wchar_t * from_end, const wchar_t*  & from_next,
-		char * to, char * to_end, char * & to_next) const;
-
-	bool invalid_continuing_octet(unsigned char octet_1) const
-	{
-		return ((octet_1 < 0x80) || (0xbf < octet_1));
-	}
-
-	bool invalid_leading_octet(unsigned char octet_1) const
-	{
-		return (((0x7f < octet_1) && (octet_1 < 0xc0)) || (octet_1 > 0xfd));
-	}
-
-	// continuing octets = octets except for the leading octet
-	static unsigned int get_cont_octet_count(unsigned char lead_octet)
-	{
-		return (get_octet_count(lead_octet) - 1);
-	}
-
-	static unsigned int get_octet_count(unsigned char lead_octet);
-
-	// How many "continuing octets" will be needed for this word
-	// ==   total octets - 1.
-	int get_cont_octet_out_count(wchar_t word) const;
-
-	virtual bool do_always_noconv() const throw() {return false;}
-
-	// UTF-8 isn't really stateful since we rewind on partial conversions
-	virtual std::codecvt_base::result do_unshift(
-		std::mbstate_t&, char * from, char * /*to*/, char * & next) const 
-	{
-		next = from;
-		return ok;
-	}
-
-	virtual int do_encoding() const throw()
-	{
-		const int variable_byte_external_encoding = 0;
-		return variable_byte_external_encoding;
-	}
-
-	// How many char objects can I process to get <= max_limit
-	// wchar_t objects?
-	virtual int do_length(
-		std::mbstate_t &, const char * from, 
-		const char * from_end, std::size_t max_limit) const throw();
-
-	// Largest possible value do_length(state,from,from_end,1) could return.
-	virtual int do_max_length() const throw ()
-	{
-		return 6; // largest UTF-8 encoding of a UCS-4 character
-	}
-
-private:
-	UTF8Codecvt(const UTF8Codecvt&);
-	UTF8Codecvt& operator=(const UTF8Codecvt&);
-};
-
-#define IMBUE_UTF8_CODECVT(outputFile) \
-{ \
-	std::locale loc(std::locale(), new UTF8Codecvt); \
-	(outputFile).imbue(loc); \
-}
-
 #endif

Added: trunk/src/types/utf16fstream.h
===================================================================
--- trunk/src/types/utf16fstream.h	                        (rev 0)
+++ trunk/src/types/utf16fstream.h	2007-10-09 17:59:56 UTC (rev 1043)
@@ -0,0 +1,117 @@
+/*
+ * this file is part of Startup Manager
+ * Copyright (C) 2004-2007 Glenn Van Loon, cnx...@us...
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#ifndef UTF16FSTREAM_H
+#define UTF16FSTREAM_H
+
+#include "tfstream.h"
+
+#include <locale>
+
+typedef std::codecvt<wchar_t, char, mbstate_t> NullCodecvtBase;
+class UTF16Codecvt : public NullCodecvtBase
+{
+public:
+	typedef wchar_t _E;
+	typedef char _To;
+	typedef mbstate_t _St;
+
+	explicit UTF16Codecvt(size_t _R=0) : NullCodecvtBase(_R) {}
+
+protected:
+	virtual result do_in(_St&, const _To*, const _To*, const _To*&, _E*, _E*, _E*&) const
+	{
+		return noconv;
+	}
+	virtual result do_out(_St&, const _E*, const _E*, const _E*&, _To*, _E*, _To*&) const
+	{
+		return noconv;
+	}
+	virtual result do_unshift(_St&, _To*, _To*, _To*&) const
+	{
+		return noconv;
+	}
+	virtual int do_length(_St&, const _To* _F1, const _To* _L1, size_t _N2) const _THROW0()
+	{
+		return static_cast<int>((_N2 < (size_t)(_L1 - _F1)) ? _N2 : _L1 - _F1);
+	}
+	virtual bool do_always_noconv() const _THROW0()
+	{
+		return true;
+	}
+	virtual int do_max_length() const _THROW0()
+	{
+		return 2;
+	}
+	virtual int do_encoding() const _THROW0()
+	{
+		
+
+private:
+	UTF16Codecvt(const UTF16Codecvt&);
+	UTF16Codecvt& operator=(const UTF16Codecvt&);
+};
+
+#define IMBUE_UTF16_CODECVT(outputFile) \
+{ \
+	std::locale loc(std::locale(), new UTF16Codecvt); \
+	(outputFile).imbue(loc); \
+}
+
+class utf8ofstream : public tofstream
+{
+public:
+	utf16ofstream() {}
+	utf16ofstream(const char * filename, std::ios_base::openmode mode = std::ios_base::out)
+	{
+		open(filename, mode);
+	}
+	virtual ~utf16ofstream() {}
+	virtual void open(const char * filename, std::ios_base::openmode mode = std::ios_base::out)
+	{
+		IMBUE_UTF16_CODECVT(*this);
+		tofstream::open(filename, mode);
+	}
+
+private:
+	utf16ofstream(const utf16ofstream &s);				//prevent copy construction
+	utf16ofstream& operator=(const utf16ofstream &s);	//prevent copy assignment
+};
+
+class utf16ifstream : public tifstream
+{
+public:
+	utf16ifstream() {}
+	utf16ifstream(const char * filename, std::ios_base::openmode mode = std::ios_base::in)
+	{
+		open(filename, mode);
+	}
+	virtual ~utf16ifstream() {}
+	virtual void open(const char * filename, std::ios_base::openmode mode = std::ios_base::in)
+	{
+		IMBUE_UTF16_CODECVT(*this);
+		tifstream::open(filename, mode);
+	}
+
+private:
+	utf16ifstream(const utf16ifstream &s);				//prevent copy construction
+	utf16ifstream& operator=(const utf16ifstream &s);	//prevent copy assignment
+};
+
+#endif //UTF16FSTREAM_H

Added: trunk/src/types/utf8fstream.cpp
===================================================================
--- trunk/src/types/utf8fstream.cpp	                        (rev 0)
+++ trunk/src/types/utf8fstream.cpp	2007-10-09 17:59:56 UTC (rev 1043)
@@ -0,0 +1,254 @@
+/*
+ * this file is part of Startup Manager
+ * Copyright (C) 2004-2007 Glenn Van Loon, cnx...@us...
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#include "utf8fstream.h"
+
+#include <cassert>
+#include <limits>
+
+// Translate incoming UTF-8 into UCS-4
+std::codecvt_base::result UTF8Codecvt::do_in(
+	std::mbstate_t&, const char * from,
+	const char * from_end, const char * & from_next,
+	wchar_t * to, wchar_t * to_end, wchar_t * & to_next) const
+{
+	// Basic algorithm:  The first octet determines how many
+	// octets total make up the UCS-4 character.  The remaining
+	// "continuing octets" all begin with "10". To convert, subtract
+	// the amount that specifies the number of octets from the first
+	// octet.  Subtract 0x80 (1000 0000) from each continuing octet,
+	// then mash the whole lot together.  Note that each continuing
+	// octet only uses 6 bits as unique values, so only shift by
+	// multiples of 6 to combine.
+	while ((from != from_end) && (to != to_end))
+	{
+		// Error checking on the first octet
+		if (invalid_leading_octet(*from))
+		{
+			from_next = from;
+			to_next = to;
+			return std::codecvt_base::error;
+		}
+
+		// The first octet is adjusted by a value dependent upon 
+		// the number of "continuing octets" encoding the character
+		const int cont_octet_count = get_cont_octet_count(*from);
+		const wchar_t octet1_modifier_table[] = {0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
+
+		// The unsigned char conversion is necessary in case char is
+		// signed (I learned this the hard way)
+		wchar_t ucs_result = (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count];
+
+		// Invariants   : 
+		//   1) At the start of the loop,   'i' continuing characters have been
+		//    processed 
+		//   2) *from   points to the next continuing character to be processed.
+		int i   = 0;
+		while ((i != cont_octet_count) && (from != from_end))
+		{
+			// Error checking on continuing characters
+			if (invalid_continuing_octet(*from))
+			{
+				from_next = from;
+				to_next = to;
+				return std::codecvt_base::error;
+			}
+
+			ucs_result *= (1 << 6); 
+
+			// each continuing character has an extra (10xxxxxx)b attached to 
+			// it that must be removed.
+			ucs_result += (unsigned char)(*from++) - 0x80;
+			++i;
+		}
+
+		// If the buffer ends with an incomplete unicode character...
+		if ((from == from_end) && (i != cont_octet_count))
+		{
+			// rewind "from" to before the current character translation
+			from_next = from - (i+1); 
+			to_next = to;
+			return std::codecvt_base::partial;
+		}
+		*to++ = ucs_result;
+	}
+	from_next = from;
+	to_next = to;
+
+	// Were we done converting or did we run out of destination space?
+	if (from == from_end)
+		return std::codecvt_base::ok;
+
+	return std::codecvt_base::partial;
+}
+
+std::codecvt_base::result UTF8Codecvt::do_out(
+	std::mbstate_t &, const wchar_t *   from,
+	const wchar_t * from_end, const wchar_t * & from_next,
+	char * to, char * to_end, char * & to_next) const
+{
+	// RG - consider merging this table with the other one
+	const wchar_t octet1_modifier_table[] = {0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
+
+	while ((from != from_end) && (to != to_end))
+	{
+		// Check for invalid UCS-4 character
+		if (*from > reinterpret_cast<wchar_t>(std::numeric_limits<wchar_t>::max))
+		{
+			from_next = from;
+			to_next = to;
+			return std::codecvt_base::error;
+		}
+
+		int cont_octet_count = get_cont_octet_out_count(*from);
+
+		// RG  - comment this formula better
+		int shift_exponent = (cont_octet_count) * 6;
+
+		// Process the first character
+		*to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] + (unsigned char)(*from / (1 << shift_exponent)));
+
+		// Process the continuation characters 
+		// Invariants: At   the start of the loop:
+		//   1) 'i' continuing octets   have been generated
+		//   2) '*to'   points to the next location to place an octet
+		//   3) shift_exponent is   6 more than needed for the next octet
+		int i = 0;
+		while ((i != cont_octet_count) && (to != to_end))
+		{
+			shift_exponent -= 6;
+			*to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6)));
+			++i;
+		}
+		// If we filled up the out buffer before encoding the character
+		if ((to == to_end) && (i != cont_octet_count))
+		{
+			from_next = from;
+			to_next = to - (i+1);
+			return std::codecvt_base::partial;
+		}
+		*from++;
+	}
+	from_next = from;
+	to_next = to;
+
+	// Were we done or did we run out of destination space
+	if (from == from_end)
+		return std::codecvt_base::ok;
+	
+	return std::codecvt_base::partial;
+}
+
+// How many char objects can I process to get <= max_limit
+// wchar_t objects?
+int UTF8Codecvt::do_length(
+	std::mbstate_t &, const char * from,
+	const char * from_end, std::size_t max_limit) const throw()
+{ 
+	// RG - this code is confusing!  I need a better way to express it.
+	// and test cases.
+
+	// Invariants:
+	// 1) last_octet_count has the size of the last measured character
+	// 2) char_count holds the number of characters shown to fit
+	// within the bounds so far (no greater than max_limit)
+	// 3) from_next points to the octet 'last_octet_count' before the
+	// last measured character.  
+	int last_octet_count=0;
+	std::size_t char_count = 0;
+	const char* from_next = from;
+	// Use "<" because the buffer may represent incomplete characters
+	while ((from_next+last_octet_count <= from_end) && (char_count <= max_limit))
+	{
+		from_next += last_octet_count;
+		last_octet_count = (get_octet_count(*from_next));
+		++char_count;
+	}
+	return static_cast<int>(from_next-from_end);
+}
+
+unsigned int UTF8Codecvt::get_octet_count(unsigned char lead_octet)
+{
+	// if the 0-bit (MSB) is 0, then 1 character
+	if (lead_octet <= 0x7f) return 1;
+
+	// Otherwise the count number of consecutive 1 bits starting at MSB
+	assert((0xc0 <= lead_octet) && (lead_octet <= 0xfd));
+
+	if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2;
+	else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3;
+	else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4;
+	else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5;
+	else return 6;
+}
+
+namespace
+{
+
+	template<std::size_t s>
+	int get_cont_octet_out_count_impl(wchar_t word)
+	{
+		if (word < 0x80)
+		{
+			return 0;
+		}
+		if (word < 0x800)
+		{
+			return 1;
+		}
+		return 2;
+	}
+
+	// note the following code will generate on some platforms where
+	// wchar_t is defined as UCS2.  The warnings are superfluous as
+	// the specialization is never instantitiated with such compilers.
+	template<>
+	int get_cont_octet_out_count_impl<4>(wchar_t word)
+	{
+		if (word < 0x80)
+		{
+			return 0;
+		}
+		if (word < 0x800)
+		{
+			return 1;
+		}
+		if (word < 0x10000)
+		{
+			return 2;
+		}
+		if (word < 0x200000)
+		{
+			return 3;
+		}
+		if (word < 0x4000000)
+		{
+			return 4;
+		}
+		return 5;
+	}
+
+} // namespace anonymous
+
+// How many "continuing octets" will be needed for this word
+// ==   total octets - 1.
+int UTF8Codecvt::get_cont_octet_out_count(wchar_t word) const
+{
+	return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word);
+}

Added: trunk/src/types/utf8fstream.h
===================================================================
--- trunk/src/types/utf8fstream.h	                        (rev 0)
+++ trunk/src/types/utf8fstream.h	2007-10-09 17:59:56 UTC (rev 1043)
@@ -0,0 +1,155 @@
+/*
+ * this file is part of Startup Manager
+ * Copyright (C) 2004-2007 Glenn Van Loon, cnx...@us...
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#ifndef UTF8FSTREAM_H
+#define UTF8FSTREAM_H
+
+#include "tfstream.h"
+
+#include <locale>
+
+// The UTF8Codecvt class was based on the utf8_codecvt_facet boost class.
+// Copyright  2001 Ronald Garcia, Indiana University (ga...@os...)
+// Andrew Lumsdaine, Indiana University (lu...@os...). Permission to copy, 
+// use, modify, sell and distribute this software is granted provided this
+// copyright notice appears in all copies. This software is provided "as is"
+// without express or implied warranty, and with no claim as to its suitability
+// for any purpose.
+
+typedef std::codecvt<wchar_t, char, mbstate_t> NullCodecvtBase;
+class UTF8Codecvt : public NullCodecvtBase
+{
+public:
+	explicit UTF8Codecvt(std::size_t no_locale_manage=0)
+		: std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage) 
+	{}
+
+protected:
+	virtual std::codecvt_base::result do_in(
+		std::mbstate_t& state, const char * from,
+		const char * from_end, const char * & from_next,
+		wchar_t * to, wchar_t * to_end, wchar_t * & to_next) const;
+
+	virtual std::codecvt_base::result do_out(
+		std::mbstate_t & state, const wchar_t * from,
+		const wchar_t * from_end, const wchar_t*  & from_next,
+		char * to, char * to_end, char * & to_next) const;
+
+	bool invalid_continuing_octet(unsigned char octet_1) const
+	{
+		return ((octet_1 < 0x80) || (0xbf < octet_1));
+	}
+
+	bool invalid_leading_octet(unsigned char octet_1) const
+	{
+		return (((0x7f < octet_1) && (octet_1 < 0xc0)) || (octet_1 > 0xfd));
+	}
+
+	// continuing octets = octets except for the leading octet
+	static unsigned int get_cont_octet_count(unsigned char lead_octet)
+	{
+		return (get_octet_count(lead_octet) - 1);
+	}
+
+	static unsigned int get_octet_count(unsigned char lead_octet);
+
+	// How many "continuing octets" will be needed for this word
+	// ==   total octets - 1.
+	int get_cont_octet_out_count(wchar_t word) const;
+
+	virtual bool do_always_noconv() const throw() {return false;}
+
+	// UTF-8 isn't really stateful since we rewind on partial conversions
+	virtual std::codecvt_base::result do_unshift(
+		std::mbstate_t&, char * from, char * /*to*/, char * & next) const 
+	{
+		next = from;
+		return ok;
+	}
+
+	virtual int do_encoding() const throw()
+	{
+		const int variable_byte_external_encoding = 0;
+		return variable_byte_external_encoding;
+	}
+
+	// How many char objects can I process to get <= max_limit
+	// wchar_t objects?
+	virtual int do_length(
+		std::mbstate_t &, const char * from, 
+		const char * from_end, std::size_t max_limit) const throw();
+
+	// Largest possible value do_length(state,from,from_end,1) could return.
+	virtual int do_max_length() const throw ()
+	{
+		return 6; // largest UTF-8 encoding of a UCS-4 character
+	}
+
+private:
+	UTF8Codecvt(const UTF8Codecvt&);
+	UTF8Codecvt& operator=(const UTF8Codecvt&);
+};
+
+#define IMBUE_UTF8_CODECVT(outputFile) \
+{ \
+	std::locale loc(std::locale(), new UTF8Codecvt); \
+	(outputFile).imbue(loc); \
+}
+
+class utf8ofstream : public tofstream
+{
+public:
+	utf8ofstream() {}
+	utf8ofstream(const char * filename, std::ios_base::openmode mode = std::ios_base::out)
+	{
+		open(filename, mode);
+	}
+	virtual ~utf8ofstream() {}
+	virtual void open(const char * filename, std::ios_base::openmode mode = std::ios_base::out)
+	{
+		IMBUE_UTF8_CODECVT(*this);
+		tofstream::open(filename, mode);
+	}
+
+private:
+	utf8ofstream(const utf8ofstream &s);			//prevent copy construction
+	utf8ofstream& operator=(const utf8ofstream &s);	//prevent copy assignment
+};
+
+class utf8ifstream : public tifstream
+{
+public:
+	utf8ifstream() {}
+	utf8ifstream(const char * filename, std::ios_base::openmode mode = std::ios_base::in)
+	{
+		open(filename, mode);
+	}
+	virtual ~utf8ifstream() {}
+	virtual void open(const char * filename, std::ios_base::openmode mode = std::ios_base::in)
+	{
+		IMBUE_UTF8_CODECVT(*this);
+		tifstream::open(filename, mode);
+	}
+
+private:
+	utf8ifstream(const utf8ifstream &s);			//prevent copy construction
+	utf8ifstream& operator=(const utf8ifstream &s);	//prevent copy assignment
+};
+
+#endif //UTF8FSTREAM_H


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.