[St-m-svn] SF.net SVN: st-m: [1043] trunk/src
Brought to you by:
cnx_glenn
From: <cnx...@us...> - 2007-10-09 18:00:00
|
Revision: 1043 http://st-m.svn.sourceforge.net/st-m/?rev=1043&view=rev Author: cnx_glenn Date: 2007-10-09 10:59:56 -0700 (Tue, 09 Oct 2007) Log Message: ----------- Created new UTF8 and UTF16 fstream classes: - utf8ofstream and utf8ifstream (utf8fstream.h) - utf16ofstream and utf16ifstream (utf16fstream.h) Modified Paths: -------------- trunk/src/MainFrm.cpp trunk/src/Startup Manager_2003.vcproj trunk/src/types/tfstream.h Added Paths: ----------- trunk/src/types/utf16fstream.h trunk/src/types/utf8fstream.cpp trunk/src/types/utf8fstream.h Removed Paths: ------------- trunk/src/types/tfstream.cpp Modified: trunk/src/MainFrm.cpp =================================================================== --- trunk/src/MainFrm.cpp 2007-10-09 16:30:10 UTC (rev 1042) +++ trunk/src/MainFrm.cpp 2007-10-09 17:59:56 UTC (rev 1043) @@ -30,6 +30,7 @@ #include "aboutdlg.h" #include "dropdlg.h" #include "updatedlg.h" +#include "types\utf8fstream.h" #include <winuser.h> #include "lib/htmlhelp.h" @@ -415,7 +416,7 @@ void CMainFrame::ExportItemsAsHtmlVertical(tofstream &htmlfile) { - if(!htmlfile) + if(!htmlfile.is_open()) throw(CWinException(_T("ExportItemsAsHtmlVetical... htmlfile is not a valid file handle"))); tstring item = LoadString(STR_COL1); @@ -445,7 +446,7 @@ void CMainFrame::ExportItemsAsHtmlHorizontal(tofstream &htmlfile) { - if(!htmlfile) + if(!htmlfile.is_open()) throw(CWinException(_T("ExportItemsAsHtmlHorizontal... htmlfile is not a valid file handle"))); int iItems = ListView_GetItemCount(GetListView().GetHwnd()); @@ -467,14 +468,12 @@ try { #ifdef UNICODE - tofstream htmlfile; - IMBUE_UTF8_CODECVT(htmlfile); - htmlfile.open(toNarrowString(path).c_str(), std::ios::out | std::ios::binary); + utf8ofstream htmlfile(toNarrowString(path).c_str(), std::ios::out | std::ios::binary); #else tofstream htmlfile(path.c_str(), std::ios::out); #endif - if(htmlfile) + if (htmlfile.is_open()) { htmlfile << _T("<html>") << NEWLINE; htmlfile << _T("\t<head><title>Startup Programs List</title></head>") << NEWLINE; @@ -534,7 +533,7 @@ void CMainFrame::ExportItemsAsXml(tofstream &xmlfile) { - if(!xmlfile) + if(!xmlfile.is_open()) throw(CWinException(_T("ExportItemsAsXml... xmlfile is not a valid file handle"))); int iItems = ListView_GetItemCount(GetListView().GetHwnd()); @@ -555,7 +554,7 @@ try { tofstream xmlfile(toNarrowString(path).c_str(), std::ios::out); - if(xmlfile) + if(xmlfile.is_open()) { xmlfile << _T("<?xml version=\"1.0\" encoding=\"UTF-8\"?>") << std::endl; xmlfile << _T("<?xml-stylesheet type=\"text/xsl\" href=\"http://st-m.sourceforge.net/misc/st-m_items.xsl\"?>") << std::endl; Modified: trunk/src/Startup Manager_2003.vcproj =================================================================== --- trunk/src/Startup Manager_2003.vcproj 2007-10-09 16:30:10 UTC (rev 1042) +++ trunk/src/Startup Manager_2003.vcproj 2007-10-09 17:59:56 UTC (rev 1043) @@ -546,10 +546,10 @@ Name="Types" Filter=""> <File - RelativePath=".\types\tfstream.cpp"> + RelativePath=".\types\tstring.cpp"> </File> <File - RelativePath=".\types\tstring.cpp"> + RelativePath=".\types\utf8fstream.cpp"> </File> </Filter> </Filter> @@ -681,6 +681,12 @@ <File RelativePath=".\types\tstring.h"> </File> + <File + RelativePath=".\types\utf16fstream.h"> + </File> + <File + RelativePath=".\types\utf8fstream.h"> + </File> </Filter> </Filter> <Filter Deleted: trunk/src/types/tfstream.cpp =================================================================== --- trunk/src/types/tfstream.cpp 2007-10-09 16:30:10 UTC (rev 1042) +++ trunk/src/types/tfstream.cpp 2007-10-09 17:59:56 UTC (rev 1043) @@ -1,262 +0,0 @@ -/* - * this file is part of Startup Manager - * Copyright (C) 2004-2007 Glenn Van Loon, cnx...@us... - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. - */ - -// The UTF8Codecvt class was based on the utf8_codecvt_facet boost class. -// Copyright 2001 Ronald Garcia, Indiana University (ga...@os...) -// Andrew Lumsdaine, Indiana University (lu...@os...). Permission to copy, -// use, modify, sell and distribute this software is granted provided this -// copyright notice appears in all copies. This software is provided "as is" -// without express or implied warranty, and with no claim as to its suitability -// for any purpose. - -#include "tfstream.h" - -#include <cassert> -#include <limits> - -// Translate incoming UTF-8 into UCS-4 -std::codecvt_base::result UTF8Codecvt::do_in( - std::mbstate_t&, const char * from, - const char * from_end, const char * & from_next, - wchar_t * to, wchar_t * to_end, wchar_t * & to_next) const -{ - // Basic algorithm: The first octet determines how many - // octets total make up the UCS-4 character. The remaining - // "continuing octets" all begin with "10". To convert, subtract - // the amount that specifies the number of octets from the first - // octet. Subtract 0x80 (1000 0000) from each continuing octet, - // then mash the whole lot together. Note that each continuing - // octet only uses 6 bits as unique values, so only shift by - // multiples of 6 to combine. - while ((from != from_end) && (to != to_end)) - { - // Error checking on the first octet - if (invalid_leading_octet(*from)) - { - from_next = from; - to_next = to; - return std::codecvt_base::error; - } - - // The first octet is adjusted by a value dependent upon - // the number of "continuing octets" encoding the character - const int cont_octet_count = get_cont_octet_count(*from); - const wchar_t octet1_modifier_table[] = {0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; - - // The unsigned char conversion is necessary in case char is - // signed (I learned this the hard way) - wchar_t ucs_result = (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count]; - - // Invariants : - // 1) At the start of the loop, 'i' continuing characters have been - // processed - // 2) *from points to the next continuing character to be processed. - int i = 0; - while ((i != cont_octet_count) && (from != from_end)) - { - // Error checking on continuing characters - if (invalid_continuing_octet(*from)) - { - from_next = from; - to_next = to; - return std::codecvt_base::error; - } - - ucs_result *= (1 << 6); - - // each continuing character has an extra (10xxxxxx)b attached to - // it that must be removed. - ucs_result += (unsigned char)(*from++) - 0x80; - ++i; - } - - // If the buffer ends with an incomplete unicode character... - if ((from == from_end) && (i != cont_octet_count)) - { - // rewind "from" to before the current character translation - from_next = from - (i+1); - to_next = to; - return std::codecvt_base::partial; - } - *to++ = ucs_result; - } - from_next = from; - to_next = to; - - // Were we done converting or did we run out of destination space? - if (from == from_end) - return std::codecvt_base::ok; - - return std::codecvt_base::partial; -} - -std::codecvt_base::result UTF8Codecvt::do_out( - std::mbstate_t &, const wchar_t * from, - const wchar_t * from_end, const wchar_t * & from_next, - char * to, char * to_end, char * & to_next) const -{ - // RG - consider merging this table with the other one - const wchar_t octet1_modifier_table[] = {0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; - - while ((from != from_end) && (to != to_end)) - { - // Check for invalid UCS-4 character - if (*from > reinterpret_cast<wchar_t>(std::numeric_limits<wchar_t>::max)) - { - from_next = from; - to_next = to; - return std::codecvt_base::error; - } - - int cont_octet_count = get_cont_octet_out_count(*from); - - // RG - comment this formula better - int shift_exponent = (cont_octet_count) * 6; - - // Process the first character - *to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] + (unsigned char)(*from / (1 << shift_exponent))); - - // Process the continuation characters - // Invariants: At the start of the loop: - // 1) 'i' continuing octets have been generated - // 2) '*to' points to the next location to place an octet - // 3) shift_exponent is 6 more than needed for the next octet - int i = 0; - while ((i != cont_octet_count) && (to != to_end)) - { - shift_exponent -= 6; - *to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6))); - ++i; - } - // If we filled up the out buffer before encoding the character - if ((to == to_end) && (i != cont_octet_count)) - { - from_next = from; - to_next = to - (i+1); - return std::codecvt_base::partial; - } - *from++; - } - from_next = from; - to_next = to; - - // Were we done or did we run out of destination space - if (from == from_end) - return std::codecvt_base::ok; - - return std::codecvt_base::partial; -} - -// How many char objects can I process to get <= max_limit -// wchar_t objects? -int UTF8Codecvt::do_length( - std::mbstate_t &, const char * from, - const char * from_end, std::size_t max_limit) const throw() -{ - // RG - this code is confusing! I need a better way to express it. - // and test cases. - - // Invariants: - // 1) last_octet_count has the size of the last measured character - // 2) char_count holds the number of characters shown to fit - // within the bounds so far (no greater than max_limit) - // 3) from_next points to the octet 'last_octet_count' before the - // last measured character. - int last_octet_count=0; - std::size_t char_count = 0; - const char* from_next = from; - // Use "<" because the buffer may represent incomplete characters - while ((from_next+last_octet_count <= from_end) && (char_count <= max_limit)) - { - from_next += last_octet_count; - last_octet_count = (get_octet_count(*from_next)); - ++char_count; - } - return static_cast<int>(from_next-from_end); -} - -unsigned int UTF8Codecvt::get_octet_count(unsigned char lead_octet) -{ - // if the 0-bit (MSB) is 0, then 1 character - if (lead_octet <= 0x7f) return 1; - - // Otherwise the count number of consecutive 1 bits starting at MSB - assert((0xc0 <= lead_octet) && (lead_octet <= 0xfd)); - - if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2; - else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3; - else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4; - else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5; - else return 6; -} - -namespace -{ - - template<std::size_t s> - int get_cont_octet_out_count_impl(wchar_t word) - { - if (word < 0x80) - { - return 0; - } - if (word < 0x800) - { - return 1; - } - return 2; - } - - // note the following code will generate on some platforms where - // wchar_t is defined as UCS2. The warnings are superfluous as - // the specialization is never instantitiated with such compilers. - template<> - int get_cont_octet_out_count_impl<4>(wchar_t word) - { - if (word < 0x80) - { - return 0; - } - if (word < 0x800) - { - return 1; - } - if (word < 0x10000) - { - return 2; - } - if (word < 0x200000) - { - return 3; - } - if (word < 0x4000000) - { - return 4; - } - return 5; - } - -} // namespace anonymous - -// How many "continuing octets" will be needed for this word -// == total octets - 1. -int UTF8Codecvt::get_cont_octet_out_count(wchar_t word) const -{ - return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word); -} Modified: trunk/src/types/tfstream.h =================================================================== --- trunk/src/types/tfstream.h 2007-10-09 16:30:10 UTC (rev 1042) +++ trunk/src/types/tfstream.h 2007-10-09 17:59:56 UTC (rev 1043) @@ -17,19 +17,10 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -// The UTF8Codecvt class was based on the utf8_codecvt_facet boost class. -// Copyright 2001 Ronald Garcia, Indiana University (ga...@os...) -// Andrew Lumsdaine, Indiana University (lu...@os...). Permission to copy, -// use, modify, sell and distribute this software is granted provided this -// copyright notice appears in all copies. This software is provided "as is" -// without express or implied warranty, and with no claim as to its suitability -// for any purpose. - #ifndef TFSTREAM_H #define TFSTREAM_H #include <fstream> -#include <locale> #ifdef UNICODE typedef std::wifstream tifstream; @@ -38,85 +29,5 @@ typedef std::ifstream tifstream; typedef std::ofstream tofstream; #endif - -typedef std::codecvt<wchar_t, char, mbstate_t> NullCodecvtBase; -class UTF8Codecvt : public NullCodecvtBase -{ -public: - explicit UTF8Codecvt(std::size_t no_locale_manage=0) - : std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage) - {} -protected: - virtual std::codecvt_base::result do_in( - std::mbstate_t& state, const char * from, - const char * from_end, const char * & from_next, - wchar_t * to, wchar_t * to_end, wchar_t * & to_next) const; - - virtual std::codecvt_base::result do_out( - std::mbstate_t & state, const wchar_t * from, - const wchar_t * from_end, const wchar_t* & from_next, - char * to, char * to_end, char * & to_next) const; - - bool invalid_continuing_octet(unsigned char octet_1) const - { - return ((octet_1 < 0x80) || (0xbf < octet_1)); - } - - bool invalid_leading_octet(unsigned char octet_1) const - { - return (((0x7f < octet_1) && (octet_1 < 0xc0)) || (octet_1 > 0xfd)); - } - - // continuing octets = octets except for the leading octet - static unsigned int get_cont_octet_count(unsigned char lead_octet) - { - return (get_octet_count(lead_octet) - 1); - } - - static unsigned int get_octet_count(unsigned char lead_octet); - - // How many "continuing octets" will be needed for this word - // == total octets - 1. - int get_cont_octet_out_count(wchar_t word) const; - - virtual bool do_always_noconv() const throw() {return false;} - - // UTF-8 isn't really stateful since we rewind on partial conversions - virtual std::codecvt_base::result do_unshift( - std::mbstate_t&, char * from, char * /*to*/, char * & next) const - { - next = from; - return ok; - } - - virtual int do_encoding() const throw() - { - const int variable_byte_external_encoding = 0; - return variable_byte_external_encoding; - } - - // How many char objects can I process to get <= max_limit - // wchar_t objects? - virtual int do_length( - std::mbstate_t &, const char * from, - const char * from_end, std::size_t max_limit) const throw(); - - // Largest possible value do_length(state,from,from_end,1) could return. - virtual int do_max_length() const throw () - { - return 6; // largest UTF-8 encoding of a UCS-4 character - } - -private: - UTF8Codecvt(const UTF8Codecvt&); - UTF8Codecvt& operator=(const UTF8Codecvt&); -}; - -#define IMBUE_UTF8_CODECVT(outputFile) \ -{ \ - std::locale loc(std::locale(), new UTF8Codecvt); \ - (outputFile).imbue(loc); \ -} - #endif Added: trunk/src/types/utf16fstream.h =================================================================== --- trunk/src/types/utf16fstream.h (rev 0) +++ trunk/src/types/utf16fstream.h 2007-10-09 17:59:56 UTC (rev 1043) @@ -0,0 +1,117 @@ +/* + * this file is part of Startup Manager + * Copyright (C) 2004-2007 Glenn Van Loon, cnx...@us... + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef UTF16FSTREAM_H +#define UTF16FSTREAM_H + +#include "tfstream.h" + +#include <locale> + +typedef std::codecvt<wchar_t, char, mbstate_t> NullCodecvtBase; +class UTF16Codecvt : public NullCodecvtBase +{ +public: + typedef wchar_t _E; + typedef char _To; + typedef mbstate_t _St; + + explicit UTF16Codecvt(size_t _R=0) : NullCodecvtBase(_R) {} + +protected: + virtual result do_in(_St&, const _To*, const _To*, const _To*&, _E*, _E*, _E*&) const + { + return noconv; + } + virtual result do_out(_St&, const _E*, const _E*, const _E*&, _To*, _E*, _To*&) const + { + return noconv; + } + virtual result do_unshift(_St&, _To*, _To*, _To*&) const + { + return noconv; + } + virtual int do_length(_St&, const _To* _F1, const _To* _L1, size_t _N2) const _THROW0() + { + return static_cast<int>((_N2 < (size_t)(_L1 - _F1)) ? _N2 : _L1 - _F1); + } + virtual bool do_always_noconv() const _THROW0() + { + return true; + } + virtual int do_max_length() const _THROW0() + { + return 2; + } + virtual int do_encoding() const _THROW0() + { + + +private: + UTF16Codecvt(const UTF16Codecvt&); + UTF16Codecvt& operator=(const UTF16Codecvt&); +}; + +#define IMBUE_UTF16_CODECVT(outputFile) \ +{ \ + std::locale loc(std::locale(), new UTF16Codecvt); \ + (outputFile).imbue(loc); \ +} + +class utf8ofstream : public tofstream +{ +public: + utf16ofstream() {} + utf16ofstream(const char * filename, std::ios_base::openmode mode = std::ios_base::out) + { + open(filename, mode); + } + virtual ~utf16ofstream() {} + virtual void open(const char * filename, std::ios_base::openmode mode = std::ios_base::out) + { + IMBUE_UTF16_CODECVT(*this); + tofstream::open(filename, mode); + } + +private: + utf16ofstream(const utf16ofstream &s); //prevent copy construction + utf16ofstream& operator=(const utf16ofstream &s); //prevent copy assignment +}; + +class utf16ifstream : public tifstream +{ +public: + utf16ifstream() {} + utf16ifstream(const char * filename, std::ios_base::openmode mode = std::ios_base::in) + { + open(filename, mode); + } + virtual ~utf16ifstream() {} + virtual void open(const char * filename, std::ios_base::openmode mode = std::ios_base::in) + { + IMBUE_UTF16_CODECVT(*this); + tifstream::open(filename, mode); + } + +private: + utf16ifstream(const utf16ifstream &s); //prevent copy construction + utf16ifstream& operator=(const utf16ifstream &s); //prevent copy assignment +}; + +#endif //UTF16FSTREAM_H Added: trunk/src/types/utf8fstream.cpp =================================================================== --- trunk/src/types/utf8fstream.cpp (rev 0) +++ trunk/src/types/utf8fstream.cpp 2007-10-09 17:59:56 UTC (rev 1043) @@ -0,0 +1,254 @@ +/* + * this file is part of Startup Manager + * Copyright (C) 2004-2007 Glenn Van Loon, cnx...@us... + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#include "utf8fstream.h" + +#include <cassert> +#include <limits> + +// Translate incoming UTF-8 into UCS-4 +std::codecvt_base::result UTF8Codecvt::do_in( + std::mbstate_t&, const char * from, + const char * from_end, const char * & from_next, + wchar_t * to, wchar_t * to_end, wchar_t * & to_next) const +{ + // Basic algorithm: The first octet determines how many + // octets total make up the UCS-4 character. The remaining + // "continuing octets" all begin with "10". To convert, subtract + // the amount that specifies the number of octets from the first + // octet. Subtract 0x80 (1000 0000) from each continuing octet, + // then mash the whole lot together. Note that each continuing + // octet only uses 6 bits as unique values, so only shift by + // multiples of 6 to combine. + while ((from != from_end) && (to != to_end)) + { + // Error checking on the first octet + if (invalid_leading_octet(*from)) + { + from_next = from; + to_next = to; + return std::codecvt_base::error; + } + + // The first octet is adjusted by a value dependent upon + // the number of "continuing octets" encoding the character + const int cont_octet_count = get_cont_octet_count(*from); + const wchar_t octet1_modifier_table[] = {0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; + + // The unsigned char conversion is necessary in case char is + // signed (I learned this the hard way) + wchar_t ucs_result = (unsigned char)(*from++) - octet1_modifier_table[cont_octet_count]; + + // Invariants : + // 1) At the start of the loop, 'i' continuing characters have been + // processed + // 2) *from points to the next continuing character to be processed. + int i = 0; + while ((i != cont_octet_count) && (from != from_end)) + { + // Error checking on continuing characters + if (invalid_continuing_octet(*from)) + { + from_next = from; + to_next = to; + return std::codecvt_base::error; + } + + ucs_result *= (1 << 6); + + // each continuing character has an extra (10xxxxxx)b attached to + // it that must be removed. + ucs_result += (unsigned char)(*from++) - 0x80; + ++i; + } + + // If the buffer ends with an incomplete unicode character... + if ((from == from_end) && (i != cont_octet_count)) + { + // rewind "from" to before the current character translation + from_next = from - (i+1); + to_next = to; + return std::codecvt_base::partial; + } + *to++ = ucs_result; + } + from_next = from; + to_next = to; + + // Were we done converting or did we run out of destination space? + if (from == from_end) + return std::codecvt_base::ok; + + return std::codecvt_base::partial; +} + +std::codecvt_base::result UTF8Codecvt::do_out( + std::mbstate_t &, const wchar_t * from, + const wchar_t * from_end, const wchar_t * & from_next, + char * to, char * to_end, char * & to_next) const +{ + // RG - consider merging this table with the other one + const wchar_t octet1_modifier_table[] = {0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc}; + + while ((from != from_end) && (to != to_end)) + { + // Check for invalid UCS-4 character + if (*from > reinterpret_cast<wchar_t>(std::numeric_limits<wchar_t>::max)) + { + from_next = from; + to_next = to; + return std::codecvt_base::error; + } + + int cont_octet_count = get_cont_octet_out_count(*from); + + // RG - comment this formula better + int shift_exponent = (cont_octet_count) * 6; + + // Process the first character + *to++ = static_cast<char>(octet1_modifier_table[cont_octet_count] + (unsigned char)(*from / (1 << shift_exponent))); + + // Process the continuation characters + // Invariants: At the start of the loop: + // 1) 'i' continuing octets have been generated + // 2) '*to' points to the next location to place an octet + // 3) shift_exponent is 6 more than needed for the next octet + int i = 0; + while ((i != cont_octet_count) && (to != to_end)) + { + shift_exponent -= 6; + *to++ = static_cast<char>(0x80 + ((*from / (1 << shift_exponent)) % (1 << 6))); + ++i; + } + // If we filled up the out buffer before encoding the character + if ((to == to_end) && (i != cont_octet_count)) + { + from_next = from; + to_next = to - (i+1); + return std::codecvt_base::partial; + } + *from++; + } + from_next = from; + to_next = to; + + // Were we done or did we run out of destination space + if (from == from_end) + return std::codecvt_base::ok; + + return std::codecvt_base::partial; +} + +// How many char objects can I process to get <= max_limit +// wchar_t objects? +int UTF8Codecvt::do_length( + std::mbstate_t &, const char * from, + const char * from_end, std::size_t max_limit) const throw() +{ + // RG - this code is confusing! I need a better way to express it. + // and test cases. + + // Invariants: + // 1) last_octet_count has the size of the last measured character + // 2) char_count holds the number of characters shown to fit + // within the bounds so far (no greater than max_limit) + // 3) from_next points to the octet 'last_octet_count' before the + // last measured character. + int last_octet_count=0; + std::size_t char_count = 0; + const char* from_next = from; + // Use "<" because the buffer may represent incomplete characters + while ((from_next+last_octet_count <= from_end) && (char_count <= max_limit)) + { + from_next += last_octet_count; + last_octet_count = (get_octet_count(*from_next)); + ++char_count; + } + return static_cast<int>(from_next-from_end); +} + +unsigned int UTF8Codecvt::get_octet_count(unsigned char lead_octet) +{ + // if the 0-bit (MSB) is 0, then 1 character + if (lead_octet <= 0x7f) return 1; + + // Otherwise the count number of consecutive 1 bits starting at MSB + assert((0xc0 <= lead_octet) && (lead_octet <= 0xfd)); + + if (0xc0 <= lead_octet && lead_octet <= 0xdf) return 2; + else if (0xe0 <= lead_octet && lead_octet <= 0xef) return 3; + else if (0xf0 <= lead_octet && lead_octet <= 0xf7) return 4; + else if (0xf8 <= lead_octet && lead_octet <= 0xfb) return 5; + else return 6; +} + +namespace +{ + + template<std::size_t s> + int get_cont_octet_out_count_impl(wchar_t word) + { + if (word < 0x80) + { + return 0; + } + if (word < 0x800) + { + return 1; + } + return 2; + } + + // note the following code will generate on some platforms where + // wchar_t is defined as UCS2. The warnings are superfluous as + // the specialization is never instantitiated with such compilers. + template<> + int get_cont_octet_out_count_impl<4>(wchar_t word) + { + if (word < 0x80) + { + return 0; + } + if (word < 0x800) + { + return 1; + } + if (word < 0x10000) + { + return 2; + } + if (word < 0x200000) + { + return 3; + } + if (word < 0x4000000) + { + return 4; + } + return 5; + } + +} // namespace anonymous + +// How many "continuing octets" will be needed for this word +// == total octets - 1. +int UTF8Codecvt::get_cont_octet_out_count(wchar_t word) const +{ + return get_cont_octet_out_count_impl<sizeof(wchar_t)>(word); +} Added: trunk/src/types/utf8fstream.h =================================================================== --- trunk/src/types/utf8fstream.h (rev 0) +++ trunk/src/types/utf8fstream.h 2007-10-09 17:59:56 UTC (rev 1043) @@ -0,0 +1,155 @@ +/* + * this file is part of Startup Manager + * Copyright (C) 2004-2007 Glenn Van Loon, cnx...@us... + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifndef UTF8FSTREAM_H +#define UTF8FSTREAM_H + +#include "tfstream.h" + +#include <locale> + +// The UTF8Codecvt class was based on the utf8_codecvt_facet boost class. +// Copyright 2001 Ronald Garcia, Indiana University (ga...@os...) +// Andrew Lumsdaine, Indiana University (lu...@os...). Permission to copy, +// use, modify, sell and distribute this software is granted provided this +// copyright notice appears in all copies. This software is provided "as is" +// without express or implied warranty, and with no claim as to its suitability +// for any purpose. + +typedef std::codecvt<wchar_t, char, mbstate_t> NullCodecvtBase; +class UTF8Codecvt : public NullCodecvtBase +{ +public: + explicit UTF8Codecvt(std::size_t no_locale_manage=0) + : std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage) + {} + +protected: + virtual std::codecvt_base::result do_in( + std::mbstate_t& state, const char * from, + const char * from_end, const char * & from_next, + wchar_t * to, wchar_t * to_end, wchar_t * & to_next) const; + + virtual std::codecvt_base::result do_out( + std::mbstate_t & state, const wchar_t * from, + const wchar_t * from_end, const wchar_t* & from_next, + char * to, char * to_end, char * & to_next) const; + + bool invalid_continuing_octet(unsigned char octet_1) const + { + return ((octet_1 < 0x80) || (0xbf < octet_1)); + } + + bool invalid_leading_octet(unsigned char octet_1) const + { + return (((0x7f < octet_1) && (octet_1 < 0xc0)) || (octet_1 > 0xfd)); + } + + // continuing octets = octets except for the leading octet + static unsigned int get_cont_octet_count(unsigned char lead_octet) + { + return (get_octet_count(lead_octet) - 1); + } + + static unsigned int get_octet_count(unsigned char lead_octet); + + // How many "continuing octets" will be needed for this word + // == total octets - 1. + int get_cont_octet_out_count(wchar_t word) const; + + virtual bool do_always_noconv() const throw() {return false;} + + // UTF-8 isn't really stateful since we rewind on partial conversions + virtual std::codecvt_base::result do_unshift( + std::mbstate_t&, char * from, char * /*to*/, char * & next) const + { + next = from; + return ok; + } + + virtual int do_encoding() const throw() + { + const int variable_byte_external_encoding = 0; + return variable_byte_external_encoding; + } + + // How many char objects can I process to get <= max_limit + // wchar_t objects? + virtual int do_length( + std::mbstate_t &, const char * from, + const char * from_end, std::size_t max_limit) const throw(); + + // Largest possible value do_length(state,from,from_end,1) could return. + virtual int do_max_length() const throw () + { + return 6; // largest UTF-8 encoding of a UCS-4 character + } + +private: + UTF8Codecvt(const UTF8Codecvt&); + UTF8Codecvt& operator=(const UTF8Codecvt&); +}; + +#define IMBUE_UTF8_CODECVT(outputFile) \ +{ \ + std::locale loc(std::locale(), new UTF8Codecvt); \ + (outputFile).imbue(loc); \ +} + +class utf8ofstream : public tofstream +{ +public: + utf8ofstream() {} + utf8ofstream(const char * filename, std::ios_base::openmode mode = std::ios_base::out) + { + open(filename, mode); + } + virtual ~utf8ofstream() {} + virtual void open(const char * filename, std::ios_base::openmode mode = std::ios_base::out) + { + IMBUE_UTF8_CODECVT(*this); + tofstream::open(filename, mode); + } + +private: + utf8ofstream(const utf8ofstream &s); //prevent copy construction + utf8ofstream& operator=(const utf8ofstream &s); //prevent copy assignment +}; + +class utf8ifstream : public tifstream +{ +public: + utf8ifstream() {} + utf8ifstream(const char * filename, std::ios_base::openmode mode = std::ios_base::in) + { + open(filename, mode); + } + virtual ~utf8ifstream() {} + virtual void open(const char * filename, std::ios_base::openmode mode = std::ios_base::in) + { + IMBUE_UTF8_CODECVT(*this); + tifstream::open(filename, mode); + } + +private: + utf8ifstream(const utf8ifstream &s); //prevent copy construction + utf8ifstream& operator=(const utf8ifstream &s); //prevent copy assignment +}; + +#endif //UTF8FSTREAM_H This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |