[Opentrep-svn] SF.net SVN: opentrep:[179] trunk/opentrep
Status: Beta
Brought to you by:
denis_arnaud
From: <den...@us...> - 2009-08-16 14:08:17
|
Revision: 179 http://opentrep.svn.sourceforge.net/opentrep/?rev=179&view=rev Author: denis_arnaud Date: 2009-08-16 14:08:09 +0000 (Sun, 16 Aug 2009) Log Message: ----------- [i18n] Added a utility class for conversion from/to UTF8 strings to/from wide-character strings. Modified Paths: -------------- trunk/opentrep/opentrep/basic/sources.mk trunk/opentrep/test/i18n/icu/Makefile.am trunk/opentrep/test/i18n/stdlocru.cpp trunk/opentrep/test/i18n/utf8/Makefile.am trunk/opentrep/test/i18n/utf8/utf8.cpp trunk/opentrep/test/i18n/utf8/utf8.hpp trunk/opentrep/test/i18n/utf8/utf8string.cpp Added Paths: ----------- trunk/opentrep/opentrep/basic/UTF8Handler.cpp trunk/opentrep/opentrep/basic/UTF8Handler.hpp Added: trunk/opentrep/opentrep/basic/UTF8Handler.cpp =================================================================== --- trunk/opentrep/opentrep/basic/UTF8Handler.cpp (rev 0) +++ trunk/opentrep/opentrep/basic/UTF8Handler.cpp 2009-08-16 14:08:09 UTC (rev 179) @@ -0,0 +1,183 @@ +// ////////////////////////////////////////////////////////////////////// +// Import section +// ////////////////////////////////////////////////////////////////////// +// STL +#include <cassert> +#include <sstream> +#include <string> +// OpenTrep +#include <opentrep/basic/UTF8Handler.hpp> + +namespace OPENTREP { + + // ////////////////////////////////////////////////////////////////////// + static const wchar_t offsetsFromUTF8[6] = { + 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL + }; + + // ////////////////////////////////////////////////////////////////////// + static const char trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 + }; + + // ////////////////////////////////////////////////////////////////////// + std::wstring UTF8Handler::toWideString (const std::string& iSrc) { + std::basic_ostringstream<wchar_t> oStr; + + // Length of the source string + const size_t lStringSize = iSrc.size(); + + // Transform the source string in a regular C-string (char*) + const char* src = iSrc.c_str(); + + // + typedef unsigned char uchar_t; + + size_t idx = 0; + while (idx != lStringSize) { + + uchar_t lCurrentChar = static_cast<uchar_t> (src[idx]); + + // When there are multi-byte characters (e.g., for UTF-8 encoded + // STL strings), the size of the STL string corresponds to the + // total number of bytes. For instance, "München" has a size of 8 + // bytes (and not 7 characters). However, the iteration is made on + // the number of characters (idx); when the end of the string is + // reached, the loop must therefore be exited. + if (lCurrentChar == '\0') { + break; + } + + const int nb = trailingBytesForUTF8[lCurrentChar]; + + wchar_t tmpChar = 0; + switch (nb) { + // These fall through deliberately + case 3: { + lCurrentChar = static_cast<uchar_t> (src[idx]); ++idx; + tmpChar += lCurrentChar; tmpChar <<= 6; + } + case 2: { + lCurrentChar = static_cast<uchar_t> (src[idx]); ++idx; + tmpChar += lCurrentChar; tmpChar <<= 6; + } + case 1: { + lCurrentChar = static_cast<uchar_t> (src[idx]); ++idx; + tmpChar += lCurrentChar; tmpChar <<= 6; + } + case 0: { + lCurrentChar = static_cast<uchar_t> (src[idx]); ++idx; + tmpChar += lCurrentChar; + } + } + + tmpChar -= offsetsFromUTF8[nb]; + oStr << tmpChar; + } + + oStr << '\0'; + return oStr.str(); + } + + // ////////////////////////////////////////////////////////////////////// + std::string UTF8Handler::toSimpleString (const std::wstring& iStr) { + std::ostringstream oStr; + + const wchar_t* src = iStr.c_str(); + size_t idx = 0; + size_t i = 0; + + while (src[i] != 0) { + wchar_t ch = src[i]; + + if (ch < 0x80) { + const char tmpChar = static_cast<const char> (ch); + oStr << tmpChar; ++idx; + + } else if (ch < 0x800) { + char tmpChar = static_cast<const char> ((ch >> 6) | 0xC0); + oStr << tmpChar; ++idx; + + tmpChar = static_cast<const char> ((ch & 0x3F) | 0x80); + oStr << tmpChar; ++idx; + + } else if (ch < 0x10000) { + char tmpChar = static_cast<const char> ((ch>>12) | 0xE0); + oStr << tmpChar; ++idx; + + tmpChar = static_cast<const char> (((ch>>6) & 0x3F) | 0x80); + oStr << tmpChar; ++idx; + + tmpChar = static_cast<const char> ((ch & 0x3F) | 0x80); + oStr << tmpChar; ++idx; + + } else if (ch < 0x110000) { + char tmpChar = static_cast<const char> ((ch>>18) | 0xF0); + oStr << tmpChar; ++idx; + + tmpChar = static_cast<const char> (((ch>>12) & 0x3F) | 0x80); + oStr << tmpChar; ++idx; + + tmpChar = static_cast<const char> (((ch>>6) & 0x3F) | 0x80); + oStr << tmpChar; ++idx; + + tmpChar = static_cast<const char> ((ch & 0x3F) | 0x80); + oStr << tmpChar; ++idx; + } + i++; + } + + oStr << '\0'; + + return oStr.str(); + } + + // ////////////////////////////////////////////////////////////////////// + std::string UTF8Handler::displayCharString (const char* iString) { + std::ostringstream oStr; + + bool hasReachedEnd = false; + for (size_t idx = 0; hasReachedEnd == false; ++idx) { + if (idx != 0) { + oStr << "; "; + } + const unsigned char lChar = iString[idx]; + // const wchar_t lChar = iString[idx]; + if (lChar == '\0') { + hasReachedEnd = true; + } + oStr << "[" << idx << "]: " << std::hex << lChar; + } + oStr << std::endl; + + return oStr.str(); + } + + // ////////////////////////////////////////////////////////////////////// + std::string UTF8Handler::displaySTLWString (const std::wstring& iString) { + std::ostringstream oStr; + + size_t idx = 0; + for (std::wstring::const_iterator itChar = iString.begin(); + itChar != iString.end(); ++itChar, ++idx) { + if (idx != 0) { + oStr << "; "; + } + const wchar_t lChar = *itChar; + oStr << "[" << idx << "]: " << std::hex << lChar; + } + oStr << std::endl; + + return oStr.str(); + } + +} + Added: trunk/opentrep/opentrep/basic/UTF8Handler.hpp =================================================================== --- trunk/opentrep/opentrep/basic/UTF8Handler.hpp (rev 0) +++ trunk/opentrep/opentrep/basic/UTF8Handler.hpp 2009-08-16 14:08:09 UTC (rev 179) @@ -0,0 +1,53 @@ +#ifndef __OPENTREP_BAS_UTF8HANDLER_HPP +#define __OPENTREP_BAS_UTF8HANDLER_HPP + +// ////////////////////////////////////////////////////////////////////// +// Import section +// ////////////////////////////////////////////////////////////////////// +// STL +#include <string> + +namespace OPENTREP { + + /** Utility class for basic handling of UTF-8 encoded strings. + <br>Most of the methods have taken their inspiration from Jeff + Bezanson's work in the Wikix project + (see http://meta.wikimedia.org/wiki/Wikix for further details), + and have been "C++-ified". */ + class UTF8Handler { + public: + /* Conversion from a UTF-8-encoded "simple character" (though + potentially multi-byte) STL string into a wide character STL + string. + <br>Note that as there is no checks of appropriate encoding, it + only works for valid UTF-8, i.e. no 5- or 6-byte sequences. + <br>Note that the "simple characters", within a STL string, may be + multi-byte (e.g., if they are UTF-8-encoded). + @param std::string The "simple character" (though potentially + multi-byte) STL string. + @return std::wstring The wide character STL string. + */ + static std::wstring toWideString (const std::string& iSrc); + + /* Conversion from a wide character STL string into a UTF-8-encoded + "simple character" (though potentially multi-byte) STL string. + <br>Note that as there is no checks of appropriate encoding, it + only works for valid UTF-8, i.e. no 5- or 6-byte sequences. + <br>Note that the "simple characters", within a STL string, may be + multi-byte (e.g., if they are UTF-8-encoded). + @param std::wstring The wide character STL string. + @return std::string The "simple character" (though potentially + multi-byte) STL string. + */ + static std::string toSimpleString (const std::wstring& iStr); + + /** Display the sequence of characters for the simple C-string. */ + static std::string displayCharString (const char* iString); + + /** Display the sequence of characters (one by one) for the given + STL wide character string. */ + static std::string displaySTLWString (const std::wstring& iString); + }; + +} +#endif // __OPENTREP_BAS_UTF8HANDLER_HPP Modified: trunk/opentrep/opentrep/basic/sources.mk =================================================================== --- trunk/opentrep/opentrep/basic/sources.mk 2009-08-15 18:24:37 UTC (rev 178) +++ trunk/opentrep/opentrep/basic/sources.mk 2009-08-16 14:08:09 UTC (rev 179) @@ -1,5 +1,7 @@ bas_h_sources = $(top_srcdir)/opentrep/basic/BasConst_General.hpp \ $(top_srcdir)/opentrep/basic/BasConst_OPENTREP_Service.hpp \ - $(top_srcdir)/opentrep/basic/BasChronometer.hpp + $(top_srcdir)/opentrep/basic/BasChronometer.hpp \ + $(top_srcdir)/opentrep/basic/UTF8Handler.hpp bas_cc_sources = $(top_srcdir)/opentrep/basic/BasConst.cpp \ - $(top_srcdir)/opentrep/basic/BasChronometer.cpp + $(top_srcdir)/opentrep/basic/BasChronometer.cpp \ + $(top_srcdir)/opentrep/basic/UTF8Handler.cpp Modified: trunk/opentrep/test/i18n/icu/Makefile.am =================================================================== --- trunk/opentrep/test/i18n/icu/Makefile.am 2009-08-15 18:24:37 UTC (rev 178) +++ trunk/opentrep/test/i18n/icu/Makefile.am 2009-08-16 14:08:09 UTC (rev 179) @@ -3,7 +3,7 @@ MAINTAINERCLEANFILES = Makefile.in -check_PROGRAMS = icufmt icuustring icucharsetdetector icuconv +check_PROGRAMS = icufmt icuustring icucharsetdetector icuconv icuutext icufmt_SOURCES = icufmt.cpp icufmt_CXXFLAGS = $(ICU_CFLAGS) @@ -21,4 +21,8 @@ icuconv_CXXFLAGS = $(ICU_CFLAGS) icuconv_LDFLAGS = $(ICU_LIBS) $(ICU_IO_LIB) +icuutext_SOURCES = icuutext.cpp +icuutext_CXXFLAGS = $(ICU_CFLAGS) +icuutext_LDFLAGS = $(ICU_LIBS) $(ICU_IO_LIB) + EXTRA_DIST = Modified: trunk/opentrep/test/i18n/stdlocru.cpp =================================================================== --- trunk/opentrep/test/i18n/stdlocru.cpp 2009-08-15 18:24:37 UTC (rev 178) +++ trunk/opentrep/test/i18n/stdlocru.cpp 2009-08-16 14:08:09 UTC (rev 179) @@ -59,7 +59,7 @@ std::cout << "de: " << mucDEWCharString << std::endl; std::cout << "ru: " << mucRUWCharString << std::endl; - // STL ctypes on char* + // STL ctypes on wchar_t std::use_facet<std::ctype<wchar_t> > (langLocale).toupper(mucDEWCharString, mucDEWCharString+7); std::use_facet<std::ctype<wchar_t> > (langLocale).toupper(mucRUWCharString, Modified: trunk/opentrep/test/i18n/utf8/Makefile.am =================================================================== --- trunk/opentrep/test/i18n/utf8/Makefile.am 2009-08-15 18:24:37 UTC (rev 178) +++ trunk/opentrep/test/i18n/utf8/Makefile.am 2009-08-16 14:08:09 UTC (rev 179) @@ -11,6 +11,8 @@ utf8string_SOURCES = utf8string.cpp utf8string_CXXFLAGS = -utf8string_LDFLAGS = +utf8string_LDFLAGS = \ + $(BOOST_LIBS) $(SOCI_LIBS) $(CPPUNIT_LIBS) \ + $(top_builddir)/@PACKAGE@/lib@PACKAGE@.la EXTRA_DIST = Modified: trunk/opentrep/test/i18n/utf8/utf8.cpp =================================================================== --- trunk/opentrep/test/i18n/utf8/utf8.cpp 2009-08-15 18:24:37 UTC (rev 178) +++ trunk/opentrep/test/i18n/utf8/utf8.cpp 2009-08-16 14:08:09 UTC (rev 179) @@ -55,10 +55,10 @@ for all the characters. if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space. */ -int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz) +int u8_toucs(u_int32_t *dest, int sz, const char *src, int srcsz) { u_int32_t ch; - char *src_end = src + srcsz; + const char* src_end = src + srcsz; int nb; int i=0; @@ -100,7 +100,7 @@ the NUL as well. the destination string will never be bigger than the source string. */ -int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz) +int u8_toutf8(char *dest, int sz, const u_int32_t *src, int srcsz) { u_int32_t ch; int i = 0; Modified: trunk/opentrep/test/i18n/utf8/utf8.hpp =================================================================== --- trunk/opentrep/test/i18n/utf8/utf8.hpp 2009-08-15 18:24:37 UTC (rev 178) +++ trunk/opentrep/test/i18n/utf8/utf8.hpp 2009-08-16 14:08:09 UTC (rev 179) @@ -5,10 +5,10 @@ #define isutf(c) (((c)&0xC0)!=0x80) /* convert UTF-8 data to wide character */ -int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz); +int u8_toucs(u_int32_t *dest, int sz, const char *src, int srcsz); /* the opposite conversion */ -int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz); +int u8_toutf8(char *dest, int sz, const u_int32_t *src, int srcsz); /* single character to UTF-8 */ int u8_wc_toutf8(char *dest, u_int32_t ch); Modified: trunk/opentrep/test/i18n/utf8/utf8string.cpp =================================================================== --- trunk/opentrep/test/i18n/utf8/utf8string.cpp 2009-08-15 18:24:37 UTC (rev 178) +++ trunk/opentrep/test/i18n/utf8/utf8string.cpp 2009-08-16 14:08:09 UTC (rev 179) @@ -1,113 +1,43 @@ // STL #include <iostream> -#include <locale> -#include <string> -#include <cstring> +// OpenTrep +#include <opentrep/basic/UTF8Handler.hpp> -// /////////////////////////////////////////////// -void displayCharString (const char* iString) { - // Store current formatting flags of std::cout - std::ios::fmtflags oldFlags = std::cout.flags(); - - const size_t lLength = std::strlen (iString); - for (size_t idx = 0; idx != lLength; ++idx) { - if (idx != 0) { - std::cout << "; "; - } - const unsigned short lChar = iString[idx]; - // const wchar_t lChar = iString[idx]; - std::cout << "[" << idx << "]: " << std::hex << lChar; - } - std::cout << std::endl; - - // Reset formatting flags of std::cout - std::cout.flags (oldFlags); -} - -// /////////////////////////////////////////////// -void displayWCharString (const wchar_t* iString, const size_t iLength) { - // Store current formatting flags of std::cout - std::ios::fmtflags oldFlags = std::cout.flags(); - - for (size_t idx = 0; idx != iLength; ++idx) { - if (idx != 0) { - std::cout << "; "; - } - const wchar_t lChar = iString[idx]; - std::cout << "[" << idx << "]: " << std::hex << lChar; - } - std::cout << std::endl; - - // Reset formatting flags of std::cout - std::cout.flags (oldFlags); -} - -// /////////////////////////////////////////////// -void displaySTLString (const std::string& iString) { - // Store current formatting flags of std::cout - std::ios::fmtflags oldFlags = std::cout.flags(); - - unsigned short idx = 0; - for (std::string::const_iterator itChar = iString.begin(); - itChar != iString.end(); ++itChar, ++idx) { - if (idx != 0) { - std::cout << "; "; - } - const unsigned short lChar = *itChar; - // const char lChar = *itChar; - // const wchar_t lChar = *itChar; - std::cout << "[" << idx << "]: " << std::hex << lChar; - } - std::cout << std::endl; - - // Reset formatting flags of std::cout - std::cout.flags (oldFlags); -} - // //////////////////////// M A I N ///////////////////////// int main (int argc, char* argv[]) { - // Single char strings - const char mucDECharString[] = ("München"); - const char mucRUCharString[] = ("Мюнхен"); + // STL strings + std::string mucDESTLString ("München"); + std::string mucRUSTLString ("Мюнхен"); - std::cout << "--------" << std::endl << "Single char strings" << std::endl; - std::cout << "Deutsch ('" << mucDECharString << "'): " << std::endl; - displayCharString (mucDECharString); + std::cout << "--------" << std::endl + << "STL strings without processing" << std::endl; + std::cout << "Deutsch: '" << mucDESTLString << "'" << std::endl; + std::cout << "Russian: '" << mucRUSTLString << "'" << std::endl; - std::cout << "Russian ('" << mucRUCharString << "'): " << std::endl; - displayCharString (mucRUCharString); - - // Wide char strings - wchar_t mucDEWCharString[7]; - wchar_t mucRUWCharString[6]; - - // Conversion from char* to wchar_t thanks to the STL locale - std::locale lLocale; - std::use_facet<std::ctype<wchar_t> > (lLocale).widen (mucDECharString, - mucDECharString+7, - mucDEWCharString); - std::use_facet<std::ctype<wchar_t> > (lLocale).widen (mucRUCharString, - mucRUCharString+6, - mucRUWCharString); + // + std::wstring mucDESTLWString = + OPENTREP::UTF8Handler::toWideString (mucDESTLString); + std::wstring mucRUSTLWString = + OPENTREP::UTF8Handler::toWideString (mucRUSTLString); - std::cout << "--------" << std::endl << "Wide char strings" << std::endl; - std::cout << "Deutsch ('" << mucDEWCharString << "'): " << std::endl; - displayWCharString (mucDEWCharString, 7); + std::cout << "--------" << std::endl + << "UTF-8 decoded wide char strings" << std::endl; + std::cout << "Deutsch: " << std::endl; + // std::cout << "Deutsch: '" << mucDESTLWString << "'" << std::endl; + std::cout << OPENTREP::UTF8Handler::displaySTLWString (mucDESTLWString); - std::cout << "Russian ('" << mucRUWCharString << "'): " << std::endl; - displayWCharString (mucRUWCharString, 6); + std::cout << "Russian: " << std::endl; + // std::cout << "Russian: '" << mucRUSTLWString << "'" << std::endl; + std::cout << OPENTREP::UTF8Handler::displaySTLWString (mucRUSTLWString); - // STL strings - std::string mucDESTLString ("München"); - std::string mucRUSTLString ("Мюнхен"); - - std::cout << "--------" << std::endl << "STL strings" << std::endl; - std::cout << "Deutsch ('" << mucDESTLString << "'): " << std::endl; - displaySTLString (mucDESTLString); + mucDESTLString = OPENTREP::UTF8Handler::toSimpleString (mucDESTLWString); + mucRUSTLString = OPENTREP::UTF8Handler::toSimpleString (mucRUSTLWString); - std::cout << "Russian ('" << mucRUSTLString << "'): " << std::endl; - displaySTLString (mucRUSTLString); + std::cout << "--------" << std::endl + << "STL strings after processing" << std::endl; + std::cout << "Deutsch: '" << mucDESTLString << "'" << std::endl; + std::cout << "Russian: '" << mucRUSTLString << "'" << std::endl; return 0; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |