[Opentrep-svn] SF.net SVN: opentrep:[179] trunk/opentrep

SourceForge Headquarters 225 Broadway Suite 1600 San Diego, CA 92101 +1 (858) 422-6466

Revision: 179
          http://opentrep.svn.sourceforge.net/opentrep/?rev=179&view=rev
Author:   denis_arnaud
Date:     2009-08-16 14:08:09 +0000 (Sun, 16 Aug 2009)

Log Message:
-----------
[i18n] Added a utility class for conversion from/to UTF8 strings to/from wide-character strings.

Modified Paths:
--------------
    trunk/opentrep/opentrep/basic/sources.mk
    trunk/opentrep/test/i18n/icu/Makefile.am
    trunk/opentrep/test/i18n/stdlocru.cpp
    trunk/opentrep/test/i18n/utf8/Makefile.am
    trunk/opentrep/test/i18n/utf8/utf8.cpp
    trunk/opentrep/test/i18n/utf8/utf8.hpp
    trunk/opentrep/test/i18n/utf8/utf8string.cpp

Added Paths:
-----------
    trunk/opentrep/opentrep/basic/UTF8Handler.cpp
    trunk/opentrep/opentrep/basic/UTF8Handler.hpp

Added: trunk/opentrep/opentrep/basic/UTF8Handler.cpp
===================================================================

--- trunk/opentrep/opentrep/basic/UTF8Handler.cpp	                        (rev 0)
+++ trunk/opentrep/opentrep/basic/UTF8Handler.cpp	2009-08-16 14:08:09 UTC (rev 179)
@@ -0,0 +1,183 @@
+// //////////////////////////////////////////////////////////////////////
+// Import section
+// //////////////////////////////////////////////////////////////////////
+// STL
+#include <cassert>
+#include <sstream>
+#include <string>
+// OpenTrep
+#include <opentrep/basic/UTF8Handler.hpp>
+
+namespace OPENTREP {
+
+  // //////////////////////////////////////////////////////////////////////
+  static const wchar_t offsetsFromUTF8[6] = {
+    0x00000000UL, 0x00003080UL, 0x000E2080UL,
+    0x03C82080UL, 0xFA082080UL, 0x82082080UL
+  };
+
+  // //////////////////////////////////////////////////////////////////////
+  static const char trailingBytesForUTF8[256] = {
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
+  };
+
+  // //////////////////////////////////////////////////////////////////////
+  std::wstring UTF8Handler::toWideString (const std::string& iSrc) {
+    std::basic_ostringstream<wchar_t> oStr;
+
+    // Length of the source string
+    const size_t lStringSize = iSrc.size();
+
+    // Transform the source string in a regular C-string (char*)
+    const char* src = iSrc.c_str();
+
+    //
+    typedef unsigned char uchar_t;
+  
+    size_t idx = 0;
+    while (idx != lStringSize) {
+
+      uchar_t lCurrentChar = static_cast<uchar_t> (src[idx]);
+
+      // When there are multi-byte characters (e.g., for UTF-8 encoded
+      // STL strings), the size of the STL string corresponds to the
+      // total number of bytes. For instance, "München" has a size of 8
+      // bytes (and not 7 characters). However, the iteration is made on
+      // the number of characters (idx); when the end of the string is
+      // reached, the loop must therefore be exited.
+      if (lCurrentChar == '\0') {
+        break;
+      }
+
+      const int nb = trailingBytesForUTF8[lCurrentChar];
+    
+      wchar_t tmpChar = 0;
+      switch (nb) {
+        // These fall through deliberately
+      case 3: {
+        lCurrentChar = static_cast<uchar_t> (src[idx]); ++idx;
+        tmpChar += lCurrentChar; tmpChar <<= 6;
+      }
+      case 2: {
+        lCurrentChar = static_cast<uchar_t> (src[idx]); ++idx;
+        tmpChar += lCurrentChar; tmpChar <<= 6;
+      }
+      case 1: {
+        lCurrentChar = static_cast<uchar_t> (src[idx]); ++idx;
+        tmpChar += lCurrentChar; tmpChar <<= 6;
+      }
+      case 0: {
+        lCurrentChar = static_cast<uchar_t> (src[idx]); ++idx;
+        tmpChar += lCurrentChar;
+      }
+      }
+    
+      tmpChar -= offsetsFromUTF8[nb];
+      oStr << tmpChar;
+    }
+
+    oStr << '\0';
+    return oStr.str();
+  }
+
+  // //////////////////////////////////////////////////////////////////////
+  std::string UTF8Handler::toSimpleString (const std::wstring& iStr) {
+    std::ostringstream oStr;
+  
+    const wchar_t* src = iStr.c_str();
+    size_t idx = 0;
+    size_t i = 0;
+  
+    while (src[i] != 0) {
+      wchar_t ch = src[i];
+    
+      if (ch < 0x80) {
+        const char tmpChar = static_cast<const char> (ch);
+        oStr << tmpChar; ++idx;
+      
+      } else if (ch < 0x800) {
+        char tmpChar = static_cast<const char> ((ch >> 6) | 0xC0);
+        oStr << tmpChar; ++idx;
+
+        tmpChar = static_cast<const char> ((ch & 0x3F) | 0x80);
+        oStr << tmpChar; ++idx;
+      
+      } else if (ch < 0x10000) {
+        char tmpChar = static_cast<const char> ((ch>>12) | 0xE0);
+        oStr << tmpChar; ++idx;
+
+        tmpChar = static_cast<const char> (((ch>>6) & 0x3F) | 0x80);
+        oStr << tmpChar; ++idx;
+
+        tmpChar = static_cast<const char> ((ch & 0x3F) | 0x80);
+        oStr << tmpChar; ++idx;
+      
+      } else if (ch < 0x110000) {
+        char tmpChar = static_cast<const char> ((ch>>18) | 0xF0);
+        oStr << tmpChar; ++idx;
+
+        tmpChar = static_cast<const char> (((ch>>12) & 0x3F) | 0x80);
+        oStr << tmpChar; ++idx;
+      
+        tmpChar = static_cast<const char> (((ch>>6) & 0x3F) | 0x80);
+        oStr << tmpChar; ++idx;
+      
+        tmpChar = static_cast<const char> ((ch & 0x3F) | 0x80);
+        oStr << tmpChar; ++idx;
+      }
+      i++;
+    }
+
+    oStr << '\0';
+  
+    return oStr.str();
+  }
+
+  // //////////////////////////////////////////////////////////////////////
+  std::string UTF8Handler::displayCharString (const char* iString) {
+    std::ostringstream oStr;
+    
+    bool hasReachedEnd = false;
+    for (size_t idx = 0; hasReachedEnd == false; ++idx) {
+      if (idx != 0) {
+        oStr << "; ";
+      }
+      const unsigned char lChar = iString[idx];
+      // const wchar_t lChar = iString[idx];
+      if (lChar == '\0') {
+        hasReachedEnd = true;
+      }
+      oStr << "[" << idx << "]: " << std::hex << lChar;
+    }
+    oStr << std::endl;
+    
+    return oStr.str();
+  }
+
+  // //////////////////////////////////////////////////////////////////////
+  std::string UTF8Handler::displaySTLWString (const std::wstring& iString) {
+    std::ostringstream oStr;
+    
+    size_t idx = 0;
+    for (std::wstring::const_iterator itChar = iString.begin();
+         itChar != iString.end(); ++itChar, ++idx) {
+      if (idx != 0) {
+        oStr << "; ";
+      }
+      const wchar_t lChar = *itChar;
+      oStr << "[" << idx << "]: " << std::hex << lChar;
+    }
+    oStr << std::endl;
+    
+    return oStr.str();
+  }
+
+}
+

Added: trunk/opentrep/opentrep/basic/UTF8Handler.hpp
===================================================================
--- trunk/opentrep/opentrep/basic/UTF8Handler.hpp	                        (rev 0)
+++ trunk/opentrep/opentrep/basic/UTF8Handler.hpp	2009-08-16 14:08:09 UTC (rev 179)
@@ -0,0 +1,53 @@
+#ifndef __OPENTREP_BAS_UTF8HANDLER_HPP
+#define __OPENTREP_BAS_UTF8HANDLER_HPP
+
+// //////////////////////////////////////////////////////////////////////
+// Import section
+// //////////////////////////////////////////////////////////////////////
+// STL
+#include <string>
+
+namespace OPENTREP {
+
+  /** Utility class for basic handling of UTF-8 encoded strings.
+      <br>Most of the methods have taken their inspiration from Jeff
+      Bezanson's work in the Wikix project
+      (see http://meta.wikimedia.org/wiki/Wikix for further details),
+      and have been "C++-ified". */
+  class UTF8Handler {
+  public:    
+    /* Conversion from a UTF-8-encoded "simple character" (though
+       potentially multi-byte) STL string into a wide character STL
+       string.
+       <br>Note that as there is no checks of appropriate encoding, it
+       only works for valid UTF-8, i.e. no 5- or 6-byte sequences.
+       <br>Note that the "simple characters", within a STL string, may be
+       multi-byte (e.g., if they are UTF-8-encoded).
+       @param std::string The "simple character" (though potentially
+              multi-byte) STL string.
+              @return std::wstring The wide character STL string.
+    */
+    static std::wstring toWideString (const std::string& iSrc);
+
+    /* Conversion from a wide character STL string into a UTF-8-encoded
+       "simple character" (though potentially multi-byte) STL string.
+       <br>Note that as there is no checks of appropriate encoding, it
+       only works for valid UTF-8, i.e. no 5- or 6-byte sequences.
+       <br>Note that the "simple characters", within a STL string, may be
+       multi-byte (e.g., if they are UTF-8-encoded).
+       @param std::wstring The wide character STL string.
+       @return std::string The "simple character" (though potentially
+               multi-byte) STL string.
+    */
+    static std::string toSimpleString (const std::wstring& iStr);
+
+    /** Display the sequence of characters for the simple C-string. */
+    static std::string displayCharString (const char* iString);
+
+    /** Display the sequence of characters (one by one) for the given
+        STL wide character string. */
+    static std::string displaySTLWString (const std::wstring& iString);
+  };
+
+}
+#endif // __OPENTREP_BAS_UTF8HANDLER_HPP

Modified: trunk/opentrep/opentrep/basic/sources.mk
===================================================================
--- trunk/opentrep/opentrep/basic/sources.mk	2009-08-15 18:24:37 UTC (rev 178)
+++ trunk/opentrep/opentrep/basic/sources.mk	2009-08-16 14:08:09 UTC (rev 179)
@@ -1,5 +1,7 @@
 bas_h_sources = $(top_srcdir)/opentrep/basic/BasConst_General.hpp \
 				$(top_srcdir)/opentrep/basic/BasConst_OPENTREP_Service.hpp \
-				$(top_srcdir)/opentrep/basic/BasChronometer.hpp
+				$(top_srcdir)/opentrep/basic/BasChronometer.hpp \
+				$(top_srcdir)/opentrep/basic/UTF8Handler.hpp
 bas_cc_sources = $(top_srcdir)/opentrep/basic/BasConst.cpp \
-				$(top_srcdir)/opentrep/basic/BasChronometer.cpp
+				$(top_srcdir)/opentrep/basic/BasChronometer.cpp \
+				$(top_srcdir)/opentrep/basic/UTF8Handler.cpp

Modified: trunk/opentrep/test/i18n/icu/Makefile.am
===================================================================
--- trunk/opentrep/test/i18n/icu/Makefile.am	2009-08-15 18:24:37 UTC (rev 178)
+++ trunk/opentrep/test/i18n/icu/Makefile.am	2009-08-16 14:08:09 UTC (rev 179)
@@ -3,7 +3,7 @@
 
 MAINTAINERCLEANFILES = Makefile.in
 
-check_PROGRAMS = icufmt icuustring icucharsetdetector icuconv
+check_PROGRAMS = icufmt icuustring icucharsetdetector icuconv icuutext
 
 icufmt_SOURCES = icufmt.cpp
 icufmt_CXXFLAGS = $(ICU_CFLAGS)
@@ -21,4 +21,8 @@
 icuconv_CXXFLAGS = $(ICU_CFLAGS)
 icuconv_LDFLAGS = $(ICU_LIBS) $(ICU_IO_LIB)
 
+icuutext_SOURCES = icuutext.cpp
+icuutext_CXXFLAGS = $(ICU_CFLAGS)
+icuutext_LDFLAGS = $(ICU_LIBS) $(ICU_IO_LIB)
+
 EXTRA_DIST =

Modified: trunk/opentrep/test/i18n/stdlocru.cpp
===================================================================
--- trunk/opentrep/test/i18n/stdlocru.cpp	2009-08-15 18:24:37 UTC (rev 178)
+++ trunk/opentrep/test/i18n/stdlocru.cpp	2009-08-16 14:08:09 UTC (rev 179)
@@ -59,7 +59,7 @@
   std::cout << "de: " << mucDEWCharString << std::endl;
   std::cout << "ru: " << mucRUWCharString << std::endl;
   
-  // STL ctypes on char*
+  // STL ctypes on wchar_t
   std::use_facet<std::ctype<wchar_t> > (langLocale).toupper(mucDEWCharString,
                                                             mucDEWCharString+7);
   std::use_facet<std::ctype<wchar_t> > (langLocale).toupper(mucRUWCharString,

Modified: trunk/opentrep/test/i18n/utf8/Makefile.am
===================================================================
--- trunk/opentrep/test/i18n/utf8/Makefile.am	2009-08-15 18:24:37 UTC (rev 178)
+++ trunk/opentrep/test/i18n/utf8/Makefile.am	2009-08-16 14:08:09 UTC (rev 179)
@@ -11,6 +11,8 @@
 
 utf8string_SOURCES = utf8string.cpp
 utf8string_CXXFLAGS =
-utf8string_LDFLAGS =
+utf8string_LDFLAGS = \
+		$(BOOST_LIBS) $(SOCI_LIBS) $(CPPUNIT_LIBS) \
+		$(top_builddir)/@PACKAGE@/lib@PACKAGE@.la
 
 EXTRA_DIST =

Modified: trunk/opentrep/test/i18n/utf8/utf8.cpp
===================================================================
--- trunk/opentrep/test/i18n/utf8/utf8.cpp	2009-08-15 18:24:37 UTC (rev 178)
+++ trunk/opentrep/test/i18n/utf8/utf8.cpp	2009-08-16 14:08:09 UTC (rev 179)
@@ -55,10 +55,10 @@
    for all the characters.
    if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space.
 */
-int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz)
+int u8_toucs(u_int32_t *dest, int sz, const char *src, int srcsz)
 {
     u_int32_t ch;
-    char *src_end = src + srcsz;
+    const char* src_end = src + srcsz;
     int nb;
     int i=0;
 
@@ -100,7 +100,7 @@
    the NUL as well.
    the destination string will never be bigger than the source string.
 */
-int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz)
+int u8_toutf8(char *dest, int sz, const u_int32_t *src, int srcsz)
 {
     u_int32_t ch;
     int i = 0;

Modified: trunk/opentrep/test/i18n/utf8/utf8.hpp
===================================================================
--- trunk/opentrep/test/i18n/utf8/utf8.hpp	2009-08-15 18:24:37 UTC (rev 178)
+++ trunk/opentrep/test/i18n/utf8/utf8.hpp	2009-08-16 14:08:09 UTC (rev 179)
@@ -5,10 +5,10 @@
 #define isutf(c) (((c)&0xC0)!=0x80)
 
 /* convert UTF-8 data to wide character */
-int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz);
+int u8_toucs(u_int32_t *dest, int sz, const char *src, int srcsz);
 
 /* the opposite conversion */
-int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz);
+int u8_toutf8(char *dest, int sz, const u_int32_t *src, int srcsz);
 
 /* single character to UTF-8 */
 int u8_wc_toutf8(char *dest, u_int32_t ch);

Modified: trunk/opentrep/test/i18n/utf8/utf8string.cpp
===================================================================
--- trunk/opentrep/test/i18n/utf8/utf8string.cpp	2009-08-15 18:24:37 UTC (rev 178)
+++ trunk/opentrep/test/i18n/utf8/utf8string.cpp	2009-08-16 14:08:09 UTC (rev 179)
@@ -1,113 +1,43 @@
 // STL
 #include <iostream>
-#include <locale>
-#include <string>
-#include <cstring>
+// OpenTrep
+#include <opentrep/basic/UTF8Handler.hpp>
 
-// ///////////////////////////////////////////////
-void displayCharString (const char* iString) {
-  // Store current formatting flags of std::cout
-  std::ios::fmtflags oldFlags = std::cout.flags();
-  
-  const size_t lLength = std::strlen (iString);
-  for (size_t idx = 0; idx != lLength; ++idx) {
-    if (idx != 0) {
-      std::cout << "; ";
-    }
-    const unsigned short lChar = iString[idx];
-    // const wchar_t lChar = iString[idx];
-    std::cout << "[" << idx << "]: " << std::hex << lChar;
-  }
-  std::cout << std::endl;
-  
-  // Reset formatting flags of std::cout
-  std::cout.flags (oldFlags);
-}
-
-// ///////////////////////////////////////////////
-void displayWCharString (const wchar_t* iString, const size_t iLength) {
-  // Store current formatting flags of std::cout
-  std::ios::fmtflags oldFlags = std::cout.flags();
-  
-  for (size_t idx = 0; idx != iLength; ++idx) {
-    if (idx != 0) {
-      std::cout << "; ";
-    }
-    const wchar_t lChar = iString[idx];
-    std::cout << "[" << idx << "]: " << std::hex << lChar;
-  }
-  std::cout << std::endl;
-
-  // Reset formatting flags of std::cout
-  std::cout.flags (oldFlags);
-}
-
-// ///////////////////////////////////////////////
-void displaySTLString (const std::string& iString) {
-  // Store current formatting flags of std::cout
-  std::ios::fmtflags oldFlags = std::cout.flags();
-  
-  unsigned short idx = 0;
-  for (std::string::const_iterator itChar = iString.begin();
-       itChar != iString.end(); ++itChar, ++idx) {
-    if (idx != 0) {
-      std::cout << "; ";
-    }
-    const unsigned short lChar = *itChar;
-    // const char lChar = *itChar;
-    // const wchar_t lChar = *itChar;
-    std::cout << "[" << idx << "]: " << std::hex << lChar;
-  }
-  std::cout << std::endl;
-
-  // Reset formatting flags of std::cout
-  std::cout.flags (oldFlags);
-}
-
 // //////////////////////// M A I N /////////////////////////
 int main (int argc, char* argv[]) {
 
-  // Single char strings
-  const char mucDECharString[] = ("München");
-  const char mucRUCharString[] = ("Мюнхен");
+  // STL strings
+  std::string mucDESTLString ("München");
+  std::string mucRUSTLString ("Мюнхен");
 
-  std::cout << "--------" << std::endl << "Single char strings" << std::endl;
-  std::cout << "Deutsch ('" << mucDECharString << "'): " << std::endl;
-  displayCharString (mucDECharString);
+  std::cout << "--------" << std::endl
+            << "STL strings without processing" << std::endl;
+  std::cout << "Deutsch: '" << mucDESTLString << "'" << std::endl;
+  std::cout << "Russian: '" << mucRUSTLString << "'" << std::endl;
   
-  std::cout << "Russian ('" << mucRUCharString << "'): " << std::endl;
-  displayCharString (mucRUCharString);
-
-  // Wide char strings
-  wchar_t mucDEWCharString[7];
-  wchar_t mucRUWCharString[6];
-
-  // Conversion from char* to wchar_t thanks to the STL locale
-  std::locale lLocale;
-  std::use_facet<std::ctype<wchar_t> > (lLocale).widen (mucDECharString,
-                                                        mucDECharString+7,
-                                                        mucDEWCharString);
-  std::use_facet<std::ctype<wchar_t> > (lLocale).widen (mucRUCharString,
-                                                        mucRUCharString+6,
-                                                        mucRUWCharString);
+  //
+  std::wstring mucDESTLWString =
+    OPENTREP::UTF8Handler::toWideString (mucDESTLString);
+  std::wstring mucRUSTLWString =
+    OPENTREP::UTF8Handler::toWideString (mucRUSTLString);
   
-  std::cout << "--------" << std::endl << "Wide char strings" << std::endl;
-  std::cout << "Deutsch ('" << mucDEWCharString << "'): " << std::endl;
-  displayWCharString (mucDEWCharString, 7);
+  std::cout << "--------" << std::endl
+            << "UTF-8 decoded wide char strings" << std::endl;
+  std::cout << "Deutsch: " << std::endl;
+  // std::cout << "Deutsch: '" << mucDESTLWString << "'" << std::endl;
+  std::cout << OPENTREP::UTF8Handler::displaySTLWString (mucDESTLWString);
   
-  std::cout << "Russian ('" << mucRUWCharString << "'): " << std::endl;
-  displayWCharString (mucRUWCharString, 6);
+  std::cout << "Russian: " << std::endl;
+  // std::cout << "Russian: '" << mucRUSTLWString << "'" << std::endl;
+  std::cout << OPENTREP::UTF8Handler::displaySTLWString (mucRUSTLWString);
 
-  // STL strings
-  std::string mucDESTLString ("München");
-  std::string mucRUSTLString ("Мюнхен");
-
-  std::cout << "--------" << std::endl << "STL strings" << std::endl;
-  std::cout << "Deutsch ('" << mucDESTLString << "'): " << std::endl;
-  displaySTLString (mucDESTLString);
+  mucDESTLString = OPENTREP::UTF8Handler::toSimpleString (mucDESTLWString);
+  mucRUSTLString = OPENTREP::UTF8Handler::toSimpleString (mucRUSTLWString);
   
-  std::cout << "Russian ('" << mucRUSTLString << "'): " << std::endl;
-  displaySTLString (mucRUSTLString);
+  std::cout << "--------" << std::endl
+            << "STL strings after processing" << std::endl;
+  std::cout << "Deutsch: '" << mucDESTLString << "'" << std::endl;
+  std::cout << "Russian: '" << mucRUSTLString << "'" << std::endl;
   
   return 0;
 }


This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site.