[hoc-cvs] hoc/HOC/HOC Unicode.hs,NONE,1.1 StdArgumentTypes.hs,1.4,1.5

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/hoc/hoc/HOC/HOC
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv24465/HOC/HOC

Modified Files:
	StdArgumentTypes.hs 
Added Files:
	Unicode.hs 
Log Message:
Support arbitrary unicode for NSString <-> String conversions

--- NEW FILE: Unicode.hs ---
--------------------------------------------------------------------------------
--
--  This module has been copied from the HXML Toolbox library:
--    http://www.fh-wedel.de/~si/HXmlToolbox/
--  and is subject to the following terms:
--
--  The MIT License
--
--  Copyright (c) 2002 Uwe Schmidt, Martin Schmidt
--
--  Permission is hereby granted, free of charge, to any person obtaining a copy
--  of this software and associated documentation files (the "Software"),
--  to deal in the Software without restriction, including without limitation
--  the rights to use, copy, modify, merge, publish, distribute, sublicense,
--  and/or sell copies of the Software, and to permit persons to whom the
--  Software is furnished to do so, subject to the following conditions:
--
--  The above copyright notice and this permission notice shall be included in
--  all copies or substantial portions of the Software.
--
--  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
--  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
--  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
--  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
--  DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
--  OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
--  OR OTHER DEALINGS IN THE SOFTWARE.
--
--------------------------------------------------------------------------------

--  Modified, May 2004, Graham Klyne:
--
--  - Added generic UTF-to-Unicode conversion function
--  - Return a null Unicode character when an invalid UTF-8 sequence is encountered

--  Modified, March 2006, Wolfgang Thaller;
--
--  - Brutally cut down for inclusion in HOC
--    all we need is Unicode <-> UTF8 conversion.
--  - Use String for unicode and [Word8] for utf-8,
--    because that's The Way It Should Be (according to me).


module HOC.Unicode
    (
     -- * Unicode Type declarations
     UTF8Char,
     UTF8String,

      -- * Unicode and UTF-8 conversions
      utf8ToUnicode,
      unicodeToUtf8
    )
where

import Data.Char( toUpper )
import Data.Word( Word8 )

-- ------------------------------------------------------------

-- | Unicode is represented as the Char type
--   Precondition for this is the support of Unicode character range
--   in the compiler (e.g. ghc but not hugs)
type Unicode    = Char

-- | the type for Unicode strings
type UString    = [Unicode]

type UTF8Char   = Word8

type UTF8String = [UTF8Char]

maxCharValue = fromEnum ('\x10FFFF')

-- ------------------------------------------------------------
--
-- Unicode predicates

-- |
-- test for leading multibyte UTF-8 character

isLeadingMultiByteChar  :: UTF8Char -> Bool
isLeadingMultiByteChar  c
    = c >= 0xC0 && c <= 0xFD

-- |
-- test for following multibyte UTF-8 character

isFollowingMultiByteChar        :: UTF8Char -> Bool
isFollowingMultiByteChar        c
    = c >= 0x80 && c < 0xC0

-- |
-- compute the number of following bytes and the mask bits of a leading UTF-8 multibyte char

isNByteChar     :: UTF8Char -> (Int, Int, Int)
isNByteChar c
    | c >= 0xc0 && c <= 0xdf = (1, 0xC0, 0x00000080)
    | c >= 0xe0 && c <= 0xef = (2, 0xE0, 0x00000800)
    | c >= 0xf0 && c <= 0xf7 = (3, 0xF0, 0x00010000)
    | c >= 0xf8 && c <= 0xfb = (4, 0xF8, 0x00200000)
    | c >= 0xfc && c <= 0xfd = (5, 0xFC, 0x04000000)
    | otherwise = (0,fromIntegral c,0)

-- ------------------------------------------------------------

-- |
-- conversion of a UTF-8 encoded string into a sequence of unicode values.
-- precondition: the string is a valid UTF-8 encoded string

utf8ToUnicode           :: UTF8String -> String
utf8ToUnicode (c : cs)
    | c < 0x80
        = toEnum (fromIntegral c) : utf8ToUnicode cs
    | isLeadingMultiByteChar c
      &&
      resAsInt <= maxCharValue
      &&
      l1 == length cs0
        = toEnum resAsInt
          : utf8ToUnicode cs1
    | otherwise
        = toEnum 0 : utf8ToUnicode cs1
          where
          (l1, mask, _min)      = isNByteChar c
          (cs0, cs1)            = splitAt l1 cs

          resAsInt              :: Int
          resAsInt              = utf8ToU (fromIntegral c - mask) cs0

          utf8ToU i []      = i
          utf8ToU i (c1:l)
              | isFollowingMultiByteChar c1
                  = utf8ToU (i * 0x40 + (fromIntegral c1 - 0x80)) l
              | otherwise
                  = 0 -- error ("utf8ToUnicode: illegal UTF-8 multibyte character " ++ show (c : cs0) )

utf8ToUnicode []
    = []

-- ------------------------------------------------------------

-- |
-- conversion from Unicode strings (UString) to UTF8 encoded strings.

unicodeToUtf8           :: String -> UTF8String
unicodeToUtf8           = concatMap unicodeCharToUtf8

-- |
-- conversion from Char to a UTF8 encoded string.

unicodeCharToUtf8       :: Char -> UTF8String
unicodeCharToUtf8 c
    | i >= 0          && i <= 0x0000007F        -- 1 byte UTF8 (7 bits)
        = [ fromIntegral i ]
    | i >= 0x00000080 && i <= 0x000007FF        -- 2 byte UTF8 (5 + 6 bits)
        = [ fromIntegral (0xC0 + i `div` 0x40)
          , fromIntegral (0x80 + i                  `mod` 0x40)
          ]
    | i >= 0x00000800 && i <= 0x0000FFFF        -- 3 byte UTF8 (4 + 6 + 6 bits)
        = [ fromIntegral (0xE0 +  i `div`   0x1000)
          , fromIntegral (0x80 + (i `div`     0x40) `mod` 0x40)
          , fromIntegral (0x80 +  i                 `mod` 0x40)
          ]
    | i >= 0x00010000 && i <= 0x001FFFFF        -- 4 byte UTF8 (3 + 6 + 6 + 6 bits)
        = [ fromIntegral (0xF0 +  i `div`    0x40000)
          , fromIntegral (0x80 + (i `div`     0x1000) `mod` 0x40)
          , fromIntegral (0x80 + (i `div`       0x40) `mod` 0x40)
          , fromIntegral (0x80 +  i                   `mod` 0x40)
          ]
    | i >= 0x00200000 && i <= 0x03FFFFFF        -- 5 byte UTF8 (2 + 6 + 6 + 6 + 6 bits)
        = [ fromIntegral (0xF8 +  i `div`  0x1000000)
          , fromIntegral (0x80 + (i `div`    0x40000) `mod` 0x40)
          , fromIntegral (0x80 + (i `div`     0x1000) `mod` 0x40)
          , fromIntegral (0x80 + (i `div`       0x40) `mod` 0x40)
          , fromIntegral (0x80 +  i                   `mod` 0x40)
          ]
    | i >= 0x04000000 && i <= 0x7FFFFFFF        -- 6 byte UTF8 (1 + 6 + 6 + 6 + 6 + 6 bits)
        = [ fromIntegral (0xFC +  i `div` 0x40000000)
          , fromIntegral (0x80 + (i `div`  0x1000000) `mod` 0x40)
          , fromIntegral (0x80 + (i `div`    0x40000) `mod` 0x40)
          , fromIntegral (0x80 + (i `div`     0x1000) `mod` 0x40)
          , fromIntegral (0x80 + (i `div`       0x40) `mod` 0x40)
          , fromIntegral (0x80 +  i                   `mod` 0x40)
          ]
    | otherwise                                 -- other values not supported
        = error ("unicodeCharToUtf8: illegal integer argument " ++ show i)
    where
    i = fromEnum c


Index: StdArgumentTypes.hs
===================================================================
RCS file: /cvsroot/hoc/hoc/HOC/HOC/StdArgumentTypes.hs,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- StdArgumentTypes.hs	26 Jul 2005 01:27:44 -0000	1.4
+++ StdArgumentTypes.hs	17 Mar 2006 05:32:04 -0000	1.5
@@ -1,3 +1,4 @@
+{-# OPTIONS -fallow-undecidable-instances #-}
 module HOC.StdArgumentTypes where
 
 import HOC.Base
@@ -10,6 +11,8 @@
 import Foreign.C.Types
 import Foreign.C.String
 
+import HOC.Unicode
+
 -- Objective C
 
 -- ID: already defined 
@@ -63,19 +66,23 @@
 
 -- String
 
+
 foreign import ccall safe "Marshalling.h nsStringToUTF8"
-    nsStringToUTF8 :: Ptr ObjCObject -> IO CString
+    nsStringToUTF8 :: Ptr ObjCObject -> IO (Ptr Word8)
 
 foreign import ccall unsafe "Marshalling.h utf8ToNSString"
-    utf8ToNSString :: CString -> IO (Ptr ObjCObject)
+    utf8ToNSString :: Ptr Word8 -> IO (Ptr ObjCObject)
+
+withUTF8String str = withArray0 0 (unicodeToUtf8 str)
 
 instance ObjCArgument String (Ptr ObjCObject) where
     withExportedArgument arg action =
-        bracket (withCString arg utf8ToNSString) releaseObject action
+        bracket (withUTF8String arg utf8ToNSString) releaseObject action
     exportArgument arg = do
-        nsstr <- withCString arg utf8ToNSString
+        nsstr <- withUTF8String arg utf8ToNSString
         autoreleaseObject nsstr
         return nsstr
-    importArgument arg = nsStringToUTF8 arg >>= peekCString
-    
+    importArgument arg = nsStringToUTF8 arg >>= peekArray0 0
+    					 >>= return . utf8ToUnicode
+   
     objCTypeString _ = "*"