From: <mee...@us...> - 2005-02-20 00:44:45
|
Update of /cvsroot/sc2/sc2/src/sc2code/libs/strings In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv12949/sc2code/libs/strings Modified Files: strings.c unicode.c Log Message: Unicode support. Index: strings.c =================================================================== RCS file: /cvsroot/sc2/sc2/src/sc2code/libs/strings/strings.c,v retrieving revision 1.3 retrieving revision 1.4 diff -C2 -d -r1.3 -r1.4 *** strings.c 18 Jan 2003 23:08:39 -0000 1.3 --- strings.c 20 Feb 2005 00:44:35 -0000 1.4 *************** *** 145,152 **** StringIndex = STRING_INDEX (String); LockStringTable (StringTable, &StringTablePtr); StringLength = (COUNT)( StringTablePtr->StringOffsets[StringIndex + 1] ! - StringTablePtr->StringOffsets[StringIndex] ! ); UnlockStringTable (StringTable); } --- 145,165 ---- StringIndex = STRING_INDEX (String); LockStringTable (StringTable, &StringTablePtr); + + { + STRINGPTR start; + STRINGPTR end; + + start = (STRINGPTR) ((BYTE *) StringTablePtr + + StringTablePtr->StringOffsets[StringIndex]); + end = (STRINGPTR) ((BYTE *) StringTablePtr + + StringTablePtr->StringOffsets[StringIndex + 1]); + StringLength = utf8StringCountN(start, end); + } + + #if 0 StringLength = (COUNT)( StringTablePtr->StringOffsets[StringIndex + 1] ! - StringTablePtr->StringOffsets[StringIndex]); ! #endif UnlockStringTable (StringTable); } Index: unicode.c =================================================================== RCS file: /cvsroot/sc2/sc2/src/sc2code/libs/strings/unicode.c,v retrieving revision 1.1 retrieving revision 1.2 diff -C2 -d -r1.1 -r1.2 *** unicode.c 5 Feb 2005 01:24:12 -0000 1.1 --- unicode.c 20 Feb 2005 00:44:35 -0000 1.2 *************** *** 18,25 **** #include <stdio.h> // Get one character from a UTF-8 encoded string. // *ptr will point to the start of the next character. // Returns 0 if the encoding is bad. This can be distinguished from the ! // '\0' character by checking whether **ptr == '\0'. wchar_t getCharFromString(const unsigned char **ptr) { --- 18,33 ---- #include <stdio.h> + // Resynchronise (skip everything starting with 0x10xxxxxx): + static inline void + resyncUTF8(const unsigned char **ptr) { + while ((**ptr & 0xc0) == 0x80) + *ptr++; + } + // Get one character from a UTF-8 encoded string. // *ptr will point to the start of the next character. // Returns 0 if the encoding is bad. This can be distinguished from the ! // '\0' character by checking whether **ptr == '\0' before calling this ! // function. wchar_t getCharFromString(const unsigned char **ptr) { *************** *** 107,116 **** // Resynchronise (skip everything starting with 0x10xxxxxx): ! while ((**ptr & 0xc0) == 0x80) ! *ptr++; return 0; } --- 115,245 ---- // Resynchronise (skip everything starting with 0x10xxxxxx): ! resyncUTF8(ptr); return 0; } + wchar_t + getCharFromStringN(const unsigned char **ptr, const unsigned char *end) { + int numBytes; + + if (**ptr < 0x80) { + numBytes = 1; + } else if ((**ptr & 0xe0) == 0xc0) { + numBytes = 2; + } else if ((**ptr & 0xf0) == 0xe0) { + numBytes = 3; + } else if ((**ptr & 0xf8) == 0xf0) { + numBytes = 4; + } else + goto err; + + if (*ptr + numBytes > end) + goto err; + + return getCharFromString(ptr); + + err: + *ptr = end; + return 0; + } + + // Get one line from a string. + // A line is terminated with either CRLF (DOS/Windows), + // LF (Unix, MacOS X), or CR (old MacOS). + // The end of the string is reached when **startNext == '\0'. + // NULL is returned if the string is not valid UTF8. In this case + // *end points to the first invalid character (or the character before if + // it was a LF), and *startNext to the start of the next (possibly invalid + // too) character. + unsigned char * + getLineFromString(const unsigned char *start, const unsigned char **end, + const unsigned char **startNext) { + const unsigned char *ptr = start; + const unsigned char *lastPtr; + wchar_t ch; + + // Search for the first newline. + for (;;) { + if (*ptr == '\0') { + *end = ptr; + *startNext = ptr; + return (unsigned char *) start; + } + lastPtr = ptr; + ch = getCharFromString(&ptr); + if (ch == '\0') { + // Bad string + *end = lastPtr; + *startNext = ptr; + return NULL; + } + if (ch == '\n') { + *end = lastPtr; + if (*ptr == '\0'){ + // LF at the end of the string. + *startNext = ptr; + return (unsigned char *) start; + } + ch = getCharFromString(&ptr); + if (ch == '\0') { + // Bad string + return NULL; + } + if (ch == '\r') { + // LFCR + *startNext = ptr; + } else { + // LF + *startNext = *end; + } + return (unsigned char *) start; + } else if (ch == '\r') { + *end = lastPtr; + *startNext = ptr; + return (unsigned char *) start; + } // else: a normal character + } + } + + size_t + utf8StringCount(const unsigned char *start) { + size_t count = 0; + wchar_t ch; + + for (;;) { + ch = getCharFromString(&start); + if (ch == '\0') + return count; + count++; + } + } + + size_t + utf8StringCountN(const unsigned char *start, const unsigned char *end) { + size_t count = 0; + wchar_t ch; + + for (;;) { + ch = getCharFromStringN(&start, end); + if (ch == '\0') + return count; + count++; + } + } + + unsigned char * + skipUTF8Chars(const unsigned char *ptr, size_t num) { + wchar_t ch; + const unsigned char *oldPtr; + + while (num--) { + oldPtr = ptr; + ch = getCharFromString(&ptr); + if (ch == '\0') + return (unsigned char *) oldPtr; + } + return (unsigned char *) ptr; + } |