Update of /cvsroot/pclasses/pclasses2/src/Unicode In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv7861 Modified Files: String.cpp Char.cpp Makefile.am Removed Files: uctype.cpp unicodedata.awk unicodedata.cpp ustring.cpp genunicodedata.cpp unicodedata.h Log Message: - Dropping own Unicode implementation. We now use UCI --- genunicodedata.cpp DELETED --- --- unicodedata.cpp DELETED --- Index: Makefile.am =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/Makefile.am,v retrieving revision 1.9 retrieving revision 1.10 diff -u -d -r1.9 -r1.10 --- Makefile.am 6 May 2005 15:32:11 -0000 1.9 +++ Makefile.am 20 May 2005 14:14:39 -0000 1.10 @@ -1,11 +1,3 @@ -noinst_HEADERS = unicodedata.h - -UnicodeData.txt: - wget --passive-ftp http://www.unicode.org/Public/UNIDATA/UnicodeData.txt - -unicodedata_db.cpp unicodedata_casemap_db.cpp unicodedata_number_db.cpp: genunicodedata$(EXEEXT) UnicodeData.txt - $(top_builddir)/src/Unicode/genunicodedata$(EXEEXT) UnicodeData.txt - INCLUDES = -I$(top_srcdir)/include -I$(top_builddir)/include -I$(top_builddir)/src/Unicode $(all_includes) METASOURCES = AUTO @@ -13,15 +5,8 @@ CPPFLAGS = -DPUNICODE_BUILD -libpclasses_unicode_la_SOURCES = unicodedata.cpp unicodedata_db.cpp unicodedata_casemap_db.cpp \ - unicodedata_number_db.cpp unicodedata_decomp_db.cpp uctype.cpp ustring.cpp Char.cpp String.cpp TextStream.cpp +libpclasses_unicode_la_SOURCES = Char.cpp String.cpp TextStream.cpp libpclasses_unicode_la_LDFLAGS = -no-undefined -libpclasses_unicode_la_LIBADD = $(top_builddir)/src/libpclasses.la $(LIBICONV) - -noinst_PROGRAMS = genunicodedata -genunicodedata_SOURCES = genunicodedata.cpp - -CLEAN_FILES = UnicodeData.txt unicodedata_db.cpp unicodedata_casemap_db.cpp unicodedata_number_db.cpp \ - unicodedata_decomp_db.cpp +libpclasses_unicode_la_LIBADD = $(top_builddir)/src/libpclasses.la $(LIBICONV) -licuuc Index: Char.cpp =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/Char.cpp,v retrieving revision 1.6 retrieving revision 1.7 diff -u -d -r1.6 -r1.7 --- Char.cpp 6 May 2005 15:32:11 -0000 1.6 +++ Char.cpp 20 May 2005 14:14:39 -0000 1.7 @@ -19,214 +19,182 @@ */ #include "pclasses/Unicode/Char.h" -#include "unicodedata.h" +#include <unicode/uchar.h> namespace P { namespace Unicode { -Char::Char(uchar_t ch) -: _char(ch) -{ } - -Char::Char(const Char& ch) -: _char(ch._char) +InvalidCharError::InvalidCharError(const char* what, const SourceInfo& si) throw() +: RuntimeError(what, si) { } -bool Char::isNumber() const +InvalidCharError::~InvalidCharError() throw() { - return isudigit(_char) == 1 ? true : false; } -bool Char::isSymbol() const -{ - bool ret = false; - switch(category()) - { - case Symbol_Math: - case Symbol_Currency: - case Symbol_Modifier: - case Symbol_Other: - ret = true; - default: - break; - } - - return ret; +Char::Char(uchar32_t ch) throw(InvalidCharError) +: _char(ch) +{ + if(ch < UCHAR_MIN_VALUE || ch > UCHAR_MAX_VALUE) + throw InvalidCharError("Invalid UNICODE character", P_SOURCEINFO); } -bool Char::isMark() const +Char::Char(const Char& ch) throw() +: _char(ch._char) { - bool ret = false; - - switch(category()) - { - case Mark_NonSpacing: - case Mark_SpacingCombining: - case Mark_Enclosing: - ret = true; - default: - break; - } +} - return ret; +bool Char::isDefined() const throw() +{ + return u_isdefined(_char); } -bool Char::isPunct() const +bool Char::isAlphabetic() const throw() { - bool ret = false; + return u_isUAlphabetic(_char); +} - switch(category()) - { - case Punctuation_Connector: - case Punctuation_Dash: - case Punctuation_Open: - case Punctuation_Close: - case Punctuation_InitialQuote: - case Punctuation_FinalQuote: - case Punctuation_Other: - ret = true; - default: - break; - } +bool Char::isAlphaNumeric() const throw() +{ + return u_isalnum(_char); +} - return ret; +bool Char::isDigit() const throw() +{ + return u_isdigit(_char); } -bool Char::isLetter() const +int Char::digitValue() const throw() { - return isualpha(_char) == 1 ? true : false; + return u_charDigitValue(_char); } -float Char::toNumber() const +bool Char::isNumeric() const throw() { - float ret = 0.0f; - const codePointData* data = lookupCodePoint(_char); + return u_getNumericValue(_char) != U_NO_NUMERIC_VALUE; +} - if(data) - { - const numberMappingData* numberData = lookupNumberMappingData(data); - if(numberData) - ret = numberData->num; - } +double Char::numericValue() const throw() +{ + return u_getNumericValue(_char); +} - return ret; +bool Char::isLower() const throw() +{ + return u_isULowercase(_char); } -bool Char::isLower() const +Char Char::toLower() const throw() { - return isulower(_char) == 1 ? true : false; + return u_tolower(_char); } -Char Char::toLower() const +bool Char::isUpper() const throw() { - return toulower(_char); + return u_isUUppercase(_char); } -bool Char::isUpper() const +Char Char::toUpper() const throw() { - return isuupper(_char) == 1 ? true : false; + return u_toupper(_char); } -Char Char::toUpper() const +bool Char::isTitle() const throw() { - return touupper(_char); + return u_istitle(_char); } -bool Char::isMirrored() const +Char Char::toTitle() const throw() { - const codePointData* data = lookupCodePoint(_char); - return data->mirrored; + return u_totitle(_char); } -Char::Category Char::category() const +bool Char::isMirrored() const throw() { - const codePointData* data = lookupCodePoint(_char); - return (Char::Category)data->category; + return u_isMirrored(_char); } -Char::BidiClass Char::bidiClass() const +Char Char::mirrorChar() const throw() { - const codePointData* data = lookupCodePoint(_char); - return (Char::BidiClass)data->bidi; + return u_charMirror(_char); } -Char::Decomposition Char::decompTag() const +bool Char::isWhiteSpace() const throw() { - const codePointData* data = lookupCodePoint(_char); - return (Char::Decomposition)data->decomp; + return u_isWhitespace(_char); } -Char::CombiningClass Char::combiningClass() const +bool Char::isIgnorable() const throw() { - const codePointData* data = lookupCodePoint(_char); - return (Char::CombiningClass)data->combining; + return u_hasBinaryProperty(_char, UCHAR_DEFAULT_IGNORABLE_CODE_POINT); } -Char::operator const uchar_t& () const throw() +Char::operator const uchar32_t& () const throw() { return _char; } -Char& Char::operator=(uchar_t ch) +Char& Char::operator=(uchar32_t ch) throw(InvalidCharError) { + if(ch < UCHAR_MIN_VALUE || ch > UCHAR_MAX_VALUE) + throw InvalidCharError("Invalid UNICODE character", P_SOURCEINFO); + _char = ch; return *this; } -Char& Char::operator=(const Char& ch) +Char& Char::operator=(const Char& ch) throw() { _char = ch._char; return *this; } -bool Char::operator==(const Char& ch) const +bool Char::operator==(const Char& ch) const throw() { return _char == ch._char; } -bool Char::operator!=(const Char& ch) const +bool Char::operator!=(const Char& ch) const throw() { return _char != ch._char; } -bool Char::operator<(const Char& ch) const +bool Char::operator<(const Char& ch) const throw() { - //@todo Char::operator< - return false; + return _char < ch._char; } -bool Char::operator>(const Char& ch) const +bool Char::operator>(const Char& ch) const throw() { - //@todo Char::operator> - return false; + return _char > ch._char; } -bool Char::operator<=(const Char& ch) const +bool Char::operator<=(const Char& ch) const throw() { return (operator<(ch) || operator==(ch)); } -bool Char::operator>=(const Char& ch) const +bool Char::operator>=(const Char& ch) const throw() { return (operator>(ch) || operator==(ch)); } -const Char& Char::eof() +const Char& Char::eof() throw() { - static Char _eof(UEOF); + static Char _eof(0); return _eof; } -const Char& Char::nl() +const Char& Char::nl() throw() { static Char _nl('\n'); return _nl; } -const Char& Char::cr() +const Char& Char::cr() throw() { static Char _cr('\r'); return _cr; --- uctype.cpp DELETED --- Index: String.cpp =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/String.cpp,v retrieving revision 1.6 retrieving revision 1.7 diff -u -d -r1.6 -r1.7 --- String.cpp 6 May 2005 15:32:11 -0000 1.6 +++ String.cpp 20 May 2005 14:14:39 -0000 1.7 @@ -23,60 +23,140 @@ #include <cstring> #include <sstream> -#include <iconv.h> -#include <errno.h> +#include <unicode/ustring.h> #include <langinfo.h> +#include <errno.h> namespace P { namespace Unicode { +struct String::Data { + size_t size; // size of str counted in uchar16_t + size_t length; // length of str counted in uchar16_t + uchar16_t str[1]; +}; + String::String() { } +String::String(size_t sz) +: _data(alloc(sz * 2)) +{ +} + String::String(const String& str) -: _str(str._str) +: _data(str._data) { } -String::String(const ustring& str) -: _str(str) +String::String(const String& str, size_t offset, size_t count/*= npos*/) { + if(!str._data) + return; + + if(count == npos) + count = str.size() - offset; + + _data = alloc(count * 2); + + size_t charOffset = 0; + U16_FWD_N(str._data->str, charOffset, str._data->length, offset); + + size_t charCount = charOffset; + U16_FWD_N(str._data->str, charCount, str._data->length, count); + charCount -= charOffset; + + u_strncpy(_data->str, str._data->str + charOffset, charCount); + _data->length = charCount; } String::String(const std::string& str) { - *this = fromLatin1(str.c_str(), str.size()); + *this = fromLocal(str.c_str(), str.size()); } String::String(const char* str, size_t count) { - *this = fromLatin1(str, count); + *this = fromLocal(str, count); } String::~String() throw() { } +String::Data* String::alloc(size_t size) +{ + Data* d = (Data*)new char[sizeof(Data) + sizeof(uchar16_t) * size]; + d->size = size; + d->length = 0; + d->str[0] = 0; + return d; +} + +void String::deepCopy() +{ + if(_data.useCount() > 1) + { + Data* newData = (Data*)new char[sizeof(Data) + sizeof(uchar16_t) * _data->size]; + newData->size = _data->size; + newData->length = _data->length; + u_strncpy(newData->str, _data->str, _data->length); + _data = newData; + } +} + void String::swap(String& b) { - std::swap(b._str, _str); + _data.swap(b._data); } bool String::empty() const throw() { - return _str.empty(); + if(_data) + return !_data->length; + + return true; } size_t String::size() const throw() { - return _str.size(); + if(_data) + return u_countChar32(_data->str, _data->length); + + return 0; } -String String::substr(size_t offset, size_t length) const +size_t String::capacity() const throw() { - return _str.substr(offset, length); + if(_data) + return _data->size; + + return 0; +} + +void String::resize(size_t sz) +{ + if(!sz) + { + _data = 0; + return; + } + + Data* newData = alloc(sz); + if(_data) + { + newData->length = _data->length < sz ? _data->length : sz; + u_strncpy(newData->str, _data->str, newData->length); + } + + _data = newData; +} + +String String::substr(size_t offset, size_t length/*=npos*/) const +{ + return String(*this, offset, length); } String String::left(size_t length) const @@ -86,137 +166,234 @@ String String::right(size_t length) const { - return substr(_str.size() - length, length); + return substr(size() - length, length); } String& String::append(const String& str) { - _str.append(str._str); + if(!_data || _data->length == 0) + { + _data = str._data; + return *this; + } + + if(!str._data || str._data->length == 0) + return *this; + + if(_data->size <= _data->length + str._data->length) + resize(_data->length + str._data->length); + else + deepCopy(); + + u_strncat(_data->str, str._data->str, str._data->length); return *this; } String& String::append(const Char& ch) { - _str.append(1, ch); + if(!_data) + _data = alloc(2); + else if(_data->size <= _data->length + 2) + resize(_data->length + 2); + else + deepCopy(); + + bool error = false; + U16_APPEND(_data->str, _data->length, _data->size, (const uchar32_t)ch, error); + return *this; } -size_t String::find(const String& str, size_t pos) const +#define getUCharOffset(data, pos, offset) U16_FWD_N(data->str, offset, data->length, pos) + +size_t String::find(const String& str, size_t pos/*=0*/) const { - return _str.find(str._str, pos); + if(!_data || !_data->length || !str._data || !str._data->length) + return npos; + + size_t offset = 0; + getUCharOffset(_data, pos, offset); + + UChar* res = u_strFindFirst(_data->str + offset, _data->length, + str._data->str, str._data->length); + + if(!res) + return npos; + + return u_countChar32(_data->str + offset, res - (_data->str + offset)); } -size_t String::find(const Char& ch, size_t pos) const +size_t String::find(const Char& ch, size_t pos/*=0*/) const { - return _str.find(ch, pos); + if(!_data || !_data->length) + return npos; + + size_t offset = 0; + getUCharOffset(_data, pos, offset); + + UChar* res = u_memchr32(_data->str + offset, (const uchar32_t&)ch, _data->length); + + if(!res) + return npos; + + return u_countChar32(_data->str + offset, res - (_data->str + offset)); } -size_t String::rfind(const String& str, size_t pos) const +size_t String::rfind(const String& str, size_t pos/*=npos*/) const { - return _str.rfind(str._str, pos); + if(!_data || !_data->length || !str._data || !str._data->length) + return npos; + + if(pos==npos) + pos = size(); + + size_t searchLen = 0; + getUCharOffset(_data, pos, searchLen); + + UChar* res = u_strFindLast(_data->str, searchLen, + str._data->str, str._data->length); + + if(!res) + return npos; + + return u_countChar32(_data->str, res - _data->str); } -size_t String::rfind(const Char& ch, size_t pos) const +size_t String::rfind(const Char& ch, size_t pos/*=npos*/) const { - return _str.rfind(ch, pos); + if(!_data || !_data->length) + return npos; + + if(pos==npos) + pos = size(); + + size_t searchLen = 0; + getUCharOffset(_data, pos, searchLen); + + UChar* res = u_memrchr32(_data->str, (const uchar32_t&)ch, searchLen); + + if(!res) + return npos; + + return u_countChar32(_data->str, res - _data->str); } -size_t String::find_first_of(const String& str, size_t pos) const +size_t String::find_first_of(const String& str, size_t pos/*=0*/) const { - return _str.find_first_of(str._str, pos); + return find(str, pos); } -size_t String::find_first_of(const Char& ch, size_t pos) const +size_t String::find_first_of(const Char& ch, size_t pos/*=0*/) const { - return _str.find_first_of(ch, pos); + return find(ch, pos); } size_t String::find_first_not_of(const String& str, size_t pos) const { - return _str.find_first_not_of(str._str, pos); + return 0; } size_t String::find_first_not_of(const Char& ch, size_t pos) const { - return _str.find_first_not_of(ch, pos); + return 0; } -size_t String::find_last_of(const String& str, size_t pos) const +size_t String::find_last_of(const String& str, size_t pos /*=npos*/) const { - return _str.find_last_of(str._str, pos); + return rfind(str, pos); } -size_t String::find_last_of(const Char& ch, size_t pos) const +size_t String::find_last_of(const Char& ch, size_t pos /*=npos*/) const { - return _str.find_last_of(ch, pos); + return rfind(ch, pos); } size_t String::find_last_not_of(const String& str, size_t pos) const { - return _str.find_last_not_of(str._str, pos); + return 0; } size_t String::find_last_not_of(const Char& ch, size_t pos) const { - return _str.find_last_not_of(ch, pos); + return 0; } -const Char& String::at(size_t pos) const +Char String::at(size_t pos) const { - return at(pos); + uchar32_t cp; + U16_GET(_data->str, 0, pos, _data->length, cp); + return Char(cp); } -Char& String::at(size_t pos) +int String::compare(const String& str) const throw() { - return at(pos); + if(!_data || _data->length == 0) + { + if(!str._data || _data->length == 0) + return 0; + + return +1; + } + else if(!str._data || _data->length == 0) + return -1; + + return u_strncmp(_data->str, str._data->str, + _data->length < str._data->length ? _data->length : str._data->length); } std::string String::local() const { - return toCharset(nl_langinfo(CODESET)); -} + if(!_data || _data->length == 0) + return std::string(); -std::string String::latin1() const -{ - return toCharset("LATIN1"); + char* ret = new char[_data->length]; + u_austrncpy(ret, _data->str, _data->length); + std::string retstr(ret); + delete[] ret; + return retstr; } std::string String::utf8() const { - return toCharset("UTF8"); -} + if(!_data || _data->length == 0) + return std::string(); -const Char& String::operator[](size_t pos) const -{ - return (Char&)_str.operator[](pos); -} + ::int32_t resultLen = 0; + UErrorCode errorCode = U_ZERO_ERROR; -Char& String::operator[](size_t pos) -{ - return (Char&)_str.operator[](pos); + // pre-flight .. get the required storage ... + u_strToUTF8(0, 0, &resultLen, _data->str, _data->length, &errorCode); + + char* ret = new char[resultLen + 1]; + ::int32_t retLen = 0; + + errorCode = U_ZERO_ERROR; + u_strToUTF8(ret, resultLen + 1, &retLen, _data->str, _data->length, &errorCode); + std::string retstr(ret); + delete[] ret; + return retstr; } -String& String::operator=(const String& str) +Char String::operator[](size_t pos) const { - _str = str._str; - return *this; + return at(pos); } -String& String::operator=(const ustring& str) +String& String::operator=(const String& str) { - _str = str; + _data = str._data; return *this; } String& String::operator=(const char* str) { - *this = fromLatin1(str); + *this = fromLocal(str); return *this; } String& String::operator=(const std::string& str) { - *this = fromLatin1(str.c_str(), str.size()); + *this = fromLocal(str.c_str(), str.size()); return *this; } @@ -234,23 +411,22 @@ bool String::operator==(const String& str) const throw() { - return _str == str._str; + return compare(str) == 0; } bool String::operator!=(const String& str) const throw() { - return !operator==(str); - return false; + return compare(str) != 0; } bool String::operator<(const String& str) const throw() { - return _str < str._str; + return compare(str) < 0; } bool String::operator>(const String& str) const throw() { - return _str > str._str; + return compare(str) > 0; } bool String::operator<=(const String& str) const throw() @@ -268,15 +444,14 @@ if(count == npos) count = strlen(str); - return fromCharset(nl_langinfo(CODESET), str, count); -} + SharedPtr<Data> data(alloc(count + 1)); -String String::fromLatin1(const char* str, size_t count) -{ - if(count == npos) - count = strlen(str); + u_uastrncpy(data->str, str, count); + data->length = u_strlen(data->str); - return fromCharset("LATIN1", str, count); + String ret; + ret._data = data; + return ret; } String String::fromUtf8(const char* str, size_t count) @@ -294,85 +469,12 @@ String String::fromCharset(const char* charset, const char* str, size_t count) { - ustring outstr; - char outstrbuf[1024]; - - iconv_t icd = iconv_open("UCS4", charset); - - char* inbuf = (char*)str; - size_t inbytesleft = count; - - char* outbuf = (char*)outstrbuf; - size_t outbytesleft = sizeof(outstrbuf); - -_do_iconv: - size_t ret = iconv(icd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); - - // get number of chars converted ... - unsigned int numchars = sizeof(outstrbuf) - outbytesleft; - unsigned int numuchars = numchars / sizeof(uchar_t); - - // append already converted chars to our string ... - outstr.append((uchar_t*)outstrbuf, numuchars); - - if(ret == (size_t)-1) - { - if(errno == E2BIG) - { - outbytesleft = sizeof(outstrbuf); - outbuf = outstrbuf; - goto _do_iconv; - } - else - { - //TODO: throw InvalidString - throw; - } - } - - iconv_close(icd); - return outstr; + return String(); } std::string String::toCharset(const char* charset) const { - std::string outstr; - char outstrbuf[1024]; - - iconv_t icd = iconv_open(charset, "UCS4"); - - char* inbuf = (char*)_str.c_str(); - size_t inbytesleft = _str.size() * sizeof(uchar_t); - - char* outbuf = (char*)outstrbuf; - size_t outbytesleft = sizeof(outstrbuf); - -_do_iconv: - size_t ret = iconv(icd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); - - // get number of chars converted ... - unsigned int numchars = sizeof(outstrbuf) - outbytesleft; - - // append already converted chars to our string ... - outstr.append(outstrbuf, numchars); - - if(ret == (size_t)-1) - { - if(errno == E2BIG) - { - outbytesleft = sizeof(outstrbuf); - outbuf = outstrbuf; - goto _do_iconv; - } - else - { - //TODO: throw InvalidString - throw; - } - } - - iconv_close(icd); - return outstr; + return std::string(); } String operator+(const String& lhs, const String& rhs) @@ -387,7 +489,7 @@ std::ostream& operator<<(std::ostream& os, const String& str) { - os << str.utf8(); + os << str.local(); return os; } --- unicodedata.h DELETED --- --- ustring.cpp DELETED --- --- unicodedata.awk DELETED --- |