From: Christian P. <cp...@us...> - 2005-04-26 12:16:21
|
Update of /cvsroot/pclasses/pclasses2/src/Unicode In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26729/src/Unicode Modified Files: Char.cpp Makefile.am String.cpp uctype.cpp ustring.cpp Log Message: - More work on unicode support Index: String.cpp =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/String.cpp,v retrieving revision 1.3 retrieving revision 1.4 diff -u -d -r1.3 -r1.4 --- String.cpp 27 Dec 2004 07:00:31 -0000 1.3 +++ String.cpp 26 Apr 2005 12:16:02 -0000 1.4 @@ -1,5 +1,5 @@ /*************************************************************************** - * Copyright (C) 2004 by Christian Prochnow * + * Copyright (C) 2004,2005 by Christian Prochnow, SecuLogiX GmbH * * cp...@se... * * * * This program is free software; you can redistribute it and/or modify * @@ -19,50 +19,40 @@ ***************************************************************************/ #include "pclasses/Unicode/String.h" -#include "pclasses/ScopedPtr.h" -#include "pclasses/ScopedArrayPtr.h" -#include "pclasses/Algorithm.h" #include <cstring> #include <sstream> +#include <iconv.h> +#include <errno.h> +#include <langinfo.h> + namespace P { namespace Unicode { -String::String(size_t reserve) throw(OutOfMemory) -: _offset(0), _length(0) -{ - ScopedPtr<Data> d(new Data); - d->str = (Char*)new char[sizeof(Char) * reserve]; - d->size = reserve; - _data = d.release(); -} - -String::String(const String& str) throw(OutOfMemory) -: _offset(str._offset), _length(str._length), _data(str._data) +String::String() { } -String::String(const char* str, size_t count) throw(OutOfMemory) +String::String(const String& str) +: _str(str._str) { - *this = fromLatin1(str, count); } -String::String(const wchar_t* str, size_t count) throw(OutOfMemory) +String::String(const ustring& str) +: _str(str) { - *this = fromUcs2(str, count); } -String::String(const String& str, size_t offset, size_t length) - throw(OutOfMemory) -: _offset(str._offset + offset), _length(length), _data(str._data) +String::String(const std::string& str) { + *this = fromLatin1(str.c_str(), str.size()); } -String::String(const std::string& str) +String::String(const char* str, size_t count) { - *this = fromLatin1(str.c_str()); + *this = fromLatin1(str, count); } String::~String() throw() @@ -71,150 +61,89 @@ void String::swap(String& b) { - SharedPtr<Data> tmp = _data; - _data = b._data; - b._data = tmp; + std::swap(b._str, _str); } size_t String::size() const throw() { - return _data->size; -} - -void String::resize(size_t sz) throw(OutOfMemory) -{ - resize(sz, 0, 0); + return _str.size(); } -void String::resize(size_t sz, size_t holeOffset, size_t holeLen) throw(OutOfMemory) +String String::substr(size_t offset, size_t length) const + throw(OutOfBounds) { - SharedPtr<Data> newData(new Data); - char* newStr = 0; - size_t newLength = 0; - - size_t length = sz >= _length ? _length : sz; - - if(holeLen == 0 || holeOffset >= length) - { - size_t reserve = sz - length; - - newStr = new char[sizeof(Char) * sz]; - copy_construct((Char*)newStr, data(), length); - construct((Char*)newStr + length, reserve); - - newLength = length; - } - else - { - size_t len1 = holeOffset - length; - size_t off2 = holeOffset; - size_t len2 = length - off2; - - newStr = new char[sizeof(Char) * sz]; - copy_construct((Char*)newStr, data(), len1); - construct((Char*)newStr + holeOffset, holeLen); - copy_construct((Char*)newStr + off2, data(), len2); - - newLength = length + holeLen; - } + if(offset + length > _str.size()) + throw OutOfBounds("Offset and/or length is out of bounds", P_SOURCEINFO); - newData->str = (Char*)newStr; - newData->size = sz; - _data = newData; - _offset = 0; - _length = newLength; + return _str.substr(offset, length); } -size_t String::length() const throw() +String String::left(size_t length) const throw(OutOfBounds) { - return _length; + return substr(0, length); } -std::string String::utf8() const +String String::right(size_t length) const throw(OutOfBounds) { - std::ostringstream os; - ConstIterator i = begin(); - while(i != end()) - { - os << (*i).latin1(); - ++i; - } - - return os.str(); + return substr(_str.size() - length, length); } -String String::part(size_t offset, size_t length) const - throw(OutOfMemory, OutOfBounds) +String& String::append(const String& str) { - if(offset + length > _length) - throw OutOfBounds("Offset and/or length is out of bounds", P_SOURCEINFO); - - return String(*this, offset, length); + _str.append(str._str); + return *this; } -String String::left(size_t length) const throw(OutOfMemory, OutOfBounds) +const Char& String::at(size_t pos) const throw(OutOfBounds) { - return part(0, length); + return operator[](pos); } -String String::right(size_t length) const throw(OutOfMemory, OutOfBounds) +Char& String::at(size_t pos) throw(OutOfBounds) { - return part(_length - length, length); + return operator[](pos); } -String String::deepCopy() const +std::string String::local() const { - String str(_length + 8); - copy_construct(str.data(), data(), _length); - return str; + return toCharset(nl_langinfo(CODESET)); } -const Char& String::at(size_t pos) const throw(OutOfBounds) +std::string String::latin1() const { - if(_offset + pos >= _length) - throw OutOfBounds("Index is out of bounds", P_SOURCEINFO); - - return *(data() + pos); + return toCharset("LATIN1"); } -Char& String::at(size_t pos) throw(OutOfMemory, OutOfBounds) +std::string String::utf8() const { - return operator[](pos); + return toCharset("UTF8"); } -Char& String::operator[](ptrdiff_t pos) throw(OutOfMemory, OutOfBounds) +const Char& String::operator[](size_t pos) const throw(OutOfBounds) { - if( static_cast<size_t>( pos ) - /** ACHTUNG ^^^^ cast added by stephan to avoid warning. - @Christian: how do you want to handle this case? - */ - >= _length) + if(pos >= _str.size()) throw OutOfBounds("Index is out of bounds", P_SOURCEINFO); - if(_data.useCount() > 1) - *this = deepCopy(); + return (Char&)_str.operator[](pos); +} - // construct Char's that were not yet constructed, but allocated - /*if(pos >= _length) - size_t ccount = pos - _length; - construct(_data->str + _length, ccount); - _length += pos + 1; - }*/ +Char& String::operator[](size_t pos) throw(OutOfBounds) +{ + if(pos >= _str.size()) + throw OutOfBounds("Index is out of bounds", P_SOURCEINFO); - return *(_data->str + _offset + pos); + return (Char&)_str.operator[](pos); } String& String::operator=(const String& str) { - if(this != &str) - { - if(_data != str._data) - _data = str._data; - - _offset = str._offset; - _length = str._length; - } + _str = str._str; + return *this; +} +String& String::operator=(const ustring& str) +{ + _str = str; return *this; } @@ -224,59 +153,39 @@ return *this; } -String& String::operator=(const wchar_t* str) +String& String::operator=(const std::string& str) { - *this = fromUcs2(str); + *this = fromLatin1(str.c_str(), str.size()); return *this; } -String& String::operator=(const std::string& str) +String& String::operator+=(const String& str) { - *this = fromLatin1(str.c_str()); + append(str); return *this; } bool String::operator==(const String& str) const throw() { - if(&str == this || (str.data() == data() && str._length == _length)) - return true; - - if(str._length == _length) - { - Char* lhsBegin = data(); - Char* lhsEnd = data() + _length; - Char* rhsBegin = str.data(); - - while(lhsBegin != lhsEnd) - { - if(*lhsBegin != *rhsBegin) - return false; - - ++lhsBegin, ++rhsBegin; - } - - return true; - } - - return false; + return _str == str._str; } bool String::operator!=(const String& str) const throw() { return !operator==(str); - return false; + return false; } bool String::operator<(const String& str) const throw() { //@todo String::operator< - return false; + return false; } bool String::operator>(const String& str) const throw() { //@todo String::operator> - return false; + return false; } bool String::operator<=(const String& str) const throw() @@ -289,256 +198,118 @@ return (operator>(str) || operator==(str)); } -Char* String::data() const throw() -{ - return _data->str + _offset; -} - -String String::fromLatin1(const char* str, size_t count) throw(OutOfMemory) +String String::fromLocal(const char* str, size_t count) { if(count == npos) count = strlen(str); - Char ch; - String str2(count + 8); - - const char* end = str + count; - Char* mystr = str2.data(); - - while(str != end) - { - ch = *(str++); - copy_construct(mystr++, &ch, 1); - } - - str2._length = count; - return str2; + return fromCharset(nl_langinfo(CODESET), str, count); } -String String::fromUcs2(const wchar_t* str, size_t count) throw(OutOfMemory) +String String::fromLatin1(const char* str, size_t count) { if(count == npos) - count = wcslen(str); - - Char ch; - String str2(count + 8); - - const wchar_t* end = str + count; - Char* mystr = str2.data(); - - while(str != end) - { - ch = *(str++); - copy_construct(mystr++, &ch, 1); - } - - str2._length = count; - return str2; -} - -String::Iterator String::begin() -{ - if(_data.useCount() > 1) - *this = deepCopy(); - - return Iterator(data()); -} - -String::Iterator String::end() -{ - if(_data.useCount() > 1) - *this = deepCopy(); - - return Iterator(data() + _length); -} - -String::ConstIterator String::begin() const -{ - return ConstIterator(data()); -} - -String::ConstIterator String::end() const -{ - return ConstIterator(data() + _length); -} - -void String::insert(const Iterator& pos, const Char& ch) -{ - if(_data.useCount() > 1 || _length + 1 > _data->size) - resize(size() + 1, &(*pos) - data(), 1); - else - { /* @todo */ } - - *pos = ch; -} - -void String::insert(const Iterator& pos, const String& str) -{ - -} - -void String::insert(size_t pos, const Char& ch) -{ - insert(Iterator(data() + pos), ch); -} - -void String::insert(size_t pos, const String& str) -{ - insert(Iterator(data() + pos), str); -} - -void String::append(const Char& ch) -{ - insert(end(), ch); -} - -void String::append(const String& str) -{ - insert(end(), str); -} - -void String::prepend(const Char& ch) -{ - insert(begin(), ch); -} - -void String::prepend(const String& str) -{ - insert(begin(), str); -} - -void String::erase(const Iterator& pos) -{ -} - -void String::erase(size_t pos) -{ - erase(Iterator(data() + pos)); -} - + count = strlen(str); -String::Iterator::Iterator(const String::Iterator& iter) -: _current(iter._current) -{ + return fromCharset("LATIN1", str, count); } -String::Iterator::Iterator(Char* current) -: _current(current) +String String::fromUtf8(const char* str, size_t count) { -} + if(count == npos) + count = strlen(str); -String::Iterator::~Iterator() -{ + return fromCharset("UTF8", str, count); } -Char& String::Iterator::operator*() const +String String::fromUcs2(const char* str, size_t count) { - return *_current; + return fromCharset("UCS2", (const char*)str, count); } -String::Iterator& String::Iterator::operator++() +String String::fromCharset(const char* charset, const char* str, size_t count) { - ++_current; - return *this; -} + ustring outstr; + char outstrbuf[1024]; -String::Iterator String::Iterator::operator++(int) -{ - Iterator tmp = *this; - ++_current; - return tmp; -} + iconv_t icd = iconv_open("UCS4", charset); -String::Iterator& String::Iterator::operator--() -{ - --_current; - return *this; -} + char* inbuf = (char*)str; + size_t inbytesleft = count; -String::Iterator String::Iterator::operator--(int) -{ - Iterator tmp = *this; - --_current; - return tmp; -} + char* outbuf = (char*)outstrbuf; + size_t outbytesleft = sizeof(outstrbuf); -String::Iterator& String::Iterator::operator=(const String::Iterator& iter) -{ - _current = iter._current; - return *this; -} +_do_iconv: + size_t ret = iconv(icd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); -bool String::Iterator::operator==(const String::Iterator& iter) const -{ - return (_current == iter._current); -} + // get number of chars converted ... + unsigned int numchars = sizeof(outstrbuf) - outbytesleft; + unsigned int numuchars = numchars / sizeof(uchar_t); -bool String::Iterator::operator!=(const String::Iterator& iter) const -{ - return (_current != iter._current); -} + // append already converted chars to our string ... + outstr.append((uchar_t*)outstrbuf, numuchars); + if(ret == (size_t)-1) + { + if(errno == E2BIG) + { + outbytesleft = sizeof(outstrbuf); + outbuf = outstrbuf; + goto _do_iconv; + } + else + { + //TODO: throw InvalidString + throw; + } + } -String::ConstIterator::ConstIterator(const String::ConstIterator& iter) -: _current(iter._current) -{ + iconv_close(icd); + return outstr; } -String::ConstIterator::ConstIterator(const Char* current) -: _current(current) +std::string String::toCharset(const char* charset) const { -} + std::string outstr; + char outstrbuf[1024]; -String::ConstIterator::~ConstIterator() -{ -} + iconv_t icd = iconv_open(charset, "UCS4"); -const Char& String::ConstIterator::operator*() const -{ - return *_current; -} + char* inbuf = (char*)_str.c_str(); + size_t inbytesleft = _str.size() * sizeof(uchar_t); -String::ConstIterator& String::ConstIterator::operator++() -{ - ++_current; - return *this; -} + char* outbuf = (char*)outstrbuf; + size_t outbytesleft = sizeof(outstrbuf); -String::ConstIterator String::ConstIterator::operator++(int) -{ - ConstIterator tmp = *this; - ++_current; - return tmp; -} +_do_iconv: + size_t ret = iconv(icd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); -String::ConstIterator& String::ConstIterator::operator--() -{ - --_current; - return *this; -} + // get number of chars converted ... + unsigned int numchars = sizeof(outstrbuf) - outbytesleft; -String::ConstIterator String::ConstIterator::operator--(int) -{ - ConstIterator tmp = *this; - --_current; - return tmp; -} + // append already converted chars to our string ... + outstr.append(outstrbuf, numchars); -String::ConstIterator& String::ConstIterator::operator=(const String::ConstIterator& iter) -{ - _current = iter._current; - return *this; -} + if(ret == (size_t)-1) + { + if(errno == E2BIG) + { + outbytesleft = sizeof(outstrbuf); + outbuf = outstrbuf; + goto _do_iconv; + } + else + { + //TODO: throw InvalidString + throw; + } + } -bool String::ConstIterator::operator==(const String::ConstIterator& iter) const -{ - return (_current == iter._current); + iconv_close(icd); + return outstr; } -bool String::ConstIterator::operator!=(const String::ConstIterator& iter) const -{ - return (_current != iter._current); -} } // !namespace Unicode Index: Makefile.am =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/Makefile.am,v retrieving revision 1.7 retrieving revision 1.8 diff -u -d -r1.7 -r1.8 --- Makefile.am 17 Jan 2005 22:50:41 -0000 1.7 +++ Makefile.am 26 Apr 2005 12:16:02 -0000 1.8 @@ -22,7 +22,7 @@ libpclasses_unicode_la_LDFLAGS = -no-undefined -libpclasses_unicode_la_LIBADD = $(top_builddir)/src/libpclasses.la +libpclasses_unicode_la_LIBADD = $(top_builddir)/src/libpclasses.la $(LIBICONV) all: unicodedata_db Index: ustring.cpp =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/ustring.cpp,v retrieving revision 1.1 retrieving revision 1.2 diff -u -d -r1.1 -r1.2 --- ustring.cpp 14 Jan 2005 14:46:02 -0000 1.1 +++ ustring.cpp 26 Apr 2005 12:16:02 -0000 1.2 @@ -20,6 +20,8 @@ #include "pclasses/Unicode/ustring.h" #include <string.h> +#include <iconv.h> +#include <errno.h> namespace P { @@ -45,7 +47,7 @@ size_t ucslen(const uchar_t* s) { size_t n = 0; - while(*(s++) != 0) + while(*(s++) != UEOF) ++n; return n; @@ -80,19 +82,6 @@ return s; } -ustring str(const char* str) -{ - size_t len = strlen(str); - ustring ret; - ret.reserve(len); - - size_t i = 0; - while(len-- > 0) - ret[i++] = touchar(*(str++)); - - return ret; -} - } // !namespace Unicode } // !namespace P Index: Char.cpp =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/Char.cpp,v retrieving revision 1.3 retrieving revision 1.4 diff -u -d -r1.3 -r1.4 --- Char.cpp 14 Jan 2005 14:46:02 -0000 1.3 +++ Char.cpp 26 Apr 2005 12:16:02 -0000 1.4 @@ -25,55 +25,18 @@ namespace Unicode { -Char::Char(uint32_t ch) +Char::Char(uchar_t ch) : _char(ch) { } -Char::Char(char ch) -: _char(ch) -{ -} - -Char::Char(wchar_t ch) -: _char(ch) -{ -} - Char::Char(const Char& ch) : _char(ch._char) { } -char Char::latin1() const -{ - if(_char < 0x7f) - return _char; - - return '?'; -} - -wchar_t Char::ucs2() const -{ - //@todo - return '?'; -} - bool Char::isNumber() const { - bool ret = false; - - switch(category()) - { - case Number_DecimalDigit: - case Number_Letter: - case Number_Other: - ret = true; - break; - default: - break; - } - - return ret; + return isudigit(_char) == 1 ? true : false; } bool Char::isSymbol() const @@ -134,21 +97,7 @@ bool Char::isLetter() const { - bool ret = false; - - switch(category()) - { - case Letter_Uppercase: - case Letter_Lowercase: - case Letter_Titlecase: - case Letter_Modifier: - case Letter_Other: - ret = true; - default: - break; - } - - return ret; + return isualpha(_char) == 1 ? true : false; } int Char::toNumber() const @@ -182,36 +131,22 @@ bool Char::isLower() const { - return category() == Letter_Lowercase; + return isulower(_char) == 1 ? true : false; } Char Char::toLower() const { - const codePointData* data = lookupCodePoint(_char); - if(data && data->extra) - { - const letterExtraData* extraData = (const letterExtraData*)data->extra; - return Char(extraData->lower); - } - - return *this; + return toulower(_char); } bool Char::isUpper() const { - return category() == Letter_Uppercase; + return isuupper(_char) == 1 ? true : false; } Char Char::toUpper() const { - const codePointData* data = lookupCodePoint(_char); - if(data && data->extra) - { - const letterExtraData* extraData = (const letterExtraData*)data->extra; - return Char(extraData->upper); - } - - return *this; + return touupper(_char); } bool Char::isMirrored() const @@ -244,19 +179,7 @@ return (Char::CombiningClass)data->combining; } -Char& Char::operator=(uint32_t ch) -{ - _char = ch; - return *this; -} - -Char& Char::operator=(char ch) -{ - _char = ch; - return *this; -} - -Char& Char::operator=(wchar_t ch) +Char& Char::operator=(uchar_t ch) { _char = ch; return *this; @@ -302,7 +225,7 @@ const Char& Char::eof() { - static Char _eof((char)0); + static Char _eof(UEOF); return _eof; } Index: uctype.cpp =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/uctype.cpp,v retrieving revision 1.1 retrieving revision 1.2 diff -u -d -r1.1 -r1.2 --- uctype.cpp 14 Jan 2005 14:46:02 -0000 1.1 +++ uctype.cpp 26 Apr 2005 12:16:02 -0000 1.2 @@ -174,14 +174,6 @@ return c; } -uchar_t touchar(char c) -{ - if(c < 0x7f) - return c; - - return '?'; -} - } // !namespace Unicode } // !namespace P |