From: <tbr...@us...> - 2012-03-20 13:35:22
|
Revision: 200 http://xmlwrapp.svn.sourceforge.net/xmlwrapp/?rev=200&view=rev Author: tbrowder2 Date: 2012-03-20 13:35:13 +0000 (Tue, 20 Mar 2012) Log Message: ----------- add ustring source from glibmm project Modified Paths: -------------- trunk/src/Makefile.am Added Paths: ----------- trunk/src/libustring/ trunk/src/libustring/ustring.cc trunk/src/libustring/ustring.h Modified: trunk/src/Makefile.am =================================================================== --- trunk/src/Makefile.am 2012-03-20 13:21:45 UTC (rev 199) +++ trunk/src/Makefile.am 2012-03-20 13:35:13 UTC (rev 200) @@ -2,16 +2,24 @@ AM_CPPFLAGS = -I$(top_srcdir)/include $(CXXFLAGS_VISIBILITY) if WITH_XSLT -lib_LTLIBRARIES = libxmlwrapp.la libxsltwrapp.la +lib_LTLIBRARIES = libxmlwrapp.la libxsltwrapp.la libustring.la else -lib_LTLIBRARIES = libxmlwrapp.la +lib_LTLIBRARIES = libxmlwrapp.la libustring.la endif +libustring_la_CPPFLAGS = $(AM_CPPFLAGS) $(LIBXML_CFLAGS) +libustring_la_LIBADD = $(LIBXML_LIBS) +libustring_la_LDFLAGS = -version-info 6:0:1 -no-undefined + +libustring_la_SOURCES = \ + libustring/ustring.cc \ + libustring/ustring.h \ + libxmlwrapp_la_CPPFLAGS = $(AM_CPPFLAGS) $(LIBXML_CFLAGS) -libxmlwrapp_la_LIBADD = $(LIBXML_LIBS) -libxmlwrapp_la_LDFLAGS = -version-info 6:0:1 -no-undefined +libxmlwrapp_la_LIBADD = $(LIBXML_LIBS) +libxmlwrapp_la_LDFLAGS = -version-info 6:0:1 -no-undefined -libxmlwrapp_la_SOURCES = \ +libxmlwrapp_la_SOURCES = \ libxml/ait_impl.cc \ libxml/ait_impl.h \ libxml/attributes.cc \ Added: trunk/src/libustring/ustring.cc =================================================================== --- trunk/src/libustring/ustring.cc (rev 0) +++ trunk/src/libustring/ustring.cc 2012-03-20 13:35:13 UTC (rev 200) @@ -0,0 +1,1418 @@ +// -*- c++ -*- +/* $Id$ */ + +/* Copyright (C) 2002 The gtkmm Development Team + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <glibmmconfig.h> +#include <glibmm/ustring.h> +#include <glibmm/convert.h> +#include <glibmm/error.h> +#include <glibmm/utility.h> + +#include <algorithm> +#include <iostream> +#include <cstring> +# include <stdexcept> +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +namespace +{ + +using Glib::ustring; + +// Little helper to make the conversion from gunichar to UTF-8 a one-liner. +// +struct UnicharToUtf8 +{ + char buf[6]; + ustring::size_type len; + + explicit UnicharToUtf8(gunichar uc) + : len (g_unichar_to_utf8(uc, buf)) {} +}; + + +// All utf8_*_offset() functions return npos if offset is out of range. +// The caller should decide if npos is a valid argument and just marks +// the whole string, or if it is not allowed (e.g. for start positions). +// In the latter case std::out_of_range should be thrown, but usually +// std::string will do that for us. + +// First overload: stop on '\0' character. +static +ustring::size_type utf8_byte_offset(const char* str, ustring::size_type offset) +{ + if(offset == ustring::npos) + return ustring::npos; + + const char *const utf8_skip = g_utf8_skip; + const char* p = str; + + for(; offset != 0; --offset) + { + const unsigned int c = static_cast<unsigned char>(*p); + + if(c == 0) + return ustring::npos; + + p += utf8_skip[c]; + } + + return (p - str); +} + +// Second overload: stop when reaching maxlen. +static +ustring::size_type utf8_byte_offset(const char* str, ustring::size_type offset, + ustring::size_type maxlen) +{ + if(offset == ustring::npos) + return ustring::npos; + + const char *const utf8_skip = g_utf8_skip; + const char *const pend = str + maxlen; + const char* p = str; + + for(; offset != 0; --offset) + { + if(p >= pend) + return ustring::npos; + + p += utf8_skip[static_cast<unsigned char>(*p)]; + } + + return (p - str); +} + +// Third overload: stop when reaching str.size(). +// +inline +ustring::size_type utf8_byte_offset(const std::string& str, ustring::size_type offset) +{ + return utf8_byte_offset(str.data(), offset, str.size()); +} + +// Takes UTF-8 character offset and count in ci and cn. +// Returns the byte offset and count in i and n. +// +struct Utf8SubstrBounds +{ + ustring::size_type i; + ustring::size_type n; + + Utf8SubstrBounds(const std::string& str, ustring::size_type ci, ustring::size_type cn) + : + i (utf8_byte_offset(str, ci)), + n (ustring::npos) + { + if(i != ustring::npos) + n = utf8_byte_offset(str.data() + i, cn, str.size() - i); + } +}; + +// Converts byte offset to UTF-8 character offset. +inline +ustring::size_type utf8_char_offset(const std::string& str, ustring::size_type offset) +{ + if(offset == ustring::npos) + return ustring::npos; + + const char *const pdata = str.data(); + return g_utf8_pointer_to_offset(pdata, pdata + offset); +} + + +// Helper to implement ustring::find_first_of() and find_first_not_of(). +// Returns the UTF-8 character offset, or ustring::npos if not found. +static +ustring::size_type utf8_find_first_of(const std::string& str, ustring::size_type offset, + const char* utf8_match, long utf8_match_size, + bool find_not_of) +{ + const ustring::size_type byte_offset = utf8_byte_offset(str, offset); + if(byte_offset == ustring::npos) + return ustring::npos; + + long ucs4_match_size = 0; + const Glib::ScopedPtr<gunichar> ucs4_match + (g_utf8_to_ucs4_fast(utf8_match, utf8_match_size, &ucs4_match_size)); + + const gunichar *const match_begin = ucs4_match.get(); + const gunichar *const match_end = match_begin + ucs4_match_size; + + const char *const str_begin = str.data(); + const char *const str_end = str_begin + str.size(); + + for(const char* pstr = str_begin + byte_offset; + pstr < str_end; + pstr = g_utf8_next_char(pstr)) + { + const gunichar *const pfound = std::find(match_begin, match_end, g_utf8_get_char(pstr)); + + if((pfound != match_end) != find_not_of) + return offset; + + ++offset; + } + + return ustring::npos; +} + +// Helper to implement ustring::find_last_of() and find_last_not_of(). +// Returns the UTF-8 character offset, or ustring::npos if not found. +static +ustring::size_type utf8_find_last_of(const std::string& str, ustring::size_type offset, + const char* utf8_match, long utf8_match_size, + bool find_not_of) +{ + long ucs4_match_size = 0; + const Glib::ScopedPtr<gunichar> ucs4_match + (g_utf8_to_ucs4_fast(utf8_match, utf8_match_size, &ucs4_match_size)); + + const gunichar *const match_begin = ucs4_match.get(); + const gunichar *const match_end = match_begin + ucs4_match_size; + + const char *const str_begin = str.data(); + const char* pstr = str_begin; + + // Set pstr one byte beyond the actual start position. + const ustring::size_type byte_offset = utf8_byte_offset(str, offset); + pstr += (byte_offset < str.size()) ? byte_offset + 1 : str.size(); + + while(pstr > str_begin) + { + // Move to previous character. + do + --pstr; + while((static_cast<unsigned char>(*pstr) & 0xC0u) == 0x80); + + const gunichar *const pfound = std::find(match_begin, match_end, g_utf8_get_char(pstr)); + + if((pfound != match_end) != find_not_of) + return g_utf8_pointer_to_offset(str_begin, pstr); + } + + return ustring::npos; +} + +} // anonymous namespace + + +namespace Glib +{ + +#ifndef GLIBMM_HAVE_ALLOWS_STATIC_INLINE_NPOS +// Initialize static member here, +// because the compiler did not allow us do it inline. +const ustring::size_type ustring::npos = std::string::npos; +#endif + +/* + * We need our own version of g_utf8_get_char(), because the std::string + * iterator is not necessarily a plain pointer (it's in fact not in GCC's + * libstdc++-v3). Copying the UTF-8 data into a temporary buffer isn't an + * option since this operation is quite time critical. The implementation + * is quite different from g_utf8_get_char() -- both more generic and likely + * faster. + * + * By looking at the first byte of a UTF-8 character one can determine the + * number of bytes used. GLib offers the g_utf8_skip[] array for this purpose, + * but accessing this global variable would, on IA32 at least, introduce + * a function call to fetch the Global Offset Table, plus two levels of + * indirection in order to read the value. Even worse, fetching the GOT is + * always done right at the start of the function instead of the branch that + * actually uses the variable. + * + * Fortunately, there's a better way to get the byte count. As this table + * shows, there's a nice regular pattern in the UTF-8 encoding scheme: + * + * 0x00000000 - 0x0000007F: 0xxxxxxx + * 0x00000080 - 0x000007FF: 110xxxxx 10xxxxxx + * 0x00000800 - 0x0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx + * 0x00010000 - 0x001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x00200000 - 0x03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * 0x04000000 - 0x7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + * + * Except for the single byte case, the number of leading 1-bits equals the + * byte count. All that is needed is to shift the first byte to the left + * until bit 7 becomes 0. Naturally, doing so requires a loop -- but since + * we already have one, no additional cost is introduced. This shifting can + * further be combined with the computation of the bitmask needed to eliminate + * the leading length bits, thus saving yet another register. + * + * Note: If you change this code, it is advisable to also review what the + * compiler makes of it in the assembler output. Except for some pointless + * register moves, the generated code is sufficiently close to the optimum + * with GCC 4.1.2 on x86_64. + */ +gunichar get_unichar_from_std_iterator(std::string::const_iterator pos) +{ + unsigned int result = static_cast<unsigned char>(*pos); + + if((result & 0x80) != 0) + { + unsigned int mask = 0x40; + + do + { + result <<= 6; + const unsigned int c = static_cast<unsigned char>(*++pos); + mask <<= 5; + result += c - 0x80; + } + while((result & mask) != 0); + + result &= mask - 1; + } + + return result; +} + + +/**** Glib::ustring ********************************************************/ + +ustring::ustring() +: + string_ () +{} + +ustring::ustring(const ustring& other) +: + string_ (other.string_) +{} + +ustring::ustring(const ustring& src, ustring::size_type i, ustring::size_type n) +: + string_ () +{ + const Utf8SubstrBounds bounds (src.string_, i, n); + string_.assign(src.string_, bounds.i, bounds.n); +} + +ustring::ustring(const char* src, ustring::size_type n) +: + string_ (src, utf8_byte_offset(src, n)) +{} + +ustring::ustring(const char* src) +: + string_ (src) +{} + +ustring::ustring(ustring::size_type n, gunichar uc) +: + string_ () +{ + if(uc < 0x80) + { + // Optimize the probably most common case. + string_.assign(n, static_cast<char>(uc)); + } + else + { + const UnicharToUtf8 conv (uc); + string_.reserve(n * conv.len); + + for(; n > 0; --n) + string_.append(conv.buf, conv.len); + } +} + +ustring::ustring(ustring::size_type n, char c) +: + string_ (n, c) +{} + +ustring::ustring(const std::string& src) +: + string_ (src) +{} + +ustring::~ustring() +{} + +void ustring::swap(ustring& other) +{ + string_.swap(other.string_); +} + + +/**** Glib::ustring::operator=() *******************************************/ + +ustring& ustring::operator=(const ustring& other) +{ + string_ = other.string_; + return *this; +} + +ustring& ustring::operator=(const std::string& src) +{ + string_ = src; + return *this; +} + +ustring& ustring::operator=(const char* src) +{ + string_ = src; + return *this; +} + +ustring& ustring::operator=(gunichar uc) +{ + const UnicharToUtf8 conv (uc); + string_.assign(conv.buf, conv.len); + return *this; +} + +ustring& ustring::operator=(char c) +{ + string_ = c; + return *this; +} + + +/**** Glib::ustring::assign() **********************************************/ + +ustring& ustring::assign(const ustring& src) +{ + string_ = src.string_; + return *this; +} + +ustring& ustring::assign(const ustring& src, ustring::size_type i, ustring::size_type n) +{ + const Utf8SubstrBounds bounds (src.string_, i, n); + string_.assign(src.string_, bounds.i, bounds.n); + return *this; +} + +ustring& ustring::assign(const char* src, ustring::size_type n) +{ + string_.assign(src, utf8_byte_offset(src, n)); + return *this; +} + +ustring& ustring::assign(const char* src) +{ + string_ = src; + return *this; +} + +ustring& ustring::assign(ustring::size_type n, gunichar uc) +{ + ustring temp (n, uc); + string_.swap(temp.string_); + return *this; +} + +ustring& ustring::assign(ustring::size_type n, char c) +{ + string_.assign(n, c); + return *this; +} + + +/**** Glib::ustring::operator+=() ******************************************/ + +ustring& ustring::operator+=(const ustring& src) +{ + string_ += src.string_; + return *this; +} + +ustring& ustring::operator+=(const char* src) +{ + string_ += src; + return *this; +} + +ustring& ustring::operator+=(gunichar uc) +{ + const UnicharToUtf8 conv (uc); + string_.append(conv.buf, conv.len); + return *this; +} + +ustring& ustring::operator+=(char c) +{ + string_ += c; + return *this; +} + + +/**** Glib::ustring::push_back() *******************************************/ + +void ustring::push_back(gunichar uc) +{ + const UnicharToUtf8 conv (uc); + string_.append(conv.buf, conv.len); +} + +void ustring::push_back(char c) +{ + string_ += c; +} + + +/**** Glib::ustring::append() **********************************************/ + +ustring& ustring::append(const ustring& src) +{ + string_ += src.string_; + return *this; +} + +ustring& ustring::append(const ustring& src, ustring::size_type i, ustring::size_type n) +{ + const Utf8SubstrBounds bounds (src.string_, i, n); + string_.append(src.string_, bounds.i, bounds.n); + return *this; +} + +ustring& ustring::append(const char* src, ustring::size_type n) +{ + string_.append(src, utf8_byte_offset(src, n)); + return *this; +} + +ustring& ustring::append(const char* src) +{ + string_ += src; + return *this; +} + +ustring& ustring::append(ustring::size_type n, gunichar uc) +{ + string_.append(ustring(n, uc).string_); + return *this; +} + +ustring& ustring::append(ustring::size_type n, char c) +{ + string_.append(n, c); + return *this; +} + + +/**** Glib::ustring::insert() **********************************************/ + +ustring& ustring::insert(ustring::size_type i, const ustring& src) +{ + string_.insert(utf8_byte_offset(string_, i), src.string_); + return *this; +} + +ustring& ustring::insert(ustring::size_type i, const ustring& src, + ustring::size_type i2, ustring::size_type n) +{ + const Utf8SubstrBounds bounds2 (src.string_, i2, n); + string_.insert(utf8_byte_offset(string_, i), src.string_, bounds2.i, bounds2.n); + return *this; +} + +ustring& ustring::insert(ustring::size_type i, const char* src, ustring::size_type n) +{ + string_.insert(utf8_byte_offset(string_, i), src, utf8_byte_offset(src, n)); + return *this; +} + +ustring& ustring::insert(ustring::size_type i, const char* src) +{ + string_.insert(utf8_byte_offset(string_, i), src); + return *this; +} + +ustring& ustring::insert(ustring::size_type i, ustring::size_type n, gunichar uc) +{ + string_.insert(utf8_byte_offset(string_, i), ustring(n, uc).string_); + return *this; +} + +ustring& ustring::insert(ustring::size_type i, ustring::size_type n, char c) +{ + string_.insert(utf8_byte_offset(string_, i), n, c); + return *this; +} + +ustring::iterator ustring::insert(ustring::iterator p, gunichar uc) +{ + const size_type offset = p.base() - string_.begin(); + const UnicharToUtf8 conv (uc); + string_.insert(offset, conv.buf, conv.len); + return iterator(string_.begin() + offset); +} + +ustring::iterator ustring::insert(ustring::iterator p, char c) +{ + return iterator(string_.insert(p.base(), c)); +} + +void ustring::insert(ustring::iterator p, ustring::size_type n, gunichar uc) +{ + string_.insert(p.base() - string_.begin(), ustring(n, uc).string_); +} + +void ustring::insert(ustring::iterator p, ustring::size_type n, char c) +{ + string_.insert(p.base(), n, c); +} + + +/**** Glib::ustring::replace() *********************************************/ + +ustring& ustring::replace(ustring::size_type i, ustring::size_type n, const ustring& src) +{ + const Utf8SubstrBounds bounds (string_, i, n); + string_.replace(bounds.i, bounds.n, src.string_); + return *this; +} + +ustring& ustring::replace(ustring::size_type i, ustring::size_type n, + const ustring& src, ustring::size_type i2, ustring::size_type n2) +{ + const Utf8SubstrBounds bounds (string_, i, n); + const Utf8SubstrBounds bounds2 (src.string_, i2, n2); + string_.replace(bounds.i, bounds.n, src.string_, bounds2.i, bounds2.n); + return *this; +} + +ustring& ustring::replace(ustring::size_type i, ustring::size_type n, + const char* src, ustring::size_type n2) +{ + const Utf8SubstrBounds bounds (string_, i, n); + string_.replace(bounds.i, bounds.n, src, utf8_byte_offset(src, n2)); + return *this; +} + +ustring& ustring::replace(ustring::size_type i, ustring::size_type n, const char* src) +{ + const Utf8SubstrBounds bounds (string_, i, n); + string_.replace(bounds.i, bounds.n, src); + return *this; +} + +ustring& ustring::replace(ustring::size_type i, ustring::size_type n, + ustring::size_type n2, gunichar uc) +{ + const Utf8SubstrBounds bounds (string_, i, n); + string_.replace(bounds.i, bounds.n, ustring(n2, uc).string_); + return *this; +} + +ustring& ustring::replace(ustring::size_type i, ustring::size_type n, + ustring::size_type n2, char c) +{ + const Utf8SubstrBounds bounds (string_, i, n); + string_.replace(bounds.i, bounds.n, n2, c); + return *this; +} + +ustring& ustring::replace(ustring::iterator pbegin, ustring::iterator pend, const ustring& src) +{ + string_.replace(pbegin.base(), pend.base(), src.string_); + return *this; +} + +ustring& ustring::replace(ustring::iterator pbegin, ustring::iterator pend, + const char* src, ustring::size_type n) +{ + string_.replace(pbegin.base(), pend.base(), src, utf8_byte_offset(src, n)); + return *this; +} + +ustring& ustring::replace(ustring::iterator pbegin, ustring::iterator pend, const char* src) +{ + string_.replace(pbegin.base(), pend.base(), src); + return *this; +} + +ustring& ustring::replace(ustring::iterator pbegin, ustring::iterator pend, + ustring::size_type n, gunichar uc) +{ + string_.replace(pbegin.base(), pend.base(), ustring(n, uc).string_); + return *this; +} + +ustring& ustring::replace(ustring::iterator pbegin, ustring::iterator pend, + ustring::size_type n, char c) +{ + string_.replace(pbegin.base(), pend.base(), n, c); + return *this; +} + + +/**** Glib::ustring::erase() ***********************************************/ + +void ustring::clear() +{ + string_.erase(); +} + +ustring& ustring::erase(ustring::size_type i, ustring::size_type n) +{ + const Utf8SubstrBounds bounds (string_, i, n); + string_.erase(bounds.i, bounds.n); + return *this; +} + +ustring& ustring::erase() +{ + string_.erase(); + return *this; +} + +ustring::iterator ustring::erase(ustring::iterator p) +{ + ustring::iterator iter_end = p; + ++iter_end; + + return iterator(string_.erase(p.base(), iter_end.base())); +} + +ustring::iterator ustring::erase(ustring::iterator pbegin, ustring::iterator pend) +{ + return iterator(string_.erase(pbegin.base(), pend.base())); +} + + +/**** Glib::ustring::compare() *********************************************/ + +int ustring::compare(const ustring& rhs) const +{ + return g_utf8_collate(string_.c_str(), rhs.string_.c_str()); +} + +int ustring::compare(const char* rhs) const +{ + return g_utf8_collate(string_.c_str(), rhs); +} + +int ustring::compare(ustring::size_type i, ustring::size_type n, const ustring& rhs) const +{ + return ustring(*this, i, n).compare(rhs); +} + +int ustring::compare(ustring::size_type i, ustring::size_type n, + const ustring& rhs, ustring::size_type i2, ustring::size_type n2) const +{ + return ustring(*this, i, n).compare(ustring(rhs, i2, n2)); +} + +int ustring::compare(ustring::size_type i, ustring::size_type n, + const char* rhs, ustring::size_type n2) const +{ + return ustring(*this, i, n).compare(ustring(rhs, n2)); +} + +int ustring::compare(ustring::size_type i, ustring::size_type n, const char* rhs) const +{ + return ustring(*this, i, n).compare(rhs); +} + + +/**** Glib::ustring -- index access ****************************************/ + +ustring::value_type ustring::operator[](ustring::size_type i) const +{ + return g_utf8_get_char(g_utf8_offset_to_pointer(string_.data(), i)); +} + +ustring::value_type ustring::at(ustring::size_type i) const +{ + const size_type byte_offset = utf8_byte_offset(string_, i); + + // Throws std::out_of_range if the index is invalid. + return g_utf8_get_char(&string_.at(byte_offset)); +} + + +/**** Glib::ustring -- iterator access *************************************/ + +ustring::iterator ustring::begin() +{ + return iterator(string_.begin()); +} + +ustring::iterator ustring::end() +{ + return iterator(string_.end()); +} + +ustring::const_iterator ustring::begin() const +{ + return const_iterator(string_.begin()); +} + +ustring::const_iterator ustring::end() const +{ + return const_iterator(string_.end()); +} + +ustring::reverse_iterator ustring::rbegin() +{ + return reverse_iterator(iterator(string_.end())); +} + +ustring::reverse_iterator ustring::rend() +{ + return reverse_iterator(iterator(string_.begin())); +} + +ustring::const_reverse_iterator ustring::rbegin() const +{ + return const_reverse_iterator(const_iterator(string_.end())); +} + +ustring::const_reverse_iterator ustring::rend() const +{ + return const_reverse_iterator(const_iterator(string_.begin())); +} + + +/**** Glib::ustring::find() ************************************************/ + +ustring::size_type ustring::find(const ustring& str, ustring::size_type i) const +{ + return utf8_char_offset(string_, string_.find(str.string_, utf8_byte_offset(string_, i))); +} + +ustring::size_type ustring::find(const char* str, ustring::size_type i, ustring::size_type n) const +{ + return utf8_char_offset(string_, string_.find(str, utf8_byte_offset(string_, i), + utf8_byte_offset(str, n))); +} + +ustring::size_type ustring::find(const char* str, ustring::size_type i) const +{ + return utf8_char_offset(string_, string_.find(str, utf8_byte_offset(string_, i))); +} + +ustring::size_type ustring::find(gunichar uc, ustring::size_type i) const +{ + const UnicharToUtf8 conv (uc); + return utf8_char_offset(string_, string_.find(conv.buf, utf8_byte_offset(string_, i), conv.len)); +} + +ustring::size_type ustring::find(char c, ustring::size_type i) const +{ + return utf8_char_offset(string_, string_.find(c, utf8_byte_offset(string_, i))); +} + + +/**** Glib::ustring::rfind() ***********************************************/ + +ustring::size_type ustring::rfind(const ustring& str, ustring::size_type i) const +{ + return utf8_char_offset(string_, string_.rfind(str.string_, utf8_byte_offset(string_, i))); +} + +ustring::size_type ustring::rfind(const char* str, ustring::size_type i, + ustring::size_type n) const +{ + return utf8_char_offset(string_, string_.rfind(str, utf8_byte_offset(string_, i), + utf8_byte_offset(str, n))); +} + +ustring::size_type ustring::rfind(const char* str, ustring::size_type i) const +{ + return utf8_char_offset(string_, string_.rfind(str, utf8_byte_offset(string_, i))); +} + +ustring::size_type ustring::rfind(gunichar uc, ustring::size_type i) const +{ + const UnicharToUtf8 conv (uc); + return utf8_char_offset(string_, string_.rfind(conv.buf, utf8_byte_offset(string_, i), conv.len)); +} + +ustring::size_type ustring::rfind(char c, ustring::size_type i) const +{ + return utf8_char_offset(string_, string_.rfind(c, utf8_byte_offset(string_, i))); +} + + +/**** Glib::ustring::find_first_of() ***************************************/ + +ustring::size_type ustring::find_first_of(const ustring& match, ustring::size_type i) const +{ + return utf8_find_first_of(string_, i, match.string_.data(), match.string_.size(), false); +} + +ustring::size_type ustring::find_first_of(const char* match, + ustring::size_type i, ustring::size_type n) const +{ + return utf8_find_first_of(string_, i, match, n, false); +} + +ustring::size_type ustring::find_first_of(const char* match, ustring::size_type i) const +{ + return utf8_find_first_of(string_, i, match, -1, false); +} + +ustring::size_type ustring::find_first_of(gunichar uc, ustring::size_type i) const +{ + return find(uc, i); +} + +ustring::size_type ustring::find_first_of(char c, ustring::size_type i) const +{ + return find(c, i); +} + + +/**** Glib::ustring::find_last_of() ****************************************/ + +ustring::size_type ustring::find_last_of(const ustring& match, ustring::size_type i) const +{ + return utf8_find_last_of(string_, i, match.string_.data(), match.string_.size(), false); +} + +ustring::size_type ustring::find_last_of(const char* match, + ustring::size_type i, ustring::size_type n) const +{ + return utf8_find_last_of(string_, i, match, n, false); +} + +ustring::size_type ustring::find_last_of(const char* match, ustring::size_type i) const +{ + return utf8_find_last_of(string_, i, match, -1, false); +} + +ustring::size_type ustring::find_last_of(gunichar uc, ustring::size_type i) const +{ + return rfind(uc, i); +} + +ustring::size_type ustring::find_last_of(char c, ustring::size_type i) const +{ + return rfind(c, i); +} + + +/**** Glib::ustring::find_first_not_of() ***********************************/ + +ustring::size_type ustring::find_first_not_of(const ustring& match, ustring::size_type i) const +{ + return utf8_find_first_of(string_, i, match.string_.data(), match.string_.size(), true); +} + +ustring::size_type ustring::find_first_not_of(const char* match, + ustring::size_type i, ustring::size_type n) const +{ + return utf8_find_first_of(string_, i, match, n, true); +} + +ustring::size_type ustring::find_first_not_of(const char* match, ustring::size_type i) const +{ + return utf8_find_first_of(string_, i, match, -1, true); +} + +// Unfortunately, all of the find_*_not_of() methods for single +// characters need their own special implementation. +// +ustring::size_type ustring::find_first_not_of(gunichar uc, ustring::size_type i) const +{ + const size_type bi = utf8_byte_offset(string_, i); + if(bi != npos) + { + const char *const pbegin = string_.data(); + const char *const pend = pbegin + string_.size(); + + for(const char* p = pbegin + bi; + p < pend; + p = g_utf8_next_char(p), ++i) + { + if(g_utf8_get_char(p) != uc) + return i; + } + } + return npos; +} + +ustring::size_type ustring::find_first_not_of(char c, ustring::size_type i) const +{ + const size_type bi = utf8_byte_offset(string_, i); + if(bi != npos) + { + const char *const pbegin = string_.data(); + const char *const pend = pbegin + string_.size(); + + for(const char* p = pbegin + bi; + p < pend; + p = g_utf8_next_char(p), ++i) + { + if(*p != c) + return i; + } + } + return npos; +} + + +/**** Glib::ustring::find_last_not_of() ************************************/ + +ustring::size_type ustring::find_last_not_of(const ustring& match, ustring::size_type i) const +{ + return utf8_find_last_of(string_, i, match.string_.data(), match.string_.size(), true); +} + +ustring::size_type ustring::find_last_not_of(const char* match, + ustring::size_type i, ustring::size_type n) const +{ + return utf8_find_last_of(string_, i, match, n, true); +} + +ustring::size_type ustring::find_last_not_of(const char* match, ustring::size_type i) const +{ + return utf8_find_last_of(string_, i, match, -1, true); +} + +// Unfortunately, all of the find_*_not_of() methods for single +// characters need their own special implementation. +// +ustring::size_type ustring::find_last_not_of(gunichar uc, ustring::size_type i) const +{ + const char *const pbegin = string_.data(); + const char *const pend = pbegin + string_.size(); + size_type i_cur = 0; + size_type i_found = npos; + + for(const char* p = pbegin; + p < pend && i_cur <= i; + p = g_utf8_next_char(p), ++i_cur) + { + if(g_utf8_get_char(p) != uc) + i_found = i_cur; + } + return i_found; +} + +ustring::size_type ustring::find_last_not_of(char c, ustring::size_type i) const +{ + const char *const pbegin = string_.data(); + const char *const pend = pbegin + string_.size(); + size_type i_cur = 0; + size_type i_found = npos; + + for(const char* p = pbegin; + p < pend && i_cur <= i; + p = g_utf8_next_char(p), ++i_cur) + { + if(*p != c) + i_found = i_cur; + } + return i_found; +} + + +/**** Glib::ustring -- get size and resize *********************************/ + +bool ustring::empty() const +{ + return string_.empty(); +} + +ustring::size_type ustring::size() const +{ + const char *const pdata = string_.data(); + return g_utf8_pointer_to_offset(pdata, pdata + string_.size()); +} + +ustring::size_type ustring::length() const +{ + const char *const pdata = string_.data(); + return g_utf8_pointer_to_offset(pdata, pdata + string_.size()); +} + +ustring::size_type ustring::bytes() const +{ + return string_.size(); +} + +ustring::size_type ustring::capacity() const +{ + return string_.capacity(); +} + +ustring::size_type ustring::max_size() const +{ + return string_.max_size(); +} + +void ustring::resize(ustring::size_type n, gunichar uc) +{ + const size_type size_now = size(); + if(n < size_now) + erase(n, npos); + else if(n > size_now) + append(n - size_now, uc); +} + +void ustring::resize(ustring::size_type n, char c) +{ + const size_type size_now = size(); + if(n < size_now) + erase(n, npos); + else if(n > size_now) + string_.append(n - size_now, c); +} + +void ustring::reserve(ustring::size_type n) +{ + string_.reserve(n); +} + + +/**** Glib::ustring -- C string access *************************************/ + +const char* ustring::data() const +{ + return string_.data(); +} + +const char* ustring::c_str() const +{ + return string_.c_str(); +} + +// Note that copy() requests UTF-8 character offsets as +// parameters, but returns the number of copied bytes. +// +ustring::size_type ustring::copy(char* dest, ustring::size_type n, ustring::size_type i) const +{ + const Utf8SubstrBounds bounds (string_, i, n); + return string_.copy(dest, bounds.n, bounds.i); +} + + +/**** Glib::ustring -- UTF-8 utilities *************************************/ + +bool ustring::validate() const +{ + return (g_utf8_validate(string_.data(), string_.size(), 0) != 0); +} + +bool ustring::validate(ustring::iterator& first_invalid) +{ + const char *const pdata = string_.data(); + const char* valid_end = pdata; + const int is_valid = g_utf8_validate(pdata, string_.size(), &valid_end); + + first_invalid = iterator(string_.begin() + (valid_end - pdata)); + return (is_valid != 0); +} + +bool ustring::validate(ustring::const_iterator& first_invalid) const +{ + const char *const pdata = string_.data(); + const char* valid_end = pdata; + const int is_valid = g_utf8_validate(pdata, string_.size(), &valid_end); + + first_invalid = const_iterator(string_.begin() + (valid_end - pdata)); + return (is_valid != 0); +} + +bool ustring::is_ascii() const +{ + const char* p = string_.data(); + const char *const pend = p + string_.size(); + + for(; p != pend; ++p) + { + if((static_cast<unsigned char>(*p) & 0x80u) != 0) + return false; + } + + return true; +} + +ustring ustring::normalize(NormalizeMode mode) const +{ + const ScopedPtr<char> buf (g_utf8_normalize(string_.data(), string_.size(), + static_cast<GNormalizeMode>(int(mode)))); + return ustring(buf.get()); +} + +ustring ustring::uppercase() const +{ + const ScopedPtr<char> buf (g_utf8_strup(string_.data(), string_.size())); + return ustring(buf.get()); +} + +ustring ustring::lowercase() const +{ + const ScopedPtr<char> buf (g_utf8_strdown(string_.data(), string_.size())); + return ustring(buf.get()); +} + +ustring ustring::casefold() const +{ + const ScopedPtr<char> buf (g_utf8_casefold(string_.data(), string_.size())); + return ustring(buf.get()); +} + +std::string ustring::collate_key() const +{ + const ScopedPtr<char> buf (g_utf8_collate_key(string_.data(), string_.size())); + return std::string(buf.get()); +} + +std::string ustring::casefold_collate_key() const +{ + char *const casefold_buf = g_utf8_casefold(string_.data(), string_.size()); + char *const key_buf = g_utf8_collate_key(casefold_buf, -1); + g_free(casefold_buf); + return std::string(ScopedPtr<char>(key_buf).get()); +} + +/**** Glib::ustring -- Message formatting **********************************/ + +// static +ustring ustring::compose_argv(const Glib::ustring& fmt, int argc, const ustring* const* argv) +{ + std::string::size_type result_size = fmt.raw().size(); + + // Guesstimate the final string size. + for (int i = 0; i < argc; ++i) + result_size += argv[i]->raw().size(); + + std::string result; + result.reserve(result_size); + + const char* const pfmt = fmt.raw().c_str(); + const char* start = pfmt; + + while (const char* const stop = std::strchr(start, '%')) + { + if (stop[1] == '%') + { + result.append(start, stop - start + 1); + start = stop + 2; + } + else + { + const int index = Ascii::digit_value(stop[1]) - 1; + + if (index >= 0 && index < argc) + { + result.append(start, stop - start); + result += argv[index]->raw(); + start = stop + 2; + } + else + { + const char* const next = (stop[1] != '\0') ? g_utf8_next_char(stop + 1) : (stop + 1); + + // Copy invalid substitutions literally to the output. + result.append(start, next - start); + + g_warning("invalid substitution \"%s\" in fmt string \"%s\"", + result.c_str() + result.size() - (next - stop), pfmt); + start = next; + } + } + } + + result.append(start, pfmt + fmt.raw().size() - start); + + return result; +} + +/**** Glib::ustring::SequenceToString **************************************/ + +ustring::SequenceToString<Glib::ustring::iterator,gunichar> + ::SequenceToString(Glib::ustring::iterator pbegin, Glib::ustring::iterator pend) +: + std::string(pbegin.base(), pend.base()) +{} + +ustring::SequenceToString<Glib::ustring::const_iterator,gunichar> + ::SequenceToString(Glib::ustring::const_iterator pbegin, Glib::ustring::const_iterator pend) +: + std::string(pbegin.base(), pend.base()) +{} + +/**** Glib::ustring::FormatStream ******************************************/ + +ustring::FormatStream::FormatStream() +: + stream_ () +{} + +ustring::FormatStream::~FormatStream() +{} + +ustring ustring::FormatStream::to_string() const +{ + GError* error = 0; + +#ifdef GLIBMM_HAVE_WIDE_STREAM + const std::wstring str = stream_.str(); + +# if defined(__STDC_ISO_10646__) && SIZEOF_WCHAR_T == 4 + // Avoid going through iconv if wchar_t always contains UCS-4. + glong n_bytes = 0; + const ScopedPtr<char> buf (g_ucs4_to_utf8(reinterpret_cast<const gunichar*>(str.data()), + str.size(), 0, &n_bytes, &error)); +# elif defined(G_OS_WIN32) && SIZEOF_WCHAR_T == 2 + // Avoid going through iconv if wchar_t always contains UTF-16. + glong n_bytes = 0; + const ScopedPtr<char> buf (g_utf16_to_utf8(reinterpret_cast<const gunichar2*>(str.data()), + str.size(), 0, &n_bytes, &error)); +# else + gsize n_bytes = 0; + const ScopedPtr<char> buf (g_convert(reinterpret_cast<const char*>(str.data()), + str.size() * sizeof(std::wstring::value_type), + "UTF-8", "WCHAR_T", 0, &n_bytes, &error)); +# endif /* !(__STDC_ISO_10646__ || G_OS_WIN32) */ + +#else /* !GLIBMM_HAVE_WIDE_STREAM */ + const std::string str = stream_.str(); + + gsize n_bytes = 0; + const ScopedPtr<char> buf (g_locale_to_utf8(str.data(), str.size(), 0, &n_bytes, &error)); +#endif /* !GLIBMM_HAVE_WIDE_STREAM */ + + if (error) + { + Glib::Error::throw_exception(error); + } + + return ustring(buf.get(), buf.get() + n_bytes); +} + +/**** Glib::ustring -- stream I/O operators ********************************/ + +std::istream& operator>>(std::istream& is, Glib::ustring& utf8_string) +{ + std::string str; + is >> str; + + GError* error = 0; + gsize n_bytes = 0; + const ScopedPtr<char> buf (g_locale_to_utf8(str.data(), str.size(), 0, &n_bytes, &error)); + + if (error) + { + Glib::Error::throw_exception(error); + } + + utf8_string.assign(buf.get(), buf.get() + n_bytes); + + return is; +} + +std::ostream& operator<<(std::ostream& os, const Glib::ustring& utf8_string) +{ + GError* error = 0; + const ScopedPtr<char> buf (g_locale_from_utf8(utf8_string.raw().data(), + utf8_string.raw().size(), 0, 0, &error)); + if (error) + { + Glib::Error::throw_exception(error); + } + + // This won't work if the string contains NUL characters. Unfortunately, + // std::ostream::write() ignores format flags, so we cannot use that. + // The only option would be to create a temporary std::string. However, + // even then GCC's libstdc++-v3 prints only the characters up to the first + // NUL. Given this, there doesn't seem much of a point in allowing NUL in + // formatted output. The semantics would be unclear anyway: what's the + // screen width of a NUL? + os << buf.get(); + + return os; +} + +#ifdef GLIBMM_HAVE_WIDE_STREAM + +std::wistream& operator>>(std::wistream& is, ustring& utf8_string) +{ + GError* error = 0; + + std::wstring wstr; + is >> wstr; + +#if defined(__STDC_ISO_10646__) && SIZEOF_WCHAR_T == 4 + // Avoid going through iconv if wchar_t always contains UCS-4. + glong n_bytes = 0; + const ScopedPtr<char> buf (g_ucs4_to_utf8(reinterpret_cast<const gunichar*>(wstr.data()), + wstr.size(), 0, &n_bytes, &error)); +#elif defined(G_OS_WIN32) && SIZEOF_WCHAR_T == 2 + // Avoid going through iconv if wchar_t always contains UTF-16. + glong n_bytes = 0; + const ScopedPtr<char> buf (g_utf16_to_utf8(reinterpret_cast<const gunichar2*>(wstr.data()), + wstr.size(), 0, &n_bytes, &error)); +#else + gsize n_bytes = 0; + const ScopedPtr<char> buf (g_convert(reinterpret_cast<const char*>(wstr.data()), + wstr.size() * sizeof(std::wstring::value_type), + "UTF-8", "WCHAR_T", 0, &n_bytes, &error)); +#endif // !(__STDC_ISO_10646__ || G_OS_WIN32) + + if (error) + { + Glib::Error::throw_exception(error); + } + + utf8_string.assign(buf.get(), buf.get() + n_bytes); + + return is; +} + +std::wostream& operator<<(std::wostream& os, const ustring& utf8_string) +{ + GError* error = 0; + +#if defined(__STDC_ISO_10646__) && SIZEOF_WCHAR_T == 4 + // Avoid going through iconv if wchar_t always contains UCS-4. + const ScopedPtr<gunichar> buf (g_utf8_to_ucs4(utf8_string.raw().data(), + utf8_string.raw().size(), 0, 0, &error)); +#elif defined(G_OS_WIN32) && SIZEOF_WCHAR_T == 2 + // Avoid going through iconv if wchar_t always contains UTF-16. + const ScopedPtr<gunichar2> buf (g_utf8_to_utf16(utf8_string.raw().data(), + utf8_string.raw().size(), 0, 0, &error)); +#else + // TODO: For some reason the conversion from UTF-8 to WCHAR_T doesn't work + // with g_convert(), while iconv on the command line handles it just fine. + // Maybe a bug in GLib? + const ScopedPtr<char> buf (g_convert(utf8_string.raw().data(), utf8_string.raw().size(), + "WCHAR_T", "UTF-8", 0, 0, &error)); +#endif // !(__STDC_ISO_10646__ || G_OS_WIN32) + + if (error) + { + Glib::Error::throw_exception(error); + } + + // This won't work if the string contains NUL characters. Unfortunately, + // std::wostream::write() ignores format flags, so we cannot use that. + // The only option would be to create a temporary std::wstring. However, + // even then GCC's libstdc++-v3 prints only the characters up to the first + // NUL. Given this, there doesn't seem much of a point in allowing NUL in + // formatted output. The semantics would be unclear anyway: what's the + // screen width of a NUL? + os << reinterpret_cast<wchar_t*>(buf.get()); + + return os; +} + +#endif /* GLIBMM_HAVE_WIDE_STREAM */ + +} // namespace Glib Property changes on: trunk/src/libustring/ustring.cc ___________________________________________________________________ Added: svn:mime-type + text/plain Added: svn:eol-style + native Added: trunk/src/libustring/ustring.h =================================================================== --- trunk/src/libustring/ustring.h (rev 0) +++ trunk/src/libustring/ustring.h 2012-03-20 13:35:13 UTC (rev 200) @@ -0,0 +1,1615 @@ +// -*- c++ -*- +#ifndef _GLIBMM_USTRING_H +#define _GLIBMM_USTRING_H + +/* $Id$ */ + +/* Copyright (C) 2002 The gtkmm Development Team + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <glibmmconfig.h> +#include <glibmm/unicode.h> +#include <glib.h> + +#include <iosfwd> +#include <iterator> +#include <sstream> +#include <string> +#ifndef GLIBMM_HAVE_STD_ITERATOR_TRAITS +#include <cstddef> /* for ptrdiff_t */ +#endif + +namespace Glib +{ + +#ifndef DOXYGEN_SHOULD_SKIP_THIS +#ifndef GLIBMM_HAVE_STD_ITERATOR_TRAITS + +template <class T> +struct IteratorTraits +{ + typedef typename T::iterator_category iterator_category; + typedef typename T::value_type value_type; + typedef typename T::difference_type difference_type; + typedef typename T::pointer pointer; + typedef typename T::reference reference; +}; + +template <class T> +struct IteratorTraits<T*> +{ + typedef std::random_access_iterator_tag iterator_category; + typedef T value_type; + typedef ptrdiff_t difference_type; + typedef T* pointer; + typedef T& reference; +}; + +template <class T> +struct IteratorTraits<const T*> +{ + typedef std::random_access_iterator_tag iterator_category; + typedef T value_type; + typedef ptrdiff_t difference_type; + typedef const T* pointer; + typedef const T& reference; +}; + +#endif /* GLIBMM_HAVE_STD_ITERATOR_TRAITS */ +#endif /* DOXYGEN_SHOULD_SKIP_THIS */ + + +/** The iterator type of Glib::ustring. + * Note this is not a random access iterator but a bidirectional one, + * since all index operations need to iterate over the UTF-8 data. Use + * std::advance() to move to a certain position. However, all of the + * relational operators are available: + * <tt>== != < > <= >=</tt> + * + * A writeable iterator isn't provided because: The number of bytes of + * the old UTF-8 character and the new one to write could be different. + * Therefore, any write operation would invalidate all other iterators + * pointing into the same string. + */ +template <class T> +class ustring_Iterator +{ +public: + typedef std::bidirectional_iterator_tag iterator_category; + typedef gunichar value_type; + typedef std::string::difference_type difference_type; + typedef value_type reference; + typedef void pointer; + + inline ustring_Iterator(); + inline ustring_Iterator(const ustring_Iterator<std::string::iterator>& other); + + inline value_type operator*() const; + + inline ustring_Iterator<T> & operator++(); + inline const ustring_Iterator<T> operator++(int); + inline ustring_Iterator<T> & operator--(); + inline const ustring_Iterator<T> operator--(int); + + explicit inline ustring_Iterator(T pos); + inline T base() const; + +private: + T pos_; +}; + + +/** Extract a UCS-4 character from UTF-8 data. + * Convert a single UTF-8 (multibyte) character starting at @p pos to + * a UCS-4 wide character. This may read up to 6 bytes after the start + * position, depending on the UTF-8 character width. You have to make + * sure the source contains at least one valid UTF-8 character. + * + * This is mainly used by the implementation of Glib::ustring::iterator, + * but it might be useful as utility function if you prefer using + * std::string even for UTF-8 encoding. + */ +gunichar get_unichar_from_std_iterator(std::string::const_iterator pos) G_GNUC_PURE; + + +/** Glib::ustring has much the same interface as std::string, but contains + * %Unicode characters encoded as UTF-8. + * + * @par About UTF-8 and ASCII + * @par + * The standard character set ANSI_X3.4-1968 -- more commonly known as + * ASCII -- is a subset of UTF-8. So, if you want to, you can use + * Glib::ustring without even thinking about UTF-8. + * @par + * Whenever ASCII is mentioned in this manual, we mean the @em real ASCII + * (i.e. as defined in ANSI_X3.4-1968), which contains only 7-bit characters. + * Glib::ustring can @em not be used with ASCII-compatible extended 8-bit + * charsets like ISO-8859-1. It's a good idea to avoid string literals + * containing non-ASCII characters (e.g. German umlauts) in source code, + * or at least you should use UTF-8 literals. + * @par + * You can find a detailed UTF-8 and %Unicode FAQ here: + * http://www.cl.cam.ac.uk/~mgk25/unicode.html + * + * @par Glib::ustring vs. std::string + * @par + * Glib::ustring has implicit type conversions to and from std::string. + * These conversions do @em not convert to/from the current locale (see + * Glib::locale_from_utf8() and Glib::locale_to_utf8() if you need that). You + * can always use std::string instead of Glib::ustring -- however, using + * std::string with multi-byte characters is quite hard. For instance, + * <tt>std::string::operator[]</tt> might return a byte in the middle of a + * character, and <tt>std::string::length()</tt> returns the number of bytes + * rather than characters. So don't do that without a good reason. + * @par + * In a perfect world the C++ Standard Library would contain a UTF-8 string + * class. Unfortunately, the C++ standard doesn't mention UTF-8 at all. Note + * that std::wstring is not a UTF-8 string class because it contains only + * fixed-width characters (where width could be 32, 16, or even 8 bits). + * + * @par Glib::ustring and stream input/output + * @par + * The stream I/O operators, that is operator<<() and operator>>(), perform + * implicit charset conversion to/from the current locale. If that's not + * what you intented (e.g. when writing to a configuration file that should + * always be UTF-8 encoded) use ustring::raw() to override this behaviour. + * @par + * If you're using std::ostringstream to build strings for display in the + * user interface, you must convert the result back to UTF-8 as shown below: + * @code + * std::ostringstream output; + * output.imbue(std::locale("")); // use the user's locale for this stream + * output << percentage << " % done"; + * label->set_text(Glib::locale_to_utf8(output.str())); + * @endcode + * + * @par Formatted output and internationalization + * @par + * The methods ustring::compose() and ustring::format() provide a convenient + * and powerful alternative to string streams, as shown in the example below. + * Refer to the method documentation of compose() and format() for details. + * @code + * using Glib::ustring; + * + * ustring message = ustring::compose("%1 is lower than 0x%2.", + * 12, ustring::format(std::hex, 16)); + * @endcode + * + * @par Implementation notes + * @par + * Glib::ustring does not inherit from std::string, because std::string was + * intended to be a final class. For instance, it does not have a virtual + * destructor. Also, a HAS-A relationship is more appropriate because + * ustring can't just enhance the std::string interface. Rather, it has to + * reimplement the interface so that all operations are based on characters + * instead of bytes. + */ +class ustring +{ +public: + typedef std::string::size_type size_type; + typedef std::string::difference_type difference_type; + + typedef gunichar value_type; + typedef gunichar & reference; + typedef const gunichar & const_reference; + + typedef ustring_Iterator<std::string::iterator> iterator; + typedef ustring_Iterator<std::string::const_iterator> const_iterator; + +#ifndef GLIBMM_HAVE_SUN_REVERSE_ITERATOR + + typedef std::reverse_iterator<iterator> reverse_iterator; + typedef std::reverse_iterator<const_iterator> const_reverse_iterator; + +#else + + typedef std::reverse_iterator<iterator, + iterator::iterator_category, + iterator::value_type, + iterator::reference, + iterator::pointer, + iterator::difference_type> reverse_iterator; + typedef std::reverse_iterator<const_iterator, + const_iterator::iterator_category, + const_iterator::value_type, + const_iterator::reference, + const_iterator::pointer, + const_iterator::difference_type> const_reverse_iterator; + +#endif /* GLIBMM_HAVE_SUN_REVERSE_ITERATOR */ + +#ifdef GLIBMM_HAVE_ALLOWS_STATIC_INLINE_NPOS + static GLIBMM_API const size_type npos = std::string::npos; +#else + //The IRIX MipsPro compiler says "The indicated constant value is not known", + //so we need to initalize the static member data elsewhere. + static GLIBMM_API const size_type npos; +#endif + + /*! Default constructor, which creates an empty string. + */ + ustring(); + + ~ustring(); + + /*! Construct a ustring as a copy of another ustring. + * @param other A source string. + */ + ustring(const ustring& other); + + /*! Assign the value of another string to this string. + * @param other A source string. + */ + ustring& operator=(const ustring& other); + + /*! Swap contents with another string. + * @param other String to swap with. + */ + void swap(ustring& other); + + /*! Construct a ustring as a copy of another std::string. + * @param src A source <tt>std::string</tt> containing text encoded as UTF-8. + */ + ustring(const std::string& src); + + /*! Construct a ustring as a copy of a substring. + * @param src %Source ustring. + * @param i Index of first character to copy from. + * @param n Number of UTF-8 characters to copy (defaults to copying the remainder). + */ + ustring(const ustring& src, size_type i, size_type n=npos); + + /*! Construct a ustring as a partial copy of a C string. + * @param src %Source C string encoded as UTF-8. + * @param n Number of UTF-8 characters to copy. + */ + ustring(const char* src, size_type n); + + /*! Construct a ustring as a copy of a C string. + * @param src %Source C string encoded as UTF-8. + */ + ustring(const char* src); + + /*! Construct a ustring as multiple characters. + * @param n Number of characters. + * @param uc UCS-4 code point to use. + */ + ustring(size_type n, gunichar uc); + + /*! Construct a ustring as multiple characters. + * @param n Number of characters. + * @param c ASCII character to use. + */ + ustring(size_type n, char c); + + /*! Construct a ustring as a copy of a range. + * @param pbegin Start of range. + * @param pend End of range. + */ + template <class In> ustring(In pbegin, In pend); + + +//! @name Assign new contents. +//! @{ + + ustring& operator=(const std::string& src); + ustring& operator=(const char* src); + ustring& operator=(gunichar uc); + ustring& operator=(char c); + + ustring& assign(const ustring& src); + ustring& assign(const ustring& src, size_type i, size_type n); + ustring& assign(const char* src, size_type n); + ustring& assign(const char* src); + ustring& assign(size_type n, gunichar uc); + ustring& assign(size_type n, char c); + template <class In> ustring& assign(In pbegin, In pend); + +//! @} +//! @name Append to the string. +//! @{ + + ustring& operator+=(const ustring& src); + ustring& operator+=(const char* src); + ustring& operator+=(gunichar uc); + ustring& operator+=(char c); + void push_back(gunichar uc); + void push_back(char c); + + ustring& append(const ustring& src); + ustring& append(const ustring& src, size_type i, size_type n); + ustring& append(const char* src, size_type n); + ustring& append(const char* src); + ustring& append(size_type n, gunichar uc); + ustring& append(size_type n, char c); + template <class In> ustring& append(In pbegin, In pend); + +//! @} +//! @name Insert into the string. +//! @{ + + ustring& insert(size_type i, const ustring& src); + ustring& insert(size_type i, const ustring& src, size_type i2, size_type n); + ustring& insert(size_type i, const char* src, size_type n); + ustring& insert(size_type i, const char* src); + ustring& insert(size_type i, size_type n, gunichar uc); + ustring& insert(size_type i, size_type n, char c); + + iterator insert(iterator p, gunichar uc); + iterator insert(iterator p, char c); + void insert(iterator p, size_type n, gunichar uc); + void insert(iterator p, size_type n, char c); + template <class In> void insert(iterator p, In pbegin, In pend); + +//! @} +//! @name Replace sub-strings. +//! @{ + + ustring& replace(size_type i, size_type n, const ustring& src); + ustring& replace(size_type i, size_type n, const ustring& src, size_type i2, size_type n2); + ustring& replace(size_type i, size_type n, const char* src, size_type n2); + ustring& replace(size_type i, size_type n, const char* src); + ustring& replace(size_type i, size_type n, size_type n2, gunichar uc); + ustring& replace(size_type i, size_type n, size_type n2, char c); + + ustring& replace(iterator pbegin, iterator pend, const ustring& src); + ustring& replace(iterator pbegin, iterator pend, const char* src, size_type n); + ustring& replace(iterator pbegin, iterator pend, const char* src); + ustring& replace(iterator pbegin, iterator pend, size_type n, gunichar uc); + ustring& replace(iterator pbegin, iterator pend, size_type n, char c); + template <class In> ustring& replace(iterator pbegin, iterator pend, In pbegin2, In pend2); + +//! @} +//! @name Erase sub-strings. +//! @{ + + void clear(); + ustring& erase(size_type i, size_type n=npos); + ustring& erase(); + iterator erase(iterator p); + iterator erase(iterator pbegin, iterator pend); + +//! @} +//! @name Compare and collate. +//! @{ + + int compare(const ustring& rhs) const; + int compare(const char* rhs) const; + int compare(size_type i, size_type n, const ustring& rhs) const; + int compare(size_type i, size_type n, const ustring& rhs, size_type i2, size_type n2) const; + int compare(size_type i, size_type n, const char* rhs, size_type n2) const; + int compare(size_type i, size_type n, const char* rhs) const; + + /*! Create a unique sorting key for the UTF-8 string. If you need to + * compare UTF-8 strings regularly, e.g. for sorted containers such as + * <tt>std::set<></tt>, you should consider creating a collate key first + * and compare this key instead of the actual string. + * + * The ustring::compare() methods as well as the relational operators + * <tt>== != < > <= >=</tt> are quite costly + * because they have to deal with %Unicode and the collation rules defined by + * the current locale. Converting both operands to UCS-4 is just the first + * of several costly steps involved when comparing ustrings. So be careful. + */ + std::string collate_key() const; + + /*! Create a unique key for the UTF-8 string that can be used for caseless + * sorting. <tt>ustr.casefold_collate_key()</tt> results in the same string + * as <tt>ustr.casefold().collate_key()</tt>, but the former is likely more + * efficient. + */ + std::string casefold_collate_key() const; + +//! @} +//! @name Extract characters and sub-strings. +//! @{ + + /*! No reference return; use replace() to write characters. */ + value_type operator[](size_type i) const; + + /*! No reference return; use replace() to write characters. @throw std::out_of_range */ + value_type at(size_type i) const; + + inline ustring substr(size_type i=0, size_type n=npos) const; + +//! @} +//! @name Access a sequence of characters. +//! @{ + + iterator begin(); + iterator end(); + const_iterator begin() const; + const_iterator end() const; + reverse_iterator rbegin(); + reverse_iterator rend(); + const_reverse_iterator rbegin() const; + const_reverse_iterator rend() const; + +//! @} +//! @name Find sub-strings. +//! @{ + + size_type find(const ustring& str, size_type i=0) const; + size_type find(const char* str, size_type i, size_type n) const; + size_type find(const char* str, size_type i=0) const; + size_type find(gunichar uc, size_type i=0) const; + size_type find(char c, size_type i=0) const; + + size_type rfind(const ustring& str, size_type i=npos) const; + size_type rfind(const char* str, size_type i, size_type n) const; + size_type rfind(const char* str, size_type i=npos) const; + size_type rfind(gunichar uc, size_type i=npos) const; + size_type rfind(char c, size_type i=npos) const; + +//! @} +//! @name Match against a set of characters. +//! @{ + + size_type find_first_of(const ustring& match, size_type i=0) const; + size_type find_first_of(const char* match, size_type i, size_type n) const; + size_type find_first_of(const char* match, size_type i=0) const; + size_type find_first_of(gunichar uc, size_type i=0) const; + size_type find_first_of(char c, size_type i=0) const; + + size_type find_last_of(const ustri... [truncated message content] |