|
From: Gregorio L. <g.l...@gm...> - 2024-12-20 01:59:43
|
```c++
std::map<std::string, Converter> UnicodeUtil::m_converters{};
Converter::Converter(std::string const& codepage): m_codepage(codepage), m_converter(nullptr, &ucnv_close) {
m_converter = std::unique_ptr<UConverter, decltype(&ucnv_close)>(ucnv_open(m_codepage.c_str(), m_error), &ucnv_close);
if (m_error.isFailure()) throw std::runtime_error("unicode/error: " + std::to_string(m_error.get()) + ": " + std::string(m_error.errorName()));
}
Converter::Converter(Converter&& c) noexcept:
m_codepage(std::move(c.m_codepage)),
m_converter(std::move(c.m_converter)),
m_error(std::move(c.m_error)) {}
icu::UnicodeString Converter::convertToUTF8(std::string_view sv) {
std::scoped_lock l(m_lock);
icu::UnicodeString ret(sv.data(), static_cast<int>(sv.length()), m_converter.get(), m_error);
if (m_error.isFailure()) throw std::runtime_error("Couldn't convert string: " + std::string(sv) + " to UTF-8. Error: " + std::to_string(m_error.get()) + ": " + m_error.errorName());
return ret;
}
Converter& UnicodeUtil::getConverter(std::string const& s) {
return m_converters.try_emplace(s, Converter(s)).first->second; // FIXME: THIS NEEDS A LOCK.
}
std::string UnicodeUtil::convertToUTF8 (std::string_view str, std::string _filename, CaseMapping toCase, bool assumeUTF8) {
icu::UnicodeString ustring;
std::string charset;
if (assumeUTF8) charset = "UTF-8";
else charset = UnicodeUtil::getCharset(str);
if (charset != "UTF-8") {
if (!_filename.empty()) {
SpdLogger::info(LogSystem::I18N, "Filename={} does not seem to be UTF-8. Detected encoding={}", _filename, charset);
}
ustring = UnicodeUtil::getConverter(charset).convertToUTF8(str);
}
else { ustring = icu::UnicodeString::fromUTF8(str.data()); }
switch(toCase) {
case CaseMapping::UPPER:
ustring.toUpper();
break;
case CaseMapping::LOWER:
ustring.toLower();
break;
case CaseMapping::TITLE:
ustring.toTitle(0, icu::Locale(TranslationEngine::getCurrentLanguageCode().c_str()), U_TITLECASE_NO_LOWERCASE);
break;
case CaseMapping::NONE:
break;
}
std::string ret;
if (!ustring.isEmpty()) {
ustring.toUTF8String(ret);
}
else {
if (!ret.empty()) {
SpdLogger::error(LogSystem::I18N, "Unable to convert text in unknown encoding={}", charset);
}
}
return ret.substr(removeUTF8BOM(ret) ? 3 : 0); // For reasons unknown, it appears ICU appends an UTF-8 BOM when the source is UTF-16.
}
```
Before adding the `substr` solution at the end, I had put the following lines to get a better look at what was going on:
```c++
if (ret.length() >= 5 && SpdLogger::initialized()) {
SpdLogger::debug(LogSystem::I18N, "Converted from charset={} -- Original string beginning: {}\nret[0]={:X}, ret[1]={:X}, ret[2]={:X}, ret[3]={:X}, ret[4]={:X}", charset, str.substr(0,16), ret[0], ret[1], ret[2], ret[3], ret[4]);
if (removeUTF8BOM(ret)) {
SpdLogger::debug(LogSystem::I18N, "After trying to remove the BOM... ret[0]={:X}, ret[1]={:X}, ret[2]={:X}, ret[3]={:X}, ret[4]={:X}", ret.substr(3)[0], ret.substr(3)[1], ret.substr(3)[2], ret.substr(3)[3], ret.substr(3)[4]);
}
}
```
It seems that the BOM only gets appended for a UTF16 source (I tried converting from ANSI as well as Shift-JIS).
Considering for UTF8 the BOM is not encouraged, I would expect ICU to just remove the UTF16 BOM and not add a new one.
Gregorio Litenstein Goldzweig
Médico Cirujano
• Fono: +56 9 96343643
• E-Mail: g.l...@gm...
On 17 Dec 2024 13:30 -0300, Steven R. Loomis <sr...@gm...>, wrote:
> Hi,
> Can you post the exact code you’re using? You may have chosen an encoding which includes a BOM.
>
> -s
>
> --
> Steven R. Loomis
> Code Hive Tx, LLC
> https://codehivetx.us
>
>
>
> > On Dec 17, 2024, at 9:32 AM, Gregorio Litenstein <g.l...@gm...> wrote:
> >
> > While debugging some stuff related to text conversion, I have noticed that converting from UTF16 to UTF8 (via an intermediary `UnicodeString` and `toUTF8String`) results in a UTF8 string that starts with \xEF\xBB\xBF. Why is this BOM being appended to my string, and why does it only seem to happen when converting from UTF16?
> >
> > P.S. I am using icu4c 74.2
> >
> > --
> > You received this message because you are subscribed to the Google Groups "icu-support" group.
> > To unsubscribe from this group and stop receiving emails from it, send an email to icu...@un....
> > To view this discussion visit https://groups.google.com/a/unicode.org/d/msgid/icu-support/72cd8a2b-6e86-4b94-85e7-a77aca1f031bn%40unicode.org.
> >
> > --
> > You received this message because you are subscribed to the Google Groups "ICU - Team" group.
> > To unsubscribe from this group and stop receiving emails from it, send an email to icu...@un....
> > To view this discussion visit https://groups.google.com/a/unicode.org/d/msgid/icu-team/72cd8a2b-6e86-4b94-85e7-a77aca1f031bn%40unicode.org.
>
--
You received this message because you are subscribed to the Google Groups "icu-support" group.
To unsubscribe from this group and stop receiving emails from it, send an email to icu...@un....
To view this discussion visit https://groups.google.com/a/unicode.org/d/msgid/icu-support/e3e41b35-ea3a-4130-8560-d3a19b632867%40Spark.
|