[Opentrep-svn] SF.net SVN: opentrep:[184] trunk/opentrep/test/i18n/icu/icutranslit.cpp
Status: Beta
Brought to you by:
denis_arnaud
From: <den...@us...> - 2009-08-21 18:46:33
|
Revision: 184 http://opentrep.svn.sourceforge.net/opentrep/?rev=184&view=rev Author: denis_arnaud Date: 2009-08-21 18:46:25 +0000 (Fri, 21 Aug 2009) Log Message: ----------- [i18n] A first version of transliteration is working (but not specific to any particular language). Support for specific languages should be added (from Unicode 4.2?). Modified Paths: -------------- trunk/opentrep/test/i18n/icu/icutranslit.cpp Modified: trunk/opentrep/test/i18n/icu/icutranslit.cpp =================================================================== --- trunk/opentrep/test/i18n/icu/icutranslit.cpp 2009-08-21 15:46:51 UTC (rev 183) +++ trunk/opentrep/test/i18n/icu/icutranslit.cpp 2009-08-21 18:46:25 UTC (rev 184) @@ -4,9 +4,8 @@ * others. All Rights Reserved. ********************************************************************/ // STL -//#include <cstdio> -//#include <cstdio> #include <iostream> +#include <cassert> // ICU #include <unicode/translit.h> //#include <unicode/rbt.h> @@ -17,17 +16,6 @@ #include "icutranslit_util.hpp" #include "icutranslit_unaccent.hpp" -// RuleBasedTransliterator rules to remove accents from characters -// so they can be displayed as ASCIIx -UnicodeString UNACCENT_RULES( - "[\\u00C0-\\u00C5] > A;" - "[\\u00C8-\\u00CB] > E;" - "[\\u00CC-\\u00CF] > I;" - "[\\u00E0-\\u00E5] > a;" - "[\\u00E8-\\u00EB] > e;" - "[\\u00EC-\\u00EF] > i;" - ); - /** Display the available Transliterators. */ void displayTransliterators () { UErrorCode status = U_ZERO_ERROR; @@ -75,49 +63,63 @@ defFmt->setCalendar (*cal); // Create a Any-Latin Transliterator - const char* lLatinTransliteratorID = "Any-Latin"; + const char* lLatinTransliteratorID = "Any-Latin; NFD; [:M:] Remove; NFC;"; Transliterator* lLatinTransliterator = Transliterator::createInstance (lLatinTransliteratorID, UTRANS_FORWARD, status); - if (lLatinTransliterator == 0) { + + if (lLatinTransliterator == NULL || U_FAILURE (status)) { std::cerr << "ERROR: Transliterator::createInstance() failed for " << lLatinTransliteratorID << std::endl; - return -1; + return -1; } + assert (lLatinTransliterator != NULL); - // Create a Unaccent Transliterator - const char* lUnaccentTransliteratorID = "Accents-Any"; - // const char* lUnaccentTransliteratorID = "Any-NFC"; - Transliterator* lUnaccentTransliterator = - Transliterator::createInstance (lUnaccentTransliteratorID, UTRANS_FORWARD, - status); - if (lUnaccentTransliterator == 0) { + // Register the Transliterator object, so that the ICU library + // manages the corresponding memory. + Transliterator::registerInstance (lLatinTransliterator); + + // RuleBasedTransliterator rules to transform alternate forms of + // quotes, so that they can be removed by the transliterator removing + // punctuation. + // For instance, ʹ (\u02B9) is transformed into ' (\u0027) + // (see + UnicodeString lUnquoteRules ("[\\u02B9] > \\u0027;"); + + // Create a transformation of alternate forms of quotes into + // standard quotes + UParseError pError; + Transliterator* lPunctuationTransliterator = + Transliterator::createFromRules ("RBTUnaccent", lUnquoteRules, + UTRANS_FORWARD, pError, status); + if (lPunctuationTransliterator == NULL || U_FAILURE (status)) { std::cerr << "ERROR: Transliterator::createInstance() failed for " - << lUnaccentTransliteratorID << std::endl; - return -1; + << toUTF8String (lUnquoteRules) << std::endl; + return -1; } + assert (lPunctuationTransliterator != NULL); - // Create a Unaccent Transliterator - const char* lNFCTransliteratorID = "Any-NFC"; - Transliterator* lNFCTransliterator = - Transliterator::createInstance (lNFCTransliteratorID, UTRANS_FORWARD, - status); - if (lNFCTransliterator == 0) { + // Register the Transliterator object, so that the ICU library + // manages the corresponding memory. + Transliterator::registerInstance (lPunctuationTransliterator); + + // Create a punctuation-remover Transliterator + const char* lUnpunctuateTransliteratorID = "[:P:] Remove;"; + Transliterator* lUnpunctuateTransliterator = + Transliterator::createInstance (lUnpunctuateTransliteratorID, + UTRANS_FORWARD, status); + + if (lUnpunctuateTransliterator == NULL || U_FAILURE (status)) { std::cerr << "ERROR: Transliterator::createInstance() failed for " - << lNFCTransliteratorID << std::endl; - return -1; + << lUnpunctuateTransliteratorID << std::endl; + return -1; } + assert (lUnpunctuateTransliterator != NULL); - // Create a custom Transliterator - UParseError pError; - Transliterator* rbtUnaccent = - Transliterator::createFromRules ("RBTUnaccent", UNACCENT_RULES, - UTRANS_FORWARD, pError, status); - check (status, "Transliterator::createFromRules"); + // Register the Transliterator object, so that the ICU library + // manages the corresponding memory. + Transliterator::registerInstance (lUnpunctuateTransliterator); - // Create a custom Transliterator - Transliterator* unaccent = new UnaccentTransliterator(); - // Loop over various months for (int32_t month = Calendar::JANUARY; month <= Calendar::DECEMBER; @@ -148,31 +150,23 @@ // Transliterate result lLatinTransliterator->transliterate (str); - //lUnaccentTransliterator->transliterate (str); - lNFCTransliterator->transliterate (str); - std::cout << "Transliterated via " << lLatinTransliteratorID - << " and " << lUnaccentTransliteratorID << ": "; + std::cout << "Transliterated via " << lLatinTransliteratorID << ": "; std::cout << toUTF8String (str) << std::endl; // Transliterate result - UnicodeString str2; - str2 = str; - rbtUnaccent->transliterate(str); + lPunctuationTransliterator->transliterate (str); + lLatinTransliterator->transliterate (str); + lUnpunctuateTransliterator->transliterate (str); std::cout << "Transliterated via RBT unaccent: "; std::cout << toUTF8String (str) << std::endl; - - unaccent->transliterate(str2); - std::cout << "Transliterated via normalizer unaccent: "; - std::cout << toUTF8String (str2) << std::endl; } // Clean up delete fmt; fmt = NULL; delete cal; cal = NULL; - delete lLatinTransliterator; lLatinTransliterator = NULL; - delete unaccent; unaccent = NULL; - delete rbtUnaccent; rbtUnaccent = NULL; + // delete lLatinTransliterator; lLatinTransliterator = NULL; + // delete lPunctuationTransliterator; lPunctuationTransliterator = NULL; std::cout << "Exiting successfully" << std::endl; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |