opentrep-svn Mailing List for Open Travel Request Parser (Page 3)
Status: Beta
Brought to you by:
denis_arnaud
You can subscribe to this list here.
2009 |
Jan
|
Feb
|
Mar
|
Apr
|
May
|
Jun
|
Jul
(52) |
Aug
(19) |
Sep
(4) |
Oct
(10) |
Nov
(2) |
Dec
(4) |
---|---|---|---|---|---|---|---|---|---|---|---|---|
2010 |
Jan
(3) |
Feb
|
Mar
(3) |
Apr
|
May
|
Jun
(1) |
Jul
(2) |
Aug
(1) |
Sep
(9) |
Oct
|
Nov
(1) |
Dec
|
2011 |
Jan
|
Feb
(8) |
Mar
|
Apr
|
May
|
Jun
|
Jul
(2) |
Aug
|
Sep
|
Oct
|
Nov
|
Dec
|
From: <den...@us...> - 2009-08-24 23:11:47
|
Revision: 189 http://opentrep.svn.sourceforge.net/opentrep/?rev=189&view=rev Author: denis_arnaud Date: 2009-08-24 23:11:36 +0000 (Mon, 24 Aug 2009) Log Message: ----------- [i18n] Added a both the Russian-to-Latin and Ukrainian-to-Latin transliterators, and added a few examples of Ukrainian cities, both in Russian and in Ukrainian, so to compare both transliterations. Modified Paths: -------------- trunk/opentrep/test/i18n/icu/Makefile.am Added Paths: ----------- trunk/opentrep/test/i18n/icu/icutranslitru.cpp trunk/opentrep/test/i18n/icu/russian_latin_bgn.hpp trunk/opentrep/test/i18n/icu/ukrainian_latin_bgn.hpp Property Changed: ---------------- trunk/opentrep/test/i18n/icu/ Property changes on: trunk/opentrep/test/i18n/icu ___________________________________________________________________ Modified: svn:ignore - .libs .deps Makefile.in Makefile icufmt icucharsetdetector icuustring icuconv icuutext icutranslit + .libs .deps Makefile.in Makefile icufmt icucharsetdetector icuustring icuconv icuutext icutranslit icutranslitru Modified: trunk/opentrep/test/i18n/icu/Makefile.am =================================================================== --- trunk/opentrep/test/i18n/icu/Makefile.am 2009-08-24 20:07:46 UTC (rev 188) +++ trunk/opentrep/test/i18n/icu/Makefile.am 2009-08-24 23:11:36 UTC (rev 189) @@ -4,7 +4,7 @@ MAINTAINERCLEANFILES = Makefile.in check_PROGRAMS = icufmt icuustring icucharsetdetector icuconv \ - icuutext icutranslit + icuutext icutranslit icutranslitru icufmt_SOURCES = icufmt.cpp icufmt_CXXFLAGS = $(ICU_CFLAGS) @@ -31,4 +31,9 @@ icutranslit_CXXFLAGS = $(ICU_CFLAGS) icutranslit_LDFLAGS = $(ICU_LIBS) $(ICU_IO_LIB) +icutranslitru_SOURCES = icutranslitru.cpp \ + russian_latin_bgn.hpp ukrainian_latin_bgn.hpp +icutranslitru_CXXFLAGS = $(ICU_CFLAGS) +icutranslitru_LDFLAGS = $(ICU_LIBS) $(ICU_IO_LIB) + EXTRA_DIST = Added: trunk/opentrep/test/i18n/icu/icutranslitru.cpp =================================================================== --- trunk/opentrep/test/i18n/icu/icutranslitru.cpp (rev 0) +++ trunk/opentrep/test/i18n/icu/icutranslitru.cpp 2009-08-24 23:11:36 UTC (rev 189) @@ -0,0 +1,289 @@ +// STL +#include <cassert> +#include <iostream> +#include <sstream> +#include <string> +// ICU +#include <unicode/translit.h> +#include <unicode/unistr.h> +#include <unicode/ucnv.h> +// +#include "russian_latin_bgn.hpp" +#include "ukrainian_latin_bgn.hpp" + +// ///////////////////////////////////////////////////////////////////// +std::string toUTF8String (const UnicodeString& iString) { + std::ostringstream oStr; + + // String length + // const int32_t lLength = iString.length(); + + // Default codepage conversion + const int32_t lCapacity = 1000; + UChar lUCharString[lCapacity]; + UErrorCode status = U_ZERO_ERROR; + const int32_t actualLen = iString.extract (lUCharString, lCapacity, status); + assert (U_SUCCESS (status)); + lUCharString[actualLen] = '\0'; + + // UTF-8 converter + UConverter* cnv = ucnv_open ("UTF-8", &status); + assert (U_SUCCESS (status)); + + char lCharString[1000]; + // const int32_t nbOfConvertedChars = + ucnv_fromUChars (cnv, lCharString, 1000, lUCharString, -1, &status); + assert (U_SUCCESS (status)); + + // DEBUG + /* + std::cout << "toUTF8String(): converted " << nbOfConvertedChars + << " for the UnicodeString '" << uprintf(iString) + << "' (of length " << lLength << std::endl; + */ + + oStr << lCharString; + + return oStr.str(); +} + +// ///////////////////////////////////////////////////////////////////// +void initPunctuationTransliterators(Transliterator*& lNormaliser, + Transliterator*& lPunctuationTransliterator, + Transliterator*& lUnpunctuateTransliterator) { + lPunctuationTransliterator = NULL; + lUnpunctuateTransliterator = NULL; + + // Create a Normaliser + UErrorCode status = U_ZERO_ERROR; + const char* lNormaliserID = "NFD; [:M:] Remove; NFC;"; + lNormaliser = + Transliterator::createInstance (lNormaliserID, UTRANS_FORWARD, status); + + if (lNormaliser == NULL || U_FAILURE (status)) { + std::cerr << "ERROR: Transliterator::createInstance() failed for " + << lNormaliserID << std::endl; + return; + } + assert (lNormaliser != NULL); + + // Register the Transliterator object, so that the ICU library + // manages the corresponding memory. + Transliterator::registerInstance (lNormaliser); + + // RuleBasedTransliterator rules to transform alternate forms of + // quotes, so that they can be removed by the transliterator removing + // punctuation. + // For instance, ʹ (\u02B9) is transformed into ' (\u0027) + // and - (\u002D) is transformed into space (\u0020) + // (see + UnicodeString lUnquoteRules ("[\\u02B9] > \\u0027; [\\u002D] > \\u0020;"); + + // Create a transformation of alternate forms of quotes into + // standard quotes + UParseError pError; + lPunctuationTransliterator = + Transliterator::createFromRules ("RBTUnaccent", lUnquoteRules, + UTRANS_FORWARD, pError, status); + if (lPunctuationTransliterator == NULL || U_FAILURE (status)) { + std::cerr << "ERROR: Transliterator::createInstance() failed for " + << toUTF8String (lUnquoteRules) << std::endl; + return; + } + assert (lPunctuationTransliterator != NULL); + + // Register the Transliterator object, so that the ICU library + // manages the corresponding memory. + Transliterator::registerInstance (lPunctuationTransliterator); + + // Create a punctuation-remover Transliterator + const char* lUnpunctuateTransliteratorID = "[:P:] Remove;"; + lUnpunctuateTransliterator = + Transliterator::createInstance (lUnpunctuateTransliteratorID, + UTRANS_FORWARD, status); + + if (lUnpunctuateTransliterator == NULL || U_FAILURE (status)) { + std::cerr << "ERROR: Transliterator::createInstance() failed for " + << lUnpunctuateTransliteratorID << std::endl; + return; + } + assert (lUnpunctuateTransliterator != NULL); + + // Register the Transliterator object, so that the ICU library + // manages the corresponding memory. + Transliterator::registerInstance (lUnpunctuateTransliterator); +} + +// ///////////////////////////////////////////////////////////////////// +void removeQuotesAndPunctuation (UnicodeString& ioString, + Transliterator& lNormaliser, + Transliterator& lPunctuationTransliterator, + Transliterator& lUnpunctuateTransliterator) { + + // Remove quotes and punctuations + lNormaliser.transliterate (ioString); + lPunctuationTransliterator.transliterate (ioString); + lUnpunctuateTransliterator.transliterate (ioString); + + // Display the result + std::cout << "After normalisation and without accents: "; + std::cout << toUTF8String (ioString) << std::endl; +} + +// ////////////////////////// M A I N ////////////////////////////// +int main (int argc, char* argv[]) { + + // Default values + UErrorCode status = U_ZERO_ERROR; + + // + Transliterator* lNormaliser = NULL; + Transliterator* lPunctuationTransliterator = NULL; + Transliterator* lUnpunctuateTransliterator = NULL; + initPunctuationTransliterators (lNormaliser, lPunctuationTransliterator, + lUnpunctuateTransliterator); + assert (lPunctuationTransliterator != NULL + && lUnpunctuateTransliterator != NULL); + + // Create a Russian-Latin/BGN transliterator (which may be + // integrated in the more recent versions of the ICU library) + UParseError pError; + Transliterator* lRussianLatinBGNTransliterator = + Transliterator::createFromRules ("RBTRussianLatinBGN", + RUSSIAN_LATIN_BGN_TRANSLITERATION_RULES, + UTRANS_FORWARD, pError, status); + if (lRussianLatinBGNTransliterator == NULL || U_FAILURE (status)) { + std::cerr << "ERROR: Transliterator::createInstance() failed for " + << "RBTRussianLatinBGN" << std::endl; + return -1; + } + assert (lRussianLatinBGNTransliterator != NULL); + + // Create a Ukrainian-Latin/BGN transliterator (which may be + // integrated in the more recent versions of the ICU library) + Transliterator* lUkrainianLatinBGNTransliterator = + Transliterator::createFromRules("RBTUkrainianLatinBGN", + UKRAINIAN_LATIN_BGN_TRANSLITERATION_RULES, + UTRANS_FORWARD, pError, status); + if (lUkrainianLatinBGNTransliterator == NULL || U_FAILURE (status)) { + std::cerr << "ERROR: Transliterator::createInstance() failed for " + << "RBTUkrainianLatinBGN" << std::endl; + return -1; + } + assert (lUkrainianLatinBGNTransliterator != NULL); + + // Create a Cyrillic-Latin Transliterator + const char* lCyrillicLatinTransliteratorID = "Cyrillic-Latin; NFD; [:M:] Remove; NFC;"; + Transliterator* lCyrillicLatinTransliterator = + Transliterator::createInstance (lCyrillicLatinTransliteratorID, + UTRANS_FORWARD, status); + + if (lCyrillicLatinTransliterator == NULL || U_FAILURE (status)) { + std::cerr << "ERROR: Transliterator::createInstance() failed for " + << lCyrillicLatinTransliteratorID << std::endl; + return -1; + } + assert (lCyrillicLatinTransliterator != NULL); + + // Register the Transliterator object, so that the ICU library + // manages the corresponding memory. + Transliterator::registerInstance (lCyrillicLatinTransliterator); + + // Create a Any-Latin Transliterator + const char* lLatinTransliteratorID = "Any-Latin; NFD; [:M:] Remove; NFC;"; + Transliterator* lLatinTransliterator = + Transliterator::createInstance (lLatinTransliteratorID, UTRANS_FORWARD, + status); + + if (lLatinTransliterator == NULL || U_FAILURE (status)) { + std::cerr << "ERROR: Transliterator::createInstance() failed for " + << lLatinTransliteratorID << std::endl; + return -1; + } + assert (lLatinTransliterator != NULL); + + // Register the Transliterator object, so that the ICU library + // manages the corresponding memory. + Transliterator::registerInstance (lLatinTransliterator); + + // + // Define an UTF-8-encoded query string, with a mix of Russian and + // Ukrainian places. + UnicodeString lQueryString ("München (de) - Мюнхен (ru), " + "Житомир (ru & uk), Ужгород (ru & uk), " + "Херсон (ru & uk), " + "Украина (ru) - Україна (uk), " + "Киев (ru) - Київ (uk), " + "Львов (ru) - Львів (uk), " + "Днепропетровск (ru) - Дніпропетровськ (uk), " + "Ивано-Франковск (ru) - Івано-Франківськ (uk), " + "Хортица (ru) - Хортиця (uk), " + "Запорожье (ru) - Запоріжжя (uk), " + "Хмельницкий (ru) - Хмельницький (uk), " + "Одесса (ru) - Одеса (uk), " + "Кировоград (ru) - Кіровоград (uk), " + "Луганск (ru) - Луганськ (uk), " + "Харьков (ru) - Харків (uk), " + "Чернигов (ru) - Чернігів (uk), " + "Донецк (ru) - Донецьк (uk), " + "Черкассы (ru) - Черкаси (uk), " + "Черновцы (ru) - Чернівці (uk), " + "Винница (ru) - Вінниця (uk)"); + UnicodeString lQueryString2 (lQueryString); + UnicodeString lQueryString3 (lQueryString); + UnicodeString lQueryString4 (lQueryString); + + // + std::cout << std::endl << "-----------------" << std::endl + << "Cities in Russian and Ukrainian languages: "; + std::cout << toUTF8String (lQueryString) << std::endl; + + // Transliterate with the classical Any-Latin transliterator + lLatinTransliterator->transliterate (lQueryString); + + std::cout << std::endl << "Transliterated via " + << lLatinTransliteratorID << ": "; + std::cout << toUTF8String (lQueryString) << std::endl; + + removeQuotesAndPunctuation (lQueryString, *lNormaliser, + *lPunctuationTransliterator, + *lUnpunctuateTransliterator); + + // Transliterate with the Cyrillic-Latin transliterator + /** (it seems to be the same as the Any-Latin transliteration) + lCyrillicLatinTransliterator->transliterate (lQueryString4); + + std::cout << std::endl << "Transliterated via " + << lCyrillicLatinTransliteratorID << ": "; + std::cout << toUTF8String (lQueryString4) << std::endl; + + removeQuotesAndPunctuation (lQueryString4, *lNormaliser, + *lPunctuationTransliterator, + *lUnpunctuateTransliterator); + */ + + // Transliterate with the rule-based Russian-Latin/BGN transliterator + lRussianLatinBGNTransliterator->transliterate (lQueryString2); + + std::cout << std::endl << "Transliterated via RBTRussianLatinBGN: "; + std::cout << toUTF8String (lQueryString2) << std::endl; + + removeQuotesAndPunctuation (lQueryString2, *lNormaliser, + *lPunctuationTransliterator, + *lUnpunctuateTransliterator); + + // Transliterate with the rule-based Ukrainian-Latin/BGN transliterator + lUkrainianLatinBGNTransliterator->transliterate (lQueryString3); + + std::cout << std::endl << "Transliterated via RBTUkrainianLatinBGN: "; + std::cout << toUTF8String (lQueryString3) << std::endl; + + removeQuotesAndPunctuation (lQueryString3, *lNormaliser, + *lPunctuationTransliterator, + *lUnpunctuateTransliterator); + + + std::cout << "Exiting successfully" << std::endl; + + return 0; +} Added: trunk/opentrep/test/i18n/icu/russian_latin_bgn.hpp =================================================================== --- trunk/opentrep/test/i18n/icu/russian_latin_bgn.hpp (rev 0) +++ trunk/opentrep/test/i18n/icu/russian_latin_bgn.hpp 2009-08-24 23:11:36 UTC (rev 189) @@ -0,0 +1,107 @@ +// ICU +#include <unicode/unistr.h> + +extern const UnicodeString RUSSIAN_LATIN_BGN_TRANSLITERATION_RULES ( +"::[XxЁА-ФЦ-фц-яё];" +"::NFD(NFC);" +"[аеиоуыэ-яё]ы > y;" +"[ЁАЕИОУЫЭ-Я][Ыы] > Y;" +"А > A;" +"а > a;" +"Б > B;" +"б > b;" +"В > V;" +"в > v;" +"Г > G;" +"г > g;" +"Д > D;" +"д > d;" +"{Е}[[ЁАЕИОУЫЭ-Я][ЙЪЬ]] > YE;" +"{Е}[[аеиоуыэ-яё][йъь]] > Ye;" +"[^[:L:][:M:][:N:]]{Е} > Ye;" +"Е > E;" +"{е}[[ЁАЕИОУЫЭ-Я][аеиоуыэ-яё][ЙЪЬйъь]] > ye;" +"[^[:L:][:M:][:N:]]{е} > ye;" +"е > e;" +"{Ё}[[ЁАЕИОУЫЭ-Я][ЙЪЬ]] > YË;" +"{Ё}[[аеиоуыэ-яё][йъь]] > Yë;" +"[^[:L:][:M:][:N:]]{Ё} > YË;" +"Ё > Ë;" +"{ё}[[ЁАЕИОУЫЭ-Я][аеиоуыэ-яё][ЙЪЬйъь]] > yë;" +"[^[:L:][:M:][:N:]]{ё} > yë;" +"ё > ë;" +"{Ж}[[б-джй-нп-тф-щэ][аеиоуыэ-яё]] > Zh;" +"Ж > ZH;" +"ж > zh;" +"З > Z;" +"з > z;" +"{[[[Б-ДЖЙ-НП-ТФ-ЩЭ][б-джй-нп-тф-щэ]]-[Йй]]}З > ·Е;" +"{[[[Б-ДЖЙ-НП-ТФ-ЩЭ][б-джй-нп-тф-щэ]]-[Йй]]}з > ·е;" +"И > I;" +"и > i;" +"{Й}[АУЫЭауыэ] > Y·;" +"{й}[АУЫЭауыэ] > y·;" +"Й > Y;" +"й > y;" +"К > K;" +"к > k;" +"Л > L;" +"л > l;" +"М > M;" +"м > m;" +"Н > N;" +"н > n;" +"О > O;" +"о > o;" +"П > P;" +"п > p;" +"Р > R;" +"р > r;" +"С > S;" +"с > s;" +"ТС > T·S;" +"Тс > T·s;" +"тс > t·s;" +"Т > T;" +"т > t;" +"У > U;" +"у > u;" +"Ф > F;" +"ф > f;" +"{Х}[[б-джй-нп-тф-щэ][аеиоуыэ-яё]] > Kh;" +"Х > KH;" +"х > kh;" +"{Ц}[[б-джй-нп-тф-щэ][аеиоуыэ-яё]] > Ts;" +"Ц > TS;" +"ц > ts;" +"{Ч}[[б-джй-нп-тф-щэ][аеиоуыэ-яё]] > Ch;" +"Ч > CH;" +"ч > ch;" +"ШЧ > SH·CH;" +"Шч > Sh·ch;" +"шч > sh·ch;" +"{Ш}[[б-джй-нп-тф-щэ][аеиоуыэ-яё]] > Sh;" +"Ш > SH;" +"ш > sh;" +"{Щ}[[б-джй-нп-тф-щэ][аеиоуыэ-яё]] > Shch;" +"Щ > SHCH;" +"щ > shch;" +"Ъ > ʺ;" +"ъ > ʺ;" +"{[[ЁАЕИОУЫЭ-Я][аеиоуыэ-яё]]}Ы > ·Y;" +"{[[ЁАЕИОУЫЭ-Я][аеиоуыэ-яё]]}ы > ·y;" +"{Ы}[АУЫЭауыэ] > Y·;" +"{ы}[ауыэ] > y·;" +"Ы > Y;" +"ы > y;" +"Ь > ʹ;" +"ь > ʹ;" +"Э > E;" +"э > e;" +"{Ю}[[б-джй-нп-тф-щэ][аеиоуыэ-яё]] > Yu;" +"Ю > YU;" +"ю > yu;" +"{Я}[[б-джй-нп-тф-щэ][аеиоуыэ-яё]] > Ya;" +"Я > YA;" +"я > ya;" +); Added: trunk/opentrep/test/i18n/icu/ukrainian_latin_bgn.hpp =================================================================== --- trunk/opentrep/test/i18n/icu/ukrainian_latin_bgn.hpp (rev 0) +++ trunk/opentrep/test/i18n/icu/ukrainian_latin_bgn.hpp 2009-08-24 23:11:36 UTC (rev 189) @@ -0,0 +1,102 @@ +// ICU +#include <unicode/unistr.h> + +extern const UnicodeString UKRAINIAN_LATIN_BGN_TRANSLITERATION_RULES ( +"::[ЄІЇА-ЩЬЮ-щьюяєіїҐґ’];" +"::NFD(NFC);" +"А > A;" +"а > a;" +"Б > B;" +"б > b;" +"В > V;" +"в > v;" +"Г > H;" +"г > h;" +"Ґ > G;" +"ґ > g;" +"Д > D;" +"д > d;" +"Е > E;" +"е > e;" +"{Є}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Ye;" +"Є > YE;" +"є > ye;" +"{Ж}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Zh;" +"Ж > ZH;" +"ж > zh;" +"ЗГ > Z·H;" +"Зг > Z·h;" +"зг > z·h;" +"З > Z;" +"з > z;" +"И > Y;" +"и > y;" +"І > I;" +"і > i;" +"{Ї}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Yi;" +"Ї > YI;" +"ї > yi;" +"Й > I;" +"й > i;" +"КГ > K·H;" +"Кг > K·h;" +"кг > k·h;" +"К > K;" +"к > k;" +"Л > L;" +"л > l;" +"М > M;" +"м > m;" +"Н > N;" +"н > n;" +"О > O;" +"о > o;" +"П > P;" +"п > p;" +"Р > R;" +"р > r;" +"СГ > S·H;" +"Сг > S·h;" +"сг > s·h;" +"С > S;" +"с > s;" +"ТС > T·S;" +"Тс > T·s;" +"тс > t·s;" +"Т > T;" +"т > t;" +"У > U;" +"у > u;" +"Ф > F;" +"ф > f;" +"{Х}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Kh;" +"Х > KH;" +"х > kh;" +"ЦГ > TS·H;" +"Цг > Ts·h;" +"цг > ts·h;" +"{Ц}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Ts;" +"Ц > TS;" +"ц > ts;" +"{Ч}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Ch;" +"Ч > CH;" +"ч > ch;" +"ШЧ > SH·CH;" +"Шч > Sh·ch;" +"шч > sh·ch;" +"{Ш}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Sh;" +"Ш > SH;" +"ш > sh;" +"{Щ}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Shch;" +"Щ > SHCH;" +"щ > shch;" +"{Ю}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Yu;" +"Ю > YU;" +"ю > yu;" +"{Я}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Ya;" +"Я > YA;" +"я > ya;" +"Ь > ʹ;" +"ь > ʹ;" +"’ > ʺ;" +); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-08-24 20:07:55
|
Revision: 188 http://opentrep.svn.sourceforge.net/opentrep/?rev=188&view=rev Author: denis_arnaud Date: 2009-08-24 20:07:46 +0000 (Mon, 24 Aug 2009) Log Message: ----------- [i18n] Added the Rules for the Rule-Based Transformation (RBT) of Russian and Ukrainian to Latin/BGN. Added Paths: ----------- trunk/opentrep/test/i18n/icu/russian_latin_bgn.txt trunk/opentrep/test/i18n/icu/ukrainian_latin_bgn.txt Added: trunk/opentrep/test/i18n/icu/russian_latin_bgn.txt =================================================================== --- trunk/opentrep/test/i18n/icu/russian_latin_bgn.txt (rev 0) +++ trunk/opentrep/test/i18n/icu/russian_latin_bgn.txt 2009-08-24 20:07:46 UTC (rev 188) @@ -0,0 +1,102 @@ +::[XxЁА-ФЦ-фц-яё]; +::NFD(NFC); +[аеиоуыэ-яё]ы > y; +[ЁАЕИОУЫЭ-Я][Ыы] > Y; +А > A; +а > a; +Б > B; +б > b; +В > V; +в > v; +Г > G; +г > g; +Д > D; +д > d; +{Е}[[ЁАЕИОУЫЭ-Я][ЙЪЬ]] > YE; +{Е}[[аеиоуыэ-яё][йъь]] > Ye; +[^[:L:][:M:][:N:]]{Е} > Ye; +Е > E; +{е}[[ЁАЕИОУЫЭ-Я][аеиоуыэ-яё][ЙЪЬйъь]] > ye; +[^[:L:][:M:][:N:]]{е} > ye; +е > e; +{Ё}[[ЁАЕИОУЫЭ-Я][ЙЪЬ]] > YË; +{Ё}[[аеиоуыэ-яё][йъь]] > Yë; +[^[:L:][:M:][:N:]]{Ё} > YË; +Ё > Ë; +{ё}[[ЁАЕИОУЫЭ-Я][аеиоуыэ-яё][ЙЪЬйъь]] > yë; +[^[:L:][:M:][:N:]]{ё} > yë; +ё > ë; +{Ж}[[б-джй-нп-тф-щэ][аеиоуыэ-яё]] > Zh; +Ж > ZH; +ж > zh; +З > Z; +з > z; +{[[[Б-ДЖЙ-НП-ТФ-ЩЭ][б-джй-нп-тф-щэ]]-[Йй]]}З > ·Е; +{[[[Б-ДЖЙ-НП-ТФ-ЩЭ][б-джй-нп-тф-щэ]]-[Йй]]}з > ·е; +И > I; +и > i; +{Й}[АУЫЭауыэ] > Y·; +{й}[АУЫЭауыэ] > y·; +Й > Y; +й > y; +К > K; +к > k; +Л > L; +л > l; +М > M; +м > m; +Н > N; +н > n; +О > O; +о > o; +П > P; +п > p; +Р > R; +р > r; +С > S; +с > s; +ТС > T·S; +Тс > T·s; +тс > t·s; +Т > T; +т > t; +У > U; +у > u; +Ф > F; +ф > f; +{Х}[[б-джй-нп-тф-щэ][аеиоуыэ-яё]] > Kh; +Х > KH; +х > kh; +{Ц}[[б-джй-нп-тф-щэ][аеиоуыэ-яё]] > Ts; +Ц > TS; +ц > ts; +{Ч}[[б-джй-нп-тф-щэ][аеиоуыэ-яё]] > Ch; +Ч > CH; +ч > ch; +ШЧ > SH·CH; +Шч > Sh·ch; +шч > sh·ch; +{Ш}[[б-джй-нп-тф-щэ][аеиоуыэ-яё]] > Sh; +Ш > SH; +ш > sh; +{Щ}[[б-джй-нп-тф-щэ][аеиоуыэ-яё]] > Shch; +Щ > SHCH; +щ > shch; +Ъ > ʺ; +ъ > ʺ; +{[[ЁАЕИОУЫЭ-Я][аеиоуыэ-яё]]}Ы > ·Y; +{[[ЁАЕИОУЫЭ-Я][аеиоуыэ-яё]]}ы > ·y; +{Ы}[АУЫЭауыэ] > Y·; +{ы}[ауыэ] > y·; +Ы > Y; +ы > y; +Ь > ʹ; +ь > ʹ; +Э > E; +э > e; +{Ю}[[б-джй-нп-тф-щэ][аеиоуыэ-яё]] > Yu; +Ю > YU; +ю > yu; +{Я}[[б-джй-нп-тф-щэ][аеиоуыэ-яё]] > Ya; +Я > YA; +я > ya; Added: trunk/opentrep/test/i18n/icu/ukrainian_latin_bgn.txt =================================================================== --- trunk/opentrep/test/i18n/icu/ukrainian_latin_bgn.txt (rev 0) +++ trunk/opentrep/test/i18n/icu/ukrainian_latin_bgn.txt 2009-08-24 20:07:46 UTC (rev 188) @@ -0,0 +1,97 @@ +::[ЄІЇА-ЩЬЮ-щьюяєіїҐґ’]; +::NFD(NFC); +А > A; +а > a; +Б > B; +б > b; +В > V; +в > v; +Г > H; +г > h; +Ґ > G; +ґ > g; +Д > D; +д > d; +Е > E; +е > e; +{Є}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Ye; +Є > YE; +є > ye; +{Ж}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Zh; +Ж > ZH; +ж > zh; +ЗГ > Z·H; +Зг > Z·h; +зг > z·h; +З > Z; +з > z; +И > Y; +и > y; +І > I; +і > i; +{Ї}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Yi; +Ї > YI; +ї > yi; +Й > Y; +й > y; +КГ > K·H; +Кг > K·h; +кг > k·h; +К > K; +к > k; +Л > L; +л > l; +М > M; +м > m; +Н > N; +н > n; +О > O; +о > o; +П > P; +п > p; +Р > R; +р > r; +СГ > S·H; +Сг > S·h; +сг > s·h; +С > S; +с > s; +ТС > T·S; +Тс > T·s; +тс > t·s; +Т > T; +т > t; +У > U; +у > u; +Ф > F; +ф > f; +{Х}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Kh; +Х > KH; +х > kh; +ЦГ > TS·H; +Цг > Ts·h; +цг > ts·h; +{Ц}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Ts; +Ц > TS; +ц > ts; +{Ч}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Ch; +Ч > CH; +ч > ch; +ШЧ > SH·CH; +Шч > Sh·ch; +шч > sh·ch; +{Ш}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Sh; +Ш > SH; +ш > sh; +{Щ}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Shch; +Щ > SHCH; +щ > shch; +{Ю}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Yu; +Ю > YU; +ю > yu; +{Я}[[б-джзй-нп-тф-щьґ’][аеиоуюяєії]] > Ya; +Я > YA; +я > ya; +Ь > ʹ; +ь > ʹ; +’ > ʺ; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-08-24 20:05:52
|
Revision: 187 http://opentrep.svn.sourceforge.net/opentrep/?rev=187&view=rev Author: denis_arnaud Date: 2009-08-24 20:05:41 +0000 (Mon, 24 Aug 2009) Log Message: ----------- [Dev] 1. Fixed a bug in the edit distance/error storage. 2. The Location structure now gets the original and corrected keywords. Modified Paths: -------------- trunk/opentrep/opentrep/Location.hpp trunk/opentrep/opentrep/bom/Document.hpp trunk/opentrep/opentrep/bom/Place.cpp trunk/opentrep/opentrep/bom/Place.hpp trunk/opentrep/opentrep/bom/StringMatcher.cpp trunk/opentrep/opentrep/command/RequestInterpreter.cpp trunk/opentrep/opentrep/python/pyopentrep.py Modified: trunk/opentrep/opentrep/Location.hpp =================================================================== --- trunk/opentrep/opentrep/Location.hpp 2009-08-23 16:50:59 UTC (rev 186) +++ trunk/opentrep/opentrep/Location.hpp 2009-08-24 20:05:41 UTC (rev 187) @@ -74,6 +74,16 @@ return _nameList; } + /** Get the original keywords. */ + std::string getOriginalKeywords() const { + return _originalKeywords; + } + + /** Get the corrected keywords. */ + std::string getCorrectedKeywords() const { + return _correctedKeywords; + } + /** Get the matching percentage. */ const MatchingPercentage_T& getPercentage() const { return _percentage; @@ -152,6 +162,16 @@ _nameList = iNameList; } + /** Set the original keywords. */ + void setOriginalKeywords (const std::string& iOriginalKeywords) { + _originalKeywords = iOriginalKeywords; + } + + /** Set the corrected keywords. */ + void setCorrectedKeywords (const std::string& iCorrectedKeywords) { + _correctedKeywords = iCorrectedKeywords; + } + /** Set the Xapian matching percentage. */ void setPercentage (const MatchingPercentage_T& iPercentage) { _percentage = iPercentage; @@ -198,15 +218,18 @@ oStr << _locationCode << ", " << _cityCode << ", " << _stateCode << ", " << _countryCode << ", " << _regionCode << ", " << _continentCode << ", " << _timeZoneGroup - << ", " << _longitude << ", " << _latitude << ", " << _percentage + << ", " << _longitude << ", " << _latitude + << ", " << _originalKeywords << ", " << _correctedKeywords + << ", " << _percentage << ", " << _editDistance << ", " << _allowableEditDistance; if (_extraLocationList.empty() == false) { - oStr << " " << _extraLocationList.size() << " extra match(es)"; + oStr << " with " << _extraLocationList.size() << " extra match(es)"; } if (_alternateLocationList.empty() == false) { - oStr << " " << _alternateLocationList.size() << " alternate match(es)"; + oStr << " with " << _alternateLocationList.size() + << " alternate match(es)"; } return oStr.str(); @@ -262,6 +285,8 @@ const std::string& iTimeZoneGroup, const double iLongitude, const double iLatitude, const LocationNameList_T& iNameList, + const std::string& iOriginalKeywords, + const std::string& iCorrectedKeywords, const MatchingPercentage_T& iPercentage, const NbOfErrors_T& iEditDistance, const NbOfErrors_T& iAllowableEditDistance) @@ -270,6 +295,8 @@ _regionCode (iRegionCode), _continentCode (iContinentCode), _timeZoneGroup (iTimeZoneGroup), _longitude (iLongitude), _latitude (iLatitude), _nameList (iNameList), + _originalKeywords (iOriginalKeywords), + _correctedKeywords (iCorrectedKeywords), _percentage (iPercentage), _editDistance (iEditDistance), _allowableEditDistance (iAllowableEditDistance) { } @@ -306,6 +333,12 @@ /** List of (American) English names. */ LocationNameList_T _nameList; + /** Original keywords. */ + std::string _originalKeywords; + + /** Original keywords. */ + std::string _correctedKeywords; + /** Matching percentage. */ MatchingPercentage_T _percentage; Modified: trunk/opentrep/opentrep/bom/Document.hpp =================================================================== --- trunk/opentrep/opentrep/bom/Document.hpp 2009-08-23 16:50:59 UTC (rev 186) +++ trunk/opentrep/opentrep/bom/Document.hpp 2009-08-24 20:05:41 UTC (rev 187) @@ -34,13 +34,13 @@ public: // ////////////////// Getters //////////////// /** Get the query string. */ - const TravelQuery_T& getTravelQuery() { + const TravelQuery_T& getTravelQuery() const { return _queryString; } /** Get the corrected query string. <br>When empty, it means that no correction was necessary. */ - const TravelQuery_T& getCorrectedTravelQuery() { + const TravelQuery_T& getCorrectedTravelQuery() const { return _correctedQueryString; } Modified: trunk/opentrep/opentrep/bom/Place.cpp =================================================================== --- trunk/opentrep/opentrep/bom/Place.cpp 2009-08-23 16:50:59 UTC (rev 186) +++ trunk/opentrep/opentrep/bom/Place.cpp 2009-08-24 20:05:41 UTC (rev 187) @@ -22,6 +22,8 @@ _regionCode (iPlace._regionCode), _continentCode (iPlace._continentCode), _timeZoneGroup (iPlace._timeZoneGroup), _longitude (iPlace._longitude), _latitude (iPlace._latitude), _nameMatrix (iPlace._nameMatrix), + _originalKeywords (iPlace._originalKeywords), + _correctedKeywords (iPlace._correctedKeywords), _docID (iPlace._docID), _percentage (iPlace._percentage), _editDistance (iPlace._editDistance), _allowableEditDistance (iPlace._allowableEditDistance) { @@ -80,6 +82,7 @@ << ", " << _countryCode << ", " << _regionCode << ", " << _continentCode << ", " << _timeZoneGroup << ", " << _longitude << ", " << _latitude + << ", " << _originalKeywords << ", " << _correctedKeywords << ", " << _docID << ", " << _percentage << ", " << _editDistance << ", " << _allowableEditDistance << ". "; @@ -137,6 +140,7 @@ << ", " << _countryCode << ", " << _regionCode << ", " << _continentCode << ", " << _timeZoneGroup << ", " << _longitude << ", " << _latitude + << ", " << _originalKeywords << ", " << _correctedKeywords << ", " << _docID << ", " << _percentage << ", " << _editDistance << ", " << _allowableEditDistance; @@ -188,6 +192,8 @@ << ", time zone group = " << _timeZoneGroup << ", longitude = " << _longitude << ", latitude = " << _latitude + << ", original keywords = " << _originalKeywords + << ", corrected keywords = " << _correctedKeywords << ", docID = " << _docID << ", percentage = " << _percentage << "%" << ", edit distance = " << _editDistance @@ -265,6 +271,7 @@ Location oLocation (_placeCode, lCityCode, _stateCode, _countryCode, _regionCode, _continentCode, _timeZoneGroup, _longitude, _latitude, lNameList, + _originalKeywords, _correctedKeywords, _percentage, _editDistance, _allowableEditDistance); // Add extra matching locations, whenever they exist Modified: trunk/opentrep/opentrep/bom/Place.hpp =================================================================== --- trunk/opentrep/opentrep/bom/Place.hpp 2009-08-23 16:50:59 UTC (rev 186) +++ trunk/opentrep/opentrep/bom/Place.hpp 2009-08-24 20:05:41 UTC (rev 187) @@ -76,6 +76,16 @@ return _latitude; } + /** Get the original keywords. */ + std::string getOriginalKeywords() const { + return _originalKeywords; + } + + /** Get the corrected keywords. */ + std::string getCorrectedKeywords() const { + return _correctedKeywords; + } + /** Get the Xapian document ID. */ const XapianDocID_T& getDocID() const { return _docID; @@ -167,6 +177,16 @@ _latitude = iLatitude; } + /** Set the original keywords. */ + void setOriginalKeywords (const std::string& iOriginalKeywords) { + _originalKeywords = iOriginalKeywords; + } + + /** Set the corrected keywords. */ + void setCorrectedKeywords (const std::string& iCorrectedKeywords) { + _correctedKeywords = iCorrectedKeywords; + } + /** Set the Xapian document ID. */ void setDocID (const XapianDocID_T& iDocID) { _docID = iDocID; @@ -280,6 +300,12 @@ /** List of names, for each given language. */ NameMatrix_T _nameMatrix; + /** Original keywords. */ + std::string _originalKeywords; + + /** Original keywords. */ + std::string _correctedKeywords; + /** Xapian document ID. */ XapianDocID_T _docID; Modified: trunk/opentrep/opentrep/bom/StringMatcher.cpp =================================================================== --- trunk/opentrep/opentrep/bom/StringMatcher.cpp 2009-08-23 16:50:59 UTC (rev 186) +++ trunk/opentrep/opentrep/bom/StringMatcher.cpp 2009-08-24 20:05:41 UTC (rev 187) @@ -398,7 +398,7 @@ // Display the results nbMatches = ioMatchingSet.size(); - + // DEBUG /* OPENTREP_LOG_DEBUG ("Corrected query `" << lCorrectedQueryString @@ -417,6 +417,10 @@ ioHasReachedMaximalAllowableEditDistance = true; } + // Store the effective (Levenshtein) edit distance/error + ioEditDistance = Levenshtein::getDistance (lOriginalQueryString, + lCorrectedQueryString); + oMatchedString = lCorrectedQueryString; return oMatchedString; } Modified: trunk/opentrep/opentrep/command/RequestInterpreter.cpp =================================================================== --- trunk/opentrep/opentrep/command/RequestInterpreter.cpp 2009-08-23 16:50:59 UTC (rev 186) +++ trunk/opentrep/opentrep/command/RequestInterpreter.cpp 2009-08-24 20:05:41 UTC (rev 187) @@ -63,11 +63,17 @@ /** Helper function. */ // ////////////////////////////////////////////////////////////////////// - bool retrieveAndFillPlace (const Xapian::Document& iDocument, + bool retrieveAndFillPlace (const std::string& iOriginalKeywords, + const std::string& iCorrectedKeywords, + const Xapian::Document& iDocument, const Xapian::percent& iDocPercentage, soci::session& ioSociSession, Place& ioPlace) { bool hasRetrievedPlace = false; + // Set the original and corrected/suggested keywords + ioPlace.setOriginalKeywords (iOriginalKeywords); + ioPlace.setCorrectedKeywords (iCorrectedKeywords); + // Set the matching percentage ioPlace.setPercentage (iDocPercentage); @@ -111,10 +117,16 @@ // ////////////////////////////////////////////////////////////////////// bool retrieveAndFillPlace (const Document& iDocument, soci::session& ioSociSession, Place& ioPlace) { + // Note that Document::getTravelQuery() returns a TravelQuery_T, which + // is actually a std::string + const std::string& lOriginalKeywords = iDocument.getTravelQuery(); + const std::string& lCorrectedKeywords = iDocument.getCorrectedTravelQuery(); + // Delegate const Xapian::Document& lXapianDocument = iDocument.getXapianDocument(); const Xapian::percent& lDocPercentage = iDocument.getXapianPercentage(); - return retrieveAndFillPlace (lXapianDocument, lDocPercentage, + return retrieveAndFillPlace (lOriginalKeywords, lCorrectedKeywords, + lXapianDocument, lDocPercentage, ioSociSession, ioPlace); } @@ -166,6 +178,9 @@ // Retrieve the list of extra matching documents (documents // matching with the same weight/percentage) + const std::string& lOriginalKeywords = lDocument.getTravelQuery(); + const std::string& lCorrectedKeywords = + lDocument.getCorrectedTravelQuery(); const Xapian::percent& lExtraDocPercentage = lDocument.getXapianPercentage(); const XapianDocumentList_T& lExtraDocumentList = @@ -183,9 +198,10 @@ // Retrieve, in the MySQL database, the place corresponding to // the place code located as the first word of the Xapian // document data. - hasRetrievedPlace = retrieveAndFillPlace (lExtraDocument, - lExtraDocPercentage, - ioSociSession, lExtraPlace); + hasRetrievedPlace = + retrieveAndFillPlace (lOriginalKeywords, lCorrectedKeywords, + lExtraDocument, lExtraDocPercentage, + ioSociSession, lExtraPlace); // Same remark as above assert (hasRetrievedPlace == true); @@ -222,9 +238,10 @@ // Retrieve, in the MySQL database, the place corresponding to // the place code located as the first word of the Xapian // document data. - hasRetrievedPlace = retrieveAndFillPlace (lAlterDocument, - lAlterDocPercentage, - ioSociSession, lAlterPlace); + hasRetrievedPlace = + retrieveAndFillPlace (lOriginalKeywords, lCorrectedKeywords, + lAlterDocument, lAlterDocPercentage, + ioSociSession, lAlterPlace); // Same remark as above assert (hasRetrievedPlace == true); Modified: trunk/opentrep/opentrep/python/pyopentrep.py =================================================================== --- trunk/opentrep/opentrep/python/pyopentrep.py 2009-08-23 16:50:59 UTC (rev 186) +++ trunk/opentrep/opentrep/python/pyopentrep.py 2009-08-24 20:05:41 UTC (rev 187) @@ -5,6 +5,8 @@ # Default search string defaultSearchString = 'sna francicso rio de janero lso angles reykyavki' +needDetails = True + # Parser helpers def getMain(locations): return locations[:3] @@ -23,12 +25,24 @@ if searchString == '' : searchString = defaultSearchString # Call the OpenTrep C++ library -result = openTrepLibrary.search(searchString) -#result = openTrepLibrary.searchWithFullDetails(searchString) +if needDetails == True: + result = openTrepLibrary.searchWithFullDetails (searchString) +else: + result = openTrepLibrary.search (searchString) + print 'Raw result from the OpenTrep library:' print result -# defaults +# If we have requested the detailed display, the result string is +# potentially big and complex, and is not aimed to be parsed. So, it +# is better to stop here. +if needDetails == True: + quit() + +# As we have requested no details, the result string is aimed to be +# parsed, so as to get the whole meaning of it. + +# Defaults msg, form_value, original_form_value, unrecognized = '', '', '', '' # Sample of result string to be parsed: This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-08-23 16:51:12
|
Revision: 186 http://opentrep.svn.sourceforge.net/opentrep/?rev=186&view=rev Author: denis_arnaud Date: 2009-08-23 16:50:59 +0000 (Sun, 23 Aug 2009) Log Message: ----------- [GUI] Fixed the display bug in the PSP page (though, there is still some work to be done for the 'Did you mean Xxx?' to properly work. Modified Paths: -------------- trunk/opentrep/opentrep/python/pyopentrep.py Modified: trunk/opentrep/opentrep/python/pyopentrep.py =================================================================== --- trunk/opentrep/opentrep/python/pyopentrep.py 2009-08-23 16:43:59 UTC (rev 185) +++ trunk/opentrep/opentrep/python/pyopentrep.py 2009-08-23 16:50:59 UTC (rev 186) @@ -46,22 +46,26 @@ # 1. First, strip out the unrecongised keywords if ';' in str_matches: str_matches, unrecognized = str_matches.split(';') - msg = 'unrecognized: %s. ' % unrecognized - str_value = unrecognized - if str_matches != '': - # 2. Then, for each matching location, the - # alternate matches have to be stored aside - alter_locations = [x for x in str_matches.split(',')] - locations = [getMain(x) for x in alter_locations] - for alter_location_list in alter_locations: - alter_location_list = [x for x in alter_location_list.split('-')] - for extra_location_list in alter_location_list: - extra_location_list = [x for x in extra_location_list.split(':')] +msg = 'unrecognized: %s. ' % unrecognized +str_value = unrecognized + +if str_matches != '': + # 2. Then, for each matching location, the + # alternate matches have to be stored aside + alter_locations = [x for x in str_matches.split(',')] + locations = [getMain(x) for x in alter_locations] + + for alter_location_list in alter_locations: + alter_location_list = [x for x in alter_location_list.split('-')] + for extra_location_list in alter_location_list: + extra_location_list = [x for x in extra_location_list.split(':')] - codes = [x[:3].upper() for x in alter_locations] - if len(codes)>0: form_value = ' '.join(codes) - if str_value != '': form_value += ' ' + str_value + codes = [x[:3].upper() for x in alter_locations] + if len(codes) > 0: + form_value = ' '.join(codes) + if str_value != '': + form_value += ' ' + str_value print "Parsed entries:" print form_value This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-08-23 16:44:07
|
Revision: 185 http://opentrep.svn.sourceforge.net/opentrep/?rev=185&view=rev Author: denis_arnaud Date: 2009-08-23 16:43:59 +0000 (Sun, 23 Aug 2009) Log Message: ----------- [GUI] Fixed the display bug in the PSP page (though, there is still some work to be done for the 'Did you mean Xxx?' to properly work. Modified Paths: -------------- trunk/opentrep/gui/psp/opentrep.psp trunk/opentrep/gui/psp/result_parser.py trunk/opentrep/opentrep/python/pyopentrep.py Modified: trunk/opentrep/gui/psp/opentrep.psp =================================================================== --- trunk/opentrep/gui/psp/opentrep.psp 2009-08-21 18:46:25 UTC (rev 184) +++ trunk/opentrep/gui/psp/opentrep.psp 2009-08-23 16:43:59 UTC (rev 185) @@ -8,13 +8,20 @@ localize = apache.import_module('localize', path=[local_path]) log_service = apache.import_module('log_service', path=[local_path]) +# Parser helpers +def getMain(locations): + return locations[:3] + # defaults msg, form_value, original_form_value, unrecognized = '', '', '', '' language = 'en' quiet = True +# Sample of result string to be parsed: +# 'nce/100,sfo/100-emb/98-jcc/97,yvr/100-cxh/83-xea/83-ydt/83;niznayou' # parsing: recognize sequence of three-letter codes codes = [] +locations = [] alter_locations = [] queryStringForm = form if queryStringForm.has_key('data'): @@ -24,26 +31,42 @@ if form_value.rstrip(' ') == '': pass else: - # Use opentrep + # Use the OpenTrep library libpyopentrep_proxy = apache.import_module('libpyopentrep_proxy', path=[local_path]) libpyopentrep = libpyopentrep_proxy.import_libpyopentrep(localize.libpyopentrep_path) mySearch = libpyopentrep.OpenTrepSearcher() mySearch.init(localize.traveldb_path, localize.tmp_trep_log_filename, localize.opentrep_dbparams['user'], localize.opentrep_dbparams['password'], localize.opentrep_dbparams['host'], localize.opentrep_dbparams['port'], localize.opentrep_dbparams['db']) + + # Delegate the interpretation to the underlying + # OpenTrep library str_matches = mySearch.search(form_value) + + # Parsing begins + # 1. First, strip out the unrecongised keywords if ';' in str_matches: str_matches, unrecognized = str_matches.split(';') msg = 'unrecognized: %s. ' % unrecognized str_value = unrecognized + if str_matches != '': + # 2. Then, for each matching location, the + # alternate matches have to be stored aside alter_locations = [x for x in str_matches.split(',')] + locations = [getMain(x) for x in alter_locations] for alter_location_list in alter_locations: alter_location_list = [x for x in alter_location_list.split('-')] for extra_location_list in alter_location_list: extra_location_list = [x for x in extra_location_list.split(':')] - codes = [x[0].upper() for x in alter_locations] - if len(codes)>0: form_value = ' '.join(codes) - if str_value != '': form_value += ' ' + str_value + codes = [x[:3].upper() for x in alter_locations] + + # Re-build the form entry text from the location codes + if len(codes) > 0: + form_value = ' '.join(codes) + + # Append the unrecognised items at the end + if str_value != '': + form_value += ' ' + str_value # Logging log_service.log(localize.www_log_filename, req, queryStringForm['data'], codes, unrecognized) Modified: trunk/opentrep/gui/psp/result_parser.py =================================================================== --- trunk/opentrep/gui/psp/result_parser.py 2009-08-21 18:46:25 UTC (rev 184) +++ trunk/opentrep/gui/psp/result_parser.py 2009-08-23 16:43:59 UTC (rev 185) @@ -1,64 +1,71 @@ -#!/usr/bin/env python - -import sys - -# Default result string -defaultResultString = 'yvr:xea/98-xtw/87,sfo/100,led:dft:htl/96;niznayou' - -# If no result string was supplied as arguments of the command-line, -# ask the user for some -resultString = ','.join(sys.argv[1:]) -if resultString == '' : resultString = defaultResultString - -# Function to parse the result string -def parseResultString(iResultString): - form_value, unrecognized = '', '' - msg = '(parsing successful)' - str_matches = iResultString - alter_locations = [] - - if ';' in str_matches: - str_matches, unrecognized = str_matches.split(';') - msg = '(unrecognized: %s)' % unrecognized - str_value = unrecognized - - if str_matches != '': - alter_locations = str_matches.split(',') - - print 'alter_locations: ', alter_locations - - idx1 = 0 - while idx1 != len(alter_locations): - -# print 'Before - alter_locations['+str(idx1)+']: ', alter_locations[idx1] - alter_locations[idx1] = alter_locations[idx1].split('-') -# print 'After - alter_locations['+str(idx1)+']: ', alter_locations[idx1], alter_locations - - idx2 = 0 - while idx2 != len(alter_locations[idx1]): - - alter_locations[idx1][idx2] = alter_locations[idx1][idx2].split(':') - - idx3 = 0 - while idx3 != len(alter_locations[idx1][idx2]): - - alter_locations[idx1][idx2][idx3] = alter_locations[idx1][idx2][idx3].split('/') - idx3 += 1 - - idx2 += 1 - - idx1 += 1 - -# codes = [x.upper() for x in alter_locations] -# if len(codes) > 0: form_value = ' '.join(codes) - if str_value != '': form_value += ' ' + str_value - - print 'After - alter_locations: ', alter_locations - - print 'Result ' + msg + ':' - return form_value - -# Main -print 'Before: ' + resultString -resultString = parseResultString(resultString) -print 'After: ' + resultString +#!/usr/bin/env python + +import sys + +# Parser helpers +def getMain(locations): + return locations[:3].upper() + +# Default result string +defaultResultString = 'yvr:xea/98-xtw/87,sfo/100,led:dft:htl/96;niznayou' + +# If no result string was supplied as arguments of the command-line, +# ask the user for some +resultString = ','.join(sys.argv[1:]) +if resultString == '' : resultString = defaultResultString + +# Function to parse the result string +def parseResultString(iResultString): + form_value, unrecognized = '', '' + msg = '(parsing successful)' + str_matches = iResultString + locations = [] + alter_locations = [] + + if ';' in str_matches: + str_matches, unrecognized = str_matches.split(';') + msg = '(unrecognized: %s)' % unrecognized + str_value = unrecognized + + if str_matches != '': + alter_locations = str_matches.split(',') + locations = [getMain(x) for x in alter_locations] + + print 'locations: ', locations + print 'alter_locations: ', alter_locations + + idx1 = 0 + while idx1 != len(alter_locations): + +# print 'Before - alter_locations['+str(idx1)+']: ', alter_locations[idx1] + alter_locations[idx1] = alter_locations[idx1].split('-') +# print 'After - alter_locations['+str(idx1)+']: ', alter_locations[idx1], alter_locations + + idx2 = 0 + while idx2 != len(alter_locations[idx1]): + + alter_locations[idx1][idx2] = alter_locations[idx1][idx2].split(':') + + idx3 = 0 + while idx3 != len(alter_locations[idx1][idx2]): + + alter_locations[idx1][idx2][idx3] = alter_locations[idx1][idx2][idx3].split('/') + idx3 += 1 + + idx2 += 1 + + idx1 += 1 + +# codes = [x.upper() for x in alter_locations] +# if len(codes) > 0: form_value = ' '.join(codes) + if str_value != '': form_value += ' ' + str_value + + print 'After - alter_locations: ', alter_locations + + print 'Result: ' + ' '.join(locations) + ' ' + msg + ':' + return form_value + +# Main +print 'Before: ' + resultString +resultString = parseResultString(resultString) +print 'After: ' + resultString Modified: trunk/opentrep/opentrep/python/pyopentrep.py =================================================================== --- trunk/opentrep/opentrep/python/pyopentrep.py 2009-08-21 18:46:25 UTC (rev 184) +++ trunk/opentrep/opentrep/python/pyopentrep.py 2009-08-23 16:43:59 UTC (rev 185) @@ -5,6 +5,9 @@ # Default search string defaultSearchString = 'sna francicso rio de janero lso angles reykyavki' +# Parser helpers +def getMain(locations): + return locations[:3] # Initialise the OpenTrep C++ library import libpyopentrep @@ -20,7 +23,48 @@ if searchString == '' : searchString = defaultSearchString # Call the OpenTrep C++ library -#result = openTrepLibrary.search(searchString) -result = openTrepLibrary.searchWithFullDetails(searchString) -print 'Result:' +result = openTrepLibrary.search(searchString) +#result = openTrepLibrary.searchWithFullDetails(searchString) +print 'Raw result from the OpenTrep library:' print result + +# defaults +msg, form_value, original_form_value, unrecognized = '', '', '', '' + +# Sample of result string to be parsed: +# 'nce/100,sfo/100-emb/98-jcc/97,yvr/100-cxh/83-xea/83-ydt/83;niznayou' +# parsing: recognize sequence of three-letter codes +codes = [] +locations = [] +alter_locations = [] + +# Delegate the interpretation to the underlying +# OpenTrep library +str_matches = result + +# Parsing begins +# 1. First, strip out the unrecongised keywords +if ';' in str_matches: + str_matches, unrecognized = str_matches.split(';') + msg = 'unrecognized: %s. ' % unrecognized + str_value = unrecognized + + if str_matches != '': + # 2. Then, for each matching location, the + # alternate matches have to be stored aside + alter_locations = [x for x in str_matches.split(',')] + locations = [getMain(x) for x in alter_locations] + for alter_location_list in alter_locations: + alter_location_list = [x for x in alter_location_list.split('-')] + for extra_location_list in alter_location_list: + extra_location_list = [x for x in extra_location_list.split(':')] + + codes = [x[:3].upper() for x in alter_locations] + if len(codes)>0: form_value = ' '.join(codes) + if str_value != '': form_value += ' ' + str_value + +print "Parsed entries:" +print form_value + +#print "Locations:" +#print locations This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-08-21 18:46:33
|
Revision: 184 http://opentrep.svn.sourceforge.net/opentrep/?rev=184&view=rev Author: denis_arnaud Date: 2009-08-21 18:46:25 +0000 (Fri, 21 Aug 2009) Log Message: ----------- [i18n] A first version of transliteration is working (but not specific to any particular language). Support for specific languages should be added (from Unicode 4.2?). Modified Paths: -------------- trunk/opentrep/test/i18n/icu/icutranslit.cpp Modified: trunk/opentrep/test/i18n/icu/icutranslit.cpp =================================================================== --- trunk/opentrep/test/i18n/icu/icutranslit.cpp 2009-08-21 15:46:51 UTC (rev 183) +++ trunk/opentrep/test/i18n/icu/icutranslit.cpp 2009-08-21 18:46:25 UTC (rev 184) @@ -4,9 +4,8 @@ * others. All Rights Reserved. ********************************************************************/ // STL -//#include <cstdio> -//#include <cstdio> #include <iostream> +#include <cassert> // ICU #include <unicode/translit.h> //#include <unicode/rbt.h> @@ -17,17 +16,6 @@ #include "icutranslit_util.hpp" #include "icutranslit_unaccent.hpp" -// RuleBasedTransliterator rules to remove accents from characters -// so they can be displayed as ASCIIx -UnicodeString UNACCENT_RULES( - "[\\u00C0-\\u00C5] > A;" - "[\\u00C8-\\u00CB] > E;" - "[\\u00CC-\\u00CF] > I;" - "[\\u00E0-\\u00E5] > a;" - "[\\u00E8-\\u00EB] > e;" - "[\\u00EC-\\u00EF] > i;" - ); - /** Display the available Transliterators. */ void displayTransliterators () { UErrorCode status = U_ZERO_ERROR; @@ -75,49 +63,63 @@ defFmt->setCalendar (*cal); // Create a Any-Latin Transliterator - const char* lLatinTransliteratorID = "Any-Latin"; + const char* lLatinTransliteratorID = "Any-Latin; NFD; [:M:] Remove; NFC;"; Transliterator* lLatinTransliterator = Transliterator::createInstance (lLatinTransliteratorID, UTRANS_FORWARD, status); - if (lLatinTransliterator == 0) { + + if (lLatinTransliterator == NULL || U_FAILURE (status)) { std::cerr << "ERROR: Transliterator::createInstance() failed for " << lLatinTransliteratorID << std::endl; - return -1; + return -1; } + assert (lLatinTransliterator != NULL); - // Create a Unaccent Transliterator - const char* lUnaccentTransliteratorID = "Accents-Any"; - // const char* lUnaccentTransliteratorID = "Any-NFC"; - Transliterator* lUnaccentTransliterator = - Transliterator::createInstance (lUnaccentTransliteratorID, UTRANS_FORWARD, - status); - if (lUnaccentTransliterator == 0) { + // Register the Transliterator object, so that the ICU library + // manages the corresponding memory. + Transliterator::registerInstance (lLatinTransliterator); + + // RuleBasedTransliterator rules to transform alternate forms of + // quotes, so that they can be removed by the transliterator removing + // punctuation. + // For instance, ʹ (\u02B9) is transformed into ' (\u0027) + // (see + UnicodeString lUnquoteRules ("[\\u02B9] > \\u0027;"); + + // Create a transformation of alternate forms of quotes into + // standard quotes + UParseError pError; + Transliterator* lPunctuationTransliterator = + Transliterator::createFromRules ("RBTUnaccent", lUnquoteRules, + UTRANS_FORWARD, pError, status); + if (lPunctuationTransliterator == NULL || U_FAILURE (status)) { std::cerr << "ERROR: Transliterator::createInstance() failed for " - << lUnaccentTransliteratorID << std::endl; - return -1; + << toUTF8String (lUnquoteRules) << std::endl; + return -1; } + assert (lPunctuationTransliterator != NULL); - // Create a Unaccent Transliterator - const char* lNFCTransliteratorID = "Any-NFC"; - Transliterator* lNFCTransliterator = - Transliterator::createInstance (lNFCTransliteratorID, UTRANS_FORWARD, - status); - if (lNFCTransliterator == 0) { + // Register the Transliterator object, so that the ICU library + // manages the corresponding memory. + Transliterator::registerInstance (lPunctuationTransliterator); + + // Create a punctuation-remover Transliterator + const char* lUnpunctuateTransliteratorID = "[:P:] Remove;"; + Transliterator* lUnpunctuateTransliterator = + Transliterator::createInstance (lUnpunctuateTransliteratorID, + UTRANS_FORWARD, status); + + if (lUnpunctuateTransliterator == NULL || U_FAILURE (status)) { std::cerr << "ERROR: Transliterator::createInstance() failed for " - << lNFCTransliteratorID << std::endl; - return -1; + << lUnpunctuateTransliteratorID << std::endl; + return -1; } + assert (lUnpunctuateTransliterator != NULL); - // Create a custom Transliterator - UParseError pError; - Transliterator* rbtUnaccent = - Transliterator::createFromRules ("RBTUnaccent", UNACCENT_RULES, - UTRANS_FORWARD, pError, status); - check (status, "Transliterator::createFromRules"); + // Register the Transliterator object, so that the ICU library + // manages the corresponding memory. + Transliterator::registerInstance (lUnpunctuateTransliterator); - // Create a custom Transliterator - Transliterator* unaccent = new UnaccentTransliterator(); - // Loop over various months for (int32_t month = Calendar::JANUARY; month <= Calendar::DECEMBER; @@ -148,31 +150,23 @@ // Transliterate result lLatinTransliterator->transliterate (str); - //lUnaccentTransliterator->transliterate (str); - lNFCTransliterator->transliterate (str); - std::cout << "Transliterated via " << lLatinTransliteratorID - << " and " << lUnaccentTransliteratorID << ": "; + std::cout << "Transliterated via " << lLatinTransliteratorID << ": "; std::cout << toUTF8String (str) << std::endl; // Transliterate result - UnicodeString str2; - str2 = str; - rbtUnaccent->transliterate(str); + lPunctuationTransliterator->transliterate (str); + lLatinTransliterator->transliterate (str); + lUnpunctuateTransliterator->transliterate (str); std::cout << "Transliterated via RBT unaccent: "; std::cout << toUTF8String (str) << std::endl; - - unaccent->transliterate(str2); - std::cout << "Transliterated via normalizer unaccent: "; - std::cout << toUTF8String (str2) << std::endl; } // Clean up delete fmt; fmt = NULL; delete cal; cal = NULL; - delete lLatinTransliterator; lLatinTransliterator = NULL; - delete unaccent; unaccent = NULL; - delete rbtUnaccent; rbtUnaccent = NULL; + // delete lLatinTransliterator; lLatinTransliterator = NULL; + // delete lPunctuationTransliterator; lPunctuationTransliterator = NULL; std::cout << "Exiting successfully" << std::endl; This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-08-21 15:47:01
|
Revision: 183 http://opentrep.svn.sourceforge.net/opentrep/?rev=183&view=rev Author: denis_arnaud Date: 2009-08-21 15:46:51 +0000 (Fri, 21 Aug 2009) Log Message: ----------- [i18n] Work on transliteration is on-going. Modified Paths: -------------- trunk/opentrep/test/i18n/icu/icutranslit.cpp trunk/opentrep/test/i18n/icu/icutranslit_util.cpp trunk/opentrep/test/i18n/icu/icutranslit_util.hpp Modified: trunk/opentrep/test/i18n/icu/icutranslit.cpp =================================================================== --- trunk/opentrep/test/i18n/icu/icutranslit.cpp 2009-08-19 18:17:01 UTC (rev 182) +++ trunk/opentrep/test/i18n/icu/icutranslit.cpp 2009-08-21 15:46:51 UTC (rev 183) @@ -4,8 +4,9 @@ * others. All Rights Reserved. ********************************************************************/ // STL -#include <cstdio> -#include <cstdlib> +//#include <cstdio> +//#include <cstdio> +#include <iostream> // ICU #include <unicode/translit.h> //#include <unicode/rbt.h> @@ -27,101 +28,153 @@ "[\\u00EC-\\u00EF] > i;" ); +/** Display the available Transliterators. */ +void displayTransliterators () { + UErrorCode status = U_ZERO_ERROR; + StringEnumeration* lStringEnumeration = + Transliterator::getAvailableIDs (status); + check (status, "Transliterator::getAvailableIDs()"); + + const UnicodeString* lString = lStringEnumeration->snext (status); + check (status, "StringEnumeration::snext()"); + + if (lString != NULL) { + std::cout << "Available transliterators:" << std::endl; + } + + while (lString != NULL) { + std::cout << toUTF8String (*lString) << std::endl; + + lString = lStringEnumeration->snext (status); + check (status, "StringEnumeration::snext()"); + } + + delete lStringEnumeration; lStringEnumeration = NULL; +} + // ////////////////////////// M A I N ////////////////////////////// -int main(int argc, char **argv) { +int main (int argc, char* argv[]) { - Calendar *cal; - DateFormat *fmt; - DateFormat *defFmt; - Transliterator *greek_latin; - Transliterator *rbtUnaccent; - Transliterator *unaccent; - UParseError pError; + // Create a calendar in the Russian locale UErrorCode status = U_ZERO_ERROR; - Locale greece("el", "GR"); - UnicodeString str, str2; - // Create a calendar in the Greek locale - cal = Calendar::createInstance(greece, status); - check(status, "Calendar::createInstance"); + // DEBUG + // displayTransliterators(); + // return 0; + + Locale russia ("ru", "RU"); + Calendar* cal = Calendar::createInstance (russia, status); + check (status, "Calendar::createInstance"); // Create a formatter - fmt = DateFormat::createDateInstance(DateFormat::kFull, greece); - fmt->setCalendar(*cal); + DateFormat* fmt = DateFormat::createDateInstance(DateFormat::kFull, russia); + fmt->setCalendar (*cal); // Create a default formatter - defFmt = DateFormat::createDateInstance(DateFormat::kFull); - defFmt->setCalendar(*cal); + DateFormat* defFmt = DateFormat::createDateInstance (DateFormat::kFull); + defFmt->setCalendar (*cal); - // Create a Greek-Latin Transliterator - greek_latin = Transliterator::createInstance("Greek-Latin", UTRANS_FORWARD, status); - if (greek_latin == 0) { - printf("ERROR: Transliterator::createInstance() failed\n"); - exit(1); + // Create a Any-Latin Transliterator + const char* lLatinTransliteratorID = "Any-Latin"; + Transliterator* lLatinTransliterator = + Transliterator::createInstance (lLatinTransliteratorID, UTRANS_FORWARD, + status); + if (lLatinTransliterator == 0) { + std::cerr << "ERROR: Transliterator::createInstance() failed for " + << lLatinTransliteratorID << std::endl; + return -1; } + // Create a Unaccent Transliterator + const char* lUnaccentTransliteratorID = "Accents-Any"; + // const char* lUnaccentTransliteratorID = "Any-NFC"; + Transliterator* lUnaccentTransliterator = + Transliterator::createInstance (lUnaccentTransliteratorID, UTRANS_FORWARD, + status); + if (lUnaccentTransliterator == 0) { + std::cerr << "ERROR: Transliterator::createInstance() failed for " + << lUnaccentTransliteratorID << std::endl; + return -1; + } + + // Create a Unaccent Transliterator + const char* lNFCTransliteratorID = "Any-NFC"; + Transliterator* lNFCTransliterator = + Transliterator::createInstance (lNFCTransliteratorID, UTRANS_FORWARD, + status); + if (lNFCTransliterator == 0) { + std::cerr << "ERROR: Transliterator::createInstance() failed for " + << lNFCTransliteratorID << std::endl; + return -1; + } + // Create a custom Transliterator - rbtUnaccent = Transliterator::createFromRules("RBTUnaccent", - UNACCENT_RULES, - UTRANS_FORWARD, - pError, - status); - check(status, "Transliterator::createFromRules"); + UParseError pError; + Transliterator* rbtUnaccent = + Transliterator::createFromRules ("RBTUnaccent", UNACCENT_RULES, + UTRANS_FORWARD, pError, status); + check (status, "Transliterator::createFromRules"); // Create a custom Transliterator - unaccent = new UnaccentTransliterator(); + Transliterator* unaccent = new UnaccentTransliterator(); // Loop over various months for (int32_t month = Calendar::JANUARY; month <= Calendar::DECEMBER; ++month) { + // const int32_t month = Calendar::JUNE; - // Set the calendar to a date - cal->clear(); - cal->set(1999, month, 4); - - // Format the date in default locale - str.remove(); - defFmt->format(cal->getTime(status), str, status); - check(status, "DateFormat::format"); - printf("Date: "); - uprintf(escape(str)); - printf("\n"); - - // Format the date for Greece - str.remove(); - fmt->format(cal->getTime(status), str, status); - check(status, "DateFormat::format"); - printf("Greek formatted date: "); - uprintf(escape(str)); - printf("\n"); - - // Transliterate result - greek_latin->transliterate(str); - printf("Transliterated via Greek-Latin: "); - uprintf(escape(str)); - printf("\n"); - - // Transliterate result - str2 = str; - rbtUnaccent->transliterate(str); - printf("Transliterated via RBT unaccent: "); - uprintf(escape(str)); - printf("\n"); - - unaccent->transliterate(str2); - printf("Transliterated via normalizer unaccent: "); - uprintf(escape(str2)); - printf("\n\n"); + // Set the calendar to a date + cal->clear(); + cal->set (2009, month, 21); + + // Format the date in default locale + UnicodeString str; + str.remove(); + defFmt->format (cal->getTime (status), str, status); + check (status, "DateFormat::format"); + + std::cout << std::endl << "-----------------" << std::endl + << "Date in default format: "; + std::cout << toUTF8String (str) << std::endl; + + // Format the date for Russia + str.remove(); + fmt->format (cal->getTime(status), str, status); + check (status, "DateFormat::format"); + + std::cout << "Russian formatted date: "; + std::cout << toUTF8String (str) << std::endl; + + // Transliterate result + lLatinTransliterator->transliterate (str); + //lUnaccentTransliterator->transliterate (str); + lNFCTransliterator->transliterate (str); + + std::cout << "Transliterated via " << lLatinTransliteratorID + << " and " << lUnaccentTransliteratorID << ": "; + std::cout << toUTF8String (str) << std::endl; + + // Transliterate result + UnicodeString str2; + str2 = str; + rbtUnaccent->transliterate(str); + std::cout << "Transliterated via RBT unaccent: "; + std::cout << toUTF8String (str) << std::endl; + + unaccent->transliterate(str2); + std::cout << "Transliterated via normalizer unaccent: "; + std::cout << toUTF8String (str2) << std::endl; } // Clean up - delete fmt; - delete cal; - delete greek_latin; - delete unaccent; - delete rbtUnaccent; + delete fmt; fmt = NULL; + delete cal; cal = NULL; + delete lLatinTransliterator; lLatinTransliterator = NULL; + delete unaccent; unaccent = NULL; + delete rbtUnaccent; rbtUnaccent = NULL; - printf("Exiting successfully\n"); + std::cout << "Exiting successfully" << std::endl; + return 0; } Modified: trunk/opentrep/test/i18n/icu/icutranslit_util.cpp =================================================================== --- trunk/opentrep/test/i18n/icu/icutranslit_util.cpp 2009-08-19 18:17:01 UTC (rev 182) +++ trunk/opentrep/test/i18n/icu/icutranslit_util.cpp 2009-08-21 15:46:51 UTC (rev 183) @@ -3,61 +3,109 @@ * Copyright (c) 1999-2002, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ +// STL +#include <cassert> +#include <iostream> +#include <sstream> // ICU -#include <unicode/unistr.h> -#include <cstdio> -#include <cstdlib> +#include <unicode/unistr.h> // UnicodeString +#include <unicode/ucnv.h> // Converter +// OpenTrep +#include "icutranslit_util.hpp" // Verify that a UErrorCode is successful; exit(1) if not -void check(UErrorCode& status, const char* msg) { - if (U_FAILURE(status)) { - printf("ERROR: %s (%s)\n", u_errorName(status), msg); - exit(1); +void check (UErrorCode& status, const char* msg) { + if (U_FAILURE (status)) { + std::cerr << "ERROR: " << u_errorName(status) << " (" << msg << ")" + << std::endl; } - // printf("Ok: %s\n", msg); + // std::cout << "Ok: " << msg << std::endl; } // Append a hex string to the target -static UnicodeString& appendHex(uint32_t number, - int8_t digits, - UnicodeString& target) { - static const UnicodeString DIGIT_STRING("0123456789ABCDEF"); +static UnicodeString& appendHex (uint32_t number, int8_t digits, + UnicodeString& target) { + static const UnicodeString DIGIT_STRING ("0123456789ABCDEF"); while (digits > 0) { - target += DIGIT_STRING[(number >> ((--digits) * 4)) & 0xF]; + target += DIGIT_STRING[(number >> ((--digits) * 4)) & 0xF]; } + return target; } // Replace nonprintable characters with unicode escapes -UnicodeString escape(const UnicodeString &source) { - int32_t i; - UnicodeString target; - target += "\""; - for (i=0; i<source.length(); ++i) { - UChar ch = source[i]; - if (ch < 0x09 || (ch > 0x0A && ch < 0x20) || ch > 0x7E) { - target += "\\u"; - appendHex(ch, 4, target); - } else { - target += ch; - } +UnicodeString escape (const UnicodeString& source) { + int32_t i; + UnicodeString target; + target += "\""; + for (i=0; i<source.length(); ++i) { + UChar ch = source[i]; + + if (ch < 0x09 || (ch > 0x0A && ch < 0x20) || ch > 0x7E) { + target += "\\u"; + appendHex (ch, 4, target); + + } else { + target += ch; } - target += "\""; - return target; + } + target += "\""; + return target; } // Print the given string to stdout -void uprintf(const UnicodeString &str) { - char *buf = 0; - int32_t len = str.length(); - // int32_t bufLen = str.extract(0, len, buf); // Preflight - /* Preflighting seems to be broken now, so assume 1-1 conversion, - plus some slop. */ - int32_t bufLen = len + 16; - int32_t actualLen; - buf = new char[bufLen + 1]; - actualLen = str.extract(0, len, buf/*, bufLen*/); // Default codepage conversion - buf[actualLen] = 0; - printf("%s", buf); - delete buf; +std::string uprintf (const UnicodeString& str) { + std::ostringstream oStr; + + const int32_t len = str.length(); + // int32_t bufLen = str.extract(0, len, buf); // Preflight + /* Preflighting seems to be broken now, so assume 1-1 conversion, + plus some slop. */ + const int32_t bufLen = len + 16; + + // Default codepage conversion + char* buf = new char[bufLen + 1]; + const int32_t actualLen = str.extract (0, len, buf/*, bufLen*/); + buf[actualLen] = '\0'; + + oStr << buf; + delete buf; buf = NULL; + + return oStr.str(); } + +// ///////////////////////////////////////////////////////////////////// +std::string toUTF8String (const UnicodeString& iString) { + std::ostringstream oStr; + + // String length + // const int32_t lLength = iString.length(); + + // Default codepage conversion + const int32_t lCapacity = 1000; + UChar lUCharString[lCapacity]; + UErrorCode status = U_ZERO_ERROR; + const int32_t actualLen = iString.extract (lUCharString, lCapacity, status); + assert (U_SUCCESS (status)); + lUCharString[actualLen] = '\0'; + + // UTF-8 converter + UConverter* cnv = ucnv_open ("UTF-8", &status); + assert (U_SUCCESS (status)); + + char lCharString[1000]; + // const int32_t nbOfConvertedChars = + ucnv_fromUChars (cnv, lCharString, 1000, lUCharString, -1, &status); + assert (U_SUCCESS (status)); + + // DEBUG + /* + std::cout << "toUTF8String(): converted " << nbOfConvertedChars + << " for the UnicodeString '" << uprintf(iString) + << "' (of length " << lLength << std::endl; + */ + + oStr << lCharString; + + return oStr.str(); +} Modified: trunk/opentrep/test/i18n/icu/icutranslit_util.hpp =================================================================== --- trunk/opentrep/test/i18n/icu/icutranslit_util.hpp 2009-08-19 18:17:01 UTC (rev 182) +++ trunk/opentrep/test/i18n/icu/icutranslit_util.hpp 2009-08-21 15:46:51 UTC (rev 183) @@ -3,14 +3,19 @@ * Copyright (c) 1999-2002, International Business Machines Corporation and * others. All Rights Reserved. ********************************************************************/ +// STL +#include <string> // ICU #include <unicode/unistr.h> -// Verify that a UErrorCode is successful; exit(1) if not -void check(UErrorCode& status, const char* msg); +/** Verify that a UErrorCode is successful; exit(1) if not. */ +void check (UErrorCode& status, const char* msg); -// Replace nonprintable characters with unicode escapes -UnicodeString escape(const UnicodeString &source); +/** Replace nonprintable characters with unicode escapes. */ +UnicodeString escape (const UnicodeString &source); -// Print the given string to stdout -void uprintf(const UnicodeString &str); +/** Print the given string to stdout. */ +std::string uprintf (const UnicodeString& str); + +/** Print the given Unicode string as a UTF8-encoded string. */ +std::string toUTF8String (const UnicodeString& str); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-08-19 18:17:13
|
Revision: 182 http://opentrep.svn.sourceforge.net/opentrep/?rev=182&view=rev Author: denis_arnaud Date: 2009-08-19 18:17:01 +0000 (Wed, 19 Aug 2009) Log Message: ----------- [GUI] Added some charset-related elements in the produced HTML document. Modified Paths: -------------- trunk/opentrep/gui/psp/opentrep.psp Modified: trunk/opentrep/gui/psp/opentrep.psp =================================================================== --- trunk/opentrep/gui/psp/opentrep.psp 2009-08-19 16:15:17 UTC (rev 181) +++ trunk/opentrep/gui/psp/opentrep.psp 2009-08-19 18:17:01 UTC (rev 182) @@ -1,13 +1,16 @@ +<% print "Content-Type: text/html; charset=utf-8\n\n" %> + <% import os +from mod_python import apache + local_path = '/var/www/opentrep' -from mod_python import apache localize = apache.import_module('localize', path=[local_path]) log_service = apache.import_module('log_service', path=[local_path]) # defaults -msg, head, form_value, unrecognized = '', '', '', '' -#body_declaration = '<body>' +msg, form_value, original_form_value, unrecognized = '', '', '', '' +language = 'en' quiet = True # parsing: recognize sequence of three-letter codes @@ -16,6 +19,7 @@ queryStringForm = form if queryStringForm.has_key('data'): form_value = queryStringForm['data'] + original_form_value = form_value quiet = False if form_value.rstrip(' ') == '': pass @@ -47,12 +51,12 @@ %> -<?xml version="1.0" encoding="UTF-8"?> -<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> -<html xmlns="http://www.w3.org/1999/xhtml"> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" + "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="<%= language %>" lang="<%= language %>"> <head> -<title>OpenTREP</title> -<%= head %> + <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/> + <title>OpenTREP</title> </head> <body> @@ -74,7 +78,14 @@ </table> </div> -<p style="font-size:small;"><%= msg %></p> +<p style="font-size:small;"> + <%= msg %> +</p> +<p style="font-size:small;"> + Test to display UTF8-encoded text: Ивано-Франковск (ru_RU) - Івано-Франківськ (uk_UA) +<br>Original form value: <%= original_form_value %> +</p> + </body> </html> This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-08-19 16:15:24
|
Revision: 181 http://opentrep.svn.sourceforge.net/opentrep/?rev=181&view=rev Author: denis_arnaud Date: 2009-08-19 16:15:17 +0000 (Wed, 19 Aug 2009) Log Message: ----------- [i18n] Added sample code from the ICU library for transliteration. Modified Paths: -------------- trunk/opentrep/test/i18n/icu/Makefile.am Added Paths: ----------- trunk/opentrep/test/i18n/icu/icutranslit.cpp trunk/opentrep/test/i18n/icu/icutranslit_unaccent.cpp trunk/opentrep/test/i18n/icu/icutranslit_unaccent.hpp trunk/opentrep/test/i18n/icu/icutranslit_util.cpp trunk/opentrep/test/i18n/icu/icutranslit_util.hpp Property Changed: ---------------- trunk/opentrep/test/i18n/icu/ Property changes on: trunk/opentrep/test/i18n/icu ___________________________________________________________________ Modified: svn:ignore - .libs .deps Makefile.in Makefile icufmt icucharsetdetector icuustring icuconv icuutext + .libs .deps Makefile.in Makefile icufmt icucharsetdetector icuustring icuconv icuutext icutranslit Modified: trunk/opentrep/test/i18n/icu/Makefile.am =================================================================== --- trunk/opentrep/test/i18n/icu/Makefile.am 2009-08-16 15:07:35 UTC (rev 180) +++ trunk/opentrep/test/i18n/icu/Makefile.am 2009-08-19 16:15:17 UTC (rev 181) @@ -3,7 +3,8 @@ MAINTAINERCLEANFILES = Makefile.in -check_PROGRAMS = icufmt icuustring icucharsetdetector icuconv icuutext +check_PROGRAMS = icufmt icuustring icucharsetdetector icuconv \ + icuutext icutranslit icufmt_SOURCES = icufmt.cpp icufmt_CXXFLAGS = $(ICU_CFLAGS) @@ -25,4 +26,9 @@ icuutext_CXXFLAGS = $(ICU_CFLAGS) icuutext_LDFLAGS = $(ICU_LIBS) $(ICU_IO_LIB) +icutranslit_SOURCES = icutranslit_unaccent.hpp icutranslit_unaccent.cpp \ + icutranslit_util.hpp icutranslit_util.cpp icutranslit.cpp +icutranslit_CXXFLAGS = $(ICU_CFLAGS) +icutranslit_LDFLAGS = $(ICU_LIBS) $(ICU_IO_LIB) + EXTRA_DIST = Added: trunk/opentrep/test/i18n/icu/icutranslit.cpp =================================================================== --- trunk/opentrep/test/i18n/icu/icutranslit.cpp (rev 0) +++ trunk/opentrep/test/i18n/icu/icutranslit.cpp 2009-08-19 16:15:17 UTC (rev 181) @@ -0,0 +1,127 @@ +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 1999-2003, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ +// STL +#include <cstdio> +#include <cstdlib> +// ICU +#include <unicode/translit.h> +//#include <unicode/rbt.h> +#include <unicode/unistr.h> +#include <unicode/calendar.h> +#include <unicode/datefmt.h> +// +#include "icutranslit_util.hpp" +#include "icutranslit_unaccent.hpp" + +// RuleBasedTransliterator rules to remove accents from characters +// so they can be displayed as ASCIIx +UnicodeString UNACCENT_RULES( + "[\\u00C0-\\u00C5] > A;" + "[\\u00C8-\\u00CB] > E;" + "[\\u00CC-\\u00CF] > I;" + "[\\u00E0-\\u00E5] > a;" + "[\\u00E8-\\u00EB] > e;" + "[\\u00EC-\\u00EF] > i;" + ); + +// ////////////////////////// M A I N ////////////////////////////// +int main(int argc, char **argv) { + + Calendar *cal; + DateFormat *fmt; + DateFormat *defFmt; + Transliterator *greek_latin; + Transliterator *rbtUnaccent; + Transliterator *unaccent; + UParseError pError; + UErrorCode status = U_ZERO_ERROR; + Locale greece("el", "GR"); + UnicodeString str, str2; + + // Create a calendar in the Greek locale + cal = Calendar::createInstance(greece, status); + check(status, "Calendar::createInstance"); + + // Create a formatter + fmt = DateFormat::createDateInstance(DateFormat::kFull, greece); + fmt->setCalendar(*cal); + + // Create a default formatter + defFmt = DateFormat::createDateInstance(DateFormat::kFull); + defFmt->setCalendar(*cal); + + // Create a Greek-Latin Transliterator + greek_latin = Transliterator::createInstance("Greek-Latin", UTRANS_FORWARD, status); + if (greek_latin == 0) { + printf("ERROR: Transliterator::createInstance() failed\n"); + exit(1); + } + + // Create a custom Transliterator + rbtUnaccent = Transliterator::createFromRules("RBTUnaccent", + UNACCENT_RULES, + UTRANS_FORWARD, + pError, + status); + check(status, "Transliterator::createFromRules"); + + // Create a custom Transliterator + unaccent = new UnaccentTransliterator(); + + // Loop over various months + for (int32_t month = Calendar::JANUARY; + month <= Calendar::DECEMBER; + ++month) { + + // Set the calendar to a date + cal->clear(); + cal->set(1999, month, 4); + + // Format the date in default locale + str.remove(); + defFmt->format(cal->getTime(status), str, status); + check(status, "DateFormat::format"); + printf("Date: "); + uprintf(escape(str)); + printf("\n"); + + // Format the date for Greece + str.remove(); + fmt->format(cal->getTime(status), str, status); + check(status, "DateFormat::format"); + printf("Greek formatted date: "); + uprintf(escape(str)); + printf("\n"); + + // Transliterate result + greek_latin->transliterate(str); + printf("Transliterated via Greek-Latin: "); + uprintf(escape(str)); + printf("\n"); + + // Transliterate result + str2 = str; + rbtUnaccent->transliterate(str); + printf("Transliterated via RBT unaccent: "); + uprintf(escape(str)); + printf("\n"); + + unaccent->transliterate(str2); + printf("Transliterated via normalizer unaccent: "); + uprintf(escape(str2)); + printf("\n\n"); + } + + // Clean up + delete fmt; + delete cal; + delete greek_latin; + delete unaccent; + delete rbtUnaccent; + + printf("Exiting successfully\n"); + return 0; +} Added: trunk/opentrep/test/i18n/icu/icutranslit_unaccent.cpp =================================================================== --- trunk/opentrep/test/i18n/icu/icutranslit_unaccent.cpp (rev 0) +++ trunk/opentrep/test/i18n/icu/icutranslit_unaccent.cpp 2009-08-19 16:15:17 UTC (rev 181) @@ -0,0 +1,56 @@ +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 1999-2003, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ +// ICU +#include "icutranslit_unaccent.hpp" + +const char UnaccentTransliterator::fgClassID = 0; + +/** + * Constructor + */ +UnaccentTransliterator::UnaccentTransliterator() : + Transliterator ("Unaccent", 0), + normalizer ("", UNORM_NFD) { +} + +/** + * Destructor + */ +UnaccentTransliterator::~UnaccentTransliterator() { +} + +/** + * Remove accents from a character using Normalizer. + */ +UChar UnaccentTransliterator::unaccent(UChar c) const { + UnicodeString str(c); + UErrorCode status = U_ZERO_ERROR; + UnaccentTransliterator* t = (UnaccentTransliterator*)this; + + t->normalizer.setText(str, status); + if (U_FAILURE(status)) { + return c; + } + return (UChar) t->normalizer.next(); +} + +/** + * Implement Transliterator API + */ +void UnaccentTransliterator::handleTransliterate(Replaceable& text, + UTransPosition& index, + UBool incremental) const { + UnicodeString str("a"); + while (index.start < index.limit) { + UChar c = text.charAt(index.start); + UChar d = unaccent(c); + if (c != d) { + str.setCharAt(0, d); + text.handleReplaceBetween(index.start, index.start+1, str); + } + index.start++; + } +} Added: trunk/opentrep/test/i18n/icu/icutranslit_unaccent.hpp =================================================================== --- trunk/opentrep/test/i18n/icu/icutranslit_unaccent.hpp (rev 0) +++ trunk/opentrep/test/i18n/icu/icutranslit_unaccent.hpp 2009-08-19 16:15:17 UTC (rev 181) @@ -0,0 +1,89 @@ +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 1999-2003, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ +// ICU +#include <unicode/translit.h> +#include <unicode/normlzr.h> + +class UnaccentTransliterator : public Transliterator { + + public: + + /** + * Constructor + */ + UnaccentTransliterator(); + + /** + * Destructor + */ + virtual ~UnaccentTransliterator(); + + protected: + + /** + * Implement Transliterator API + */ + virtual void handleTransliterate(Replaceable& text, + UTransPosition& index, + UBool incremental) const; + + private: + + /** + * Unaccent a single character using normalizer. + */ + UChar unaccent(UChar c) const; + + Normalizer normalizer; + +public: + + /** + * Return the class ID for this class. This is useful only for + * comparing to a return value from getDynamicClassID(). For example: + * <pre> + * . Base* polymorphic_pointer = createPolymorphicObject(); + * . if (polymorphic_pointer->getDynamicClassID() == + * . Derived::getStaticClassID()) ... + * </pre> + * @return The class ID for all objects of this class. + * @stable ICU 2.0 + */ + static inline UClassID getStaticClassID(void) { return (UClassID)&fgClassID; }; + + /** + * Returns a unique class ID <b>polymorphically</b>. This method + * is to implement a simple version of RTTI, since not all C++ + * compilers support genuine RTTI. Polymorphic operator==() and + * clone() methods call this method. + * + * <p>Concrete subclasses of Transliterator that wish clients to + * be able to identify them should implement getDynamicClassID() + * and also a static method and data member: + * + * <pre> + * static UClassID getStaticClassID() { return (UClassID)&fgClassID; } + * static char fgClassID; + * </pre> + * + * Subclasses that do not implement this method will have a + * dynamic class ID of Transliterator::getStatisClassID(). + * + * @return The class ID for this object. All objects of a given + * class have the same class ID. Objects of other classes have + * different class IDs. + * @stable ICU 2.0 + */ + virtual UClassID getDynamicClassID(void) const { return getStaticClassID(); }; + +private: + + /** + * Class identifier for subclasses of Transliterator that do not + * define their class (anonymous subclasses). + */ + static const char fgClassID; +}; Added: trunk/opentrep/test/i18n/icu/icutranslit_util.cpp =================================================================== --- trunk/opentrep/test/i18n/icu/icutranslit_util.cpp (rev 0) +++ trunk/opentrep/test/i18n/icu/icutranslit_util.cpp 2009-08-19 16:15:17 UTC (rev 181) @@ -0,0 +1,63 @@ +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 1999-2002, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ +// ICU +#include <unicode/unistr.h> +#include <cstdio> +#include <cstdlib> + +// Verify that a UErrorCode is successful; exit(1) if not +void check(UErrorCode& status, const char* msg) { + if (U_FAILURE(status)) { + printf("ERROR: %s (%s)\n", u_errorName(status), msg); + exit(1); + } + // printf("Ok: %s\n", msg); +} + +// Append a hex string to the target +static UnicodeString& appendHex(uint32_t number, + int8_t digits, + UnicodeString& target) { + static const UnicodeString DIGIT_STRING("0123456789ABCDEF"); + while (digits > 0) { + target += DIGIT_STRING[(number >> ((--digits) * 4)) & 0xF]; + } + return target; +} + +// Replace nonprintable characters with unicode escapes +UnicodeString escape(const UnicodeString &source) { + int32_t i; + UnicodeString target; + target += "\""; + for (i=0; i<source.length(); ++i) { + UChar ch = source[i]; + if (ch < 0x09 || (ch > 0x0A && ch < 0x20) || ch > 0x7E) { + target += "\\u"; + appendHex(ch, 4, target); + } else { + target += ch; + } + } + target += "\""; + return target; +} + +// Print the given string to stdout +void uprintf(const UnicodeString &str) { + char *buf = 0; + int32_t len = str.length(); + // int32_t bufLen = str.extract(0, len, buf); // Preflight + /* Preflighting seems to be broken now, so assume 1-1 conversion, + plus some slop. */ + int32_t bufLen = len + 16; + int32_t actualLen; + buf = new char[bufLen + 1]; + actualLen = str.extract(0, len, buf/*, bufLen*/); // Default codepage conversion + buf[actualLen] = 0; + printf("%s", buf); + delete buf; +} Added: trunk/opentrep/test/i18n/icu/icutranslit_util.hpp =================================================================== --- trunk/opentrep/test/i18n/icu/icutranslit_util.hpp (rev 0) +++ trunk/opentrep/test/i18n/icu/icutranslit_util.hpp 2009-08-19 16:15:17 UTC (rev 181) @@ -0,0 +1,16 @@ +/******************************************************************** + * COPYRIGHT: + * Copyright (c) 1999-2002, International Business Machines Corporation and + * others. All Rights Reserved. + ********************************************************************/ +// ICU +#include <unicode/unistr.h> + +// Verify that a UErrorCode is successful; exit(1) if not +void check(UErrorCode& status, const char* msg); + +// Replace nonprintable characters with unicode escapes +UnicodeString escape(const UnicodeString &source); + +// Print the given string to stdout +void uprintf(const UnicodeString &str); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-08-16 15:07:48
|
Revision: 180 http://opentrep.svn.sourceforge.net/opentrep/?rev=180&view=rev Author: denis_arnaud Date: 2009-08-16 15:07:35 +0000 (Sun, 16 Aug 2009) Log Message: ----------- [i18n] The conversion to lower cases for UTF-8 strings now works (with support from the ICU library). Modified Paths: -------------- trunk/opentrep/TODO Added Paths: ----------- trunk/opentrep/test/i18n/icu/icuconvaliastable.txt trunk/opentrep/test/i18n/icu/icuutext.cpp Property Changed: ---------------- trunk/opentrep/test/i18n/icu/ Modified: trunk/opentrep/TODO =================================================================== --- trunk/opentrep/TODO 2009-08-16 14:08:09 UTC (rev 179) +++ trunk/opentrep/TODO 2009-08-16 15:07:35 UTC (rev 180) @@ -8,15 +8,20 @@ locations). * [14/08/2009] With the ICU library, check the encoding of the input, - and convert in Unicode if needed (see the test/i18n/icuustring and - test/i18n/icuconv} for example). First detect and convert hard-coded - strings, then do it on the output of PSP pages. + and convert it to Unicode if needed (see the test/i18n/icuustring + and test/i18n/icuconv samples for example). First detect and convert + hard-coded strings, then do it on the output of PSP pages. * [14/08/2009] Write a transliterator, taking UTF-8 Cyrillic input (e.g., Russian and/or Ukrainian) and romanising/transliterating - it. Note that, with the ICU library, UTex may be used advantageously - (to take UTF-8 input). + it. Note that there is good transliterator sample within the ICU + Subversion repository + (http://source.icu-project.org/repos/icu/icu/trunk/source/samples/translit). +* [14/08/2009] With the ICU library, convert UTF-8 encoded strings to + lower cases. +OK + * [01/08/2009] Finish the work on bringing extra and additional Location objects into the API. OK Property changes on: trunk/opentrep/test/i18n/icu ___________________________________________________________________ Modified: svn:ignore - .libs .deps Makefile.in Makefile icufmt icucharsetdetector icuustring icuconv + .libs .deps Makefile.in Makefile icufmt icucharsetdetector icuustring icuconv icuutext Added: trunk/opentrep/test/i18n/icu/icuconvaliastable.txt =================================================================== --- trunk/opentrep/test/i18n/icu/icuconvaliastable.txt (rev 0) +++ trunk/opentrep/test/i18n/icu/icuconvaliastable.txt 2009-08-16 15:07:35 UTC (rev 180) @@ -0,0 +1,1167 @@ +# ****************************************************************************** +# * +# * Copyright (C) 1995-2009, International Business Machines +# * Corporation and others. All Rights Reserved. +# * +# ****************************************************************************** + +# If this converter alias table looks very confusing, a much easier to +# understand view can be found at this demo: +# http://demo.icu-project.org/icu-bin/convexp + +# IMPORTANT NOTE +# +# This file is not read directly by ICU. If you change it, you need to +# run gencnval, and eventually run pkgdata to update the representation that +# ICU uses for aliases. The gencnval tool will normally compile this file into +# cnvalias.icu. The gencnval -v verbose option will help you when you edit +# this file. + +# Please be friendly to the rest of us that edit this table by +# keeping this table free of tabs. + +# This is an alias file used by the character set converter. +# A lot of converter information can be found in unicode/ucnv.h, but here +# is more information about this file. +# +# Here is the file format using BNF-like syntax: +# +# converterTable ::= tags { converterLine* } +# converterLine ::= converterName [ tags ] { taggedAlias* }'\n' +# taggedAlias ::= alias [ tags ] +# tags ::= '{' { tag+ } '}' +# tag ::= standard['*'] +# converterName ::= [0-9a-zA-Z:_'-']+ +# alias ::= converterName +# +# Except for the converter name, aliases are case insensitive. +# Names are separated by whitespace. +# Line continuation and comment sytax are similar to the GNU make syntax. +# Any lines beginning with whitespace (e.g. U+0020 SPACE or U+0009 HORIZONTAL +# TABULATION) are presumed to be a continuation of the previous line. +# The # symbol starts a comment and the comment continues till the end of +# the line. +# +# The converter +# +# All names can be tagged by including a space-separated list of tags in +# curly braces, as in ISO_8859-1:1987{IANA*} iso-8859-1 { MIME* } or +# some-charset{MIME* IANA*}. The order of tags does not matter, and +# whitespace is allowed between the tagged name and the tags list. +# +# The tags can be used to get standard names using ucnv_getStandardName(). +# +# The complete list of recognized tags used in this file is defined in +# the affinity list near the beginning of the file. +# +# The * after the standard tag denotes that the previous alias is the +# preferred (default) charset name for that standard. There can only +# be one of these default charset names per converter. + + + +# The world is getting more complicated... +# Supporting XML parsers, HTML, MIME, and similar applications +# that mark encodings with a charset name can be difficult. +# Many of these applications and operating systems will update +# their codepages over time. + +# It means that a new codepage, one that differs from an +# old one by changing a code point, e.g., to the Euro sign, +# must not get an old alias, because it would mean that +# old files with this alias would be interpreted differently. + +# If an codepage gets updated by assigning characters to previously +# unassigned code points, then a new name is not necessary. +# Also, some codepages map unassigned codepage byte values +# to the same numbers in Unicode for roundtripping. It may be +# industry practice to keep the encoding name in such a case, too +# (example: Windows codepages). + +# The aliases listed in the list of character sets +# that is maintained by the IANA (http://www.iana.org/) must +# not be changed to mean encodings different from what this +# list shows. Currently, the IANA list is at +# http://www.iana.org/assignments/character-sets +# It should also be mentioned that the exact mapping table used for each +# IANA names usually isn't specified. This means that some other applications +# and operating systems are left to interpret the exact mappings for the +# underspecified aliases. For instance, Shift-JIS on a Solaris platform +# may be different from Shift-JIS on a Windows platform. This is why +# some of the aliases can be tagged to differentiate different mapping +# tables with the same alias. If an alias is given to more than one converter, +# it is considered to be an ambiguous alias, and the affinity list will +# choose the converter to use when a standard isn't specified with the alias. + +# Name matching is case-insensitive. Also, dashes '-', underscores '_' +# and spaces ' ' are ignored in names (thus cs-iso_latin-1, csisolatin1 +# and "cs iso latin 1" are the same). +# However, the names in the left column are directly file names +# or names of algorithmic converters, and their case must not +# be changed - or else code and/or file names must also be changed. +# For example, the converter ibm-921 is expected to be the file ibm-921.cnv. + + + +# The immediately following list is the affinity list of supported standard tags. +# When multiple converters have the same alias under different standards, +# the standard nearest to the top of this list with that alias will +# be the first converter that will be opened. The ordering of the aliases +# after this affinity list does not affect the preferred alias, but it may +# affect the order of the returned list of aliases for a given converter. +# +# The general ordering is from specific and frequently used to more general +# or rarely used at the bottom. +{ UTR22 # Name format specified by http://www.unicode.org/unicode/reports/tr22/ + # ICU # Can also use ICU_FEATURE + IBM # The IBM CCSID number is specified by ibm-* + WINDOWS # The Microsoft code page identifier number is specified by windows-*. The rest are recognized IE names. + JAVA # Source: Sun JDK. Alias name case is ignored, but dashes are not ignored. + # GLIBC + # AIX + # DB2 + # SOLARIS + # APPLE + # HPUX + IANA # Source: http://www.iana.org/assignments/character-sets + MIME # Source: http://www.iana.org/assignments/character-sets + # MSIE # MSIE is Internet Explorer, which can be different from Windows (From the IMultiLanguage COM interface) + # ZOS_USS # z/OS (os/390) Unix System Services (USS), which has NL<->LF swapping. They have the same format as the IBM tag. + } + + + +# Fully algorithmic converters + +UTF-8 { IANA* MIME* JAVA* WINDOWS } + ibm-1208 { IBM* } # UTF-8 with IBM PUA + ibm-1209 { IBM } # UTF-8 + ibm-5304 { IBM } # Unicode 2.0, UTF-8 with IBM PUA + ibm-5305 { IBM } # Unicode 2.0, UTF-8 + ibm-13496 { IBM } # Unicode 3.0, UTF-8 with IBM PUA + ibm-13497 { IBM } # Unicode 3.0, UTF-8 + ibm-17592 { IBM } # Unicode 4.0, UTF-8 with IBM PUA + ibm-17593 { IBM } # Unicode 4.0, UTF-8 + windows-65001 { WINDOWS* } + cp1208 + +# The ICU 2.2 UTF-16/32 converters detect and write a BOM. +UTF-16 { IANA* MIME* JAVA* } ISO-10646-UCS-2 { IANA } + ibm-1204 { IBM* } # UTF-16 with IBM PUA and BOM sensitive + ibm-1205 { IBM } # UTF-16 BOM sensitive + unicode + csUnicode + ucs-2 +# The following Unicode CCSIDs (IBM) are not valid in ICU because they are +# considered pure DBCS (exactly 2 bytes) of Unicode, +# and they are a subset of Unicode. ICU does not support their encoding structures. +# 1400 1401 1402 1410 1414 1415 1446 1447 1448 1449 64770 64771 65520 5496 5497 5498 9592 13688 +UTF-16BE { IANA* MIME* JAVA* } x-utf-16be { JAVA } + UnicodeBigUnmarked { JAVA } # java.io name + ibm-1200 { IBM* } # UTF-16 BE with IBM PUA + ibm-1201 { IBM } # UTF-16 BE + ibm-13488 { IBM } # Unicode 2.0, UTF-16 BE with IBM PUA + ibm-13489 { IBM } # Unicode 2.0, UTF-16 BE + ibm-17584 { IBM } # Unicode 3.0, UTF-16 BE with IBM PUA + ibm-17585 { IBM } # Unicode 3.0, UTF-16 BE + ibm-21680 { IBM } # Unicode 4.0, UTF-16 BE with IBM PUA + ibm-21681 { IBM } # Unicode 4.0, UTF-16 BE + ibm-25776 { IBM } # Unicode 4.1, UTF-16 BE with IBM PUA + ibm-25777 { IBM } # Unicode 4.1, UTF-16 BE + ibm-29872 { IBM } # Unicode 5.0, UTF-16 BE with IBM PUA + ibm-29873 { IBM } # Unicode 5.0, UTF-16 BE + ibm-61955 { IBM } # UTF-16BE with Gaidai University (Japan) PUA + ibm-61956 { IBM } # UTF-16BE with Microsoft HKSCS-Big 5 PUA + windows-1201 { WINDOWS* } + cp1200 + cp1201 + UTF16_BigEndian + # ibm-5297 { IBM } # Unicode 2.0, UTF-16 (BE) (reserved, never used) + # iso-10646-ucs-2 { JAVA } # This is ambiguous + # ibm-61952 is not a valid CCSID because it's Unicode 1.1 + # ibm-61953 is not a valid CCSID because it's Unicode 1.0 +UTF-16LE { IANA* MIME* JAVA* } x-utf-16le { JAVA } + UnicodeLittleUnmarked { JAVA } # java.io name + ibm-1202 { IBM* } # UTF-16 LE with IBM PUA + ibm-1203 { IBM } # UTF-16 LE + ibm-13490 { IBM } # Unicode 2.0, UTF-16 LE with IBM PUA + ibm-13491 { IBM } # Unicode 2.0, UTF-16 LE + ibm-17586 { IBM } # Unicode 3.0, UTF-16 LE with IBM PUA + ibm-17587 { IBM } # Unicode 3.0, UTF-16 LE + ibm-21682 { IBM } # Unicode 4.0, UTF-16 LE with IBM PUA + ibm-21683 { IBM } # Unicode 4.0, UTF-16 LE + ibm-25778 { IBM } # Unicode 4.1, UTF-16 LE with IBM PUA + ibm-25779 { IBM } # Unicode 4.1, UTF-16 LE + ibm-29874 { IBM } # Unicode 5.0, UTF-16 LE with IBM PUA + ibm-29875 { IBM } # Unicode 5.0, UTF-16 LE + UTF16_LittleEndian + windows-1200 { WINDOWS* } + +UTF-32 { IANA* MIME* } ISO-10646-UCS-4 { IANA } + ibm-1236 { IBM* } # UTF-32 with IBM PUA and BOM sensitive + ibm-1237 { IBM } # UTF-32 BOM sensitive + csUCS4 + ucs-4 +UTF-32BE { IANA* } UTF32_BigEndian + ibm-1232 { IBM* } # UTF-32 BE with IBM PUA + ibm-1233 { IBM } # UTF-32 BE + ibm-9424 { IBM } # Unicode 4.1, UTF-32 BE with IBM PUA +UTF-32LE { IANA* } UTF32_LittleEndian + ibm-1234 { IBM* } # UTF-32 LE, with IBM PUA + ibm-1235 { IBM } # UTF-32 LE + +# ICU-specific names for special uses +UTF16_PlatformEndian +UTF16_OppositeEndian + +UTF32_PlatformEndian +UTF32_OppositeEndian + + +# Java-specific, non-Unicode-standard UTF-16 variants. +# These are in the Java "Basic Encoding Set (contained in lib/rt.jar)". +# See the "Supported Encodings" at +# http://java.sun.com/javase/6/docs/technotes/guides/intl/encoding.doc.html +# or a newer version of this document. +# +# Aliases marked with { JAVA* } are canonical names for java.io and java.lang APIs. +# Aliases marked with { JAVA } are canonical names for the java.nio API. +# +# "BOM" means the Unicode Byte Order Mark, which is the encoding-scheme-specific +# byte sequence for U+FEFF. +# "Reverse BOM" means the BOM for the sibling encoding scheme with the +# opposite endianness. (LE<->BE) + +# "Sixteen-bit Unicode (or UCS) Transformation Format, big-endian byte order, +# with byte-order mark" +# +# From Unicode: Writes BOM. +# To Unicode: Detects and consumes BOM. +# If there is a "reverse BOM", Java throws +# MalformedInputException: Incorrect byte-order mark. +# In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value +# and a UCNV_ILLEGAL UConverterCallbackReason. +UTF-16BE,version=1 UnicodeBig { JAVA* } + +# "Sixteen-bit Unicode (or UCS) Transformation Format, little-endian byte order, +# with byte-order mark" +# +# From Unicode: Writes BOM. +# To Unicode: Detects and consumes BOM. +# If there is a "reverse BOM", Java throws +# MalformedInputException: Incorrect byte-order mark. +# In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value +# and a UCNV_ILLEGAL UConverterCallbackReason. +UTF-16LE,version=1 UnicodeLittle { JAVA* } x-UTF-16LE-BOM { JAVA } + +# This one is not mentioned on the "Supported Encodings" page +# but is available in Java. +# In Java, this is called "Unicode" but we cannot give it that alias +# because the standard UTF-16 converter already has a "unicode" alias. +# +# From Unicode: Writes BOM. +# To Unicode: Detects and consumes BOM. +# If there is no BOM, rather than defaulting to BE, Java throws +# MalformedInputException: Missing byte-order mark. +# In this case, ICU4C sets a U_ILLEGAL_ESCAPE_SEQUENCE UErrorCode value +# and a UCNV_ILLEGAL UConverterCallbackReason. +UTF-16,version=1 + +# Note: ICU does not currently support Java-specific, non-Unicode-standard UTF-32 variants. +# Presumably, these behave analogously to the UTF-16 variants with similar names. +# UTF_32BE_BOM x-UTF-32BE-BOM +# UTF_32LE_BOM x-UTF-32LE-BOM + +# End of Java-specific, non-Unicode-standard UTF variants. + + +# On UTF-7: +# RFC 2152 (http://www.imc.org/rfc2152) allows to encode some US-ASCII +# characters directly or in base64. Especially, the characters in set O +# as defined in the RFC (!"#$%&*;<=>@[]^_`{|}) may be encoded directly +# but are not allowed in, e.g., email headers. +# By default, the ICU UTF-7 converter encodes set O directly. +# By choosing the option "version=1", set O will be escaped instead. +# For example: +# utf7Converter=ucnv_open("UTF-7,version=1"); +# +# For details about email headers see RFC 2047. +UTF-7 { IANA* MIME* WINDOWS } windows-65000 { WINDOWS* } + +# UTF-EBCDIC doesn't exist in ICU, but the aliases are here for reference. +#UTF-EBCDIC ibm-1210 { IBM* } ibm-1211 { IBM } + +# IMAP-mailbox-name is an ICU-specific name for the encoding of IMAP mailbox names. +# It is a substantially modified UTF-7 encoding. See the specification in: +# +# RFC 2060: INTERNET MESSAGE ACCESS PROTOCOL - VERSION 4rev1 +# (http://www.ietf.org/rfc/rfc2060.txt) +# Section 5.1.3. Mailbox International Naming Convention +IMAP-mailbox-name + +SCSU { IANA* } + ibm-1212 { IBM } # SCSU with IBM PUA + ibm-1213 { IBM* } # SCSU +BOCU-1 { IANA* } + csBOCU-1 { IANA } + ibm-1214 { IBM } # BOCU-1 with IBM PUA + ibm-1215 { IBM* } # BOCU-1 + +# See http://www.unicode.org/unicode/reports/tr26 for this Compatibility Encoding Scheme for UTF-16 +# The Unicode Consortium does not encourage the use of CESU-8 +CESU-8 { IANA* } ibm-9400 { IBM* } + +# Standard iso-8859-1, which does not have the Euro update. +# See iso-8859-15 (latin9) for the Euro update +ISO-8859-1 { MIME* IANA JAVA* } + ibm-819 { IBM* JAVA } # This is not truely ibm-819 because it's missing the fallbacks. + IBM819 { IANA } + cp819 { IANA JAVA } + latin1 { IANA JAVA } + 8859_1 { JAVA } + csISOLatin1 { IANA JAVA } + iso-ir-100 { IANA JAVA } + ISO_8859-1:1987 { IANA* JAVA } + l1 { IANA JAVA } + 819 { JAVA } + # windows-28591 { WINDOWS* } # This has odd behavior because it has the Euro update, which isn't correct. + # LATIN_1 # Old ICU name + # ANSI_X3.110-1983 # This is for a different IANA alias. This isn't iso-8859-1. + +US-ASCII { MIME* IANA JAVA WINDOWS } + ASCII { JAVA* IANA WINDOWS } + ANSI_X3.4-1968 { IANA* WINDOWS } + ANSI_X3.4-1986 { IANA WINDOWS } + ISO_646.irv:1991 { IANA WINDOWS } + iso_646.irv:1983 { JAVA } + ISO646-US { JAVA IANA WINDOWS } + us { IANA } + csASCII { IANA WINDOWS } + iso-ir-6 { IANA } + cp367 { IANA WINDOWS } + ascii7 { JAVA } + 646 { JAVA } + windows-20127 { WINDOWS* } + ibm-367 { IBM* } IBM367 { IANA WINDOWS } # This is not truely ibm-367 because it's missing the fallbacks. + +# GB 18030 is partly algorithmic, using the MBCS converter +gb18030 { IANA* } ibm-1392 { IBM* } windows-54936 { WINDOWS* } GB18030 { MIME* } + +# Table-based interchange codepages + +# Central Europe +ibm-912_P100-1995 { UTR22* } + ibm-912 { IBM* JAVA } + ISO-8859-2 { MIME* IANA JAVA* WINDOWS } + ISO_8859-2:1987 { IANA* WINDOWS JAVA } + latin2 { IANA WINDOWS JAVA } + csISOLatin2 { IANA WINDOWS JAVA } + iso-ir-101 { IANA WINDOWS JAVA } + l2 { IANA WINDOWS JAVA } + 8859_2 { JAVA } + cp912 { JAVA } + 912 { JAVA } + windows-28592 { WINDOWS* } + +# Maltese Esperanto +ibm-913_P100-2000 { UTR22* } + ibm-913 { IBM* JAVA } + ISO-8859-3 { MIME* IANA WINDOWS JAVA* } + ISO_8859-3:1988 { IANA* WINDOWS JAVA } + latin3 { IANA JAVA WINDOWS } + csISOLatin3 { IANA WINDOWS } + iso-ir-109 { IANA WINDOWS JAVA } + l3 { IANA WINDOWS JAVA } + 8859_3 { JAVA } + cp913 { JAVA } + 913 { JAVA } + windows-28593 { WINDOWS* } + +# Baltic +ibm-914_P100-1995 { UTR22* } + ibm-914 { IBM* JAVA } + ISO-8859-4 { MIME* IANA WINDOWS JAVA* } + latin4 { IANA WINDOWS JAVA } + csISOLatin4 { IANA WINDOWS JAVA } + iso-ir-110 { IANA WINDOWS JAVA } + ISO_8859-4:1988 { IANA* WINDOWS JAVA } + l4 { IANA WINDOWS JAVA } + 8859_4 { JAVA } + cp914 { JAVA } + 914 { JAVA } + windows-28594 { WINDOWS* } + +# Cyrillic +ibm-915_P100-1995 { UTR22* } + ibm-915 { IBM* JAVA } + ISO-8859-5 { MIME* IANA WINDOWS JAVA* } + cyrillic { IANA WINDOWS JAVA } + csISOLatinCyrillic { IANA WINDOWS JAVA } + iso-ir-144 { IANA WINDOWS JAVA } + ISO_8859-5:1988 { IANA* WINDOWS JAVA } + 8859_5 { JAVA } + cp915 { JAVA } + 915 { JAVA } + windows-28595 { WINDOWS* } + +# Arabic +# ISO_8859-6-E and ISO_8859-6-I are similar to this charset, but BiDi is done differently +# From a narrow mapping point of view, there is no difference. +# -E means explicit. -I means implicit. +# -E requires the client to handle the ISO 6429 bidirectional controls +ibm-1089_P100-1995 { UTR22* } + ibm-1089 { IBM* JAVA } + ISO-8859-6 { MIME* IANA WINDOWS JAVA* } + arabic { IANA WINDOWS JAVA } + csISOLatinArabic { IANA WINDOWS JAVA } + iso-ir-127 { IANA WINDOWS JAVA } + ISO_8859-6:1987 { IANA* WINDOWS JAVA } + ECMA-114 { IANA JAVA } + ASMO-708 { IANA JAVA } + 8859_6 { JAVA } + cp1089 { JAVA } + 1089 { JAVA } + windows-28596 { WINDOWS* } + ISO-8859-6-I { IANA MIME } # IANA considers this alias different and BiDi needs to be applied. + ISO-8859-6-E { IANA MIME } # IANA considers this alias different and BiDi needs to be applied. + +# ISO Greek (with euro update). This is really ISO_8859-7:2003 +ibm-9005_X110-2007 { UTR22* } + ibm-9005 { IBM* } + ISO-8859-7 { MIME* IANA WINDOWS } + greek { IANA WINDOWS } + greek8 { IANA WINDOWS } + ELOT_928 { IANA WINDOWS } + ECMA-118 { IANA WINDOWS } + csISOLatinGreek { IANA WINDOWS } + iso-ir-126 { IANA WINDOWS } + ISO_8859-7:1987 { IANA* WINDOWS } + windows-28597 { WINDOWS* } + sun_eu_greek # For Solaris + +# ISO Greek (w/o euro update) +# JDK 1.5 has these aliases. +ibm-813_P100-1995 { UTR22* } + ibm-813 { IBM* JAVA } + ISO-8859-7 { JAVA* } + greek { JAVA } + greek8 { JAVA } + ELOT_928 { JAVA } + ECMA-118 { JAVA } + csISOLatinGreek { JAVA } + iso-ir-126 { JAVA } + ISO_8859-7:1987 { JAVA } + 8859_7 { JAVA } + cp813 { JAVA } + 813 { JAVA } + +# hebrew +# ISO_8859-8-E and ISO_8859-8-I are similar to this charset, but BiDi is done differently +# From a narrow mapping point of view, there is no difference. +# -E means explicit. -I means implicit. +# -E requires the client to handle the ISO 6429 bidirectional controls +# This matches the official mapping on unicode.org +ibm-5012_P100-1999 { UTR22* } + ibm-5012 { IBM* } + ISO-8859-8 { MIME* IANA WINDOWS JAVA* } + hebrew { IANA WINDOWS JAVA } + csISOLatinHebrew { IANA WINDOWS JAVA } + iso-ir-138 { IANA WINDOWS JAVA } + ISO_8859-8:1988 { IANA* WINDOWS JAVA } + ISO-8859-8-I { IANA MIME } # IANA and Windows considers this alias different and BiDi needs to be applied. + ISO-8859-8-E { IANA MIME } # IANA and Windows considers this alias different and BiDi needs to be applied. + 8859_8 { JAVA } + windows-28598 { WINDOWS* } # Hebrew (ISO-Visual). A hybrid between ibm-5012 and ibm-916 with extra PUA mappings. + hebrew8 # Reflect HP-UX code page update + +# Unfortunately, the Java aliases are split across ibm-916 and ibm-5012 +# Also many platforms are a combination between ibm-916 and ibm-5012 behaviors +ibm-916_P100-1995 { UTR22* } + ibm-916 { IBM* JAVA* } + cp916 { JAVA } + 916 { JAVA } + +# Turkish +ibm-920_P100-1995 { UTR22* } + ibm-920 { IBM* JAVA } + ISO-8859-9 { MIME* IANA WINDOWS JAVA* } + latin5 { IANA WINDOWS JAVA } + csISOLatin5 { IANA JAVA } + iso-ir-148 { IANA WINDOWS JAVA } + ISO_8859-9:1989 { IANA* WINDOWS } + l5 { IANA WINDOWS JAVA } + 8859_9 { JAVA } + cp920 { JAVA } + 920 { JAVA } + windows-28599 { WINDOWS* } + ECMA-128 # IANA doesn't have this alias 6/24/2002 + turkish8 # Reflect HP-UX codepage update 8/1/2008 + turkish # Reflect HP-UX codepage update 8/1/2008 + +# Nordic languages +iso-8859_10-1998 { UTR22* } ISO-8859-10 { MIME* IANA* } + iso-ir-157 { IANA } + l6 { IANA } + ISO_8859-10:1992 { IANA } + csISOLatin6 { IANA } + latin6 { IANA } + +# Thai +# Be warned. There several iso-8859-11 codepage variants, and they are all incompatible. +# ISO-8859-11 is a superset of TIS-620. The difference is that ISO-8859-11 contains the C1 control codes. +iso-8859_11-2001 { UTR22* } ISO-8859-11 + thai8 # HP-UX alias. HP-UX says TIS-620, but it's closer to ISO-8859-11. + +# iso-8859-13, PC Baltic (w/o euro update) +ibm-921_P100-1995 { UTR22* } + ibm-921 { IBM* } + ISO-8859-13 { IANA* MIME* JAVA* } + 8859_13 { JAVA } + windows-28603 { WINDOWS* } + cp921 + 921 + +# Celtic +iso-8859_14-1998 { UTR22* } ISO-8859-14 { IANA* } + iso-ir-199 { IANA } + ISO_8859-14:1998 { IANA } + latin8 { IANA } + iso-celtic { IANA } + l8 { IANA } + +# Latin 9 +ibm-923_P100-1998 { UTR22* } + ibm-923 { IBM* JAVA } + ISO-8859-15 { IANA* MIME* WINDOWS JAVA* } + Latin-9 { IANA WINDOWS } + l9 { WINDOWS } + 8859_15 { JAVA } + latin0 { JAVA } + csisolatin0 { JAVA } + csisolatin9 { JAVA } + iso8859_15_fdis { JAVA } + cp923 { JAVA } + 923 { JAVA } + windows-28605 { WINDOWS* } + +# CJK encodings + +ibm-942_P12A-1999 { UTR22* } # ibm-942_P120 is a rarely used alternate mapping (sjis78 is already old) + ibm-942 { IBM* } + ibm-932 { IBM } + cp932 + shift_jis78 + sjis78 + ibm-942_VSUB_VPUA + ibm-932_VSUB_VPUA + # Is this "JIS_C6226-1978"? + +# ibm-943_P15A-2003 differs from windows-932-2000 only in a few roundtrip mappings: +# - the usual IBM PC control code rotation (1A-1C-7F) +# - the Windows table has roundtrips for bytes 80, A0, and FD-FF to U+0080 and PUA +ibm-943_P15A-2003 { UTR22* } + ibm-943 # Leave untagged because this isn't the default + Shift_JIS { IANA* MIME* WINDOWS JAVA } + MS_Kanji { IANA WINDOWS JAVA } + csShiftJIS { IANA WINDOWS JAVA } + windows-31j { IANA JAVA } # A further extension of Shift_JIS to include NEC special characters (Row 13) + csWindows31J { IANA WINDOWS JAVA } # A further extension of Shift_JIS to include NEC special characters (Row 13) + x-sjis { WINDOWS JAVA } + x-ms-cp932 { WINDOWS } + cp932 { WINDOWS } + windows-932 { WINDOWS* } + cp943c { JAVA* } # This is slightly different, but the backslash mapping is the same. + IBM-943C #{ AIX* } # Add this tag once AIX aliases becomes available + ms932 + pck # Probably SOLARIS + sjis # This might be for ibm-1351 + ibm-943_VSUB_VPUA + # cp943 # This isn't Windows, and no one else uses it. + # IANA says that Windows-31J is an extension to csshiftjis ibm-932 +ibm-943_P130-1999 { UTR22* } + ibm-943 { IBM* JAVA } + Shift_JIS # Leave untagged because this isn't the default + cp943 { JAVA* } # This is slightly different, but the backslash mapping is the same. + 943 { JAVA } + ibm-943_VASCII_VSUB_VPUA + # japanese. Unicode name is \u30b7\u30d5\u30c8\u7b26\u53f7\u5316\u8868\u73fe +ibm-33722_P12A_P12A-2004_U2 { UTR22* } + ibm-33722 # Leave untagged because this isn't the default + ibm-5050 # Leave untagged because this isn't the default, and yes this alias is correct + EUC-JP { IANA MIME* WINDOWS } + Extended_UNIX_Code_Packed_Format_for_Japanese { IANA* WINDOWS } + csEUCPkdFmtJapanese { IANA WINDOWS } + X-EUC-JP { WINDOWS } # Japan EUC. x-euc-jp is a MIME name + windows-51932 { WINDOWS* } + ibm-33722_VPUA + IBM-eucJP +ibm-33722_P120-1999 { UTR22* } # Japan EUC with \ <-> Yen mapping + ibm-33722 { IBM* JAVA } + ibm-5050 { IBM } # Yes this is correct + cp33722 { JAVA* } + 33722 { JAVA } + ibm-33722_VASCII_VPUA +# ibm-954 seems to be almost a superset of ibm-33722 and ibm-1350 +# ibm-1350 seems to be almost a superset of ibm-33722 +# ibm-954 contains more PUA characters than the others. +ibm-954_P101-2007 { UTR22* } + ibm-954 { IBM* } + EUC-JP { JAVA* } # Matches more closely with ibm-1350 + Extended_UNIX_Code_Packed_Format_for_Japanese { JAVA } + csEUCPkdFmtJapanese { JAVA } + X-EUC-JP { JAVA } # Japan EUC. x-euc-jp is a MIME name + eucjis { JAVA } + ujis # Linux sometimes uses this name. This is an unfortunate generic and rarely used name. Its use is discouraged. + # eucJP # This is closest to Solaris EUC-JP. + +# Here are various interpretations and extentions of Big5 +ibm-1373_P100-2002 { UTR22* } # IBM's interpretation of Windows' Taiwan Big-5 without HKSCS extensions + ibm-1373 { IBM* } + windows-950 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage. +windows-950-2000 { UTR22* } + Big5 { IANA* MIME* JAVA* WINDOWS } + csBig5 { IANA WINDOWS } + windows-950 { WINDOWS* } + x-big5 +ibm-950_P110-1999 { UTR22* } # Taiwan Big-5 (w/o euro update) + ibm-950 { IBM* JAVA } + cp950 { JAVA* } + 950 { JAVA } +ibm-1375_P100-2007 { UTR22* } # Big5-HKSCS-2004 with Unicode 3.1 mappings. This uses supplementary characters. + ibm-1375 { IBM* } + Big5-HKSCS { IANA* JAVA* } + big5hk { JAVA } + HKSCS-BIG5 # From http://www.openi18n.org/localenameguide/ +ibm-5471_P100-2006 { UTR22* } # Big5-HKSCS-2001 with Unicode 3.0 mappings. This uses many PUA characters. + ibm-5471 { IBM* } + Big5-HKSCS + MS950_HKSCS { JAVA* } + hkbig5 # from HP-UX 11i, which can't handle supplementary characters. + big5-hkscs:unicode3.0 + # windows-950 # Windows-950 can be w/ or w/o HKSCS extensions. By default it's not. + # windows-950_hkscs + +# GBK +ibm-1386_P100-2001 { UTR22* } + ibm-1386 { IBM* } + cp1386 + windows-936 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage. + ibm-1386_VSUB_VPUA +windows-936-2000 { UTR22* } + GBK { IANA* WINDOWS JAVA* } + CP936 { IANA JAVA } + MS936 { IANA } # In JDK 1.5, this goes to x-mswin-936. This is an IANA name split. + windows-936 { IANA WINDOWS* JAVA } + +# Java has two different tables for ibm-1383 and gb2312. We pick closest set for tagging. +ibm-1383_P110-1999 { UTR22* } # China EUC. + ibm-1383 { IBM* JAVA } + GB2312 { IANA* MIME* } + csGB2312 { IANA } + cp1383 { JAVA* } + 1383 { JAVA } + EUC-CN # According to other platforms, windows-20936 looks more like euc-cn. x-euc-cn is also a MIME name + ibm-eucCN + hp15CN # From HP-UX? + ibm-1383_VPUA + # gb # This is not an IANA name. gb in IANA means Great Britain. + +ibm-5478_P100-1995 { UTR22* } ibm-5478 { IBM* } # This gb_2312_80 DBCS mapping is needed by iso-2022. + GB_2312-80 { IANA* } # Windows maps this alias incorrectly + chinese { IANA } + iso-ir-58 { IANA } + csISO58GB231280 { IANA } + gb2312-1980 + GB2312.1980-0 # From X11R6 + +ibm-964_P110-1999 { UTR22* } # Taiwan EUC. x-euc-tw is a MIME name + ibm-964 { IBM* JAVA } + EUC-TW + ibm-eucTW + cns11643 + cp964 { JAVA* } + 964 { JAVA } + ibm-964_VPUA + +# ISO-2022 needs one, and other people may need others. +ibm-949_P110-1999 { UTR22* } + ibm-949 { IBM* JAVA } + cp949 { JAVA* } + 949 { JAVA } + ibm-949_VASCII_VSUB_VPUA +ibm-949_P11A-1999 { UTR22* } + ibm-949 # Leave untagged because this isn't the default + cp949c { JAVA* } + ibm-949_VSUB_VPUA + +# Korean EUC. +# +# <quote from="Jungshik Shin"> +# EUC-KR = KS X 1003/ISO 646-KR or ISO 646-IRV/US-ASCII in GL and KS X 1001:1998 (formerly KS C 5601-1987) in GR. +# +# Although widely spread on MS Windows, using +# KS C 5601 or related names to denote EUC-KR or +# windows-949 is very much misleading. KS C 5601-1987 +# is NOT suitable as a designation for MIME charset +# and MBCS. It's just the name of a 94 x 94 Korean +# coded character set standard which can be invoked +# on either GL (with MSB reset) or GR (with MSB set). +# Note that JOHAB (windows-1361) specified in +# KS X 1001:1998 annex 3 (KS C 5601-1992 annex 3) +# is a _seprate_ MBCS with a _completely different_ +# mapping. +# </quote> +# +# The following aliases tries to mirror the poor state of alias recognition +# on these platforms. +# +# ibm-970 is almost a subset of ibm-1363. +# Java, Solaris and AIX use euc-kr to also mean ksc5601. +# Java has both ibm-970 and EUC-KR as separate converters. +ibm-970_P110_P110-2006_U2 { UTR22* } + ibm-970 { IBM* JAVA } + EUC-KR { IANA* MIME* WINDOWS JAVA } + KS_C_5601-1987 { JAVA } + windows-51949 { WINDOWS* } + csEUCKR { IANA WINDOWS } # x-euc-kr is also a MIME name + ibm-eucKR { JAVA } + KSC_5601 { JAVA } # Needed by iso-2022 + 5601 { JAVA } + cp970 { JAVA* } + 970 { JAVA } + ibm-970_VPUA + +# ibm-971 is almost the set of DBCS mappings of ibm-970 +ibm-971_P100-1995 ibm-971 { IBM* } ibm-971_VPUA + +# Java, Solaris and AIX use euc-kr to also mean ksc5601, and _sometimes_ for Windows too. +# ibm-1363 is almost a superset of ibm-970. +ibm-1363_P11B-1998 { UTR22* } + ibm-1363 # Leave untagged because this isn't the default + KS_C_5601-1987 { IANA* } + KS_C_5601-1989 { IANA } + KSC_5601 { IANA } + csKSC56011987 { IANA } + korean { IANA } + iso-ir-149 { IANA } + cp1363 { MIME* } + 5601 + ksc + windows-949 # Alternate mapping. Leave untagged. This is the IBM interpretation of a Windows codepage. + ibm-1363_VSUB_VPUA + # ks_x_1001:1992 + # ksc5601-1992 + +ibm-1363_P110-1997 { UTR22* } # Korean KSC MBCS with \ <-> Won mapping + ibm-1363 { IBM* } + ibm-1363_VASCII_VSUB_VPUA + +windows-949-2000 { UTR22* } + windows-949 { JAVA* WINDOWS* } + KS_C_5601-1987 { WINDOWS } + KS_C_5601-1989 { WINDOWS } + KSC_5601 { MIME WINDOWS } # Needed by iso-2022 + csKSC56011987 { WINDOWS } + korean { WINDOWS } + iso-ir-149 { WINDOWS } + ms949 { JAVA } + +windows-874-2000 { UTR22* } # Thai (w/ euro update) + TIS-620 { WINDOWS } + windows-874 { JAVA* WINDOWS* } + MS874 { JAVA } + # iso-8859-11 { WINDOWS } # iso-8859-11 is similar to TIS-620. ibm-13162 is a closer match. + +ibm-874_P100-1995 { UTR22* } # Thai PC (w/o euro update). + ibm-874 { IBM* JAVA } + ibm-9066 { IBM } # Yes ibm-874 == ibm-9066. ibm-1161 has the euro update. + cp874 { JAVA* } + TIS-620 { IANA* JAVA } # This is actually separate from ibm-874, which is similar to this table + tis620.2533 { JAVA } # This is actually separate from ibm-874, which is similar to this table + eucTH # eucTH is an unusual alias from Solaris. eucTH has fewer mappings than TIS620 + +ibm-1162_P100-1999 { UTR22* } # Thai (w/ euro update) + ibm-1162 { IBM* } + +# Platform codepages +# If Java supports the IBM prefix, it should also support the ibm- prefix too. +ibm-437_P100-1995 { UTR22* } ibm-437 { IBM* } IBM437 { IANA* WINDOWS JAVA } cp437 { IANA WINDOWS JAVA* } 437 { IANA WINDOWS JAVA } csPC8CodePage437 { IANA JAVA } windows-437 { WINDOWS* } # PC US +ibm-720_P100-1997 { UTR22* } ibm-720 { IBM* } windows-720 { WINDOWS* } DOS-720 { WINDOWS } # PC Arabic +ibm-737_P100-1997 { UTR22* } ibm-737 { IBM* } IBM737 { WINDOWS JAVA } cp737 { JAVA* } windows-737 { WINDOWS* } 737 { JAVA } # PC Greek +ibm-775_P100-1996 { UTR22* } ibm-775 { IBM* } IBM775 { IANA* WINDOWS JAVA } cp775 { IANA WINDOWS JAVA* } csPC775Baltic { IANA } windows-775 { WINDOWS* } 775 { JAVA } # PC Baltic +ibm-850_P100-1995 { UTR22* } ibm-850 { IBM* } IBM850 { IANA* MIME* WINDOWS JAVA } cp850 { IANA MIME WINDOWS JAVA* } 850 { IANA JAVA } csPC850Multilingual { IANA JAVA } windows-850 { WINDOWS* } # PC latin1 +ibm-851_P100-1995 { UTR22* } ibm-851 { IBM* } IBM851 { IANA* } cp851 { IANA MIME* } 851 { IANA } csPC851 { IANA } # PC DOS Greek (w/o euro) +ibm-852_P100-1995 { UTR22* } ibm-852 { IBM* } IBM852 { IANA* WINDOWS JAVA } cp852 { IANA WINDOWS JAVA* } 852 { IANA WINDOWS JAVA } csPCp852 { IANA JAVA } windows-852 { WINDOWS* } # PC latin2 (w/o euro update) +ibm-855_P100-1995 { UTR22* } ibm-855 { IBM* } IBM855 { IANA* JAVA } cp855 { IANA JAVA* } 855 { IANA } csIBM855 { IANA } csPCp855 { JAVA } windows-855 { WINDOWS* } # PC cyrillic (w/o euro update) +ibm-856_P100-1995 { UTR22* } ibm-856 { IBM* } IBM856 { JAVA } cp856 { JAVA* } 856 { JAVA } # PC Hebrew implicit order +ibm-857_P100-1995 { UTR22* } ibm-857 { IBM* } IBM857 { IANA* MIME* WINDOWS JAVA } cp857 { IANA MIME JAVA* } 857 { IANA JAVA } csIBM857 { IANA JAVA } windows-857 { WINDOWS* } # PC Latin 5 (w/o euro update) +ibm-858_P100-1997 { UTR22* } ibm-858 { IBM* } IBM00858 { IANA* MIME* JAVA } CCSID00858 { IANA JAVA } CP00858 { IANA JAVA } PC-Multilingual-850+euro { IANA } cp858 { MIME JAVA* } windows-858 { WINDOWS* } # PC latin1 with Euro +ibm-860_P100-1995 { UTR22* } ibm-860 { IBM* } IBM860 { IANA* MIME* JAVA } cp860 { IANA MIME JAVA* } 860 { IANA JAVA } csIBM860 { IANA JAVA } # PC Portugal +ibm-861_P100-1995 { UTR22* } ibm-861 { IBM* } IBM861 { IANA* MIME* WINDOWS JAVA } cp861 { IANA MIME JAVA* } 861 { IANA JAVA } cp-is { IANA JAVA } csIBM861 { IANA JAVA } windows-861 { WINDOWS* } # PC Iceland +ibm-862_P100-1995 { UTR22* } ibm-862 { IBM* } IBM862 { IANA* MIME* JAVA } cp862 { IANA MIME JAVA* } 862 { IANA JAVA } csPC862LatinHebrew { IANA JAVA } DOS-862 { WINDOWS } windows-862 { WINDOWS* } # PC Hebrew visual order (w/o euro update) +ibm-863_P100-1995 { UTR22* } ibm-863 { IBM* } IBM863 { IANA* MIME* JAVA } cp863 { IANA MIME JAVA* } 863 { IANA JAVA } csIBM863 { IANA JAVA } # PC Canadian French +ibm-864_X110-1999 { UTR22* } ibm-864 { IBM* } IBM864 { IANA* MIME* JAVA } cp864 { IANA MIME JAVA* } csIBM864 { IANA JAVA } # PC Arabic (w/o euro update) +ibm-865_P100-1995 { UTR22* } ibm-865 { IBM* } IBM865 { IANA* MIME* JAVA } cp865 { IANA MIME JAVA* } 865 { IANA JAVA } csIBM865 { IANA JAVA } # PC Nordic +ibm-866_P100-1995 { UTR22* } ibm-866 { IBM* } IBM866 { IANA* MIME* JAVA } cp866 { IANA MIME WINDOWS JAVA* } 866 { IANA JAVA } csIBM866 { IANA JAVA } windows-866 { WINDOWS* } # PC Russian (w/o euro update) +ibm-867_P100-1998 { UTR22* } ibm-867 { IBM* } # PC Hebrew (w/ euro update) Updated version of ibm-862 +ibm-868_P100-1995 { UTR22* } ibm-868 { IBM* } IBM868 { IANA* MIME* JAVA } CP868 { IANA MIME JAVA* } 868 { JAVA } csIBM868 { IANA } cp-ar { IANA } # PC Urdu +ibm-869_P100-1995 { UTR22* } ibm-869 { IBM* } IBM869 { IANA* MIME* WINDOWS JAVA } cp869 { IANA MIME JAVA* } 869 { IANA JAVA } cp-gr { IANA JAVA } csIBM869 { IANA JAVA } windows-869 { WINDOWS* } # PC Greek (w/o euro update) +ibm-878_P100-1996 { UTR22* } ibm-878 { IBM* } KOI8-R { IANA* MIME* WINDOWS JAVA* } koi8 { WINDOWS JAVA } csKOI8R { IANA WINDOWS JAVA } windows-20866 { WINDOWS* } cp878 # Russian internet +ibm-901_P100-1999 { UTR22* } ibm-901 { IBM* } # PC Baltic (w/ euro update), update of ibm-921 +ibm-902_P100-1999 { UTR22* } ibm-902 { IBM* } # PC Estonian (w/ euro update), update of ibm-922 +ibm-922_P100-1999 { UTR22* } ibm-922 { IBM* } IBM922 { JAVA } cp922 { JAVA* } 922 { JAVA } # PC Estonian (w/o euro update) +ibm-1168_P100-2002 { UTR22* } ibm-1168 { IBM* } KOI8-U { IANA* WINDOWS } windows-21866 { WINDOWS* } # Ukrainian KOI8. koi8-ru != KOI8-U and Microsoft is wrong for aliasing them as the same. +ibm-4909_P100-1999 { UTR22* } ibm-4909 { IBM* } # ISO Greek (w/ euro update), update of ibm-813 + +# The cp aliases in this section aren't really windows aliases, but it was used by ICU for Windows. +# cp is usually used to denote IBM in Java, and that is why we don't do that anymore. +# The windows-* aliases mean windows codepages. +ibm-5346_P100-1998 { UTR22* } ibm-5346 { IBM* } windows-1250 { IANA* JAVA* WINDOWS* } cp1250 { WINDOWS JAVA } # Windows Latin2 (w/ euro update) +ibm-5347_P100-1998 { UTR22* } ibm-5347 { IBM* } windows-1251 { IANA* JAVA* WINDOWS* } cp1251 { WINDOWS JAVA } ANSI1251 # Windows Cyrillic (w/ euro update). ANSI1251 is from Solaris +ibm-5348_P100-1997 { UTR22* } ibm-5348 { IBM* } windows-1252 { IANA* JAVA* WINDOWS* } cp1252 { JAVA } # Windows Latin1 (w/ euro update) +ibm-5349_P100-1998 { UTR22* } ibm-5349 { IBM* } windows-1253 { IANA* JAVA* WINDOWS* } cp1253 { JAVA } # Windows Greek (w/ euro update) +ibm-5350_P100-1998 { UTR22* } ibm-5350 { IBM* } windows-1254 { IANA* JAVA* WINDOWS* } cp1254 { JAVA } # Windows Turkish (w/ euro update) +ibm-9447_P100-2002 { UTR22* } ibm-9447 { IBM* } windows-1255 { IANA* JAVA* WINDOWS* } cp1255 { JAVA } # Windows Hebrew (w/ euro update) +ibm-9448_X100-2005 { UTR22* } ibm-9448 { IBM* } windows-1256 { IANA* JAVA* WINDOWS* } cp1256 { WINDOWS JAVA } # Windows Arabic (w/ euro update) +ibm-9449_P100-2002 { UTR22* } ibm-9449 { IBM* } windows-1257 { IANA* JAVA* WINDOWS* } cp1257 { JAVA } # Windows Baltic (w/ euro update) +ibm-5354_P100-1998 { UTR22* } ibm-5354 { IBM* } windows-1258 { IANA* JAVA* WINDOWS* } cp1258 { JAVA } # Windows Vietnamese (w/ euro update) + +# These tables are out of date, and most don't have the Euro +# Leave the windows- variants untagged. They are alternate tables of the newer ones above. +ibm-1250_P100-1995 { UTR22* } ibm-1250 { IBM* } windows-1250 # Old Windows Latin2 (w/o euro update) +ibm-1251_P100-1995 { UTR22* } ibm-1251 { IBM* } windows-1251 # Old Windows Cyrillic (w/o euro update) +ibm-1252_P100-2000 { UTR22* } ibm-1252 { IBM* } windows-1252 # Old Windows Latin 1 without Euro +ibm-1253_P100-1995 { UTR22* } ibm-1253 { IBM* } windows-1253 # Old Windows Greek (w/o euro update) +ibm-1254_P100-1995 { UTR22* } ibm-1254 { IBM* } windows-1254 # Old Windows Turkish (w/o euro update) +ibm-1255_P100-1995 { UTR22* } ibm-1255 { IBM* } # Very old Windows Hebrew (w/o euro update) +ibm-5351_P100-1998 { UTR22* } ibm-5351 { IBM* } windows-1255 # Old Windows Hebrew (w/ euro update) +ibm-1256_P110-1997 { UTR22* } ibm-1256 { IBM* } # Old Windows Arabic (w/o euro update) +ibm-5352_P100-1998 { UTR22* } ibm-5352 { IBM* } windows-1256 # Somewhat old Windows Arabic (w/ euro update) +ibm-1257_P100-1995 { UTR22* } ibm-1257 { IBM* } # Old Windows Baltic (w/o euro update) +ibm-5353_P100-1998 { UTR22* } ibm-5353 { IBM* } windows-1257 # Somewhat old Windows Baltic (w/ euro update) +ibm-1258_P100-1997 { UTR22* } ibm-1258 { IBM* } windows-1258 # Old Windows Vietnamese (w/o euro update) + +macos-0_2-10.2 { UTR22* } macintosh { IANA* MIME* WINDOWS } mac { IANA } csMacintosh { IANA } windows-10000 { WINDOWS* } macroman { JAVA } x-macroman { JAVA* } # Apple latin 1 +macos-6_2-10.4 { UTR22* } x-mac-greek { MIME* WINDOWS } windows-10006 { WINDOWS* } macgr # Apple Greek +macos-7_3-10.2 { UTR22* } x-mac-cyrillic { MIME* WINDOWS } windows-10007 { WINDOWS* } mac-cyrillic maccy # Apple Cyrillic +macos-29-10.2 { UTR22* } x-mac-centraleurroman { MIME* } windows-10029 { WINDOWS* } x-mac-ce { WINDOWS } macce maccentraleurope # Apple Central Europe +macos-35-10.2 { UTR22* } x-mac-turkish { MIME* WINDOWS } windows-10081 { WINDOWS* } mactr # Apple Turkish + +ibm-1051_P100-1995 { UTR22* } ibm-1051 { IBM* } hp-roman8 { IANA* } roman8 { IANA } r8 { IANA } csHPRoman8 { IANA } # HP Latin1 +ibm-1276_P100-1995 { UTR22* } ibm-1276 { IBM* } Adobe-Standard-Encoding { IANA* } csAdobeStandardEncoding { IANA } # Different from ISO-Unicode-IBM-1276 (GCSGID: 1276) + +ibm-1006_P100-1995 { UTR22* } ibm-1006 { IBM* } IBM1006 { JAVA } cp1006 { JAVA* } 1006 { JAVA } # Urdu +ibm-1098_P100-1995 { UTR22* } ibm-1098 { IBM* } IBM1098 { JAVA } cp1098 { JAVA* } 1098 { JAVA } # PC Farsi +ibm-1124_P100-1996 { UTR22* } ibm-1124 { IBM* JAVA } cp1124 { JAVA* } 1124 { JAVA } # ISO Cyrillic Ukraine +ibm-1125_P100-1997 { UTR22* } ibm-1125 { IBM* } cp1125 # Cyrillic Ukraine PC +ibm-1129_P100-1997 { UTR22* } ibm-1129 { IBM* } # ISO Vietnamese +ibm-1131_P100-1997 { UTR22* } ibm-1131 { IBM* } cp1131 # Cyrillic Belarus PC +ibm-1133_P100-1997 { UTR22* } ibm-1133 { IBM* } # ISO Lao + + +# Partially algorithmic converters + +# [U_ENABLE_GENERIC_ISO_2022] +# The _generic_ ISO-2022 converter is disabled starting 2003-dec-03 (ICU 2.8). +# For details see the icu mailing list from 2003-dec-01 and the ucnv2022.c file. +# Language-specific variants of ISO-2022 continue to be available as listed below. +# ISO_2022 ISO-2022 + +ISO_2022,locale=ja,version=0 ISO-2022-JP { IANA* MIME* JAVA* } csISO2022JP { IANA JAVA } +ISO_2022,locale=ja,version=1 ISO-2022-JP-1 { MIME* } JIS_Encoding { IANA* } csJISEncoding { IANA } ibm-5054 { IBM* } JIS +ISO_2022,locale=ja,version=2 ISO-2022-JP-2 { IANA* MIME* } csISO2022JP2 { IANA } +ISO_2022,locale=ja,version=3 JIS7 +ISO_2022,locale=ja,version=4 JIS8 +ISO_2022,locale=ko,version=0 ISO-2022-KR { IANA* MIME* JAVA* } csISO2022KR { IANA JAVA } # This uses ibm-949 +ISO_2022,locale=ko,version=1 ibm-25546 { IBM* } +ISO_2022,locale=zh,version=0 ISO-2022-CN { IANA* JAVA* } csISO2022CN { JAVA } +ISO_2022,locale=zh,version=1 ISO-2022-CN-EXT { IANA* } +HZ HZ-GB-2312 { IANA* } + +ISCII,version=0 x-iscii-de { WINDOWS } windows-57002 { WINDOWS* } iscii-dev ibm-4902 { IBM* } # ibm-806 contains non-standard box drawing symbols. +ISCII,version=1 x-iscii-be { WINDOWS } windows-57003 { WINDOWS* } iscii-bng windows-57006 { WINDOWS } x-iscii-as { WINDOWS } # be is different from as on Windows. +ISCII,version=2 x-iscii-pa { WINDOWS } windows-57011 { WINDOWS* } iscii-gur +ISCII,version=3 x-iscii-gu { WINDOWS } windows-57010 { WINDOWS* } iscii-guj +ISCII,version=4 x-iscii-or { WINDOWS } windows-57007 { WINDOWS* } iscii-ori +ISCII,version=5 x-iscii-ta { WINDOWS } windows-57004 { WINDOWS* } iscii-tml +ISCII,version=6 x-iscii-te { WINDOWS } windows-57005 { WINDOWS* } iscii-tlg +ISCII,version=7 x-iscii-ka { WINDOWS } windows-57008 { WINDOWS* } iscii-knd +ISCII,version=8 x-iscii-ma { WINDOWS } windows-57009 { WINDOWS* } iscii-mlm + +# Lotus specific +LMBCS-1 lmbcs ibm-65025 { IBM* } + +# These Lotus specific converters still work, but they aren't advertised in this alias table. +# These are almost never used outside of Lotus software, +# and they take a lot of time when creating the available converter list. +# Also Lotus doesn't really use them anyway. It was a mistake to create these LMBCS variant converters in ICU. +#LMBCS-2 +#LMBCS-3 +#LMBCS-4 +#LMBCS-5 +#LMBCS-6 +#LMBCS-8 +#LMBCS-11 +#LMBCS-16 +#LMBCS-17 +#LMBCS-18 +#LMBCS-19 + +# EBCDIC codepages according to the CDRA + +# without Euro +ibm-37_P100-1995 { UTR22* } # EBCDIC US + ibm-37 { IBM* } + IBM037 { IANA* JAVA } + ibm-037 # { JAVA } + ebcdic-cp-us { IANA JAVA } + ebcdic-cp-ca { IANA JAVA } + ebcdic-cp-wt { IANA JAVA } + ebcdic-cp-nl { IANA JAVA } + csIBM037 { IANA JAVA } + cp037 { JAVA* } + 037 { JAVA } + cpibm37 { JAVA } + cp37 + +ibm-273_P100-1995 { UTR22* } ibm-273 { IBM* } IBM273 { IANA* JAVA } CP273 { IANA JAVA* } csIBM273 { IANA } ebcdic-de 273 { JAVA } # EBCDIC Germanay, Austria +ibm-277_P100-1995 { UTR22* } ibm-277 { IBM* } IBM277 { IANA* JAVA } cp277 { JAVA* } EBCDIC-CP-DK { IANA } EBCDIC-CP-NO { IANA } csIBM277 { IANA } ebcdic-dk 277 { JAVA } # EBCDIC Denmark +ibm-278_P100-1995 { UTR22* } ibm-278 { IBM* } IBM278 { IANA* JAVA } cp278 { JAVA* } ebcdic-cp-fi { IANA } ebcdic-cp-se { IANA } csIBM278 { IANA } ebcdic-sv { JAVA } 278 { JAVA } # EBCDIC Sweden +ibm-280_P100-1995 { UTR22* } ibm-280 { IBM* } IBM280 { IANA* JAVA } CP280 { IANA JAVA* } ebcdic-cp-it { IANA } csIBM280 { IANA } 280 { JAVA } # EBCDIC Italy +ibm-284_P100-1995 { UTR22* } ibm-284 { IBM* } IBM284 { IANA* JAVA } CP284 { IANA JAVA* } ebcdic-cp-es { IANA } csIBM284 { IANA } cpibm284 { JAVA } 284 { JAVA } # EBCDIC Spain +ibm-285_P100-1995 { UTR22* } ibm-285 { IBM* } IBM285 { IANA* JAVA } CP285 { IANA JAVA* } ebcdic-cp-gb { IANA } csIBM285 { IANA } cpibm285 { JAVA } ebcdic-gb { JAVA } 285 { JAVA } # EBCDIC UK Ireland +ibm-290_P100-1995 { UTR22* } ibm-290 { IBM* } IBM290 { IANA* } cp290 { IANA } EBCDIC-JP-kana { IANA } csIBM290 { IANA } # host SBCS (Katakana) +ibm-297_P100-1995 { UTR22* } ibm-297 { IBM* } IBM297 { IANA* JAVA } cp297 { IANA JAVA* } ebcdic-cp-fr { IANA } csIBM297 { IANA } cpibm297 { JAVA } 297 { JAVA } # EBCDIC France +ibm-420_X120-1999 { UTR22* } ibm-420 { IBM* } IBM420 { IANA* JAVA } cp420 { IANA JAVA* } ebcdic-cp-ar1 { IANA } csIBM420 { IANA } 420 { JAVA } # EBCDIC Arabic (all presentation shapes) +ibm-424_P100-1995 { UTR22* } ibm-424 { IBM* } IBM424 { IANA* JAVA } cp424 { IANA JAVA* } ebcdic-cp-he { IANA } csIBM424 { IANA } 424 { JAVA } # EBCDIC Hebrew +ibm-500_P100-1995 { UTR22* } ibm-500 { IBM* } IBM500 { IANA* JAVA } CP500 { IANA JAVA* } ebcdic-cp-be { IANA } csIBM500 { IANA } ebcdic-cp-ch { IANA } 500 # EBCDIC International Latin1 +ibm-803_P100-1999 { UTR22* } ibm-803 { IBM* } cp803 # Old EBCDIC Hebrew +ibm-838_P100-1995 { UTR22* } ibm-838 { IBM* } IBM838 { JAVA } IBM-Thai { IANA* JAVA } csIBMThai { IANA } cp838 { JAVA* } 838 { JAVA } ibm-9030 { IBM } # EBCDIC Thai. Yes ibm-9030 is an alias. +ibm-870_P100-1995 { UTR22* } ibm-870 { IBM* } IBM870 { IANA* JAVA } CP870 { IANA JAVA* } ebcdic-cp-roece { IANA } ebcdic-cp-yu { IANA } csIBM870 { IANA } # EBCDIC Latin 2 +ibm-871_P100-1995 { UTR22* } ibm-871 { IBM* } IBM871 { IANA* JAVA } ebcdic-cp-is { IANA JAVA } csIBM871 { IANA JAVA } CP871 { IANA JAVA* } ebcdic-is { JAVA } 871 { JAVA } # EBCDIC Iceland +ibm-875_P100-1995 { UTR22* } ibm-875 { IBM* } IBM875 { JAVA } cp875 { JAVA* } 875 { JAVA } # EBCDIC Greek +ibm-918_P100-1995 { UTR22* } ibm-918 { IBM* } IBM918 { IANA* JAVA } CP918 { IANA JAVA* } ebcdic-cp-ar2 { IANA } csIBM918 { IANA } # EBCDIC Urdu +ibm-930_P120-1999 { UTR22* } # EBCDIC_STATEFUL Katakana-Kanji Host Mixed. + ibm-930 { IBM* } + ibm-5026 { IBM } # Yes this is correct + IBM930 { JAVA } + cp930 { JAVA* } + 930 { JAVA } +ibm-933_P110-1995 { UTR22* } ibm-933 { IBM* JAVA } cp933 { JAVA* } 933 { JAVA } # Korea EBCDIC MIXED +ibm-935_P110-1999 { UTR22* } ibm-935 { IBM* JAVA } cp935 { JAVA* } 935 { JAVA } # China EBCDIC MIXED. Need to use Unicode, ibm-1388 or gb18030 instead because it is required by the government of China. +ibm-937_P110-1999 { UTR22* } ibm-937 { IBM* JAVA } cp937 { JAVA* } 937 { JAVA } # Taiwan EBCDIC MIXED +ibm-939_P120-1999 { UTR22* } # EBCDIC_STATEFUL Latin-Kanji Host Mixed. + ibm-939 { IBM* } + ibm-931 { IBM } # Yes this is correct + ibm-5035 { IBM } # Yes this is also correct + IBM939 { JAVA } + cp939 { JAVA* } + 939 { JAVA } +ibm-1025_P100-1995 { UTR22* } ibm-1025 { IBM* JAVA } cp1025 { JAVA* } 1025 { JAVA } # EBCDIC Cyrillic +ibm-1026_P100-1995 { UTR22* } ibm-1026 { IBM* } IBM1026 { IANA* JAVA } CP1026 { IANA JAVA* } csIBM1026 { IANA } 1026 { JAVA } # EBCDIC Turkey +ibm-1047_P100-1995 { UTR22* } ibm-1047 { IBM* } IBM1047 { IANA* JAVA } cp1047 { JAVA* } 1047 { JAVA } # EBCDIC Open systems Latin1 +ibm-1097_P100-1995 { UTR22* } ibm-1097 { IBM* JAVA } cp1097 { JAVA* } 1097 { JAVA } # EBCDIC Farsi +ibm-1112_P100-1995 { UTR22* } ibm-1112 { IBM* JAVA } cp1112 { JAVA* } 1112 { JAVA } # EBCDIC Baltic +ibm-1122_P100-1999 { UTR22* } ibm-1122 { IBM* JAVA } cp1122 { JAVA* } 1122 { JAVA } # EBCDIC Estonia +ibm-1123_P100-1995 { UTR22* } ibm-1123 { IBM* JAVA } cp1123 { JAVA* } 1123 { JAVA } # EBCDIC Cyrillic Ukraine +ibm-1130_P100-1997 { UTR22* } ibm-1130 { IBM* } # EBCDIC Vietnamese +ibm-1132_P100-1998 { UTR22* } ibm-1132 { IBM* } # EBCDIC Lao +ibm-1137_P100-1999 { UTR22* } ibm-1137 { IBM* } # Devanagari EBCDIC (based on Unicode character set) +ibm-4517_P100-2005 { UTR22* } ibm-4517 { IBM* } # EBCDIC Arabic. Update of ibm-421 + +# with Euro +ibm-1140_P100-1997 { UTR22* } ibm-1140 { IBM* } IBM01140 { IANA* JAVA } CCSID01140 { IANA JAVA } CP01140 { IANA JAVA } cp1140 { JAVA* } ebcdic-us-37+euro { IANA } # EBCDIC US +ibm-1141_P100-1997 { UTR22* } ibm-1141 { IBM* } IBM01141 { IANA* JAVA } CCSID01141 { IANA JAVA } CP01141 { IANA JAVA } cp1141 { JAVA* } ebcdic-de-273+euro { IANA } # EBCDIC Germanay, Austria +ibm-1142_P100-1997 { UTR22* } ibm-1142 { IBM* } IBM01142 { IANA* JAVA } CCSID01142 { IANA JAVA } CP01142 { IANA JAVA } cp1142 { JAVA* } ebcdic-dk-277+euro { IANA } ebcdic-no-277+euro { IANA } # EBCDIC Denmark +ibm-1143_P100-1997 { UTR22* } ibm-1143 { IBM* } IBM01143 { IANA* JAVA } CCSID01143 { IANA JAVA } CP01143 { IANA JAVA } cp1143 { JAVA* } ebcdic-fi-278+euro { IANA } ebcdic-se-278+euro { IANA } # EBCDIC Sweden +ibm-1144_P100-1997 { UTR22* } ibm-1144 { IBM* } IBM01144 { IANA* JAVA } CCSID01144 { IANA JAVA } CP01144 { IANA JAVA } cp1144 { JAVA* } ebcdic-it-280+euro { IANA } # EBCDIC Italy +ibm-1145_P100-1997 { UTR22* } ibm-1145 { IBM* } IBM01145 { IANA* JAVA } CCSID01145 { IANA JAVA } CP01145 { IANA JAVA } cp1145 { JAVA* } ebcdic-es-284+euro { IANA } # EBCDIC Spain +ibm-1146_P100-1997 { UTR22* } ibm-1146 { IBM* } IBM01146 { IANA* JAVA } CCSID01146 { IANA JAVA } CP01146 { IANA JAVA } cp1146 { JAVA* } ebcdic-gb-285+euro { IANA } # EBCDIC UK Ireland +ibm-1147_P100-1997 { UTR22* } ibm-1147 { IBM* } IBM01147 { IANA* JAVA } CCSID01147 { IANA JAVA } CP011... [truncated message content] |
From: <den...@us...> - 2009-08-16 14:08:17
|
Revision: 179 http://opentrep.svn.sourceforge.net/opentrep/?rev=179&view=rev Author: denis_arnaud Date: 2009-08-16 14:08:09 +0000 (Sun, 16 Aug 2009) Log Message: ----------- [i18n] Added a utility class for conversion from/to UTF8 strings to/from wide-character strings. Modified Paths: -------------- trunk/opentrep/opentrep/basic/sources.mk trunk/opentrep/test/i18n/icu/Makefile.am trunk/opentrep/test/i18n/stdlocru.cpp trunk/opentrep/test/i18n/utf8/Makefile.am trunk/opentrep/test/i18n/utf8/utf8.cpp trunk/opentrep/test/i18n/utf8/utf8.hpp trunk/opentrep/test/i18n/utf8/utf8string.cpp Added Paths: ----------- trunk/opentrep/opentrep/basic/UTF8Handler.cpp trunk/opentrep/opentrep/basic/UTF8Handler.hpp Added: trunk/opentrep/opentrep/basic/UTF8Handler.cpp =================================================================== --- trunk/opentrep/opentrep/basic/UTF8Handler.cpp (rev 0) +++ trunk/opentrep/opentrep/basic/UTF8Handler.cpp 2009-08-16 14:08:09 UTC (rev 179) @@ -0,0 +1,183 @@ +// ////////////////////////////////////////////////////////////////////// +// Import section +// ////////////////////////////////////////////////////////////////////// +// STL +#include <cassert> +#include <sstream> +#include <string> +// OpenTrep +#include <opentrep/basic/UTF8Handler.hpp> + +namespace OPENTREP { + + // ////////////////////////////////////////////////////////////////////// + static const wchar_t offsetsFromUTF8[6] = { + 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL + }; + + // ////////////////////////////////////////////////////////////////////// + static const char trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 + }; + + // ////////////////////////////////////////////////////////////////////// + std::wstring UTF8Handler::toWideString (const std::string& iSrc) { + std::basic_ostringstream<wchar_t> oStr; + + // Length of the source string + const size_t lStringSize = iSrc.size(); + + // Transform the source string in a regular C-string (char*) + const char* src = iSrc.c_str(); + + // + typedef unsigned char uchar_t; + + size_t idx = 0; + while (idx != lStringSize) { + + uchar_t lCurrentChar = static_cast<uchar_t> (src[idx]); + + // When there are multi-byte characters (e.g., for UTF-8 encoded + // STL strings), the size of the STL string corresponds to the + // total number of bytes. For instance, "München" has a size of 8 + // bytes (and not 7 characters). However, the iteration is made on + // the number of characters (idx); when the end of the string is + // reached, the loop must therefore be exited. + if (lCurrentChar == '\0') { + break; + } + + const int nb = trailingBytesForUTF8[lCurrentChar]; + + wchar_t tmpChar = 0; + switch (nb) { + // These fall through deliberately + case 3: { + lCurrentChar = static_cast<uchar_t> (src[idx]); ++idx; + tmpChar += lCurrentChar; tmpChar <<= 6; + } + case 2: { + lCurrentChar = static_cast<uchar_t> (src[idx]); ++idx; + tmpChar += lCurrentChar; tmpChar <<= 6; + } + case 1: { + lCurrentChar = static_cast<uchar_t> (src[idx]); ++idx; + tmpChar += lCurrentChar; tmpChar <<= 6; + } + case 0: { + lCurrentChar = static_cast<uchar_t> (src[idx]); ++idx; + tmpChar += lCurrentChar; + } + } + + tmpChar -= offsetsFromUTF8[nb]; + oStr << tmpChar; + } + + oStr << '\0'; + return oStr.str(); + } + + // ////////////////////////////////////////////////////////////////////// + std::string UTF8Handler::toSimpleString (const std::wstring& iStr) { + std::ostringstream oStr; + + const wchar_t* src = iStr.c_str(); + size_t idx = 0; + size_t i = 0; + + while (src[i] != 0) { + wchar_t ch = src[i]; + + if (ch < 0x80) { + const char tmpChar = static_cast<const char> (ch); + oStr << tmpChar; ++idx; + + } else if (ch < 0x800) { + char tmpChar = static_cast<const char> ((ch >> 6) | 0xC0); + oStr << tmpChar; ++idx; + + tmpChar = static_cast<const char> ((ch & 0x3F) | 0x80); + oStr << tmpChar; ++idx; + + } else if (ch < 0x10000) { + char tmpChar = static_cast<const char> ((ch>>12) | 0xE0); + oStr << tmpChar; ++idx; + + tmpChar = static_cast<const char> (((ch>>6) & 0x3F) | 0x80); + oStr << tmpChar; ++idx; + + tmpChar = static_cast<const char> ((ch & 0x3F) | 0x80); + oStr << tmpChar; ++idx; + + } else if (ch < 0x110000) { + char tmpChar = static_cast<const char> ((ch>>18) | 0xF0); + oStr << tmpChar; ++idx; + + tmpChar = static_cast<const char> (((ch>>12) & 0x3F) | 0x80); + oStr << tmpChar; ++idx; + + tmpChar = static_cast<const char> (((ch>>6) & 0x3F) | 0x80); + oStr << tmpChar; ++idx; + + tmpChar = static_cast<const char> ((ch & 0x3F) | 0x80); + oStr << tmpChar; ++idx; + } + i++; + } + + oStr << '\0'; + + return oStr.str(); + } + + // ////////////////////////////////////////////////////////////////////// + std::string UTF8Handler::displayCharString (const char* iString) { + std::ostringstream oStr; + + bool hasReachedEnd = false; + for (size_t idx = 0; hasReachedEnd == false; ++idx) { + if (idx != 0) { + oStr << "; "; + } + const unsigned char lChar = iString[idx]; + // const wchar_t lChar = iString[idx]; + if (lChar == '\0') { + hasReachedEnd = true; + } + oStr << "[" << idx << "]: " << std::hex << lChar; + } + oStr << std::endl; + + return oStr.str(); + } + + // ////////////////////////////////////////////////////////////////////// + std::string UTF8Handler::displaySTLWString (const std::wstring& iString) { + std::ostringstream oStr; + + size_t idx = 0; + for (std::wstring::const_iterator itChar = iString.begin(); + itChar != iString.end(); ++itChar, ++idx) { + if (idx != 0) { + oStr << "; "; + } + const wchar_t lChar = *itChar; + oStr << "[" << idx << "]: " << std::hex << lChar; + } + oStr << std::endl; + + return oStr.str(); + } + +} + Added: trunk/opentrep/opentrep/basic/UTF8Handler.hpp =================================================================== --- trunk/opentrep/opentrep/basic/UTF8Handler.hpp (rev 0) +++ trunk/opentrep/opentrep/basic/UTF8Handler.hpp 2009-08-16 14:08:09 UTC (rev 179) @@ -0,0 +1,53 @@ +#ifndef __OPENTREP_BAS_UTF8HANDLER_HPP +#define __OPENTREP_BAS_UTF8HANDLER_HPP + +// ////////////////////////////////////////////////////////////////////// +// Import section +// ////////////////////////////////////////////////////////////////////// +// STL +#include <string> + +namespace OPENTREP { + + /** Utility class for basic handling of UTF-8 encoded strings. + <br>Most of the methods have taken their inspiration from Jeff + Bezanson's work in the Wikix project + (see http://meta.wikimedia.org/wiki/Wikix for further details), + and have been "C++-ified". */ + class UTF8Handler { + public: + /* Conversion from a UTF-8-encoded "simple character" (though + potentially multi-byte) STL string into a wide character STL + string. + <br>Note that as there is no checks of appropriate encoding, it + only works for valid UTF-8, i.e. no 5- or 6-byte sequences. + <br>Note that the "simple characters", within a STL string, may be + multi-byte (e.g., if they are UTF-8-encoded). + @param std::string The "simple character" (though potentially + multi-byte) STL string. + @return std::wstring The wide character STL string. + */ + static std::wstring toWideString (const std::string& iSrc); + + /* Conversion from a wide character STL string into a UTF-8-encoded + "simple character" (though potentially multi-byte) STL string. + <br>Note that as there is no checks of appropriate encoding, it + only works for valid UTF-8, i.e. no 5- or 6-byte sequences. + <br>Note that the "simple characters", within a STL string, may be + multi-byte (e.g., if they are UTF-8-encoded). + @param std::wstring The wide character STL string. + @return std::string The "simple character" (though potentially + multi-byte) STL string. + */ + static std::string toSimpleString (const std::wstring& iStr); + + /** Display the sequence of characters for the simple C-string. */ + static std::string displayCharString (const char* iString); + + /** Display the sequence of characters (one by one) for the given + STL wide character string. */ + static std::string displaySTLWString (const std::wstring& iString); + }; + +} +#endif // __OPENTREP_BAS_UTF8HANDLER_HPP Modified: trunk/opentrep/opentrep/basic/sources.mk =================================================================== --- trunk/opentrep/opentrep/basic/sources.mk 2009-08-15 18:24:37 UTC (rev 178) +++ trunk/opentrep/opentrep/basic/sources.mk 2009-08-16 14:08:09 UTC (rev 179) @@ -1,5 +1,7 @@ bas_h_sources = $(top_srcdir)/opentrep/basic/BasConst_General.hpp \ $(top_srcdir)/opentrep/basic/BasConst_OPENTREP_Service.hpp \ - $(top_srcdir)/opentrep/basic/BasChronometer.hpp + $(top_srcdir)/opentrep/basic/BasChronometer.hpp \ + $(top_srcdir)/opentrep/basic/UTF8Handler.hpp bas_cc_sources = $(top_srcdir)/opentrep/basic/BasConst.cpp \ - $(top_srcdir)/opentrep/basic/BasChronometer.cpp + $(top_srcdir)/opentrep/basic/BasChronometer.cpp \ + $(top_srcdir)/opentrep/basic/UTF8Handler.cpp Modified: trunk/opentrep/test/i18n/icu/Makefile.am =================================================================== --- trunk/opentrep/test/i18n/icu/Makefile.am 2009-08-15 18:24:37 UTC (rev 178) +++ trunk/opentrep/test/i18n/icu/Makefile.am 2009-08-16 14:08:09 UTC (rev 179) @@ -3,7 +3,7 @@ MAINTAINERCLEANFILES = Makefile.in -check_PROGRAMS = icufmt icuustring icucharsetdetector icuconv +check_PROGRAMS = icufmt icuustring icucharsetdetector icuconv icuutext icufmt_SOURCES = icufmt.cpp icufmt_CXXFLAGS = $(ICU_CFLAGS) @@ -21,4 +21,8 @@ icuconv_CXXFLAGS = $(ICU_CFLAGS) icuconv_LDFLAGS = $(ICU_LIBS) $(ICU_IO_LIB) +icuutext_SOURCES = icuutext.cpp +icuutext_CXXFLAGS = $(ICU_CFLAGS) +icuutext_LDFLAGS = $(ICU_LIBS) $(ICU_IO_LIB) + EXTRA_DIST = Modified: trunk/opentrep/test/i18n/stdlocru.cpp =================================================================== --- trunk/opentrep/test/i18n/stdlocru.cpp 2009-08-15 18:24:37 UTC (rev 178) +++ trunk/opentrep/test/i18n/stdlocru.cpp 2009-08-16 14:08:09 UTC (rev 179) @@ -59,7 +59,7 @@ std::cout << "de: " << mucDEWCharString << std::endl; std::cout << "ru: " << mucRUWCharString << std::endl; - // STL ctypes on char* + // STL ctypes on wchar_t std::use_facet<std::ctype<wchar_t> > (langLocale).toupper(mucDEWCharString, mucDEWCharString+7); std::use_facet<std::ctype<wchar_t> > (langLocale).toupper(mucRUWCharString, Modified: trunk/opentrep/test/i18n/utf8/Makefile.am =================================================================== --- trunk/opentrep/test/i18n/utf8/Makefile.am 2009-08-15 18:24:37 UTC (rev 178) +++ trunk/opentrep/test/i18n/utf8/Makefile.am 2009-08-16 14:08:09 UTC (rev 179) @@ -11,6 +11,8 @@ utf8string_SOURCES = utf8string.cpp utf8string_CXXFLAGS = -utf8string_LDFLAGS = +utf8string_LDFLAGS = \ + $(BOOST_LIBS) $(SOCI_LIBS) $(CPPUNIT_LIBS) \ + $(top_builddir)/@PACKAGE@/lib@PACKAGE@.la EXTRA_DIST = Modified: trunk/opentrep/test/i18n/utf8/utf8.cpp =================================================================== --- trunk/opentrep/test/i18n/utf8/utf8.cpp 2009-08-15 18:24:37 UTC (rev 178) +++ trunk/opentrep/test/i18n/utf8/utf8.cpp 2009-08-16 14:08:09 UTC (rev 179) @@ -55,10 +55,10 @@ for all the characters. if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space. */ -int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz) +int u8_toucs(u_int32_t *dest, int sz, const char *src, int srcsz) { u_int32_t ch; - char *src_end = src + srcsz; + const char* src_end = src + srcsz; int nb; int i=0; @@ -100,7 +100,7 @@ the NUL as well. the destination string will never be bigger than the source string. */ -int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz) +int u8_toutf8(char *dest, int sz, const u_int32_t *src, int srcsz) { u_int32_t ch; int i = 0; Modified: trunk/opentrep/test/i18n/utf8/utf8.hpp =================================================================== --- trunk/opentrep/test/i18n/utf8/utf8.hpp 2009-08-15 18:24:37 UTC (rev 178) +++ trunk/opentrep/test/i18n/utf8/utf8.hpp 2009-08-16 14:08:09 UTC (rev 179) @@ -5,10 +5,10 @@ #define isutf(c) (((c)&0xC0)!=0x80) /* convert UTF-8 data to wide character */ -int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz); +int u8_toucs(u_int32_t *dest, int sz, const char *src, int srcsz); /* the opposite conversion */ -int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz); +int u8_toutf8(char *dest, int sz, const u_int32_t *src, int srcsz); /* single character to UTF-8 */ int u8_wc_toutf8(char *dest, u_int32_t ch); Modified: trunk/opentrep/test/i18n/utf8/utf8string.cpp =================================================================== --- trunk/opentrep/test/i18n/utf8/utf8string.cpp 2009-08-15 18:24:37 UTC (rev 178) +++ trunk/opentrep/test/i18n/utf8/utf8string.cpp 2009-08-16 14:08:09 UTC (rev 179) @@ -1,113 +1,43 @@ // STL #include <iostream> -#include <locale> -#include <string> -#include <cstring> +// OpenTrep +#include <opentrep/basic/UTF8Handler.hpp> -// /////////////////////////////////////////////// -void displayCharString (const char* iString) { - // Store current formatting flags of std::cout - std::ios::fmtflags oldFlags = std::cout.flags(); - - const size_t lLength = std::strlen (iString); - for (size_t idx = 0; idx != lLength; ++idx) { - if (idx != 0) { - std::cout << "; "; - } - const unsigned short lChar = iString[idx]; - // const wchar_t lChar = iString[idx]; - std::cout << "[" << idx << "]: " << std::hex << lChar; - } - std::cout << std::endl; - - // Reset formatting flags of std::cout - std::cout.flags (oldFlags); -} - -// /////////////////////////////////////////////// -void displayWCharString (const wchar_t* iString, const size_t iLength) { - // Store current formatting flags of std::cout - std::ios::fmtflags oldFlags = std::cout.flags(); - - for (size_t idx = 0; idx != iLength; ++idx) { - if (idx != 0) { - std::cout << "; "; - } - const wchar_t lChar = iString[idx]; - std::cout << "[" << idx << "]: " << std::hex << lChar; - } - std::cout << std::endl; - - // Reset formatting flags of std::cout - std::cout.flags (oldFlags); -} - -// /////////////////////////////////////////////// -void displaySTLString (const std::string& iString) { - // Store current formatting flags of std::cout - std::ios::fmtflags oldFlags = std::cout.flags(); - - unsigned short idx = 0; - for (std::string::const_iterator itChar = iString.begin(); - itChar != iString.end(); ++itChar, ++idx) { - if (idx != 0) { - std::cout << "; "; - } - const unsigned short lChar = *itChar; - // const char lChar = *itChar; - // const wchar_t lChar = *itChar; - std::cout << "[" << idx << "]: " << std::hex << lChar; - } - std::cout << std::endl; - - // Reset formatting flags of std::cout - std::cout.flags (oldFlags); -} - // //////////////////////// M A I N ///////////////////////// int main (int argc, char* argv[]) { - // Single char strings - const char mucDECharString[] = ("München"); - const char mucRUCharString[] = ("Мюнхен"); + // STL strings + std::string mucDESTLString ("München"); + std::string mucRUSTLString ("Мюнхен"); - std::cout << "--------" << std::endl << "Single char strings" << std::endl; - std::cout << "Deutsch ('" << mucDECharString << "'): " << std::endl; - displayCharString (mucDECharString); + std::cout << "--------" << std::endl + << "STL strings without processing" << std::endl; + std::cout << "Deutsch: '" << mucDESTLString << "'" << std::endl; + std::cout << "Russian: '" << mucRUSTLString << "'" << std::endl; - std::cout << "Russian ('" << mucRUCharString << "'): " << std::endl; - displayCharString (mucRUCharString); - - // Wide char strings - wchar_t mucDEWCharString[7]; - wchar_t mucRUWCharString[6]; - - // Conversion from char* to wchar_t thanks to the STL locale - std::locale lLocale; - std::use_facet<std::ctype<wchar_t> > (lLocale).widen (mucDECharString, - mucDECharString+7, - mucDEWCharString); - std::use_facet<std::ctype<wchar_t> > (lLocale).widen (mucRUCharString, - mucRUCharString+6, - mucRUWCharString); + // + std::wstring mucDESTLWString = + OPENTREP::UTF8Handler::toWideString (mucDESTLString); + std::wstring mucRUSTLWString = + OPENTREP::UTF8Handler::toWideString (mucRUSTLString); - std::cout << "--------" << std::endl << "Wide char strings" << std::endl; - std::cout << "Deutsch ('" << mucDEWCharString << "'): " << std::endl; - displayWCharString (mucDEWCharString, 7); + std::cout << "--------" << std::endl + << "UTF-8 decoded wide char strings" << std::endl; + std::cout << "Deutsch: " << std::endl; + // std::cout << "Deutsch: '" << mucDESTLWString << "'" << std::endl; + std::cout << OPENTREP::UTF8Handler::displaySTLWString (mucDESTLWString); - std::cout << "Russian ('" << mucRUWCharString << "'): " << std::endl; - displayWCharString (mucRUWCharString, 6); + std::cout << "Russian: " << std::endl; + // std::cout << "Russian: '" << mucRUSTLWString << "'" << std::endl; + std::cout << OPENTREP::UTF8Handler::displaySTLWString (mucRUSTLWString); - // STL strings - std::string mucDESTLString ("München"); - std::string mucRUSTLString ("Мюнхен"); - - std::cout << "--------" << std::endl << "STL strings" << std::endl; - std::cout << "Deutsch ('" << mucDESTLString << "'): " << std::endl; - displaySTLString (mucDESTLString); + mucDESTLString = OPENTREP::UTF8Handler::toSimpleString (mucDESTLWString); + mucRUSTLString = OPENTREP::UTF8Handler::toSimpleString (mucRUSTLWString); - std::cout << "Russian ('" << mucRUSTLString << "'): " << std::endl; - displaySTLString (mucRUSTLString); + std::cout << "--------" << std::endl + << "STL strings after processing" << std::endl; + std::cout << "Deutsch: '" << mucDESTLString << "'" << std::endl; + std::cout << "Russian: '" << mucRUSTLString << "'" << std::endl; return 0; } This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-08-15 18:24:45
|
Revision: 178 http://opentrep.svn.sourceforge.net/opentrep/?rev=178&view=rev Author: denis_arnaud Date: 2009-08-15 18:24:37 +0000 (Sat, 15 Aug 2009) Log Message: ----------- [i18n] Some small changes in the test programs of UTF-8. Modified Paths: -------------- trunk/opentrep/test/i18n/stdlocru.cpp trunk/opentrep/test/i18n/utf8/Makefile.am Added Paths: ----------- trunk/opentrep/test/i18n/utf8/utf8string.cpp Property Changed: ---------------- trunk/opentrep/test/i18n/utf8/ Modified: trunk/opentrep/test/i18n/stdlocru.cpp =================================================================== --- trunk/opentrep/test/i18n/stdlocru.cpp 2009-08-15 16:04:38 UTC (rev 177) +++ trunk/opentrep/test/i18n/stdlocru.cpp 2009-08-15 18:24:37 UTC (rev 178) @@ -24,28 +24,61 @@ std::cout << "ru: Мюнхен" << std::endl; // With STL strings - std::string mucDE ("München"); - std::string mucRU ("Мюнхен"); + std::string mucDESTLString ("München"); + std::string mucRUSTLString ("Мюнхен"); // Display the STL strings std::cout << "STL strings without processing:" << std::endl; - std::cout << "de: " << mucDE << std::endl; - std::cout << "ru: " << mucRU << std::endl; + std::cout << "de: " << mucDESTLString << std::endl; + std::cout << "ru: " << mucRUSTLString << std::endl; - // With char* - char mucDEStr[] = "München"; - char mucRUStr[] = "Мюнхен"; + // Single char strings + char mucDECharString[] = "München"; + char mucRUCharString[] = "Мюнхен"; + // Display the altered single char strings + std::cout << "Single character literals without processing:" << std::endl; + std::cout << "de: " << mucDECharString << std::endl; + std::cout << "ru: " << mucRUCharString << std::endl; + + // Wide char strings + wchar_t mucDEWCharString[7]; + wchar_t mucRUWCharString[6]; + + + // Conversion from char* to wchar_t thanks to the STL locale + std::use_facet<std::ctype<wchar_t> > (langLocale).widen (mucDECharString, + mucDECharString+7, + mucDEWCharString); + std::use_facet<std::ctype<wchar_t> > (langLocale).widen (mucRUCharString, + mucRUCharString+6, + mucRUWCharString); + + // Display the wide char strings + std::cout << "Wide character literals after widening:" << std::endl; + std::cout << "de: " << mucDEWCharString << std::endl; + std::cout << "ru: " << mucRUWCharString << std::endl; + // STL ctypes on char* - std::use_facet<std::ctype<char> > (langLocale).toupper (mucDEStr, - mucDEStr+8); - std::use_facet<std::ctype<char> > (langLocale).toupper (mucRUStr, - mucRUStr+8); + std::use_facet<std::ctype<wchar_t> > (langLocale).toupper(mucDEWCharString, + mucDEWCharString+7); + std::use_facet<std::ctype<wchar_t> > (langLocale).toupper(mucRUWCharString, + mucRUWCharString+6); - // Display the altered STL strings + // Conversion from wchar_t to char thanks to the STL locale + std::use_facet<std::ctype<wchar_t> > (langLocale).narrow (mucDEWCharString, + mucDEWCharString+7, + ' ', + mucDECharString); + std::use_facet<std::ctype<wchar_t> > (langLocale).narrow (mucRUWCharString, + mucRUWCharString+6, + ' ', + mucRUCharString); + + // Display the altered single char strings std::cout << "Character literals with STL locale processing:" << std::endl; - std::cout << "de: " << mucDEStr << std::endl; - std::cout << "ru: " << mucRUStr << std::endl; + std::cout << "de: " << mucDECharString << std::endl; + std::cout << "ru: " << mucRUCharString << std::endl; return 0; } Property changes on: trunk/opentrep/test/i18n/utf8 ___________________________________________________________________ Modified: svn:ignore - .deps .libs Makefile Makefile.in utf8 + .deps .libs Makefile Makefile.in utf8 utf8string Modified: trunk/opentrep/test/i18n/utf8/Makefile.am =================================================================== --- trunk/opentrep/test/i18n/utf8/Makefile.am 2009-08-15 16:04:38 UTC (rev 177) +++ trunk/opentrep/test/i18n/utf8/Makefile.am 2009-08-15 18:24:37 UTC (rev 178) @@ -3,10 +3,14 @@ MAINTAINERCLEANFILES = Makefile.in -check_PROGRAMS = utf8 +check_PROGRAMS = utf8 utf8string utf8_SOURCES = utf8.cpp utf8_CXXFLAGS = utf8_LDFLAGS = +utf8string_SOURCES = utf8string.cpp +utf8string_CXXFLAGS = +utf8string_LDFLAGS = + EXTRA_DIST = Added: trunk/opentrep/test/i18n/utf8/utf8string.cpp =================================================================== --- trunk/opentrep/test/i18n/utf8/utf8string.cpp (rev 0) +++ trunk/opentrep/test/i18n/utf8/utf8string.cpp 2009-08-15 18:24:37 UTC (rev 178) @@ -0,0 +1,113 @@ +// STL +#include <iostream> +#include <locale> +#include <string> +#include <cstring> + +// /////////////////////////////////////////////// +void displayCharString (const char* iString) { + // Store current formatting flags of std::cout + std::ios::fmtflags oldFlags = std::cout.flags(); + + const size_t lLength = std::strlen (iString); + for (size_t idx = 0; idx != lLength; ++idx) { + if (idx != 0) { + std::cout << "; "; + } + const unsigned short lChar = iString[idx]; + // const wchar_t lChar = iString[idx]; + std::cout << "[" << idx << "]: " << std::hex << lChar; + } + std::cout << std::endl; + + // Reset formatting flags of std::cout + std::cout.flags (oldFlags); +} + +// /////////////////////////////////////////////// +void displayWCharString (const wchar_t* iString, const size_t iLength) { + // Store current formatting flags of std::cout + std::ios::fmtflags oldFlags = std::cout.flags(); + + for (size_t idx = 0; idx != iLength; ++idx) { + if (idx != 0) { + std::cout << "; "; + } + const wchar_t lChar = iString[idx]; + std::cout << "[" << idx << "]: " << std::hex << lChar; + } + std::cout << std::endl; + + // Reset formatting flags of std::cout + std::cout.flags (oldFlags); +} + +// /////////////////////////////////////////////// +void displaySTLString (const std::string& iString) { + // Store current formatting flags of std::cout + std::ios::fmtflags oldFlags = std::cout.flags(); + + unsigned short idx = 0; + for (std::string::const_iterator itChar = iString.begin(); + itChar != iString.end(); ++itChar, ++idx) { + if (idx != 0) { + std::cout << "; "; + } + const unsigned short lChar = *itChar; + // const char lChar = *itChar; + // const wchar_t lChar = *itChar; + std::cout << "[" << idx << "]: " << std::hex << lChar; + } + std::cout << std::endl; + + // Reset formatting flags of std::cout + std::cout.flags (oldFlags); +} + +// //////////////////////// M A I N ///////////////////////// +int main (int argc, char* argv[]) { + + // Single char strings + const char mucDECharString[] = ("München"); + const char mucRUCharString[] = ("Мюнхен"); + + std::cout << "--------" << std::endl << "Single char strings" << std::endl; + std::cout << "Deutsch ('" << mucDECharString << "'): " << std::endl; + displayCharString (mucDECharString); + + std::cout << "Russian ('" << mucRUCharString << "'): " << std::endl; + displayCharString (mucRUCharString); + + // Wide char strings + wchar_t mucDEWCharString[7]; + wchar_t mucRUWCharString[6]; + + // Conversion from char* to wchar_t thanks to the STL locale + std::locale lLocale; + std::use_facet<std::ctype<wchar_t> > (lLocale).widen (mucDECharString, + mucDECharString+7, + mucDEWCharString); + std::use_facet<std::ctype<wchar_t> > (lLocale).widen (mucRUCharString, + mucRUCharString+6, + mucRUWCharString); + + std::cout << "--------" << std::endl << "Wide char strings" << std::endl; + std::cout << "Deutsch ('" << mucDEWCharString << "'): " << std::endl; + displayWCharString (mucDEWCharString, 7); + + std::cout << "Russian ('" << mucRUWCharString << "'): " << std::endl; + displayWCharString (mucRUWCharString, 6); + + // STL strings + std::string mucDESTLString ("München"); + std::string mucRUSTLString ("Мюнхен"); + + std::cout << "--------" << std::endl << "STL strings" << std::endl; + std::cout << "Deutsch ('" << mucDESTLString << "'): " << std::endl; + displaySTLString (mucDESTLString); + + std::cout << "Russian ('" << mucRUSTLString << "'): " << std::endl; + displaySTLString (mucRUSTLString); + + return 0; +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-08-15 16:04:46
|
Revision: 177 http://opentrep.svn.sourceforge.net/opentrep/?rev=177&view=rev Author: denis_arnaud Date: 2009-08-15 16:04:38 +0000 (Sat, 15 Aug 2009) Log Message: ----------- [i18n] Added a tool on UTF-8 string handling, by Jeff Bezanson (Wikix). Modified Paths: -------------- trunk/opentrep/configure.ac trunk/opentrep/test/i18n/Makefile.am Added Paths: ----------- trunk/opentrep/test/i18n/utf8/ trunk/opentrep/test/i18n/utf8/Makefile.am trunk/opentrep/test/i18n/utf8/utf8.cpp trunk/opentrep/test/i18n/utf8/utf8.hpp Modified: trunk/opentrep/configure.ac =================================================================== --- trunk/opentrep/configure.ac 2009-08-14 17:51:06 UTC (rev 176) +++ trunk/opentrep/configure.ac 2009-08-15 16:04:38 UTC (rev 177) @@ -265,6 +265,7 @@ test/parsers/Makefile test/i18n/Makefile test/i18n/icu/Makefile + test/i18n/utf8/Makefile test/python/Makefile test/iterator/Makefile test/Makefile Modified: trunk/opentrep/test/i18n/Makefile.am =================================================================== --- trunk/opentrep/test/i18n/Makefile.am 2009-08-14 17:51:06 UTC (rev 176) +++ trunk/opentrep/test/i18n/Makefile.am 2009-08-15 16:04:38 UTC (rev 177) @@ -1,6 +1,8 @@ ## command sub-directory include $(top_srcdir)/Makefile.common +SUBDIRS = icu utf8 + MAINTAINERCLEANFILES = Makefile.in check_PROGRAMS = boost_string loc2 stdlocru simple_io Property changes on: trunk/opentrep/test/i18n/utf8 ___________________________________________________________________ Added: svn:ignore + .deps .libs Makefile Makefile.in utf8 Added: trunk/opentrep/test/i18n/utf8/Makefile.am =================================================================== --- trunk/opentrep/test/i18n/utf8/Makefile.am (rev 0) +++ trunk/opentrep/test/i18n/utf8/Makefile.am 2009-08-15 16:04:38 UTC (rev 177) @@ -0,0 +1,12 @@ +## command sub-directory +include $(top_srcdir)/Makefile.common + +MAINTAINERCLEANFILES = Makefile.in + +check_PROGRAMS = utf8 + +utf8_SOURCES = utf8.cpp +utf8_CXXFLAGS = +utf8_LDFLAGS = + +EXTRA_DIST = Added: trunk/opentrep/test/i18n/utf8/utf8.cpp =================================================================== --- trunk/opentrep/test/i18n/utf8/utf8.cpp (rev 0) +++ trunk/opentrep/test/i18n/utf8/utf8.cpp 2009-08-15 16:04:38 UTC (rev 177) @@ -0,0 +1,483 @@ +/* + Basic UTF-8 manipulation routines + by Jeff Bezanson + placed in the public domain Fall 2005 + + This code is designed to provide the utilities you need to manipulate + UTF-8 as an internal string encoding. These functions do not perform the + error checking normally needed when handling UTF-8 data, so if you happen + to be from the Unicode Consortium you will want to flay me alive. + I do this because error checking can be performed at the boundaries (I/O), + with these routines reserved for higher performance on data known to be + valid. +*/ +#include <cstdlib> +#include <cstdio> +#include <cstring> +#include <cstdarg> +#ifdef WIN32 +#include <malloc.h> +#else +#include <alloca.h> +#endif + +#include "utf8.hpp" + +static const u_int32_t offsetsFromUTF8[6] = { + 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL +}; + +static const char trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 +}; + +/* returns length of next utf-8 sequence */ +int u8_seqlen(char *s) +{ + return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1; +} + +/* conversions without error checking + only works for valid UTF-8, i.e. no 5- or 6-byte sequences + srcsz = source size in bytes, or -1 if 0-terminated + sz = dest size in # of wide characters + + returns # characters converted + dest will always be L'\0'-terminated, even if there isn't enough room + for all the characters. + if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space. +*/ +int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz) +{ + u_int32_t ch; + char *src_end = src + srcsz; + int nb; + int i=0; + + while (i < sz-1) { + nb = trailingBytesForUTF8[(unsigned char)*src]; + if (srcsz == -1) { + if (*src == 0) + goto done_toucs; + } + else { + if (src + nb >= src_end) + goto done_toucs; + } + ch = 0; + switch (nb) { + /* these fall through deliberately */ + case 3: ch += (unsigned char)*src++; ch <<= 6; + case 2: ch += (unsigned char)*src++; ch <<= 6; + case 1: ch += (unsigned char)*src++; ch <<= 6; + case 0: ch += (unsigned char)*src++; + } + ch -= offsetsFromUTF8[nb]; + dest[i++] = ch; + } + done_toucs: + dest[i] = 0; + return i; +} + +/* srcsz = number of source characters, or -1 if 0-terminated + sz = size of dest buffer in bytes + + returns # characters converted + dest will only be '\0'-terminated if there is enough space. this is + for consistency; imagine there are 2 bytes of space left, but the next + character requires 3 bytes. in this case we could NUL-terminate, but in + general we can't when there's insufficient space. therefore this function + only NUL-terminates if all the characters fit, and there's space for + the NUL as well. + the destination string will never be bigger than the source string. +*/ +int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz) +{ + u_int32_t ch; + int i = 0; + char *dest_end = dest + sz; + + while (srcsz<0 ? src[i]!=0 : i < srcsz) { + ch = src[i]; + if (ch < 0x80) { + if (dest >= dest_end) + return i; + *dest++ = (char)ch; + } + else if (ch < 0x800) { + if (dest >= dest_end-1) + return i; + *dest++ = (ch>>6) | 0xC0; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x10000) { + if (dest >= dest_end-2) + return i; + *dest++ = (ch>>12) | 0xE0; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x110000) { + if (dest >= dest_end-3) + return i; + *dest++ = (ch>>18) | 0xF0; + *dest++ = ((ch>>12) & 0x3F) | 0x80; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + i++; + } + if (dest < dest_end) + *dest = '\0'; + return i; +} + +int u8_wc_toutf8(char *dest, u_int32_t ch) +{ + if (ch < 0x80) { + dest[0] = (char)ch; + return 1; + } + if (ch < 0x800) { + dest[0] = (ch>>6) | 0xC0; + dest[1] = (ch & 0x3F) | 0x80; + return 2; + } + if (ch < 0x10000) { + dest[0] = (ch>>12) | 0xE0; + dest[1] = ((ch>>6) & 0x3F) | 0x80; + dest[2] = (ch & 0x3F) | 0x80; + return 3; + } + if (ch < 0x110000) { + dest[0] = (ch>>18) | 0xF0; + dest[1] = ((ch>>12) & 0x3F) | 0x80; + dest[2] = ((ch>>6) & 0x3F) | 0x80; + dest[3] = (ch & 0x3F) | 0x80; + return 4; + } + return 0; +} + +/* charnum => byte offset */ +int u8_offset(char *str, int charnum) +{ + int offs=0; + + while (charnum > 0 && str[offs]) { + (void)(isutf(str[++offs]) || isutf(str[++offs]) || + isutf(str[++offs]) || ++offs); + charnum--; + } + return offs; +} + +/* byte offset => charnum */ +int u8_charnum(char *s, int offset) +{ + int charnum = 0, offs=0; + + while (offs < offset && s[offs]) { + (void)(isutf(s[++offs]) || isutf(s[++offs]) || + isutf(s[++offs]) || ++offs); + charnum++; + } + return charnum; +} + +/* number of characters */ +int u8_strlen(char *s) +{ + int count = 0; + int i = 0; + + while (u8_nextchar(s, &i) != 0) + count++; + + return count; +} + +/* reads the next utf-8 sequence out of a string, updating an index */ +u_int32_t u8_nextchar(char *s, int *i) +{ + u_int32_t ch = 0; + int sz = 0; + + do { + ch <<= 6; + ch += (unsigned char)s[(*i)++]; + sz++; + } while (s[*i] && !isutf(s[*i])); + ch -= offsetsFromUTF8[sz-1]; + + return ch; +} + +void u8_inc(char *s, int *i) +{ + (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) || + isutf(s[++(*i)]) || ++(*i)); +} + +void u8_dec(char *s, int *i) +{ + (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || + isutf(s[--(*i)]) || --(*i)); +} + +int octal_digit(char c) +{ + return (c >= '0' && c <= '7'); +} + +int hex_digit(char c) +{ + return ((c >= '0' && c <= '9') || + (c >= 'A' && c <= 'F') || + (c >= 'a' && c <= 'f')); +} + +/* assumes that src points to the character after a backslash + returns number of input characters processed */ +int u8_read_escape_sequence(char *str, u_int32_t *dest) +{ + u_int32_t ch; + char digs[10]="\0\0\0\0\0\0\0\0\0"; + int dno=0, i=1; + + ch = (u_int32_t)str[0]; /* take literal character */ + if (str[0] == 'n') + ch = L'\n'; + else if (str[0] == 't') + ch = L'\t'; + else if (str[0] == 'r') + ch = L'\r'; + else if (str[0] == 'b') + ch = L'\b'; + else if (str[0] == 'f') + ch = L'\f'; + else if (str[0] == 'v') + ch = L'\v'; + else if (str[0] == 'a') + ch = L'\a'; + else if (octal_digit(str[0])) { + i = 0; + do { + digs[dno++] = str[i++]; + } while (octal_digit(str[i]) && dno < 3); + ch = strtol(digs, NULL, 8); + } + else if (str[0] == 'x') { + while (hex_digit(str[i]) && dno < 2) { + digs[dno++] = str[i++]; + } + if (dno > 0) + ch = strtol(digs, NULL, 16); + } + else if (str[0] == 'u') { + while (hex_digit(str[i]) && dno < 4) { + digs[dno++] = str[i++]; + } + if (dno > 0) + ch = strtol(digs, NULL, 16); + } + else if (str[0] == 'U') { + while (hex_digit(str[i]) && dno < 8) { + digs[dno++] = str[i++]; + } + if (dno > 0) + ch = strtol(digs, NULL, 16); + } + *dest = ch; + + return i; +} + +/* convert a string with literal \uxxxx or \Uxxxxxxxx characters to UTF-8 + example: u8_unescape(mybuf, 256, "hello\\u220e") + note the double backslash is needed if called on a C string literal */ +int u8_unescape(char *buf, int sz, char *src) +{ + int c=0, amt; + u_int32_t ch; + char temp[4]; + + while (*src && c < sz) { + if (*src == '\\') { + src++; + amt = u8_read_escape_sequence(src, &ch); + } + else { + ch = (u_int32_t)*src; + amt = 1; + } + src += amt; + amt = u8_wc_toutf8(temp, ch); + if (amt > sz-c) + break; + memcpy(&buf[c], temp, amt); + c += amt; + } + if (c < sz) + buf[c] = '\0'; + return c; +} + +int u8_escape_wchar(char *buf, int sz, u_int32_t ch) +{ + if (ch == L'\n') + return snprintf(buf, sz, "\\n"); + else if (ch == L'\t') + return snprintf(buf, sz, "\\t"); + else if (ch == L'\r') + return snprintf(buf, sz, "\\r"); + else if (ch == L'\b') + return snprintf(buf, sz, "\\b"); + else if (ch == L'\f') + return snprintf(buf, sz, "\\f"); + else if (ch == L'\v') + return snprintf(buf, sz, "\\v"); + else if (ch == L'\a') + return snprintf(buf, sz, "\\a"); + else if (ch == L'\\') + return snprintf(buf, sz, "\\\\"); + else if (ch < 32 || ch == 0x7f) + return snprintf(buf, sz, "\\x%hhX", (unsigned char)ch); + else if (ch > 0xFFFF) + return snprintf(buf, sz, "\\U%.8X", (u_int32_t)ch); + else if (ch >= 0x80 && ch <= 0xFFFF) + return snprintf(buf, sz, "\\u%.4hX", (unsigned short)ch); + + return snprintf(buf, sz, "%c", (char)ch); +} + +int u8_escape(char *buf, int sz, char *src, int escape_quotes) +{ + int c=0, i=0, amt; + + while (src[i] && c < sz) { + if (escape_quotes && src[i] == '"') { + amt = snprintf(buf, sz - c, "\\\""); + i++; + } + else { + amt = u8_escape_wchar(buf, sz - c, u8_nextchar(src, &i)); + } + c += amt; + buf += amt; + } + if (c < sz) + *buf = '\0'; + return c; +} + +char *u8_strchr(char *s, u_int32_t ch, int *charn) +{ + int i = 0, lasti=0; + u_int32_t c; + + *charn = 0; + while (s[i]) { + c = u8_nextchar(s, &i); + if (c == ch) { + return &s[lasti]; + } + lasti = i; + (*charn)++; + } + return NULL; +} + +char *u8_memchr(char *s, u_int32_t ch, size_t sz, int *charn) +{ + int lasti=0; + size_t i =0; + u_int32_t c; + int csz; + + *charn = 0; + while (i < sz) { + c = csz = 0; + do { + c <<= 6; + c += (unsigned char)s[i++]; + csz++; + } while (i < sz && !isutf(s[i])); + c -= offsetsFromUTF8[csz-1]; + + if (c == ch) { + return &s[lasti]; + } + lasti = i; + (*charn)++; + } + return NULL; +} + +int u8_is_locale_utf8(char *locale) +{ + /* this code based on libutf8 */ + const char* cp = locale; + + for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) { + if (*cp == '.') { + const char* encoding = ++cp; + for (; *cp != '\0' && *cp != '@' && *cp != '+' && *cp != ','; cp++) + ; + if ((cp-encoding == 5 && !strncmp(encoding, "UTF-8", 5)) + || (cp-encoding == 4 && !strncmp(encoding, "utf8", 4))) + return 1; /* it's UTF-8 */ + break; + } + } + return 0; +} + +int u8_vprintf(char *fmt, va_list ap) +{ + int cnt, sz=0; + char *buf; + u_int32_t *wcs; + + sz = 512; + buf = (char*)alloca(sz); + try_print: + cnt = vsnprintf(buf, sz, fmt, ap); + if (cnt >= sz) { + buf = (char*)alloca(cnt - sz + 1); + sz = cnt + 1; + goto try_print; + } + wcs = (u_int32_t*)alloca((cnt+1) * sizeof(u_int32_t)); + cnt = u8_toucs(wcs, cnt+1, buf, cnt); + printf("%ls", (wchar_t*)wcs); + return cnt; +} + +int u8_printf(char *fmt, ...) +{ + int cnt; + va_list args; + + va_start(args, fmt); + + cnt = u8_vprintf(fmt, args); + + va_end(args); + return cnt; +} + +// ////////////////// M A I N /////////////////// +int main (int argc, char* argv[]) { + + return 0; +} Added: trunk/opentrep/test/i18n/utf8/utf8.hpp =================================================================== --- trunk/opentrep/test/i18n/utf8/utf8.hpp (rev 0) +++ trunk/opentrep/test/i18n/utf8/utf8.hpp 2009-08-15 16:04:38 UTC (rev 177) @@ -0,0 +1,72 @@ +// +#include <cstdarg> + +/* is c the start of a utf8 sequence? */ +#define isutf(c) (((c)&0xC0)!=0x80) + +/* convert UTF-8 data to wide character */ +int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz); + +/* the opposite conversion */ +int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz); + +/* single character to UTF-8 */ +int u8_wc_toutf8(char *dest, u_int32_t ch); + +/* character number to byte offset */ +int u8_offset(char *str, int charnum); + +/* byte offset to character number */ +int u8_charnum(char *s, int offset); + +/* return next character, updating an index variable */ +u_int32_t u8_nextchar(char *s, int *i); + +/* move to next character */ +void u8_inc(char *s, int *i); + +/* move to previous character */ +void u8_dec(char *s, int *i); + +/* returns length of next utf-8 sequence */ +int u8_seqlen(char *s); + +/* assuming src points to the character after a backslash, read an + escape sequence, storing the result in dest and returning the number of + input characters processed */ +int u8_read_escape_sequence(char *src, u_int32_t *dest); + +/* given a wide character, convert it to an ASCII escape sequence stored in + buf, where buf is "sz" bytes. returns the number of characters output. */ +int u8_escape_wchar(char *buf, int sz, u_int32_t ch); + +/* convert a string "src" containing escape sequences to UTF-8 */ +int u8_unescape(char *buf, int sz, char *src); + +/* convert UTF-8 "src" to ASCII with escape sequences. + if escape_quotes is nonzero, quote characters will be preceded by + backslashes as well. */ +int u8_escape(char *buf, int sz, char *src, int escape_quotes); + +/* utility predicates used by the above */ +int octal_digit(char c); +int hex_digit(char c); + +/* return a pointer to the first occurrence of ch in s, or NULL if not + found. character index of found character returned in *charn. */ +char *u8_strchr(char *s, u_int32_t ch, int *charn); + +/* same as the above, but searches a buffer of a given size instead of + a NUL-terminated string. */ +char *u8_memchr(char *s, u_int32_t ch, size_t sz, int *charn); + +/* count the number of characters in a UTF-8 string */ +int u8_strlen(char *s); + +int u8_is_locale_utf8(char *locale); + +/* printf where the format string and arguments may be in UTF-8. + you can avoid this function and just use ordinary printf() if the current + locale is UTF-8. */ +int u8_vprintf(char *fmt, va_list ap); +int u8_printf(char *fmt, ...); This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-08-14 17:51:16
|
Revision: 176 http://opentrep.svn.sourceforge.net/opentrep/?rev=176&view=rev Author: denis_arnaud Date: 2009-08-14 17:51:06 +0000 (Fri, 14 Aug 2009) Log Message: ----------- [i18n] Added a few examples for the development around the ICU library. Modified Paths: -------------- trunk/opentrep/TODO trunk/opentrep/configure.ac trunk/opentrep/test/IndexBuildingTestSuite.cpp trunk/opentrep/test/i18n/Makefile.am Added Paths: ----------- trunk/opentrep/test/i18n/icu/ trunk/opentrep/test/i18n/icu/Makefile.am trunk/opentrep/test/i18n/icu/icucharsetdetector.cpp trunk/opentrep/test/i18n/icu/icuconv.cpp trunk/opentrep/test/i18n/icu/icuconvref.cpp trunk/opentrep/test/i18n/icu/icufmt.cpp trunk/opentrep/test/i18n/icu/icuustring.cpp trunk/opentrep/test/i18n/icu/icuustringref.cpp Removed Paths: ------------- trunk/opentrep/test/i18n/icufmt.cpp Property Changed: ---------------- trunk/opentrep/test/i18n/ Modified: trunk/opentrep/TODO =================================================================== --- trunk/opentrep/TODO 2009-08-14 15:18:55 UTC (rev 175) +++ trunk/opentrep/TODO 2009-08-14 17:51:06 UTC (rev 176) @@ -1,6 +1,22 @@ Todo list for the OpenTrep project ---------------------------------- +* [01/08/2009] Write a (Python-based) PSP page, in order to test the + different locales of the browsers. +The Python (PSP) page has been created, but there is still some work +to do in order to adapt it to the new API (with extra and alternate +locations). + +* [14/08/2009] With the ICU library, check the encoding of the input, + and convert in Unicode if needed (see the test/i18n/icuustring and + test/i18n/icuconv} for example). First detect and convert hard-coded + strings, then do it on the output of PSP pages. + +* [14/08/2009] Write a transliterator, taking UTF-8 Cyrillic input + (e.g., Russian and/or Ukrainian) and romanising/transliterating + it. Note that, with the ICU library, UTex may be used advantageously + (to take UTF-8 input). + * [01/08/2009] Finish the work on bringing extra and additional Location objects into the API. OK @@ -12,9 +28,3 @@ corresponding result details within the database. The easiest way is to extract the first three letters of the Xapian document data. OK - -* [01/08/2009] Write a (Python-based) PSP page, in order to test the - different locales of the browsers. -The Python (PSP) page has been created, but there is still some work -to do in order to adapt it to the new API (with extra and alternate -locations). \ No newline at end of file Modified: trunk/opentrep/configure.ac =================================================================== --- trunk/opentrep/configure.ac 2009-08-14 15:18:55 UTC (rev 175) +++ trunk/opentrep/configure.ac 2009-08-14 17:51:06 UTC (rev 176) @@ -264,6 +264,7 @@ test/com/Makefile test/parsers/Makefile test/i18n/Makefile + test/i18n/icu/Makefile test/python/Makefile test/iterator/Makefile test/Makefile Modified: trunk/opentrep/test/IndexBuildingTestSuite.cpp =================================================================== --- trunk/opentrep/test/IndexBuildingTestSuite.cpp 2009-08-14 15:18:55 UTC (rev 175) +++ trunk/opentrep/test/IndexBuildingTestSuite.cpp 2009-08-14 17:51:06 UTC (rev 176) @@ -5,6 +5,7 @@ #include <test/com/CppUnitCore.hpp> // OpenTrep #include <opentrep/OPENTREP_Service.hpp> +#include <opentrep/Location.hpp> // OpenTrep Test Suite #include <test/IndexBuildingTestSuite.hpp> @@ -39,7 +40,7 @@ // Query the Xapian database (index) OPENTREP::WordList_T lNonMatchedWordList; OPENTREP::LocationList_T lLocationList; - const OPENTREP::NbOfMatches_T nbOfMatches = + // const OPENTREP::NbOfMatches_T nbOfMatches = opentrepService.interpretTravelRequest (lTravelQuery, lLocationList, lNonMatchedWordList); Property changes on: trunk/opentrep/test/i18n ___________________________________________________________________ Modified: svn:ignore - .libs .deps Makefile.in Makefile boost_string loc2 stdlocru icufmt simple_io + .libs .deps Makefile.in Makefile boost_string loc2 stdlocru simple_io Modified: trunk/opentrep/test/i18n/Makefile.am =================================================================== --- trunk/opentrep/test/i18n/Makefile.am 2009-08-14 15:18:55 UTC (rev 175) +++ trunk/opentrep/test/i18n/Makefile.am 2009-08-14 17:51:06 UTC (rev 176) @@ -3,7 +3,7 @@ MAINTAINERCLEANFILES = Makefile.in -check_PROGRAMS = boost_string loc2 stdlocru icufmt simple_io +check_PROGRAMS = boost_string loc2 stdlocru simple_io boost_string_SOURCES = boost_string.cpp boost_string_CXXFLAGS = $(BOOST_CFLAGS) @@ -18,10 +18,6 @@ stdlocru_CXXFLAGS = $(BOOST_CFLAGS) stdlocru_LDFLAGS = $(BOOST_LIBS) -icufmt_SOURCES = icufmt.cpp -icufmt_CXXFLAGS = $(ICU_CFLAGS) -icufmt_LDFLAGS = $(ICU_LIBS) $(ICU_IO_LIB) - simple_io_SOURCES = simple_io.cpp simple_io_CXXFLAGS = $(BOOST_CFLAGS) simple_io_LDFLAGS = $(BOOST_LIBS) Property changes on: trunk/opentrep/test/i18n/icu ___________________________________________________________________ Added: svn:ignore + .libs .deps Makefile.in Makefile icufmt icucharsetdetector icuustring icuconv Added: trunk/opentrep/test/i18n/icu/Makefile.am =================================================================== --- trunk/opentrep/test/i18n/icu/Makefile.am (rev 0) +++ trunk/opentrep/test/i18n/icu/Makefile.am 2009-08-14 17:51:06 UTC (rev 176) @@ -0,0 +1,24 @@ +## command sub-directory +include $(top_srcdir)/Makefile.common + +MAINTAINERCLEANFILES = Makefile.in + +check_PROGRAMS = icufmt icuustring icucharsetdetector icuconv + +icufmt_SOURCES = icufmt.cpp +icufmt_CXXFLAGS = $(ICU_CFLAGS) +icufmt_LDFLAGS = $(ICU_LIBS) $(ICU_IO_LIB) + +icuustring_SOURCES = icuustring.cpp +icuustring_CXXFLAGS = $(ICU_CFLAGS) +icuustring_LDFLAGS = $(ICU_LIBS) $(ICU_IO_LIB) + +icucharsetdetector_SOURCES = icucharsetdetector.cpp +icucharsetdetector_CXXFLAGS = $(ICU_CFLAGS) +icucharsetdetector_LDFLAGS = $(ICU_LIBS) $(ICU_IO_LIB) + +icuconv_SOURCES = icuconv.cpp +icuconv_CXXFLAGS = $(ICU_CFLAGS) +icuconv_LDFLAGS = $(ICU_LIBS) $(ICU_IO_LIB) + +EXTRA_DIST = Added: trunk/opentrep/test/i18n/icu/icucharsetdetector.cpp =================================================================== --- trunk/opentrep/test/i18n/icu/icucharsetdetector.cpp (rev 0) +++ trunk/opentrep/test/i18n/icu/icucharsetdetector.cpp 2009-08-14 17:51:06 UTC (rev 176) @@ -0,0 +1,21 @@ +// STL +#include <iostream> +#include <string> +// ICU +#include <unicode/utypes.h> +#include <unicode/ucsdet.h> + +int main (int argc, char* argv[]) { + + UErrorCode status = U_ZERO_ERROR; + UCharsetDetector* csd = ucsdet_open (&status); + static char buffer[11] = "0123456789"; + int32_t inputLength = 10; + ucsdet_setText (csd, buffer, inputLength, &status); + const UCharsetMatch* ucm = ucsdet_detect (csd, &status); + const std::string name = ucsdet_getName (ucm, &status); + + std::cout << "Character set encoding: " << name << std::endl; + + return 0; +} Added: trunk/opentrep/test/i18n/icu/icuconv.cpp =================================================================== --- trunk/opentrep/test/i18n/icu/icuconv.cpp (rev 0) +++ trunk/opentrep/test/i18n/icu/icuconv.cpp 2009-08-14 17:51:06 UTC (rev 176) @@ -0,0 +1,506 @@ + +// STL +#include <cstdio> +#include <ctype.h> /* for isspace, etc. */ +#include <cassert> +#include <cstring> +#include <cstdlib> /* malloc */ + +#define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */ + +#include "unicode/utypes.h" /* Basic ICU data types */ +#include "unicode/ucnv.h" /* C Converter API */ +#include "unicode/ustring.h" /* some more string fcns*/ +#include "unicode/uchar.h" /* char names */ +#include "unicode/uloc.h" +#include "unicode/unistr.h" + +/* Some utility functions */ + +static const UChar kNone[] = { 0x0000 }; + +#define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }} + +/* Print a UChar if possible, in seven characters. */ +void prettyPrintUChar(UChar c) +{ + if( (c <= 0x007F) && + (isgraph(c)) ) { + printf(" '%c' ", (char)(0x00FF&c)); + } else if ( c > 0x007F ) { + char buf[1000]; + UErrorCode status = U_ZERO_ERROR; + int32_t o; + + o = u_charName(c, U_UNICODE_CHAR_NAME, buf, 1000, &status); + if(U_SUCCESS(status) && (o>0) ) { + buf[80] = 0; + printf("%7s", buf); + } else { + o = u_charName(c, U_UNICODE_10_CHAR_NAME, buf, 1000, &status); + if(U_SUCCESS(status) && (o>0)) { + buf[5] = 0; + printf("~%6s", buf); + } + else { + printf(" ??????"); + } + } + } else { + switch((char)(c & 0x007F)) { + case ' ': + printf(" ' ' "); + break; + case '\t': + printf(" \\t "); + break; + case '\n': + printf(" \\n "); + break; + default: + printf(" _ "); + break; + } + } +} + + +void printUChars(const char *name = "?", + const UChar *uch = kNone, + int32_t len = -1 ) +{ + int32_t i; + + if( (len == -1) && (uch) ) { + len = u_strlen(uch); + } + + printf("%5s: ", name); + for( i = 0; i <len; i++) { + printf("%-6d ", i); + } + printf("\n"); + + printf("%5s: ", "uni"); + for( i = 0; i <len; i++) { + printf("\\u%04X ", (int)uch[i]); + } + printf("\n"); + + printf("%5s:", "ch"); + for( i = 0; i <len; i++) { + prettyPrintUChar(uch[i]); + } + printf("\n"); +} + +void printBytes(const char *name = "?", + const char *uch = "", + int32_t len = -1 ) +{ + int32_t i; + + if( (len == -1) && (uch) ) { + len = strlen(uch); + } + + printf("%5s: ", name); + for( i = 0; i <len; i++) { + printf("%-4d ", i); + } + printf("\n"); + + printf("%5s: ", "uni"); + for( i = 0; i <len; i++) { + printf("\\x%02X ", 0x00FF & (int)uch[i]); + } + printf("\n"); + + printf("%5s:", "ch"); + for( i = 0; i <len; i++) { + if(isgraph(0x00FF & (int)uch[i])) { + printf(" '%c' ", (char)uch[i]); + } else { + printf(" "); + } + } + printf("\n"); +} + +void printUChar(UChar32 ch32) +{ + if(ch32 > 0xFFFF) { + printf("ch: U+%06X\n", ch32); + } + else { + UChar ch = (UChar)ch32; + printUChars("C", &ch, 1); + } +} + +/******************************************************************* + Very simple C sample to convert the word 'Moscow' in Russian in Unicode, + followed by an exclamation mark (!) into the KOI8-R Russian code page. + + This example first creates a UChar String out of the Unicode chars. + + targetSize must be set to the amount of space available in the target + buffer. After fromUChars is called, + len will contain the number of bytes in target[] which were + used in the resulting codepage. In this case, there is a 1:1 mapping + between the input and output characters. The exclamation mark has the + same value in both KOI8-R and Unicode. + + src: 0 1 2 3 4 5 6 + uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021 + ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!' + + targ: 0 1 2 3 4 5 6 + uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21 + ch: '!' + + +Converting FROM unicode + to koi8-r. + You must call ucnv_close to clean up the memory used by the + converter. + + 'len' returns the number of OUTPUT bytes resulting from the + conversion. + */ + +UErrorCode convsample_02() +{ + printf("\n\n==============================================\n" + "Sample 02: C: simple Unicode -> koi8-r conversion\n"); + + + // **************************** START SAMPLE ******************* + // "cat<cat>OK" + UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432, + 0x0430, 0x0021, 0x0000 }; + char target[100]; + UErrorCode status = U_ZERO_ERROR; + UConverter *conv; + int32_t len; + + // set up the converter + conv = ucnv_open("koi8-r", &status); + assert(U_SUCCESS(status)); + + // convert to koi8-r + len = ucnv_fromUChars(conv, target, 100, source, -1, &status); + assert(U_SUCCESS(status)); + + // close the converter + ucnv_close(conv); + + // ***************************** END SAMPLE ******************** + + // Print it out + printUChars("src", source); + printf("\n"); + printBytes("targ", target, len); + + return U_ZERO_ERROR; +} + + +UErrorCode convsample_03() +{ + printf("\n\n==============================================\n" + "Sample 03: C: print out all converters\n"); + + int32_t count; + int32_t i; + + // **************************** START SAMPLE ******************* + count = ucnv_countAvailable(); + printf("Available converters: %d\n", count); + + for(i=0;i<count;i++) + { + printf("%s ", ucnv_getAvailableName(i)); + } + + // ***************************** END SAMPLE ******************** + + printf("\n"); + + return U_ZERO_ERROR; +} + + + +#define BUFFERSIZE 17 /* make it interesting :) */ + +/* + Converting from a codepage to Unicode in bulk.. + What is the best way to determine the buffer size? + + The 'buffersize' is in bytes of input. + For a given converter, divinding this by the minimum char size + give you the maximum number of Unicode characters that could be + expected for a given number of input bytes. + see: ucnv_getMinCharSize() + + For example, a single byte codepage like 'Latin-3' has a + minimum char size of 1. (It takes at least 1 byte to represent + each Unicode char.) So the unicode buffer has the same number of + UChars as the input buffer has bytes. + + In a strictly double byte codepage such as cp1362 (Windows + Korean), the minimum char size is 2. So, only half as many Unicode + chars as bytes are needed. + + This work to calculate the buffer size is an optimization. Any + size of input and output buffer can be used, as long as the + program handles the following cases: If the input buffer is empty, + the source pointer will be equal to sourceLimit. If the output + buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned. + */ + +UErrorCode convsample_05() +{ + printf("\n\n==============================================\n" + "Sample 05: C: count the number of letters in a UTF-8 document\n"); + + FILE *f; + int32_t count; + char inBuf[BUFFERSIZE]; + const char *source; + const char *sourceLimit; + UChar *uBuf; + UChar *target; + UChar *targetLimit; + UChar *p; + int32_t uBufSize = 0; + UConverter *conv; + UErrorCode status = U_ZERO_ERROR; + uint32_t letters=0, total=0; + + f = fopen("ref/ref_text_ru.txt", "r"); + if(!f) + { + fprintf(stderr, "Couldn't open file 'ref/ref_text_ru.txt' (UTF-8 data file).\n"); + return U_FILE_ACCESS_ERROR; + } + + // **************************** START SAMPLE ******************* + conv = ucnv_open("utf-8", &status); + assert(U_SUCCESS(status)); + + uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); + printf("input bytes %d / min chars %d = %d UChars\n", + BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); + uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); + assert(uBuf!=NULL); + + // grab another buffer's worth + while((!feof(f)) && + ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) + { + // Convert bytes to unicode + source = inBuf; + sourceLimit = inBuf + count; + + do + { + target = uBuf; + targetLimit = uBuf + uBufSize; + + ucnv_toUnicode(conv, &target, targetLimit, + &source, sourceLimit, NULL, + feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ + /* is true (when no more data will come) */ + &status); + + if(status == U_BUFFER_OVERFLOW_ERROR) + { + // simply ran out of space - we'll reset the target ptr the next + // time through the loop. + status = U_ZERO_ERROR; + } + else + { + // Check other errors here. + assert(U_SUCCESS(status)); + // Break out of the loop (by force) + } + + // Process the Unicode + // Todo: handle UTF-16/surrogates + + for(p = uBuf; p<target; p++) + { + if(u_isalpha(*p)) + letters++; + total++; + } + } while (source < sourceLimit); // while simply out of space + } + + printf("%d letters out of %d total UChars.\n", letters, total); + + // ***************************** END SAMPLE ******************** + ucnv_close(conv); + + printf("\n"); + + return U_ZERO_ERROR; +} +#undef BUFFERSIZE + +#define BUFFERSIZE 1024 +typedef struct +{ + UChar32 codepoint; + uint32_t frequency; +} CharFreqInfo; + +UErrorCode convsample_06() +{ + printf("\n\n==============================================\n" + "Sample 06: C: frequency distribution of letters in a UTF-8 document\n"); + + FILE *f; + int32_t count; + char inBuf[BUFFERSIZE]; + const char *source; + const char *sourceLimit; + UChar *uBuf; + int32_t uBufSize = 0; + UConverter *conv; + UErrorCode status = U_ZERO_ERROR; + uint32_t letters=0, total=0; + + CharFreqInfo *info; + UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */ + UChar32 p; + + uint32_t ie = 0; + uint32_t gh = 0; + UChar32 l = 0; + + f = fopen("ref/ref_text_ru.txt", "r"); + if(!f) + { + fprintf(stderr, "Couldn't open file 'ref/ref_text_ru.txt' (UTF-8 data file).\n"); + return U_FILE_ACCESS_ERROR; + } + + info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount); + if(!info) + { + fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount); + } + + /* reset frequencies */ + for(p=0;p<charCount;p++) + { + info[p].codepoint = p; + info[p].frequency = 0; + } + + // **************************** START SAMPLE ******************* + conv = ucnv_open("utf-8", &status); + assert(U_SUCCESS(status)); + + uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); + printf("input bytes %d / min chars %d = %d UChars\n", + BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); + uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); + assert(uBuf!=NULL); + + // grab another buffer's worth + while((!feof(f)) && + ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) + { + // Convert bytes to unicode + source = inBuf; + sourceLimit = inBuf + count; + + while(source < sourceLimit) + { + p = ucnv_getNextUChar(conv, &source, sourceLimit, &status); + if(U_FAILURE(status)) + { + fprintf(stderr, "%s @ %d\n", u_errorName(status), total); + status = U_ZERO_ERROR; + continue; + } + U_ASSERT(status); + total++; + + if(u_isalpha(p)) + letters++; + + if((u_tolower(l) == 'i') && (u_tolower(p) == 'e')) + ie++; + + if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127)) + gh++; + + if(p>charCount) + { + fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p); + return U_UNSUPPORTED_ERROR; + } + info[p].frequency++; + l = p; + } + } + + fclose(f); + ucnv_close(conv); + + printf("%d letters out of %d total UChars.\n", letters, total); + printf("%d ie digraphs, %d gh digraphs.\n", ie, gh); + + // now, we could sort it.. + + // qsort(info, charCount, sizeof(info[0]), charfreq_compare); + + for(p=0;p<charCount;p++) + { + if(info[p].frequency) + { + printf("% 5d U+%06X ", info[p].frequency, p); + if(p <= 0xFFFF) + { + prettyPrintUChar((UChar)p); + } + printf("\n"); + } + } + free(info); + // ***************************** END SAMPLE ******************** + + printf("\n"); + + return U_ZERO_ERROR; +} +#undef BUFFERSIZE + +#define BUFFERSIZE 219 + + +/* main */ + +int main() { + + printf("Default Converter=%s\n", ucnv_getDefaultName() ); + + convsample_02(); // C , u->koi8r, conv + convsample_03(); // C, iterate + + convsample_05(); // C, utf8->u, getNextUChar + convsample_06(); // C freq counter thingy + + printf("End of converter samples.\n"); + + fflush(stdout); + fflush(stderr); + + return 0; +} Added: trunk/opentrep/test/i18n/icu/icuconvref.cpp =================================================================== --- trunk/opentrep/test/i18n/icu/icuconvref.cpp (rev 0) +++ trunk/opentrep/test/i18n/icu/icuconvref.cpp 2009-08-14 17:51:06 UTC (rev 176) @@ -0,0 +1,1102 @@ +/************************************************************************** +* +* Copyright (C) 2000-2003, International Business Machines +* Corporation and others. All Rights Reserved. +* +*************************************************************************** +* file name: convsamp.c +* encoding: ASCII (7-bit) +* +* created on: 2000may30 +* created by: Steven R. Loomis +* +* Sample code for the ICU conversion routines. +* +* Note: Nothing special is needed to build this sample. Link with +* the icu UC and icu I18N libraries. +* +* I use 'assert' for error checking, you probably will want +* something more flexible. '***BEGIN SAMPLE***' and +* '***END SAMPLE***' mark pieces suitable for stand alone +* code snippets. +* +* +* Each test can define it's own BUFFERSIZE +* +*/ + +#define DEBUG_TMI 0 /* define to 1 to enable Too Much Information */ + +#include <stdio.h> +#include <ctype.h> /* for isspace, etc. */ +#include <assert.h> +#include <string.h> +#include <stdlib.h> /* malloc */ + +#include "unicode/utypes.h" /* Basic ICU data types */ +#include "unicode/ucnv.h" /* C Converter API */ +#include "unicode/ustring.h" /* some more string fcns*/ +#include "unicode/uchar.h" /* char names */ +#include "unicode/uloc.h" +#include "unicode/unistr.h" + +#include "flagcb.h" + +/* Some utility functions */ + +static const UChar kNone[] = { 0x0000 }; + +#define U_ASSERT(x) { if(U_FAILURE(x)) {fflush(stdout);fflush(stderr); fprintf(stderr, #x " == %s\n", u_errorName(x)); assert(U_SUCCESS(x)); }} + +/* Print a UChar if possible, in seven characters. */ +void prettyPrintUChar(UChar c) +{ + if( (c <= 0x007F) && + (isgraph(c)) ) { + printf(" '%c' ", (char)(0x00FF&c)); + } else if ( c > 0x007F ) { + char buf[1000]; + UErrorCode status = U_ZERO_ERROR; + int32_t o; + + o = u_charName(c, U_UNICODE_CHAR_NAME, buf, 1000, &status); + if(U_SUCCESS(status) && (o>0) ) { + buf[6] = 0; + printf("%7s", buf); + } else { + o = u_charName(c, U_UNICODE_10_CHAR_NAME, buf, 1000, &status); + if(U_SUCCESS(status) && (o>0)) { + buf[5] = 0; + printf("~%6s", buf); + } + else { + printf(" ??????"); + } + } + } else { + switch((char)(c & 0x007F)) { + case ' ': + printf(" ' ' "); + break; + case '\t': + printf(" \\t "); + break; + case '\n': + printf(" \\n "); + break; + default: + printf(" _ "); + break; + } + } +} + + +void printUChars(const char *name = "?", + const UChar *uch = kNone, + int32_t len = -1 ) +{ + int32_t i; + + if( (len == -1) && (uch) ) { + len = u_strlen(uch); + } + + printf("%5s: ", name); + for( i = 0; i <len; i++) { + printf("%-6d ", i); + } + printf("\n"); + + printf("%5s: ", "uni"); + for( i = 0; i <len; i++) { + printf("\\u%04X ", (int)uch[i]); + } + printf("\n"); + + printf("%5s:", "ch"); + for( i = 0; i <len; i++) { + prettyPrintUChar(uch[i]); + } + printf("\n"); +} + +void printBytes(const char *name = "?", + const char *uch = "", + int32_t len = -1 ) +{ + int32_t i; + + if( (len == -1) && (uch) ) { + len = strlen(uch); + } + + printf("%5s: ", name); + for( i = 0; i <len; i++) { + printf("%-4d ", i); + } + printf("\n"); + + printf("%5s: ", "uni"); + for( i = 0; i <len; i++) { + printf("\\x%02X ", 0x00FF & (int)uch[i]); + } + printf("\n"); + + printf("%5s:", "ch"); + for( i = 0; i <len; i++) { + if(isgraph(0x00FF & (int)uch[i])) { + printf(" '%c' ", (char)uch[i]); + } else { + printf(" "); + } + } + printf("\n"); +} + +void printUChar(UChar32 ch32) +{ + if(ch32 > 0xFFFF) { + printf("ch: U+%06X\n", ch32); + } + else { + UChar ch = (UChar)ch32; + printUChars("C", &ch, 1); + } +} + +/******************************************************************* + Very simple C sample to convert the word 'Moscow' in Russian in Unicode, + followed by an exclamation mark (!) into the KOI8-R Russian code page. + + This example first creates a UChar String out of the Unicode chars. + + targetSize must be set to the amount of space available in the target + buffer. After fromUChars is called, + len will contain the number of bytes in target[] which were + used in the resulting codepage. In this case, there is a 1:1 mapping + between the input and output characters. The exclamation mark has the + same value in both KOI8-R and Unicode. + + src: 0 1 2 3 4 5 6 + uni: \u041C \u043E \u0441 \u043A \u0432 \u0430 \u0021 + ch: CYRILL CYRILL CYRILL CYRILL CYRILL CYRILL '!' + + targ: 0 1 2 3 4 5 6 + uni: \xED \xCF \xD3 \xCB \xD7 \xC1 \x21 + ch: '!' + + +Converting FROM unicode + to koi8-r. + You must call ucnv_close to clean up the memory used by the + converter. + + 'len' returns the number of OUTPUT bytes resulting from the + conversion. + */ + +UErrorCode convsample_02() +{ + printf("\n\n==============================================\n" + "Sample 02: C: simple Unicode -> koi8-r conversion\n"); + + + // **************************** START SAMPLE ******************* + // "cat<cat>OK" + UChar source[] = { 0x041C, 0x043E, 0x0441, 0x043A, 0x0432, + 0x0430, 0x0021, 0x0000 }; + char target[100]; + UErrorCode status = U_ZERO_ERROR; + UConverter *conv; + int32_t len; + + // set up the converter + conv = ucnv_open("koi8-r", &status); + assert(U_SUCCESS(status)); + + // convert to koi8-r + len = ucnv_fromUChars(conv, target, 100, source, -1, &status); + assert(U_SUCCESS(status)); + + // close the converter + ucnv_close(conv); + + // ***************************** END SAMPLE ******************** + + // Print it out + printUChars("src", source); + printf("\n"); + printBytes("targ", target, len); + + return U_ZERO_ERROR; +} + + +UErrorCode convsample_03() +{ + printf("\n\n==============================================\n" + "Sample 03: C: print out all converters\n"); + + int32_t count; + int32_t i; + + // **************************** START SAMPLE ******************* + count = ucnv_countAvailable(); + printf("Available converters: %d\n", count); + + for(i=0;i<count;i++) + { + printf("%s ", ucnv_getAvailableName(i)); + } + + // ***************************** END SAMPLE ******************** + + printf("\n"); + + return U_ZERO_ERROR; +} + + + +#define BUFFERSIZE 17 /* make it interesting :) */ + +/* + Converting from a codepage to Unicode in bulk.. + What is the best way to determine the buffer size? + + The 'buffersize' is in bytes of input. + For a given converter, divinding this by the minimum char size + give you the maximum number of Unicode characters that could be + expected for a given number of input bytes. + see: ucnv_getMinCharSize() + + For example, a single byte codepage like 'Latin-3' has a + minimum char size of 1. (It takes at least 1 byte to represent + each Unicode char.) So the unicode buffer has the same number of + UChars as the input buffer has bytes. + + In a strictly double byte codepage such as cp1362 (Windows + Korean), the minimum char size is 2. So, only half as many Unicode + chars as bytes are needed. + + This work to calculate the buffer size is an optimization. Any + size of input and output buffer can be used, as long as the + program handles the following cases: If the input buffer is empty, + the source pointer will be equal to sourceLimit. If the output + buffer has overflowed, U_BUFFER_OVERFLOW_ERROR will be returned. + */ + +UErrorCode convsample_05() +{ + printf("\n\n==============================================\n" + "Sample 05: C: count the number of letters in a UTF-8 document\n"); + + FILE *f; + int32_t count; + char inBuf[BUFFERSIZE]; + const char *source; + const char *sourceLimit; + UChar *uBuf; + UChar *target; + UChar *targetLimit; + UChar *p; + int32_t uBufSize = 0; + UConverter *conv; + UErrorCode status = U_ZERO_ERROR; + uint32_t letters=0, total=0; + + f = fopen("data01.txt", "r"); + if(!f) + { + fprintf(stderr, "Couldn't open file 'data01.txt' (UTF-8 data file).\n"); + return U_FILE_ACCESS_ERROR; + } + + // **************************** START SAMPLE ******************* + conv = ucnv_open("utf-8", &status); + assert(U_SUCCESS(status)); + + uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); + printf("input bytes %d / min chars %d = %d UChars\n", + BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); + uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); + assert(uBuf!=NULL); + + // grab another buffer's worth + while((!feof(f)) && + ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) + { + // Convert bytes to unicode + source = inBuf; + sourceLimit = inBuf + count; + + do + { + target = uBuf; + targetLimit = uBuf + uBufSize; + + ucnv_toUnicode(conv, &target, targetLimit, + &source, sourceLimit, NULL, + feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ + /* is true (when no more data will come) */ + &status); + + if(status == U_BUFFER_OVERFLOW_ERROR) + { + // simply ran out of space - we'll reset the target ptr the next + // time through the loop. + status = U_ZERO_ERROR; + } + else + { + // Check other errors here. + assert(U_SUCCESS(status)); + // Break out of the loop (by force) + } + + // Process the Unicode + // Todo: handle UTF-16/surrogates + + for(p = uBuf; p<target; p++) + { + if(u_isalpha(*p)) + letters++; + total++; + } + } while (source < sourceLimit); // while simply out of space + } + + printf("%d letters out of %d total UChars.\n", letters, total); + + // ***************************** END SAMPLE ******************** + ucnv_close(conv); + + printf("\n"); + + return U_ZERO_ERROR; +} +#undef BUFFERSIZE + +#define BUFFERSIZE 1024 +typedef struct +{ + UChar32 codepoint; + uint32_t frequency; +} CharFreqInfo; + +UErrorCode convsample_06() +{ + printf("\n\n==============================================\n" + "Sample 06: C: frequency distribution of letters in a UTF-8 document\n"); + + FILE *f; + int32_t count; + char inBuf[BUFFERSIZE]; + const char *source; + const char *sourceLimit; + UChar *uBuf; + int32_t uBufSize = 0; + UConverter *conv; + UErrorCode status = U_ZERO_ERROR; + uint32_t letters=0, total=0; + + CharFreqInfo *info; + UChar32 charCount = 0x10000; /* increase this if you want to handle non bmp.. todo: automatically bump it.. */ + UChar32 p; + + uint32_t ie = 0; + uint32_t gh = 0; + UChar32 l = 0; + + f = fopen("data06.txt", "r"); + if(!f) + { + fprintf(stderr, "Couldn't open file 'data06.txt' (UTF-8 data file).\n"); + return U_FILE_ACCESS_ERROR; + } + + info = (CharFreqInfo*)malloc(sizeof(CharFreqInfo) * charCount); + if(!info) + { + fprintf(stderr, " Couldn't allocate %d bytes for freq counter\n", sizeof(CharFreqInfo)*charCount); + } + + /* reset frequencies */ + for(p=0;p<charCount;p++) + { + info[p].codepoint = p; + info[p].frequency = 0; + } + + // **************************** START SAMPLE ******************* + conv = ucnv_open("utf-8", &status); + assert(U_SUCCESS(status)); + + uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); + printf("input bytes %d / min chars %d = %d UChars\n", + BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); + uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); + assert(uBuf!=NULL); + + // grab another buffer's worth + while((!feof(f)) && + ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) + { + // Convert bytes to unicode + source = inBuf; + sourceLimit = inBuf + count; + + while(source < sourceLimit) + { + p = ucnv_getNextUChar(conv, &source, sourceLimit, &status); + if(U_FAILURE(status)) + { + fprintf(stderr, "%s @ %d\n", u_errorName(status), total); + status = U_ZERO_ERROR; + continue; + } + U_ASSERT(status); + total++; + + if(u_isalpha(p)) + letters++; + + if((u_tolower(l) == 'i') && (u_tolower(p) == 'e')) + ie++; + + if((u_tolower(l) == 'g') && (u_tolower(p) == 0x0127)) + gh++; + + if(p>charCount) + { + fprintf(stderr, "U+%06X: oh.., we only handle BMP characters so far.. redesign!\n", p); + return U_UNSUPPORTED_ERROR; + } + info[p].frequency++; + l = p; + } + } + + fclose(f); + ucnv_close(conv); + + printf("%d letters out of %d total UChars.\n", letters, total); + printf("%d ie digraphs, %d gh digraphs.\n", ie, gh); + + // now, we could sort it.. + + // qsort(info, charCount, sizeof(info[0]), charfreq_compare); + + for(p=0;p<charCount;p++) + { + if(info[p].frequency) + { + printf("% 5d U+%06X ", info[p].frequency, p); + if(p <= 0xFFFF) + { + prettyPrintUChar((UChar)p); + } + printf("\n"); + } + } + free(info); + // ***************************** END SAMPLE ******************** + + printf("\n"); + + return U_ZERO_ERROR; +} +#undef BUFFERSIZE + + +/****************************************************** + You must call ucnv_close to clean up the memory used by the + converter. + + 'len' returns the number of OUTPUT bytes resulting from the + conversion. + */ + +UErrorCode convsample_12() +{ + printf("\n\n==============================================\n" + "Sample 12: C: simple sjis -> unicode conversion\n"); + + + // **************************** START SAMPLE ******************* + + char source[] = { 0x63, 0x61, 0x74, (char)0x94, 0x4C, (char)0x82, 0x6E, (char)0x82, 0x6A, 0x00 }; + UChar target[100]; + UErrorCode status = U_ZERO_ERROR; + UConverter *conv; + int32_t len; + + // set up the converter + conv = ucnv_open("shift_jis", &status); + assert(U_SUCCESS(status)); + + // convert to Unicode + // Note: we can use strlen, we know it's an 8 bit null terminated codepage + target[6] = 0xFDCA; + len = ucnv_toUChars(conv, target, 100, source, strlen(source), &status); + U_ASSERT(status); + // close the converter + ucnv_close(conv); + + // ***************************** END SAMPLE ******************** + + // Print it out + printBytes("src", source, strlen(source) ); + printf("\n"); + printUChars("targ", target, len); + + return U_ZERO_ERROR; +} + +/****************************************************************** + C: Convert from codepage to Unicode one at a time. +*/ + +UErrorCode convsample_13() +{ + printf("\n\n==============================================\n" + "Sample 13: C: simple Big5 -> unicode conversion, char at a time\n"); + + + const char sourceChars[] = { 0x7a, 0x68, 0x3d, (char)0xa4, (char)0xa4, (char)0xa4, (char)0xe5, (char)0x2e }; + // const char sourceChars[] = { 0x7a, 0x68, 0x3d, 0xe4, 0xb8, 0xad, 0xe6, 0x96, 0x87, 0x2e }; + const char *source, *sourceLimit; + UChar32 target; + UErrorCode status = U_ZERO_ERROR; + UConverter *conv = NULL; + int32_t srcCount=0; + int32_t dstCount=0; + + srcCount = sizeof(sourceChars); + + conv = ucnv_open("Big5", &status); + U_ASSERT(status); + + source = sourceChars; + sourceLimit = sourceChars + sizeof(sourceChars); + + // **************************** START SAMPLE ******************* + + + printBytes("src",source,sourceLimit-source); + + while(source < sourceLimit) + { + puts(""); + target = ucnv_getNextUChar (conv, + &source, + sourceLimit, + &status); + + // printBytes("src",source,sourceLimit-source); + U_ASSERT(status); + printUChar(target); + dstCount++; + } + + + // ************************** END SAMPLE ************************* + + printf("src=%d bytes, dst=%d uchars\n", srcCount, dstCount); + ucnv_close(conv); + + return U_ZERO_ERROR; +} + + + + +UBool convsample_20_didSubstitute(const char *source) +{ + UChar uchars[100]; + char bytes[100]; + UConverter *conv = NULL; + UErrorCode status = U_ZERO_ERROR; + uint32_t len, len2; + UBool flagVal; + + FromUFLAGContext * context = NULL; + + printf("\n\n==============================================\n" + "Sample 20: C: Test for substitution using callbacks\n"); + + /* print out the original source */ + printBytes("src", source); + printf("\n"); + + /* First, convert from UTF8 to unicode */ + conv = ucnv_open("utf-8", &status); + U_ASSERT(status); + + len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); + U_ASSERT(status); + + printUChars("uch", uchars, len); + printf("\n"); + + /* Now, close the converter */ + ucnv_close(conv); + + /* Now, convert to windows-1252 */ + conv = ucnv_open("windows-1252", &status); + U_ASSERT(status); + + /* Converter starts out with the SUBSTITUTE callback set. */ + + /* initialize our callback */ + context = flagCB_fromU_openContext(); + + /* Set our special callback */ + ucnv_setFromUCallBack(conv, + flagCB_fromU, + context, + &(context->subCallback), + &(context->subContext), + &status); + + U_ASSERT(status); + + len2 = ucnv_fromUChars(conv, bytes, 100, uchars, len, &status); + U_ASSERT(status); + + flagVal = context->flag; /* it's about to go away when we close the cnv */ + + ucnv_close(conv); + + /* print out the original source */ + printBytes("bytes", bytes, len2); + + return flagVal; /* true if callback was called */ +} + +UErrorCode convsample_20() +{ + const char *sample1 = "abc\xdf\xbf"; + const char *sample2 = "abc_def"; + + + if(convsample_20_didSubstitute(sample1)) + { + printf("DID substitute.\n******\n"); + } + else + { + printf("Did NOT substitute.\n*****\n"); + } + + if(convsample_20_didSubstitute(sample2)) + { + printf("DID substitute.\n******\n"); + } + else + { + printf("Did NOT substitute.\n*****\n"); + } + + return U_ZERO_ERROR; +} + +// 21 - C, callback, with clone and debug + + + +UBool convsample_21_didSubstitute(const char *source) +{ + UChar uchars[100]; + char bytes[100]; + UConverter *conv = NULL, *cloneCnv = NULL; + UErrorCode status = U_ZERO_ERROR; + uint32_t len, len2; + int32_t cloneLen; + UBool flagVal = FALSE; + UConverterFromUCallback junkCB; + + FromUFLAGContext *flagCtx = NULL, + *cloneFlagCtx = NULL; + + debugCBContext *debugCtx1 = NULL, + *debugCtx2 = NULL, + *cloneDebugCtx = NULL; + + printf("\n\n==============================================\n" + "Sample 21: C: Test for substitution w/ callbacks & clones \n"); + + /* print out the original source */ + printBytes("src", source); + printf("\n"); + + /* First, convert from UTF8 to unicode */ + conv = ucnv_open("utf-8", &status); + U_ASSERT(status); + + len = ucnv_toUChars(conv, uchars, 100, source, strlen(source), &status); + U_ASSERT(status); + + printUChars("uch", uchars, len); + printf("\n"); + + /* Now, close the converter */ + ucnv_close(conv); + + /* Now, convert to windows-1252 */ + conv = ucnv_open("windows-1252", &status); + U_ASSERT(status); + + /* Converter starts out with the SUBSTITUTE callback set. */ + + /* initialize our callback */ + /* from the 'bottom' innermost, out + * CNV -> debugCtx1[debug] -> flagCtx[flag] -> debugCtx2[debug] */ + +#if DEBUG_TMI + printf("flagCB_fromU = %p\n", &flagCB_fromU); + printf("debugCB_fromU = %p\n", &debugCB_fromU); +#endif + + debugCtx1 = debugCB_openContext(); + flagCtx = flagCB_fromU_openContext(); + debugCtx2 = debugCB_openContext(); + + debugCtx1->subCallback = flagCB_fromU; /* debug1 -> flag */ + debugCtx1->subContext = flagCtx; + + flagCtx->subCallback = debugCB_fromU; /* flag -> debug2 */ + flagCtx->subContext = debugCtx2; + + debugCtx2->subCallback = UCNV_FROM_U_CALLBACK_SUBSTITUTE; + debugCtx2->subContext = NULL; + + /* Set our special callback */ + + ucnv_setFromUCallBack(conv, + debugCB_fromU, + debugCtx1, + &(debugCtx2->subCallback), + &(debugCtx2->subContext), + &status); + + U_ASSERT(status); + +#if DEBUG_TMI + printf("Callback chain now: Converter %p -> debug1:%p-> (%p:%p)==flag:%p -> debug2:%p -> cb %p\n", + conv, debugCtx1, debugCtx1->subCallback, + debugCtx1->subContext, flagCtx, debugCtx2, debugCtx2->subCallback); +#endif + + cloneLen = 1; /* but passing in null so it will clone */ + cloneCnv = ucnv_safeClone(conv, NULL, &cloneLen, &status); + + U_ASSERT(status); + +#if DEBUG_TMI + printf("Cloned converter from %p -> %p. Closing %p.\n", conv, cloneCnv, conv); +#endif + + ucnv_close(conv); + +#if DEBUG_TMI + printf("%p closed.\n", conv); +#endif + + U_ASSERT(status); + /* Now, we have to extract the context */ + cloneDebugCtx = NULL; + cloneFlagCtx = NULL; + + ucnv_getFromUCallBack(cloneCnv, &junkCB, (const void **)&cloneDebugCtx); + if(cloneDebugCtx != NULL) { + cloneFlagCtx = (FromUFLAGContext*) cloneDebugCtx -> subContext; + } + + printf("Cloned converter chain: %p -> %p[debug1] -> %p[flag] -> %p[debug2] -> substitute\n", + cloneCnv, cloneDebugCtx, cloneFlagCtx, cloneFlagCtx?cloneFlagCtx->subContext:NULL ); + + len2 = ucnv_fromUChars(cloneCnv, bytes, 100, uchars, len, &status); + U_ASSERT(status); + + if(cloneFlagCtx != NULL) { + flagVal = cloneFlagCtx->flag; /* it's about to go away when we close the cnv */ + } else { + printf("** Warning, couldn't get the subcallback \n"); + } + + ucnv_close(cloneCnv); + + /* print out the original source */ + printBytes("bytes", bytes, len2); + + return flagVal; /* true if callback was called */ +} + +UErrorCode convsample_21() +{ + const char *sample1 = "abc\xdf\xbf"; + const char *sample2 = "abc_def"; + + if(convsample_21_didSubstitute(sample1)) + { + printf("DID substitute.\n******\n"); + } + else + { + printf("Did NOT substitute.\n*****\n"); + } + + if(convsample_21_didSubstitute(sample2)) + { + printf("DID substitute.\n******\n"); + } + else + { + printf("Did NOT substitute.\n*****\n"); + } + + return U_ZERO_ERROR; +} + + +// 40- C, cp37 -> UTF16 [data02.bin -> data40.utf16] + +#define BUFFERSIZE 17 /* make it interesting :) */ + +UErrorCode convsample_40() +{ + printf("\n\n==============================================\n" + "Sample 40: C: convert data02.bin from cp37 to UTF16 [data40.utf16]\n"); + + FILE *f; + FILE *out; + int32_t count; + char inBuf[BUFFERSIZE]; + const char *source; + const char *sourceLimit; + UChar *uBuf; + UChar *target; + UChar *targetLimit; + int32_t uBufSize = 0; + UConverter *conv = NULL; + UErrorCode status = U_ZERO_ERROR; + uint32_t inbytes=0, total=0; + + f = fopen("data02.bin", "rb"); + if(!f) + { + fprintf(stderr, "Couldn't open file 'data02.bin' (cp37 data file).\n"); + return U_FILE_ACCESS_ERROR; + } + + out = fopen("data40.utf16", "wb"); + if(!out) + { + fprintf(stderr, "Couldn't create file 'data40.utf16'.\n"); + return U_FILE_ACCESS_ERROR; + } + + // **************************** START SAMPLE ******************* + conv = ucnv_openCCSID(37, UCNV_IBM, &status); + assert(U_SUCCESS(status)); + + uBufSize = (BUFFERSIZE/ucnv_getMinCharSize(conv)); + printf("input bytes %d / min chars %d = %d UChars\n", + BUFFERSIZE, ucnv_getMinCharSize(conv), uBufSize); + uBuf = (UChar*)malloc(uBufSize * sizeof(UChar)); + assert(uBuf!=NULL); + + // grab another buffer's worth + while((!feof(f)) && + ((count=fread(inBuf, 1, BUFFERSIZE , f)) > 0) ) + { + inbytes += count; + + // Convert bytes to unicode + source = inBuf; + sourceLimit = inBuf + count; + + do + { + target = uBuf; + targetLimit = uBuf + uBufSize; + + ucnv_toUnicode( conv, &target, targetLimit, + &source, sourceLimit, NULL, + feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ + /* is true (when no more data will come) */ + &status); + + if(status == U_BUFFER_OVERFLOW_ERROR) + { + // simply ran out of space - we'll reset the target ptr the next + // time through the loop. + status = U_ZERO_ERROR; + } + else + { + // Check other errors here. + assert(U_SUCCESS(status)); + // Break out of the loop (by force) + } + + // Process the Unicode + // Todo: handle UTF-16/surrogates + assert(fwrite(uBuf, sizeof(uBuf[0]), (target-uBuf), out) == + (size_t)(target-uBuf)); + total += (target-uBuf); + } while (source < sourceLimit); // while simply out of space + } + + printf("%d bytes in, %d UChars out.\n", inbytes, total); + + // ***************************** END SAMPLE ******************** + ucnv_close(conv); + + fclose(f); + fclose(out); + printf("\n"); + + return U_ZERO_ERROR; +} +#undef BUFFERSIZE + + + +// 46- C, UTF16 -> latin2 [data40.utf16 -> data46.out] + +#define BUFFERSIZE 24 /* make it interesting :) */ + +UErrorCode convsample_46() +{ + printf("\n\n==============================================\n" + "Sample 46: C: convert data40.utf16 from UTF16 to latin2 [data46.out]\n"); + + FILE *f; + FILE *out; + int32_t count; + UChar inBuf[BUFFERSIZE]; + const UChar *source; + const UChar *sourceLimit; + char *buf; + char *target; + char *targetLimit; + + int32_t bufSize = 0; + UConverter *conv = NULL; + UErrorCode status = U_ZERO_ERROR; + uint32_t inchars=0, total=0; + + f = fopen("data40.utf16", "rb"); + if(!f) + { + fprintf(stderr, "Couldn't open file 'data40.utf16' (did you run convsample_40() ?)\n"); + return U_FILE_ACCESS_ERROR; + } + + out = fopen("data46.out", "wb"); + if(!out) + { + fprintf(stderr, "Couldn't create file 'data46.out'.\n"); + return U_FILE_ACCESS_ERROR; + } + + // **************************** START SAMPLE ******************* + conv = ucnv_open( "iso-8859-2", &status); + assert(U_SUCCESS(status)); + + bufSize = (BUFFERSIZE*ucnv_getMaxCharSize(conv)); + printf("input UChars[16] %d * max charsize %d = %d bytes output buffer\n", + BUFFERSIZE, ucnv_getMaxCharSize(conv), bufSize); + buf = (char*)malloc(bufSize * sizeof(char)); + assert(buf!=NULL); + + // grab another buffer's worth + while((!feof(f)) && + ((count=fread(inBuf, sizeof(UChar), BUFFERSIZE , f)) > 0) ) + { + inchars += count; + + // Convert bytes to unicode + source = inBuf; + sourceLimit = inBuf + count; + + do + { + target = buf; + targetLimit = buf + bufSize; + + ucnv_fromUnicode( conv, &target, targetLimit, + &source, sourceLimit, NULL, + feof(f)?TRUE:FALSE, /* pass 'flush' when eof */ + /* is true (when no more data will come) */ + &status); + + if(status == U_BUFFER_OVERFLOW_ERROR) + { + // simply ran out of space - we'll reset the target ptr the next + // time through the loop. + status = U_ZERO_ERROR; + } + else + { + // Check other errors here. + assert(U_SUCCESS(status)); + // Break out of the loop (by force) + } + + // Process the Unicode + assert(fwrite(buf, sizeof(buf[0]), (target-buf), out) == + (size_t)(target-buf)); + total += (target-buf); + } while (source < sourceLimit); // while simply out of space + } + + printf("%d Uchars (%d bytes) in, %d chars out.\n", inchars, inchars * sizeof(UChar), total); + + // ***************************** END SAMPLE ******************** + ucnv_close(conv); + + fclose(f); + fclose(out); + printf("\n"); + + return U_ZERO_ERROR; +} +#undef BUFFERSIZE + +#define BUFFERSIZE 219 + + +/* main */ + +int main() +{ + + printf("Default Converter=%s\n", ucnv_getDefaultName() ); + + convsample_02(); // C , u->koi8r, conv + convsample_03(); // C, iterate + + convsample_05(); // C, utf8->u, getNextUChar + convsample_06(); // C freq counter thingy + + convsample_12(); // C, sjis->u, conv + convsample_13(); // C, big5->u, getNextU + + convsample_20(); // C, callback + convsample_21(); // C, callback debug + + convsample_40(); // C, cp37 -> UTF16 [data02.bin -> data40.utf16] + + convsample_46(); // C, UTF16 -> latin3 [data41.utf16 -> data46.out] + + printf("End of converter samples.\n"); + + fflush(stdout); + fflush(stderr); + + return 0; +} Copied: trunk/opentrep/test/i18n/icu/icufmt.cpp (from rev 173, trunk/opentrep/test/i18n/icufmt.cpp) =================================================================== --- trunk/opentrep/test/i18n/icu/icufmt.cpp (rev 0) +++ trunk/opentrep/test/i18n/icu/icufmt.cpp 2009-08-14 17:51:06 UTC (rev 176) @@ -0,0 +1,27 @@ +// STL +#include <iostream> +// ICU +#include <unicode/choicfmt.h> +#include <unicode/unistr.h> +#include <unicode/ustream.h> + +// //////////// M A I N ///////////// +int main (int argc, char *argv[]) { + double limits[] = {1,2,3,4,5,6,7}; + + UnicodeString weekDayNames[] = { + "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"}; + + ChoiceFormat fmt (limits, weekDayNames, 7); + + UnicodeString str; + for (double x = 1.0; x != 8.0; x += 1.0) { + fmt.format(x, str); + std::cout << x << " -> " << str << std::endl; + } + + std::cout << std::endl; + + return 0; +} + Added: trunk/opentrep/test/i18n/icu/icuustring.cpp =================================================================== --- trunk/opentrep/test/i18n/icu/icuustring.cpp (rev 0) +++ trunk/opentrep/test/i18n/icu/icuustring.cpp 2009-08-14 17:51:06 UTC (rev 176) @@ -0,0 +1,116 @@ +// STL +#include <cstdio> +#include <iostream> +// ICU +#include <unicode/utypes.h> +#include <unicode/uchar.h> +#include <unicode/locid.h> +#include <unicode/ustring.h> +#include <unicode/ucnv.h> +#include <unicode/unistr.h> + +#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) + +// helper functions -------------------------------------------------------- *** + +// default converter for the platform encoding +static UConverter* cnv = NULL; + +static void +printUnicodeString(const char *announce, const UnicodeString &s) { + static char out[200]; + int32_t i, length; + + // output the string, converted to the platform encoding + + // Note for Windows: The "platform encoding" defaults to the "ANSI codepage", + // which is different from the "OEM codepage" in the console window. + // However, if you pipe the output into a file and look at it with Notepad + // or similar, then "ANSI" characters will show correctly. + // Production code should be aware of what encoding is required, + // and use a UConverter or at least a charset name explicitly. + out[s.extract(0, 99, out)]=0; + printf("%s%s {", announce, out); + + // output the code units (not code points) + length=s.length(); + for(i=0; i<length; ++i) { + printf(" %04x", s.charAt(i)); + } + printf(" }\n"); +} + +static void demoCaseMapInCPlusPlus() { + /* + * input= + * "<Cyrillic Capital Letter BE>" + * "<Cyrillic Capital Letter GHE>" + */ + static const UChar input[]={ + 0x411, 0x413, 0 + }; + + std::cout << std::endl << "* demoCaseMapInCPlusPlus() --------- ***" + << std::endl << std::endl; + + UnicodeString s(input), t; + const Locale& en = Locale::getEnglish(); + Locale ru ("ru"); + + /* + * Full case mappings as in demoCaseMapInC(), using UnicodeString functions. + * These functions modify the string object itself. + * Since we want to keep the input string around, we copy it each time + * and case-map the copy. + */ + printUnicodeString("input string: ", s); + + /* lowercase/English */ + printUnicodeString("full-lowercased/en: ", (t=s).toLower(en)); + /* lowercase/Russian */ + printUnicodeString("full-lowercased/ru: ", (t=s).toLower(ru)); + /* uppercase/English */ + printUnicodeString("full-uppercased/en: ", (t=s).toUpper(en)); + /* uppercase/Russian */ + printUnicodeString("full-uppercased/ru: ", (t=s).toUpper(ru)); + /* titlecase/English */ + printUnicodeString("full-titlecased/en: ", (t=s).toTitle(NULL, en)); + /* titlecase/Russian */ + printUnicodeString("full-titlecased/ru: ", (t=s).toTitle(NULL, ru)); + /* case-folde/default */ + printUnicodeString("full-case-folded/default: ", (t=s).foldCase(U_FOLD_CASE_DEFAULT)); + /* case-folde/Russian */ + printUnicodeString("full-case-folded/Russian: ", (t=s).foldCase(U_FOLD_CASE_EXCLUDE_SPECIAL_I)); +} + +extern int +main(int argc, const char *argv[]) { + UErrorCode errorCode=U_ZERO_ERROR; + + // Note: Using a global variable for any object is not exactly + // thread-safe... + // You can change this call to e.g. ucnv_open("UTF-8", &errorCode) + // if you pipe the output to a file and look at it with a + // Unicode-capable editor. This will currently affect only the + // printUString() function, see the code above. + // printUnicodeString() could use this, too, by changing to an + // extract() overload that takes a UConverter argument. + // cnv = ucnv_open(NULL, &errorCode); + cnv = ucnv_open("UTF-8", &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "error %s opening the default converter\n", u_errorName(errorCode)); + return errorCode; + } + + ucnv_setFromUCallBack(cnv, UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_C, NULL, NULL, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "error %s setting the escape callback in the default converter\n", u_errorName(errorCode)); + ucnv_close(cnv); + return errorCode; + } + + demoCaseMapInCPlusPlus(); + + ucnv_close(cnv); + return 0; +} Added: trunk/opentrep/test/i18n/icu/icuustringref.cpp =================================================================== --- trunk/opentrep/test/i18n/icu/icuustringref.cpp (rev 0) +++ trunk/opentrep/test/i18n/icu/icuustringref.cpp 2009-08-14 17:51:06 UTC (rev 176) @@ -0,0 +1,609 @@ +/* +******************************************************************************* +* +* Copyright (C) 2000-2002, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: ustring.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2000aug15 +* created by: Markus W. Scherer +* +* This file contains sample code that illustrates the use of Unicode strings +* with ICU. +*/ + +#include <stdio.h> +#include "unicode/utypes.h" +#include "unicode/uchar.h" +#include "unicode/locid.h" +#include "unicode/ustring.h" +#include "unicode/ucnv.h" +#include "unicode/unistr.h" + +#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) + +// helper functions -------------------------------------------------------- *** + +// default converter for the platform encoding +static UConverter *cnv=NULL; + +static void +printUString(const char *announce, const UChar *s, int32_t length) { + static char out[200]; + UChar32 c; + int32_t i; + UErrorCode errorCode=U_ZERO_ERROR; + + /* + * Convert to the "platform encoding". See notes in printUnicodeString(). + * ucnv_fromUChars(), like most ICU APIs understands length==-1 + * to mean that the string is NUL-terminated. + */ + ucnv_fromUChars(cnv, out, sizeof(out), s, length, &errorCode); + if(U_FAILURE(errorCode) || errorCode==U_STRING_NOT_TERMINATED_WARNING) { + printf("%sproblem converting string from Unicode: %s\n", announce, u_errorName(errorCode)); + return; + } + + printf("%s%s {", announce, out); + + /* output the code points (not code units) */ + if(length>=0) { + /* s is not NUL-terminated */ + for(i=0; i<length; /* U16_NEXT post-increments */) { + U16_NEXT(s, i, length, c); + printf(" %04x", c); + } + } else { + /* s is NUL-terminated */ + for(i=0; /* condition in loop body */; /* U16_NEXT post-increments */) { + U16_NEXT(s, i, length, c); + if(c==0) { + break; + } + printf(" %04x", c); + } + } + printf(" }\n"); +} + +static void +printUnicodeString(const char *announce, const UnicodeString &s) { + static char out[200]; + int32_t i, length; + + // output the string, converted to the platform encoding + + // Note for Windows: The "platform encoding" defaults to the "ANSI codepage", + // which is different from the "OEM codepage" in the console window. + // However, if you pipe the output into a file and look at it with Notepad + // or similar, then "ANSI" characters will show correctly. + // Production code should be aware of what encoding is required, + // and use a UConverter or at least a charset name explicitly. + out[s.extract(0, 99, out)]=0; + printf("%s%s {", announce, out); + + // output the code units (not code points) + length=s.length(); + for(i=0; i<length; ++i) { + printf(" %04x", s.charAt(i)); + } + printf(" }\n"); +} + +// sample code for utf.h macros -------------------------------------------- *** + +static void +demo_utf_h_macros() { + static UChar input[]={ 0x0061, 0xd800, 0xdc00, 0xdbff, 0xdfff, 0x0062 }; + UChar32 c; + int32_t i; + UBool isError; + + printf("\n* demo_utf_h_macros() -------------- ***\n\n"); + + printUString("iterate forward through: ", input, LENGTHOF(input)); + for(i=0; i<LENGTHOF(input); /* U16_NEXT post-increments */) { + /* Iterating forwards + Codepoint at offset 0: U+0061 + Codepoint at offset 1: U+10000 + Codepoint at offset 3: U+10ffff + Codepoint at offset 5: U+0062 + */ + printf("Codepoint at offset %d: U+", i); + U16_NEXT(input, i, LENGTHOF(input), c); + printf("%04x\n", c); + } + + puts(""); + + isError=FALSE; + i=1; /* write position, gets post-incremented so needs to be in an l-value */ + U16_APPEND(input, i, LENGTHOF(input), 0x0062, isError); + + printUString("iterate backward through: ", input, LENGTHOF(input)); + for(i=LENGTHOF(input); i>0; /* U16_PREV pre-decrements */) { + U16_PREV(input, 0, i, c); + /* Iterating backwards + Codepoint at offset 5: U+0062 + Codepoint at offset 3: U+10ffff + Codepoint at offset 2: U+dc00 -- unpaired surrogate because lead surr. overwritten + Codepoint at offset 1: U+0062 -- by this BMP code point + Codepoint at offset 0: U+0061 + */ + printf("Codepoint at offset %d: U+%04x\n", i, c); + } +} + +// sample code for Unicode strings in C ------------... [truncated message content] |
From: <den...@us...> - 2009-08-14 15:19:03
|
Revision: 175 http://opentrep.svn.sourceforge.net/opentrep/?rev=175&view=rev Author: denis_arnaud Date: 2009-08-14 15:18:55 +0000 (Fri, 14 Aug 2009) Log Message: ----------- [i18n] Just added a readme file on transliteration to the test/i18n/ref sub-directory. Added Paths: ----------- trunk/opentrep/test/i18n/ref/README Added: trunk/opentrep/test/i18n/ref/README =================================================================== --- trunk/opentrep/test/i18n/ref/README (rev 0) +++ trunk/opentrep/test/i18n/ref/README 2009-08-14 15:18:55 UTC (rev 175) @@ -0,0 +1,5 @@ + +Romanisation of Cyrillic letters: +For Ukrainian: http://en.wikipedia.org/wiki/Romanization_of_Ukrainian +For Russian: http://en.wikipedia.org/wiki/Romanization_of_Russian + This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-08-14 13:56:46
|
Revision: 174 http://opentrep.svn.sourceforge.net/opentrep/?rev=174&view=rev Author: denis_arnaud Date: 2009-08-14 13:56:37 +0000 (Fri, 14 Aug 2009) Log Message: ----------- [Test] Added a proof-of-concept code sample for STL iterator on BOM classes. Modified Paths: -------------- trunk/opentrep/configure.ac Added Paths: ----------- trunk/opentrep/test/iterator/ trunk/opentrep/test/iterator/Makefile.am trunk/opentrep/test/iterator/pocIterator.cpp Modified: trunk/opentrep/configure.ac =================================================================== --- trunk/opentrep/configure.ac 2009-08-10 16:23:33 UTC (rev 173) +++ trunk/opentrep/configure.ac 2009-08-14 13:56:37 UTC (rev 174) @@ -265,6 +265,7 @@ test/parsers/Makefile test/i18n/Makefile test/python/Makefile + test/iterator/Makefile test/Makefile win32/Makefile) AC_OUTPUT Property changes on: trunk/opentrep/test/iterator ___________________________________________________________________ Added: svn:ignore + .deps .libs Makefile.in Makefile pocIterator Added: trunk/opentrep/test/iterator/Makefile.am =================================================================== --- trunk/opentrep/test/iterator/Makefile.am (rev 0) +++ trunk/opentrep/test/iterator/Makefile.am 2009-08-14 13:56:37 UTC (rev 174) @@ -0,0 +1,12 @@ +## command sub-directory +include $(top_srcdir)/Makefile.common + +MAINTAINERCLEANFILES = Makefile.in + +check_PROGRAMS = pocIterator + +pocIterator_SOURCES = pocIterator.cpp +pocIterator_CXXFLAGS = +pocIterator_LDFLAGS = + +EXTRA_DIST = Added: trunk/opentrep/test/iterator/pocIterator.cpp =================================================================== --- trunk/opentrep/test/iterator/pocIterator.cpp (rev 0) +++ trunk/opentrep/test/iterator/pocIterator.cpp 2009-08-14 13:56:37 UTC (rev 174) @@ -0,0 +1,144 @@ +// ////////////////////////////////////////////////////////////////////////// +// Proof-of-concept for STL iterators on Business Object Model (BOM) objects +// ////////////////////////////////////////////////////////////////////////// +// STL +#include <cassert> +#include <iostream> +#include <sstream> +#include <iterator> +#include <vector> + +/** Base class. */ +class BaseClass { +public: + /** Constructor. */ + BaseClass (const std::string& iName) : _name (iName) {} + /** Destructor. */ + ~BaseClass () {} + /** Get the serialised version of the Object. */ + virtual void toStream (std::ostream& ioOut) const = 0; +protected: + /** Name. */ + std::string _name; +}; + +/** Standard display function. */ +template <class charT, class traits> +inline +std::basic_ostream<charT, traits>& +operator<< (std::basic_ostream<charT, traits>& ioOut, + const BaseClass& iBaseClass) { + std::basic_ostringstream<charT,traits> ostr; + ostr.copyfmt (ioOut); + ostr.width (0); + // Fill string stream + iBaseClass.toStream (ostr); + // Print string stream + ioOut << ostr.str(); + return ioOut; +} + +/** Child class. */ +class Child : public BaseClass { +public: + /** Constructor. */ + Child (const std::string& iName) : BaseClass (iName) {} + /** Destructor. */ + ~Child () {} + /** Get the serialised version of the Object. */ + void toStream (std::ostream& ioOut) const { ioOut << "Child: " << _name; } +}; + +/** List of pointers on children objects. */ +typedef std::vector<Child*> ChildList_T; + +/** Parent class. */ +class Parent : public BaseClass { +public: + /** STL iterators on the list of (pointers on) children objects. */ + typedef ChildList_T::const_iterator const_iterator; + typedef ChildList_T::iterator iterator; + typedef ChildList_T::reverse_iterator reverse_iterator; + typedef ChildList_T::const_reverse_iterator const_reverse_iterator; + + /** Constructor. */ + Parent (const std::string& iName) : BaseClass (iName) {} + /** Destructor. */ + ~Parent () {} + + /** Get the serialised version of the Object. */ + void toStream (std::ostream& ioOut) const { ioOut << "Parent: " << _name; } + + /** Add a child in the dedicated list. */ + void push_back (Child& ioChild) { _childList.push_back (&ioChild); } + + /** Return the iterator instantiated on the first element of the + list of children objects. */ + const_iterator begin() const { return _childList.begin(); } + + /** Return the iterator instantiated beyond the last element of the + list of children objects. */ + const_iterator end() const { return _childList.end(); } + + /** Return the iterator instantiated on the last element of the + list of children objects. */ + const_reverse_iterator rbegin() const { return _childList.rbegin(); } + + /** Return the iterator instantiated beyond the first element of the + list of children objects. */ + const_reverse_iterator rend() const { return _childList.rend(); } + +private: + /** List of pointers on children objects. */ + ChildList_T _childList; +}; + +// ///////////// M A I N ///////////// +int main (int argc, char* argv[]) { + + // Initialisation + Parent* lParent_ptr = new Parent ("parent"); + + Child* lChild1_ptr = new Child ("child1"); + lParent_ptr->push_back (*lChild1_ptr); + + Child* lChild2_ptr = new Child ("child2"); + lParent_ptr->push_back (*lChild2_ptr); + + // ///////////// Usage (as a proof of concept) ///////////// + // + // Ascending order + std::cout << *lParent_ptr << " in the ascending order:" << std::endl; + unsigned short idx = 1; + for (Parent::const_iterator itChild = lParent_ptr->begin(); + itChild != lParent_ptr->end(); ++itChild, ++idx) { + if (idx != 1) { + std::cout << "; "; + } + + const Child* lChild_ptr = *itChild; + assert (lChild_ptr != NULL); + + std::cout << *lChild_ptr; + } + std::cout << std::endl; + + // + // Descending order + std::cout << *lParent_ptr << " in the descending order:" << std::endl; + idx = 1; + for (Parent::const_reverse_iterator itChild = lParent_ptr->rbegin(); + itChild != lParent_ptr->rend(); ++itChild, ++idx) { + if (idx != 1) { + std::cout << "; "; + } + + const Child* lChild_ptr = *itChild; + assert (lChild_ptr != NULL); + + std::cout << *lChild_ptr; + } + std::cout << std::endl; + + return 0; +} This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-08-10 16:23:43
|
Revision: 173 http://opentrep.svn.sourceforge.net/opentrep/?rev=173&view=rev Author: denis_arnaud Date: 2009-08-10 16:23:33 +0000 (Mon, 10 Aug 2009) Log Message: ----------- [Dev] The edit distance figures are now reported within the interface (Location structure). That work is finished, but not fully tested yet. Modified Paths: -------------- trunk/opentrep/opentrep/Location.hpp trunk/opentrep/opentrep/bom/Place.cpp trunk/opentrep/opentrep/bom/Place.hpp trunk/opentrep/opentrep/bom/ResultHolder.cpp trunk/opentrep/opentrep/bom/StringMatcher.cpp trunk/opentrep/opentrep/bom/StringMatcher.hpp trunk/opentrep/opentrep/bom/sources.mk trunk/opentrep/opentrep/command/RequestInterpreter.cpp trunk/opentrep/opentrep/python/pyopentrep.cpp trunk/opentrep/opentrep/python/pyopentrep.py Added Paths: ----------- trunk/opentrep/opentrep/bom/Levenshtein.cpp trunk/opentrep/opentrep/bom/Levenshtein.hpp Modified: trunk/opentrep/opentrep/Location.hpp =================================================================== --- trunk/opentrep/opentrep/Location.hpp 2009-08-10 12:17:27 UTC (rev 172) +++ trunk/opentrep/opentrep/Location.hpp 2009-08-10 16:23:33 UTC (rev 173) @@ -84,6 +84,12 @@ return _editDistance; } + /** Get the maximal allowable edit distance/error, with which the + matching has been made. */ + const NbOfErrors_T& getAllowableEditDistance () const { + return _allowableEditDistance; + } + /** Get the list of extra matching (similar) locations. */ const LocationList_T& getExtraLocationList() const { return _extraLocationList; @@ -156,6 +162,12 @@ _editDistance = iEditDistance; } + /** Set the maxiaml allowable edit distance/error, with which the + matching has been made. */ + void setAllowableEditDistance (const NbOfErrors_T& iAllowableEditDistance) { + _allowableEditDistance = iAllowableEditDistance; + } + /** Add an extra matching location. */ void addExtraLocation (const Location& iExtraLocation) { _extraLocationList.push_back (iExtraLocation); @@ -186,8 +198,8 @@ oStr << _locationCode << ", " << _cityCode << ", " << _stateCode << ", " << _countryCode << ", " << _regionCode << ", " << _continentCode << ", " << _timeZoneGroup - << ", " << _longitude << ", " << _latitude - << ", " << _percentage << ", " << _editDistance; + << ", " << _longitude << ", " << _latitude << ", " << _percentage + << ", " << _editDistance << ", " << _allowableEditDistance; if (_extraLocationList.empty() == false) { oStr << " " << _extraLocationList.size() << " extra match(es)"; @@ -251,13 +263,15 @@ const double iLongitude, const double iLatitude, const LocationNameList_T& iNameList, const MatchingPercentage_T& iPercentage, - const NbOfErrors_T& iEditDistance) + const NbOfErrors_T& iEditDistance, + const NbOfErrors_T& iAllowableEditDistance) : _locationCode (iPlaceCode), _cityCode (iCityCode), _stateCode (iStateCode), _countryCode (iCountryCode), _regionCode (iRegionCode), _continentCode (iContinentCode), _timeZoneGroup (iTimeZoneGroup), _longitude (iLongitude), _latitude (iLatitude), _nameList (iNameList), - _percentage (iPercentage), _editDistance (iEditDistance) { + _percentage (iPercentage), _editDistance (iEditDistance), + _allowableEditDistance (iAllowableEditDistance) { } /** Default Constructor. */ @@ -298,6 +312,10 @@ /** Allowed edit error/distance. */ NbOfErrors_T _editDistance; + /** Maximum allowable edit distance/error, with which the matching + has been made. */ + NbOfErrors_T _allowableEditDistance; + /** List of extra matching (similar) locations. */ LocationList_T _extraLocationList; Added: trunk/opentrep/opentrep/bom/Levenshtein.cpp =================================================================== --- trunk/opentrep/opentrep/bom/Levenshtein.cpp (rev 0) +++ trunk/opentrep/opentrep/bom/Levenshtein.cpp 2009-08-10 16:23:33 UTC (rev 173) @@ -0,0 +1,111 @@ +// ////////////////////////////////////////////////////////////////////// +// Import section +// ////////////////////////////////////////////////////////////////////// +// STL +#include <string> +#include <vector> +// OpenTREP +#include <opentrep/bom/Levenshtein.hpp> + +namespace OPENTREP { + + // ////////////////////////////////////////////////////////////////// + int Levenshtein::getDistance (const std::string& iSource, + const std::string& iTarget) { + + // Step 1 + + const int n = iSource.length(); + const int m = iTarget.length(); + + if (n == 0) { + return m; + } + + if (m == 0) { + return n; + } + + // Definition of Matrix Type + typedef std::vector<std::vector<int> > Matrix_T; + + Matrix_T matrix (n+1); + + // Size the vectors in the 2.nd dimension. Unfortunately C++ doesn't + // allow for allocation on declaration of 2.nd dimension of vec of vec + + for (int i = 0; i <= n; i++) { + matrix[i].resize(m+1); + } + + // Step 2 + + for (int i = 0; i <= n; i++) { + matrix[i][0]=i; + } + + for (int j = 0; j <= m; j++) { + matrix[0][j]=j; + } + + // Step 3 + + for (int i = 1; i <= n; i++) { + + const char s_i = iSource[i-1]; + + // Step 4 + + for (int j = 1; j <= m; j++) { + + const char t_j = iTarget[j-1]; + + // Step 5 + + int cost; + if (s_i == t_j) { + cost = 0; + + } else { + cost = 1; + } + + // Step 6 + + const int above = matrix[i-1][j]; + const int left = matrix[i][j-1]; + const int diag = matrix[i-1][j-1]; + int cell = std::min ( above + 1, std::min (left + 1, diag + cost)); + + // Step 6A: Cover transposition, in addition to deletion, + // insertion and substitution. This step is taken from: + // Berghel, Hal ; Roach, David : "An Extension of Ukkonen's + // Enhanced Dynamic Programming ASM Algorithm" + // (http://www.acm.org/~hlb/publications/asm/asm.html) + + if (i>2 && j>2) { + int trans = matrix[i-2][j-2] + 1; + + if (iSource[i-2] != t_j) { + trans++; + } + + if (s_i != iTarget[j-2]) { + trans++; + } + + if (cell > trans) { + cell = trans; + } + } + + matrix[i][j] = cell; + } + } + + // Step 7 + + return matrix[n][m]; + } + +} Added: trunk/opentrep/opentrep/bom/Levenshtein.hpp =================================================================== --- trunk/opentrep/opentrep/bom/Levenshtein.hpp (rev 0) +++ trunk/opentrep/opentrep/bom/Levenshtein.hpp 2009-08-10 16:23:33 UTC (rev 173) @@ -0,0 +1,28 @@ +// +// Levenshtein Distance Algorithm: C++ Implementation by Anders Sewerin Johansen +// +#ifndef __OPENTREP_BOM_LEVENSHTEIN_HPP +#define __OPENTREP_BOM_LEVENSHTEIN_HPP + +// ////////////////////////////////////////////////////////////////////// +// Import section +// ////////////////////////////////////////////////////////////////////// +// STL +#include <string> +// OpenTREP +#include <opentrep/OPENTREP_Types.hpp> +#include <opentrep/bom/BomAbstract.hpp> + +namespace OPENTREP { + + /** Class aggregating utilities around the Levenshtein edit + distance/error. */ + class Levenshtein : public BomAbstract { + public: + /** Calculate the edit distance between two strings. */ + static int getDistance (const std::string& iSource, + const std::string& iTarget); + }; + +} +#endif // __OPENTREP_BOM_LEVENSHTEIN_HPP Modified: trunk/opentrep/opentrep/bom/Place.cpp =================================================================== --- trunk/opentrep/opentrep/bom/Place.cpp 2009-08-10 12:17:27 UTC (rev 172) +++ trunk/opentrep/opentrep/bom/Place.cpp 2009-08-10 16:23:33 UTC (rev 173) @@ -23,7 +23,8 @@ _timeZoneGroup (iPlace._timeZoneGroup), _longitude (iPlace._longitude), _latitude (iPlace._latitude), _nameMatrix (iPlace._nameMatrix), _docID (iPlace._docID), _percentage (iPlace._percentage), - _editDistance (iPlace._editDistance) { + _editDistance (iPlace._editDistance), + _allowableEditDistance (iPlace._allowableEditDistance) { } // ////////////////////////////////////////////////////////////////////// @@ -80,7 +81,8 @@ << ", " << _continentCode << ", " << _timeZoneGroup << ", " << _longitude << ", " << _latitude << ", " << _docID << ", " << _percentage - << ", " << _editDistance << ". "; + << ", " << _editDistance << ", " << _allowableEditDistance + << ". "; for (NameMatrix_T::const_iterator itNameList = _nameMatrix.begin(); itNameList != _nameMatrix.end(); ++itNameList) { @@ -136,7 +138,7 @@ << ", " << _continentCode << ", " << _timeZoneGroup << ", " << _longitude << ", " << _latitude << ", " << _docID << ", " << _percentage - << ", " << _editDistance; + << ", " << _editDistance << ", " << _allowableEditDistance; NameMatrix_T::const_iterator itNameHolder = _nameMatrix.begin(); if (itNameHolder != _nameMatrix.end()) { @@ -189,6 +191,7 @@ << ", docID = " << _docID << ", percentage = " << _percentage << "%" << ", edit distance = " << _editDistance + << ", allowable edit distance = " << _allowableEditDistance << std::endl; return oStr.str(); } @@ -262,7 +265,7 @@ Location oLocation (_placeCode, lCityCode, _stateCode, _countryCode, _regionCode, _continentCode, _timeZoneGroup, _longitude, _latitude, lNameList, - _percentage, _editDistance); + _percentage, _editDistance, _allowableEditDistance); // Add extra matching locations, whenever they exist if (_extraPlaceList.empty() == false) { Modified: trunk/opentrep/opentrep/bom/Place.hpp =================================================================== --- trunk/opentrep/opentrep/bom/Place.hpp 2009-08-10 12:17:27 UTC (rev 172) +++ trunk/opentrep/opentrep/bom/Place.hpp 2009-08-10 16:23:33 UTC (rev 173) @@ -91,6 +91,12 @@ return _editDistance; } + /** Get the maximal allowable edit distance/error, with which the + matching has been made. */ + const NbOfErrors_T& getAllowableEditDistance () const { + return _allowableEditDistance; + } + /** Get the map of name lists. */ const NameMatrix_T& getNameMatrix () const { return _nameMatrix; @@ -176,6 +182,12 @@ _editDistance = iEditDistance; } + /** Set the maxiaml allowable edit distance/error, with which the + matching has been made. */ + void setAllowableEditDistance (const NbOfErrors_T& iAllowableEditDistance) { + _allowableEditDistance = iAllowableEditDistance; + } + public: // ////////// Setters in underlying names //////// @@ -277,6 +289,10 @@ /** Allowed edit error/distance. */ NbOfErrors_T _editDistance; + /** Maximum allowable edit distance/error, with which the matching + has been made. */ + NbOfErrors_T _allowableEditDistance; + /** List of extra matching (similar) places. */ PlaceOrderedList_T _extraPlaceList; Modified: trunk/opentrep/opentrep/bom/ResultHolder.cpp =================================================================== --- trunk/opentrep/opentrep/bom/ResultHolder.cpp 2009-08-10 12:17:27 UTC (rev 172) +++ trunk/opentrep/opentrep/bom/ResultHolder.cpp 2009-08-10 16:23:33 UTC (rev 173) @@ -69,95 +69,6 @@ } // ////////////////////////////////////////////////////////////////////// - std::string ResultHolder::searchString (Xapian::MSet& ioMatchingSet, - TravelQuery_T& ioPartialQueryString, - Document& ioMatchingDocument) { - std::string oMatchedString; - - // Catch any Xapian::Error exceptions thrown - try { - - /** - The query string must first be checked, without allowing any - spelling errors, but by removing the furthest right word at - every step. - <br>If no match is found, the maximal allowable edit - distance/error becomes 1, and the process (trying to match - the whole sentence, then by removing the furthest right word, - etc.) is re-performed. - <br>If no match is found, the maximal allowable edit - distance/error becomes 2. - <br>And so on until the maximum of the edit distance/error - becomes greater than the maximal allowable distance/error. - reached. - - <br>NOTE: that feature is de-activated, as it seems it does - not bring any added value. To re-activate it, just initialise - the lMaxEditDistance to 0, instead of to the positive infinite. - */ - // NbOfErrors_T lMaxEditDistance = 0; - NbOfErrors_T lMaxEditDistance = std::numeric_limits<NbOfErrors_T>::max(); - - bool hasReachedMaximalAllowableEditDistance = false; - bool shouldStop = false; - while (shouldStop == false) { - - // DEBUG - OPENTREP_LOG_DEBUG ("--------"); - OPENTREP_LOG_DEBUG ("Current query string: `" - << ioPartialQueryString - << "', with a maximal edit distance of " - << lMaxEditDistance << "."); - - // Retrieve the list of Xapian documents matching the query string - NbOfErrors_T lCalculatedEditDistance = 0; - oMatchedString = - StringMatcher::searchString(ioMatchingSet, ioPartialQueryString, - lCalculatedEditDistance, lMaxEditDistance, - hasReachedMaximalAllowableEditDistance, - _database); - - // DEBUG - OPENTREP_LOG_DEBUG ("---- Current query string: `" - << ioPartialQueryString << "' --- Kept query: `" - << oMatchedString - << "', with an edit distance of a maximum of " - << lCalculatedEditDistance << " (over " - << lMaxEditDistance << "), for " - << ioMatchingSet.size() << " matches."); - - if (ioMatchingSet.empty() == false) { - // Store the calculated (and applied) edit distance/erro - ioMatchingDocument.setEditDistance (lCalculatedEditDistance); - - // Since a result has been found, the search can be stopped - // for that part of the query. - shouldStop = true; - break; - } - - // Allow for one more spelling error - ++lMaxEditDistance; - - /** - Stop when it is no longer necessary to increase the maximal - allowable edit distance, as it is already greater than the - maximum of the calculated edit distance. - */ - if (hasReachedMaximalAllowableEditDistance == true) { - shouldStop = true; - } - } - - } catch (const Xapian::Error& error) { - OPENTREP_LOG_ERROR ("Exception: " << error.get_msg()); - throw XapianException(); - } - - return oMatchedString; - } - - // ////////////////////////////////////////////////////////////////////// std::string ResultHolder::searchString (TravelQuery_T& ioPartialQueryString, Document& ioMatchingDocument) { std::string oMatchedString; @@ -186,8 +97,12 @@ << lPartialQueryString << "'"); Xapian::MSet lMatchingSet; - oMatchedString = searchString (lMatchingSet, lPartialQueryString, - ioMatchingDocument); + NbOfErrors_T lEditDistance; + NbOfErrors_T lAllowableEditDistance; + oMatchedString = + StringMatcher::searchString (lMatchingSet, lPartialQueryString, + ioMatchingDocument, lEditDistance, + lAllowableEditDistance, _database); if (oMatchedString.empty() == false) { // Create the corresponding document (from the Xapian MSet object) @@ -195,6 +110,11 @@ extractBestMatchingDocumentFromMSet (lMatchingSet, ioMatchingDocument); + // Note: the allowable edit distance/error, as well as the + // effective (Levenshtein) edit distance/error, have been + // set, in the Document object, by the above call to the + // searchString() method. + // Since a result has been found, the search can be stopped // for that part of the query. ioPartialQueryString = lPartialQueryString; @@ -277,12 +197,18 @@ ioDocumentList.push_back (lMatchingDocument); // DEBUG - const NbOfMatches_T lNbOfMatches = + const NbOfMatches_T& lNbOfMatches = lMatchingDocument.notifyIfExtraMatch(); + const NbOfErrors_T& lEditDistance = + lMatchingDocument.getEditDistance(); + const NbOfErrors_T& lAllowableEditDistance = + lMatchingDocument.getAllowableEditDistance(); OPENTREP_LOG_DEBUG ("==> " << lNbOfMatches << " main matches for the query string: `" - << lMatchedString << "' (from `" - << lQueryString << "')"); + << lMatchedString << "' (from `" << lQueryString + << "' -> Levenshtein edit distance of " + << lEditDistance << " over allowable " + << lAllowableEditDistance << ")"); /** Remove, from the lRemainingQueryString string, the part Modified: trunk/opentrep/opentrep/bom/StringMatcher.cpp =================================================================== --- trunk/opentrep/opentrep/bom/StringMatcher.cpp 2009-08-10 12:17:27 UTC (rev 172) +++ trunk/opentrep/opentrep/bom/StringMatcher.cpp 2009-08-10 16:23:33 UTC (rev 173) @@ -18,23 +18,20 @@ // OpenTREP #include <opentrep/bom/WordHolder.hpp> #include <opentrep/bom/StringMatcher.hpp> +#include <opentrep/bom/Levenshtein.hpp> #include <opentrep/service/Logger.hpp> namespace OPENTREP { - // ///////////// Type definitions ////////// - /** Edit distance (e.g., 2 or 3). */ - typedef unsigned int EditDistance_T; - /** Given the size of the phrase, determine the allowed edit distance for spelling purpose. For instance, an edit distance of 1 will be allowed on a 4-letter word, while an edit distance of 3 will be allowed on an 11-letter word. */ // ////////////////////////////////////////////////////////////////////// static unsigned int calculateEditDistance (const TravelQuery_T& iPhrase) { - EditDistance_T oEditDistance = 2; + NbOfErrors_T oEditDistance = 2; - const EditDistance_T lQueryStringSize = iPhrase.size(); + const NbOfErrors_T lQueryStringSize = iPhrase.size(); oEditDistance = lQueryStringSize / 4; return oEditDistance; @@ -46,11 +43,17 @@ // ////////////////////////////////////////////////////////////////////// static void createCorrectedWordList (const WordList_T& iOriginalWordList, WordList_T& ioCorrectedWordList, + NbOfErrors_T& ioEditDistance, + NbOfErrors_T& ioAllowableEditDistance, const Xapian::Database& iDatabase) { // Empty the target list ioCorrectedWordList.clear(); - + + // Re-set the edit distances/errors + ioEditDistance = 0; + ioAllowableEditDistance = 0; + // Catch any Xapian::Error exceptions thrown try { @@ -59,9 +62,13 @@ const std::string& lOriginalWord = *itWord; // Calculate the distance, depending on the length of the word - const EditDistance_T lCalculatedEditDistance = + const NbOfErrors_T lCalculatedEditDistance = calculateEditDistance (lOriginalWord); + // The allowable edit distance/error is considered to be the + // cumulated allowable edit distance/error over all the words + ioAllowableEditDistance += lCalculatedEditDistance; + // Get a spelling suggestion for that word const std::string& lSuggestedWord = iDatabase.get_spelling_suggestion (lOriginalWord, @@ -69,9 +76,16 @@ if (lSuggestedWord.empty() == true) { ioCorrectedWordList.push_back (lOriginalWord); - + // The edit distance is not modified (as the spelling was correct) + } else { ioCorrectedWordList.push_back (lSuggestedWord); + + // The edit distance/error increases from the Levenshtein + // edit distance/error + const NbOfErrors_T& lLevenshteinDistance = + Levenshtein::getDistance (lOriginalWord, lSuggestedWord); + ioAllowableEditDistance += lLevenshteinDistance; } // DEBUG @@ -114,7 +128,6 @@ // /////////////////////////////////////////////////////////////////// void checkAndAlterIfNeeded (TravelQuery_T& ioSuggestedString, const TravelQuery_T& iOriginalString, - NbOfErrors_T& ioCalculatedEditDistance, const NbOfErrors_T& iMaxEditDistance, const Xapian::Database& iDatabase) { @@ -129,15 +142,13 @@ Get a spell-corrected suggestion for the reduced original string. <br>Limit the edit distance to the given maximal one. */ - ioCalculatedEditDistance = calculateEditDistance (lOriginalStringCopy); + NbOfErrors_T lEditDistance = calculateEditDistance (lOriginalStringCopy); - ioCalculatedEditDistance = std::min (ioCalculatedEditDistance, - iMaxEditDistance); + lEditDistance = std::min (lEditDistance, iMaxEditDistance); std::string lSuggestionForReducedOriginalString = - iDatabase.get_spelling_suggestion (lOriginalStringCopy, - ioCalculatedEditDistance); - + iDatabase.get_spelling_suggestion (lOriginalStringCopy, lEditDistance); + /** Note that if the suggestion on the reduced-original string is empty, it normally means that the reduced-original string is @@ -153,9 +164,9 @@ // DEBUG OPENTREP_LOG_DEBUG ("The suggestion (`" << ioSuggestedString << "') for `" << iOriginalString - << "', with an edit distance/error of " - << ioCalculatedEditDistance - << " over " << iMaxEditDistance << " allowable" + << "', with an allowable edit distance/error of " + << lEditDistance + << " over a maximum of " << iMaxEditDistance << ", is the same as the suggestion for the reduced " << "original string (`" << lOriginalStringCopy << "') -> discarded."); @@ -174,7 +185,7 @@ } /** - Store a copy of the suggested string, as it will me altered by + Store a copy of the suggested string, as it will be altered by the below method. */ lOriginalStringCopy = iOriginalString; @@ -184,14 +195,13 @@ Get a spell-corrected suggestion for the reduced original string. <br>Limit the edit distance to the given maximal one. */ - ioCalculatedEditDistance = calculateEditDistance (lOriginalStringCopy); + lEditDistance = calculateEditDistance (lOriginalStringCopy); - ioCalculatedEditDistance = std::min (ioCalculatedEditDistance, - iMaxEditDistance); + lEditDistance = std::min (lEditDistance, iMaxEditDistance); lSuggestionForReducedOriginalString = iDatabase.get_spelling_suggestion (lOriginalStringCopy, - ioCalculatedEditDistance); + lEditDistance); /** Note that if the suggestion on the reduced-original string is @@ -208,9 +218,9 @@ // DEBUG OPENTREP_LOG_DEBUG ("The suggestion (`" << ioSuggestedString << "') for `" << iOriginalString - << "', with an edit distance/error of " - << ioCalculatedEditDistance - << " over " << iMaxEditDistance << " allowable" + << "', with an allowable edit distance/error of " + << lEditDistance + << " over a maximum of " << iMaxEditDistance << ", is the same as the suggestion for the reduced " << "original string (`" << lOriginalStringCopy << "') -> discarded."); @@ -225,15 +235,16 @@ } // /////////////////////////////////////////////////////////////////// - std::string StringMatcher:: - searchString (Xapian::MSet& ioMatchingSet, - const TravelQuery_T& iSearchString, - NbOfErrors_T& ioCalculatedEditDistance, - NbOfErrors_T& ioMaxEditDistance, - bool& ioHasReachedMaximalAllowableEditDistance, - const Xapian::Database& iDatabase) { - NbOfErrors_T lMaxEditDistance = std::numeric_limits<EditDistance_T>::min(); + std::string searchStringIter (Xapian::MSet& ioMatchingSet, + const TravelQuery_T& iSearchString, + NbOfErrors_T& ioEditDistance, + NbOfErrors_T& ioAllowableEditDistance, + const NbOfErrors_T& iMaxEditDistance, + bool& ioHasReachedMaximalAllowableEditDistance, + const Xapian::Database& iDatabase) { + NbOfErrors_T lMaxEditDistance = std::numeric_limits<NbOfErrors_T>::min(); + // Initialisation std::string oMatchedString; // Catch any Xapian::Error exceptions thrown @@ -289,6 +300,13 @@ << "'"); */ + // By default, as there can be a match without changing + // anything, the edit distance is null, and the allowable edit + // distance could be anything. It makes sense, though, to set it + // at the maximum. + ioEditDistance = 0; + ioAllowableEditDistance = iMaxEditDistance; + // Start an enquire session Xapian::Enquire enquire (iDatabase); @@ -330,10 +348,10 @@ of the calculated edit distance, it becomes useless to go on increasing the maximal allowable edit distance. */ - if (lMaxEditDistance <= ioMaxEditDistance) { + if (lMaxEditDistance <= iMaxEditDistance) { ioHasReachedMaximalAllowableEditDistance = true; } - + oMatchedString = lOriginalQueryString; return oMatchedString; } @@ -350,6 +368,7 @@ */ WordList_T lCorrectedWordList; createCorrectedWordList (lOriginalWordList, lCorrectedWordList, + ioEditDistance, ioAllowableEditDistance, iDatabase); const std::string lCorrectedQueryString = @@ -394,7 +413,7 @@ of the calculated edit distance, it becomes useless to go on increasing the maximal allowable edit distance. */ - if (lMaxEditDistance <= ioMaxEditDistance) { + if (lMaxEditDistance <= iMaxEditDistance) { ioHasReachedMaximalAllowableEditDistance = true; } @@ -408,27 +427,40 @@ phrase/string. With the above example, 'sna francisco' yields the suggestion 'san francisco'. */ - ioCalculatedEditDistance = calculateEditDistance (lOriginalQueryString); + ioEditDistance = calculateEditDistance (lOriginalQueryString); // Store the greatest edit distance/error - lMaxEditDistance = std::max (lMaxEditDistance, ioCalculatedEditDistance); + lMaxEditDistance = std::max (lMaxEditDistance, ioEditDistance); // Limit the edit distance to the given maximal one - ioCalculatedEditDistance = std::min (ioCalculatedEditDistance, - ioMaxEditDistance); + ioEditDistance = std::min (ioEditDistance, iMaxEditDistance); + + // Store the allowable edit distance/error + ioAllowableEditDistance = ioEditDistance; + // Let Xapian find a spelling correction (if any) std::string lFullWordCorrectedString = iDatabase.get_spelling_suggestion (lOriginalQueryString, - ioCalculatedEditDistance); + ioEditDistance); /** Check that the suggestion does not encompass extra words, which - will be otherwise/rather recognised in another step. + will be otherwise recognised in another step. + <br>See the comment of the checkAndAlterIfNeeded() function + for more details. */ checkAndAlterIfNeeded (lFullWordCorrectedString, lOriginalQueryString, - ioCalculatedEditDistance, ioMaxEditDistance, - iDatabase); + iMaxEditDistance, iDatabase); + + if (lFullWordCorrectedString.empty() == true) { + ioEditDistance = 0; + } else { + // Store the effective (Levenshtein) edit distance/error + ioEditDistance = Levenshtein::getDistance (lOriginalQueryString, + lFullWordCorrectedString); + } + /** Since there is still no match, we search on the string corrected as a whole. @@ -452,8 +484,12 @@ /* OPENTREP_LOG_DEBUG ("Query corrected as a full sentence `" << lFullWordCorrectedString - << "' with an allowable maximal edit distance of " - << ioMaxEditDistance + << "' with a Levenshtein edit distance of " + << ioEditDistance + << " over an allowable edit distance of " + << ioAllowableEditDistance + << " over a maximum of " + << iMaxEditDistance << " on a potential of " << lMaxEditDistance << ", i.e., `"<< lFullQueryCorrected.get_description() << "' => " << nbMatches @@ -480,7 +516,7 @@ of the calculated edit distance, it becomes useless to go on increasing the maximal allowable edit distance. */ - if (ioMaxEditDistance >= lMaxEditDistance) { + if (iMaxEditDistance >= lMaxEditDistance) { ioHasReachedMaximalAllowableEditDistance = true; } @@ -488,6 +524,104 @@ } // ////////////////////////////////////////////////////////////////////// + std::string StringMatcher:: + searchString (Xapian::MSet& ioMatchingSet, + const TravelQuery_T& iPartialQueryString, + Document& ioMatchingDocument, + NbOfErrors_T& ioEditDistance, + NbOfErrors_T& ioAllowableEditDistance, + const Xapian::Database& iDatabase) { + std::string oMatchedString; + + // Catch any Xapian::Error exceptions thrown + try { + + /** + The query string must first be checked, without allowing any + spelling errors, but by removing the furthest right word at + every step. + <br>If no match is found, the maximal allowable edit + distance/error becomes 1, and the process (trying to match + the whole sentence, then by removing the furthest right word, + etc.) is re-performed. + <br>If no match is found, the maximal allowable edit + distance/error becomes 2. + <br>And so on until the maximum of the edit distance/error + becomes greater than the maximal allowable distance/error. + reached. + + <br>NOTE: that feature is de-activated, as it seems it does + not bring any added value. To re-activate it, just initialise + the lMaxEditDistance to 0, instead of to the positive infinite. + */ + // NbOfErrors_T lMaxEditDistance = 0; + NbOfErrors_T lMaxEditDistance = std::numeric_limits<NbOfErrors_T>::max(); + + bool hasReachedMaximalAllowableEditDistance = false; + bool shouldStop = false; + while (shouldStop == false) { + + // DEBUG + OPENTREP_LOG_DEBUG ("--------"); + OPENTREP_LOG_DEBUG ("Current query string: `" + << iPartialQueryString + << "', with a maximal edit distance of " + << lMaxEditDistance << "."); + + // Retrieve the list of Xapian documents matching the query string + NbOfErrors_T lEditDistance; + NbOfErrors_T lAllowableEditDistance; + oMatchedString = + searchStringIter (ioMatchingSet, iPartialQueryString, lEditDistance, + lAllowableEditDistance, lMaxEditDistance, + hasReachedMaximalAllowableEditDistance, iDatabase); + + // DEBUG + OPENTREP_LOG_DEBUG ("---- Current query string: `" + << iPartialQueryString << "' --- Kept query: `" + << oMatchedString + << "', with a Levenshtein edit distance of " + << lEditDistance + << " over an allowable edit distance of " + << lAllowableEditDistance << " (over a maximum of " + << lMaxEditDistance << "), for " + << ioMatchingSet.size() << " matches."); + + if (ioMatchingSet.empty() == false) { + // Store the effective (Levenshtein) edit distance/error + ioMatchingDocument.setEditDistance (lEditDistance); + + // Store the allowable edit distance/error + ioMatchingDocument.setAllowableEditDistance (lAllowableEditDistance); + + // Since a result has been found, the search can be stopped + // for that part of the query. + shouldStop = true; + break; + } + + // Allow for one more spelling error + ++lMaxEditDistance; + + /** + Stop when it is no longer necessary to increase the maximal + allowable edit distance, as it is already greater than the + maximum of the calculated edit distance. + */ + if (hasReachedMaximalAllowableEditDistance == true) { + shouldStop = true; + } + } + + } catch (const Xapian::Error& error) { + OPENTREP_LOG_ERROR ("Exception: " << error.get_msg()); + throw XapianException(); + } + + return oMatchedString; + } + + // ////////////////////////////////////////////////////////////////////// void StringMatcher:: extractBestMatchingDocumentFromMSet (const Xapian::MSet& iMatchingSet, Document& ioMatchingDocument) { Modified: trunk/opentrep/opentrep/bom/StringMatcher.hpp =================================================================== --- trunk/opentrep/opentrep/bom/StringMatcher.hpp 2009-08-10 12:17:27 UTC (rev 172) +++ trunk/opentrep/opentrep/bom/StringMatcher.hpp 2009-08-10 16:23:33 UTC (rev 173) @@ -29,9 +29,13 @@ /** Search, within the Xapian database, for occurrences of the words of the search string. @param Xapian::MSet& The Xapian matching set. It can be empty. - @param const std::string& The query string. - @param NbOfErrors_T& The calculated (and applied) edit distance/error. - @param NbOfErrors_T& The maximal allowable edit distance/error. + @param const TravelQuery_T& The query string. + @param NbOfErrors_T& The Levenshtein edit distance/error. + @param NbOfErrors_T& The effective allowable edit distance/error, + as calculated as a function of the number of letters. + @param NbOfErrors_T& The maximal allowable edit distance/error for + that step/call. We always have: + ioEditDistance <= ioAllowableEditDistance <= iMaxEditDistance @param bool& Whether or not the maximal allowable edit distance/error has become greater than the maximum of the edit distance/errors calculated on the phrase. @@ -39,12 +43,12 @@ @return std::string The query string, potentially corrected, which has yielded matches. */ static std::string searchString (Xapian::MSet&, - const std::string& iSearchString, - NbOfErrors_T& ioCalculatedEditDistance, - NbOfErrors_T& ioMaxEditDistance, - bool& ioHasReachedMaximalAllowableEditDistance, + const TravelQuery_T& iQueryString, + Document& ioMatchingDocument, + NbOfErrors_T& ioEditDistance, + NbOfErrors_T& ioAllowableEditDistance, const Xapian::Database&); - + /** Extract the best matching Xapian document. <br>If there are several such best matching documents (for instance, several at, say, 100%), one is taken randomly. Well, Modified: trunk/opentrep/opentrep/bom/sources.mk =================================================================== --- trunk/opentrep/opentrep/bom/sources.mk 2009-08-10 12:17:27 UTC (rev 172) +++ trunk/opentrep/opentrep/bom/sources.mk 2009-08-10 16:23:33 UTC (rev 173) @@ -13,6 +13,7 @@ $(top_srcdir)/opentrep/bom/Result.hpp \ $(top_srcdir)/opentrep/bom/ResultList.hpp \ $(top_srcdir)/opentrep/bom/ResultHolder.hpp \ + $(top_srcdir)/opentrep/bom/Levenshtein.hpp \ $(top_srcdir)/opentrep/bom/StringMatcher.hpp bom_cc_sources = $(top_srcdir)/opentrep/bom/BomAbstract.cpp \ $(top_srcdir)/opentrep/bom/BomType.cpp \ @@ -25,4 +26,5 @@ $(top_srcdir)/opentrep/bom/Document.cpp \ $(top_srcdir)/opentrep/bom/Result.cpp \ $(top_srcdir)/opentrep/bom/ResultHolder.cpp \ + $(top_srcdir)/opentrep/bom/Levenshtein.cpp \ $(top_srcdir)/opentrep/bom/StringMatcher.cpp Modified: trunk/opentrep/opentrep/command/RequestInterpreter.cpp =================================================================== --- trunk/opentrep/opentrep/command/RequestInterpreter.cpp 2009-08-10 12:17:27 UTC (rev 172) +++ trunk/opentrep/opentrep/command/RequestInterpreter.cpp 2009-08-10 16:23:33 UTC (rev 173) @@ -143,6 +143,15 @@ bool hasRetrievedPlace = retrieveAndFillPlace (lDocument, ioSociSession, lPlace); + // Retrieve the effective (Levenshtein) edit distance/error, as + // well as the allowable edit distance/error, and store them in + // the Place object. + const NbOfErrors_T& lEditDistance = lDocument.getEditDistance(); + const NbOfErrors_T& lAllowableEditDistance = + lDocument.getAllowableEditDistance(); + lPlace.setEditDistance (lEditDistance); + lPlace.setAllowableEditDistance (lAllowableEditDistance); + // If there was no place corresponding to the place code with // the SQL database, an exception is thrown. Hence, here, by // construction, the place has been retrieved from the SQL @@ -181,6 +190,12 @@ // Same remark as above assert (hasRetrievedPlace == true); + // The extra matching Place object has the very same effective + // (Levenshtein) and allowable edit distances/errors as the + // main Place object. + lExtraPlace.setEditDistance (lEditDistance); + lExtraPlace.setAllowableEditDistance (lAllowableEditDistance); + // Insert the extra matching Place object within the dedicated // list within the main Place object FacPlace::initLinkWithExtraPlace (lPlace, lExtraPlace); @@ -214,6 +229,12 @@ // Same remark as above assert (hasRetrievedPlace == true); + // The extra matching Place object has the very same effective + // (Levenshtein) and allowable edit distances/errors as the + // main Place object. + lAlterPlace.setEditDistance (lEditDistance); + lAlterPlace.setAllowableEditDistance (lAllowableEditDistance); + // Insert the alternate matching Place object within the dedicated // list within the main Place object FacPlace::initLinkWithAlternatePlace (lPlace, lAlterPlace); Modified: trunk/opentrep/opentrep/python/pyopentrep.cpp =================================================================== --- trunk/opentrep/opentrep/python/pyopentrep.cpp 2009-08-10 12:17:27 UTC (rev 172) +++ trunk/opentrep/opentrep/python/pyopentrep.cpp 2009-08-10 16:23:33 UTC (rev 173) @@ -17,15 +17,30 @@ struct OpenTrepSearcher { public: - + /** Wrapper around the search use case. */ std::string search (const std::string& iTravelQuery) { - std::ostringstream oStr; + const bool areFullDetailsRequired = false; + return searchImpl (iTravelQuery, areFullDetailsRequired); + } + /** Wrapper around the search use case. */ + std::string searchWithFullDetails (const std::string& iTravelQuery) { + const bool areFullDetailsRequired = true; + return searchImpl (iTravelQuery, areFullDetailsRequired); + } + + private: + /** Wrapper around the search use case. */ + std::string searchImpl (const std::string& iTravelQuery, + const bool areFullDetailsRequired) { + std::ostringstream oNoDetailedStr; + std::ostringstream oDetailedStr; + // Sanity check if (_logOutputStream == NULL) { - oStr << "The log filepath is not valid." << std::endl; - return oStr.str(); + oNoDetailedStr << "The log filepath is not valid." << std::endl; + return oNoDetailedStr.str(); } assert (_logOutputStream != NULL); @@ -36,11 +51,13 @@ << std::endl; if (_opentrepService == NULL) { - oStr << "The OpenTREP service has not been initialised, i.e., " - << "the init() method has not been called correctly on the " - << "OpenTrepSearcher object. Please check that all the " - << "parameters are not empty and point to actual files."; - return oStr.str(); + oNoDetailedStr << "The OpenTREP service has not been initialised, " + << "i.e., the init() method has not been called " + << "correctly on the OpenTrepSearcher object. Please " + << "check that all the parameters are not empty and " + << "point to actual files."; + *_logOutputStream << oNoDetailedStr.str(); + return oNoDetailedStr.str(); } assert (_opentrepService != NULL); @@ -57,59 +74,79 @@ if (nbOfMatches != 0) { NbOfMatches_T idx = 0; + for(LocationList_T::const_iterator itLocation = lLocationList.begin(); itLocation != lLocationList.end(); ++itLocation, ++idx) { const Location& lLocation = *itLocation; + if (idx != 0) { - oStr << ","; + oNoDetailedStr << ","; } - oStr << lLocation.getLocationCode(); + + oNoDetailedStr << lLocation.getLocationCode(); + oDetailedStr << idx+1 << ". " << lLocation.toShortString() + << std::endl; // List of extra matching locations (those with the same // matching weight/percentage) const LocationList_T& lExtraLocationList = lLocation.getExtraLocationList(); if (lExtraLocationList.empty() == false) { + oDetailedStr << " Extra matches: " << std::endl; + + NbOfMatches_T idxExtra = 0; for (LocationList_T::const_iterator itLoc = lExtraLocationList.begin(); - itLoc != lExtraLocationList.end(); ++itLoc) { - oStr << ":"; + itLoc != lExtraLocationList.end(); ++itLoc, ++idxExtra) { + oNoDetailedStr << ":"; + oDetailedStr << " " << idx+1 << "." << idxExtra+1 << ". "; + const Location& lExtraLocation = *itLoc; - oStr << lExtraLocation.getLocationCode(); + oNoDetailedStr << lExtraLocation.getLocationCode(); + oDetailedStr << lExtraLocation << std::endl; } } // The matching weight/percentage is the same for the main // and the extra matching locations - oStr << "/" << lLocation.getPercentage(); + oNoDetailedStr << "/" << lLocation.getPercentage(); // List of alternate matching locations (those with a lower // matching weight/percentage) const LocationList_T& lAlternateLocationList = lLocation.getAlternateLocationList(); if (lAlternateLocationList.empty() == false) { + oDetailedStr << " Alternate matches: " << std::endl; + + NbOfMatches_T idxAlter = 0; for (LocationList_T::const_iterator itLoc = lAlternateLocationList.begin(); - itLoc != lAlternateLocationList.end(); ++itLoc) { - oStr << "-"; + itLoc != lAlternateLocationList.end(); ++itLoc, ++idxAlter) { + oNoDetailedStr << "-"; + oDetailedStr << " " << idx+1 << "." << idxAlter+1 << ". "; + const Location& lAlternateLocation = *itLoc; - oStr << lAlternateLocation.getLocationCode() - << "/" << lAlternateLocation.getPercentage(); + oNoDetailedStr << lAlternateLocation.getLocationCode() + << "/" << lAlternateLocation.getPercentage(); + oDetailedStr << lAlternateLocation << std::endl; } } } } if (lNonMatchedWordList.empty() == false) { - oStr << ";"; + oNoDetailedStr << ";"; + oDetailedStr << "Not recognised words:" << std::endl; NbOfMatches_T idx = 0; for (WordList_T::const_iterator itWord = lNonMatchedWordList.begin(); itWord != lNonMatchedWordList.end(); ++itWord, ++idx) { const Word_T& lWord = *itWord; if (idx != 0) { - oStr << ","; + oNoDetailedStr << ","; + oDetailedStr << idx+1 << "." << std::endl; } - oStr << lWord; + oNoDetailedStr << lWord; + oDetailedStr << lWord; } } @@ -118,7 +155,10 @@ << "' yielded:" << std::endl; // DEBUG - *_logOutputStream << oStr.str() << std::endl; + *_logOutputStream << "Short version: " + << oNoDetailedStr.str() << std::endl; + *_logOutputStream << "Long version: " + << oDetailedStr.str() << std::endl; } catch (const RootException& eOpenTrepError) { *_logOutputStream << "OpenTrep error: " << eOpenTrepError.what() @@ -130,8 +170,14 @@ } catch (...) { *_logOutputStream << "Unknown error" << std::endl; } - - return oStr.str(); + + // Return the string corresponding to the request (either with + // or without details). + if (areFullDetailsRequired == true) { + return oDetailedStr.str(); + } else { + return oNoDetailedStr.str(); + } } public: @@ -213,5 +259,6 @@ BOOST_PYTHON_MODULE(libpyopentrep) { boost::python::class_<OPENTREP::OpenTrepSearcher> ("OpenTrepSearcher") .def ("search", &OPENTREP::OpenTrepSearcher::search) + .def ("searchWithFullDetails", &OPENTREP::OpenTrepSearcher::searchWithFullDetails) .def ("init", &OPENTREP::OpenTrepSearcher::init); } Modified: trunk/opentrep/opentrep/python/pyopentrep.py =================================================================== --- trunk/opentrep/opentrep/python/pyopentrep.py 2009-08-10 12:17:27 UTC (rev 172) +++ trunk/opentrep/opentrep/python/pyopentrep.py 2009-08-10 16:23:33 UTC (rev 173) @@ -13,13 +13,14 @@ # If no search string was supplied as arguments of the command-line, # ask the user for some -searchString = sys.argv[1:] +searchString = ' '.join(sys.argv[1:]) if searchString == '': # Ask for the user input searchString = raw_input('Enter a search string, or just Enter for the default one (' + defaultSearchString + '): ') if searchString == '' : searchString = defaultSearchString # Call the OpenTrep C++ library -result = openTrepLibrary.search(searchString) +#result = openTrepLibrary.search(searchString) +result = openTrepLibrary.searchWithFullDetails(searchString) print 'Result:' print result This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-08-10 12:17:34
|
Revision: 172 http://opentrep.svn.sourceforge.net/opentrep/?rev=172&view=rev Author: denis_arnaud Date: 2009-08-10 12:17:27 +0000 (Mon, 10 Aug 2009) Log Message: ----------- [Dev] Just added the allowable edit distance to the Document class. Modified Paths: -------------- trunk/opentrep/opentrep/bom/Document.cpp trunk/opentrep/opentrep/bom/Document.hpp Modified: trunk/opentrep/opentrep/bom/Document.cpp =================================================================== --- trunk/opentrep/opentrep/bom/Document.cpp 2009-08-08 14:55:34 UTC (rev 171) +++ trunk/opentrep/opentrep/bom/Document.cpp 2009-08-10 12:17:27 UTC (rev 172) @@ -38,8 +38,8 @@ const Xapian::docid& lDocID = _document.get_docid(); oStr << " => Document ID " << lDocID << " matching at " << _percentage - << "% (edit distance of " << _editDistance << ") [" - << _document.get_data() << "]"; + << "% (edit distance of " << _editDistance << " over " + << _allowableEditDistance << ") [" << _document.get_data() << "]"; if (_documentList.empty() == false) { oStr << " along with " << _documentList.size() Modified: trunk/opentrep/opentrep/bom/Document.hpp =================================================================== --- trunk/opentrep/opentrep/bom/Document.hpp 2009-08-08 14:55:34 UTC (rev 171) +++ trunk/opentrep/opentrep/bom/Document.hpp 2009-08-10 12:17:27 UTC (rev 172) @@ -54,6 +54,17 @@ return _percentage; } + /** Get the edit distance/error, with which the matching has been made. */ + const NbOfErrors_T& getEditDistance () const { + return _editDistance; + } + + /** Get the maximal allowable edit distance/error, with which the + matching has been made. */ + const NbOfErrors_T& getAllowableEditDistance () const { + return _allowableEditDistance; + } + /** Get the extra list of matching Xapian documents (i.e., those having matched with the same weight as the main one). */ const XapianDocumentList_T& getExtraDocumentList() const { @@ -93,6 +104,12 @@ _editDistance = iEditDistance; } + /** Set the maxiaml allowable edit distance/error, with which the + matching has been made. */ + void setAllowableEditDistance (const NbOfErrors_T& iAllowableEditDistance) { + _allowableEditDistance = iAllowableEditDistance; + } + /** Add a matching Xapian document (having the same matching percentage). */ void addExtraDocument (const Xapian::Document& iMatchingDocument) { _documentList.push_back (iMatchingDocument); @@ -172,6 +189,10 @@ /** Edit distance/error, with which the matching has been made. */ NbOfErrors_T _editDistance; + /** Maximum allowable edit distance/error, with which the matching + has been made. */ + NbOfErrors_T _allowableEditDistance; + /** List of Xapian documents having the same matching percentage. <br>Hence, any of those other Xapian documents could have been chosen, instead of the main one. */ This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-08-08 14:55:48
|
Revision: 171 http://opentrep.svn.sourceforge.net/opentrep/?rev=171&view=rev Author: denis_arnaud Date: 2009-08-08 14:55:34 +0000 (Sat, 08 Aug 2009) Log Message: ----------- 1. [Dev] Finished the work on bringing extra and additional Location objects into the API. 2. [DB] In the search batch, looking in the database is now based on the airport/city code, rather than on the Xapian document ID. That way, no database update is necessary when re-indexing, and any search from any Xapian index will find the corresponding result details within the database. The easiest way is to extract the first three letters of the Xapian document data. 3. [Dev] Wrote a (Python-based) PSP page in order to render in HTML the output of the search batch. There is still some work to do in order to adapt it to the new API (with extra and alternate locations). Modified Paths: -------------- trunk/opentrep/Makefile.am trunk/opentrep/config/soci.m4 trunk/opentrep/configure.ac trunk/opentrep/db/data/ref_place_names.csv trunk/opentrep/opentrep/Location.hpp trunk/opentrep/opentrep/OPENTREP_Service.hpp trunk/opentrep/opentrep/OPENTREP_Types.hpp trunk/opentrep/opentrep/batches/opentrep_indexer.cfg trunk/opentrep/opentrep/batches/opentrep_searcher.cfg trunk/opentrep/opentrep/batches/searcher.cpp trunk/opentrep/opentrep/bom/Place.cpp trunk/opentrep/opentrep/bom/Place.hpp trunk/opentrep/opentrep/bom/ResultHolder.cpp trunk/opentrep/opentrep/bom/StringMatcher.cpp trunk/opentrep/opentrep/bom/StringMatcher.hpp trunk/opentrep/opentrep/command/DBManager.cpp trunk/opentrep/opentrep/command/DBManager.hpp trunk/opentrep/opentrep/command/IndexBuilder.cpp trunk/opentrep/opentrep/command/RequestInterpreter.cpp trunk/opentrep/opentrep/factory/FacPlace.cpp trunk/opentrep/opentrep/python/pyopentrep.cpp trunk/opentrep/opentrep/python/pyopentrep.py trunk/opentrep/test/i18n/Makefile.am Added Paths: ----------- trunk/opentrep/TODO trunk/opentrep/config/ax_icu.m4 trunk/opentrep/gui/ trunk/opentrep/gui/Makefile.am trunk/opentrep/gui/icons/ trunk/opentrep/gui/icons/Makefile.am trunk/opentrep/gui/icons/opentrep.png trunk/opentrep/gui/icons/opentrep.xcf trunk/opentrep/gui/icons/sources.mk trunk/opentrep/gui/psp/ trunk/opentrep/gui/psp/Makefile.am trunk/opentrep/gui/psp/index.html trunk/opentrep/gui/psp/libpyopentrep_proxy.py trunk/opentrep/gui/psp/localize.py trunk/opentrep/gui/psp/log_service.py trunk/opentrep/gui/psp/opentrep.psp trunk/opentrep/gui/psp/result_parser.py trunk/opentrep/gui/psp/sources.mk trunk/opentrep/opentrep/LocationList.hpp trunk/opentrep/test/i18n/icufmt.cpp trunk/opentrep/test/i18n/ref/ trunk/opentrep/test/i18n/ref/ref_text_en.txt trunk/opentrep/test/i18n/ref/ref_text_ru.txt trunk/opentrep/test/i18n/ref/ref_text_ru_koi8r.txt trunk/opentrep/test/i18n/ref/ref_text_ru_koi8ru.txt trunk/opentrep/test/i18n/ref/ref_text_ru_windows_1251.txt trunk/opentrep/test/i18n/ref/ref_text_ua.txt trunk/opentrep/test/i18n/ref/ref_text_ua_koi8r.txt trunk/opentrep/test/i18n/ref/ref_text_ua_koi8u.txt trunk/opentrep/test/i18n/ref/ref_text_ua_windows_1251.txt trunk/opentrep/test/i18n/simple_io.cpp Property Changed: ---------------- trunk/opentrep/ trunk/opentrep/test/i18n/ Property changes on: trunk/opentrep ___________________________________________________________________ Modified: svn:ignore - configure config.log config.status autom4te.cache aclocal.m4 ABOUT-NLS INSTALL COPYING libtool Makefile.in Makefile opentrep.spec opentrep-config opentrep.m4 opentrep.pc opentrep-*.*.*.tar.* opentrep-html-doc-*.*.*.tar.* + configure config.log config.status autom4te.cache aclocal.m4 ABOUT-NLS INSTALL COPYING libtool Makefile.in Makefile opentrep.spec opentrep-config opentrep.m4 opentrep.pc opentrep-*.*.*.tar.* opentrep-html-doc-*.*.*.tar.* psp.tar.* Modified: trunk/opentrep/Makefile.am =================================================================== --- trunk/opentrep/Makefile.am 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/Makefile.am 2009-08-08 14:55:34 UTC (rev 171) @@ -24,7 +24,8 @@ EXTRA_DIST = @PACKAGE@.spec @PACKAGE@.m4 @PACKAGE@.pc Makefile.common # Build in these directories: -SUBDIRS = opentrep win32 po man $(INFO_DOC_DIR) $(HTML_DOC_DIR) db $(TEST_DIR) +SUBDIRS = @PACKAGE@ win32 po man $(INFO_DOC_DIR) $(HTML_DOC_DIR) db \ + gui $(TEST_DIR) # Configuration helpers @@ -43,8 +44,10 @@ dist-html: $(MAKE) -C doc dist-html +dist-gui: + $(MAKE) -C gui dist-gui -snapshot: snapshot-src snapshot-html +snapshot: snapshot-src snapshot-html snapshot-gui snapshot-src: @@ -53,8 +56,11 @@ snapshot-html: $(MAKE) -C doc dist-html html_tarname=@PACKAGE_TARNAME@-html-doc-`date +"%Y%m%d"` -upload: upload-src upload-html +snapshot-gui: + $(MAKE) -C gui dist-gui +upload: upload-src upload-html upload-gui + upload-src: dist @UPLOAD_COMMAND@ @PACKAGE_TARNAME@-@VERSION@.tar.gz \ @PACKAGE_TARNAME@-@VERSION@.tar.bz2 @@ -63,3 +69,6 @@ @UPLOAD_COMMAND@ @PACKAGE_TARNAME@-html-doc-@VERSION@.tar.gz \ @PACKAGE_TARNAME@-html-doc-@VERSION@.tar.bz2 +upload-gui: dist-gui + @UPLOAD_COMMAND@ @PACKAGE_TARNAME@-gui-@VERSION@.tar.gz \ + @PACKAGE_TARNAME@-gui-@VERSION@.tar.bz2 Added: trunk/opentrep/TODO =================================================================== --- trunk/opentrep/TODO (rev 0) +++ trunk/opentrep/TODO 2009-08-08 14:55:34 UTC (rev 171) @@ -0,0 +1,20 @@ +Todo list for the OpenTrep project +---------------------------------- + +* [01/08/2009] Finish the work on bringing extra and additional + Location objects into the API. +OK + +* [01/08/2009] In the search batch, when looking in the database, do + it based on the airport/city code, rather than on the Xapian + document ID. That way, no database update will be necessary when + re-indexing, and any search from any Xapian index will find the + corresponding result details within the database. The easiest way is + to extract the first three letters of the Xapian document data. +OK + +* [01/08/2009] Write a (Python-based) PSP page, in order to test the + different locales of the browsers. +The Python (PSP) page has been created, but there is still some work +to do in order to adapt it to the new API (with extra and alternate +locations). \ No newline at end of file Added: trunk/opentrep/config/ax_icu.m4 =================================================================== --- trunk/opentrep/config/ax_icu.m4 (rev 0) +++ trunk/opentrep/config/ax_icu.m4 2009-08-08 14:55:34 UTC (rev 171) @@ -0,0 +1,160 @@ +dnl @synopsis AX_ICU +dnl +dnl This macro tries to find Icu C API header and library locations. +dnl +dnl We define the following configure script flags: +dnl +dnl --with-icu: Give prefix for both library and headers, and try +dnl to guess subdirectory names for each. (e.g. Tack /lib and +dnl /include onto given dir name, and other common schemes.) +dnl --with-icu-lib: Similar to --with-icu, but for library only. +dnl --with-icu-include: Similar to --with-icu, but for headers +dnl only. +dnl +dnl @version 1.2, 2007/02/20 +dnl @author Warren Young <ic...@et...> + +AC_DEFUN([AX_ICU], +[ + # + # Set up configure script macros + # + AC_ARG_WITH(icu, + [ --with-icu=<path> root directory path of Icu installation], + [ICU_lib_check="$with_icu/lib64/icu $with_icu/lib/icu $with_icu/lib64 $with_icu/lib" + ICU_inc_check="$with_icu/include $with_icu/include/icu" + ICU_bin_check="$with_icu/bin"], + [ICU_lib_check="/usr/lib64 /usr/lib /usr/lib64/icu /usr/lib/icu /usr/local/lib64 /usr/local/lib /usr/local/lib/icu /usr/local/icu/lib /usr/local/icu/lib/icu /opt/icu/lib /opt/icu/lib/icu" + ICU_inc_check="/usr/include /usr/local/include /usr/local/icu/include /opt/icu/include" + ICU_bin_check="/usr/bin /usr/local/bin /usr/local/icu/bin"]) + + AC_ARG_WITH(icu-lib, + [ --with-icu-lib=<path> directory path of Icu library installation], + [ICU_lib_check="$with_icu_lib $with_icu_lib/lib64 $with_icu_lib/lib $with_icu_lib/lib64/icu $with_icu_lib/lib/icu"]) + + AC_ARG_WITH(icu-include, + [ --with-icu-include=<path> directory path of Icu header installation], + [ICU_inc_check="$with_icu_include $with_icu_include/include $with_icu_include/include/icu"]) + + + # + # Look for Icu Configuration Script + # + AC_MSG_CHECKING([for Icu configuration script]) + ICU_CONFIG= + ICU_bindir= + for m in $ICU_bin_check + do + if test -d "$m" && test -f "$m/icu-config" + then + ICU_CONFIG=$m/icu-config + ICU_bindir=$m + break + fi + done + + if test -z "$ICU_bindir" + then + AC_MSG_ERROR([Didn't find $ICU_CONFIG binary in '$ICU_bin_check']) + fi + + case "$ICU_bindir" in + /* ) ;; + * ) AC_MSG_ERROR([The Icu binary directory ($ICU_bindir) must be an absolute path.]) ;; + esac + + AC_MSG_RESULT([$ICU_bindir]) + + AC_PATH_PROG(ICU_CONFIG, icu-config, $ICU_bindir) + + if test "x${ICU_CONFIG+set}" != xset + then + ICU_VERSION=`${ICU_CONFIG} --version` + ICU_CFLAGS=`${ICU_CONFIG} --cppflags` + ICU_LIBS=`${ICU_CONFIG} --ldflags` + else + # + # Look for Icu C API library + # + AC_MSG_CHECKING([for Icu library directory]) + ICU_libdir= + ICU_IO_LIB=icuio + for m in $ICU_lib_check + do + if test -d "$m" && \ + (test -f "$m/lib$ICU_IO_LIB.so" \ + || test -f "$m/lib$ICU_IO_LIB.a") + then + ICU_libdir=$m + break + fi + done + + if test -z "$ICU_libdir" + then + AC_MSG_ERROR([Didn't find $ICU_IO_LIB library in '$ICU_lib_check']) + fi + + case "$ICU_libdir" in + /* ) ;; + * ) AC_MSG_ERROR([The Icu library directory ($ICU_libdir) must be an absolute path.]) ;; + esac + + AC_MSG_RESULT([$ICU_libdir]) + + case "$ICU_libdir" in + /usr/lib64) ;; + /usr/lib) ;; + *) LDFLAGS="$LDFLAGS -L${ICU_libdir}" ;; + esac + + # + # Look for Icu C API headers + # + AC_MSG_CHECKING([for Icu include directory]) + ICU_incdir= + for m in $ICU_inc_check + do + if test -d "$m" && test -f "$m/unicode/utf8.h" + then + ICU_incdir=$m + break + fi + done + + if test -z "$ICU_incdir" + then + AC_MSG_ERROR([Didn't find the Icu include dir in '$ICU_inc_check']) + fi + + case "$ICU_incdir" in + /* ) ;; + * ) AC_MSG_ERROR([The Icu include directory ($ICU_incdir) must be an absolute path.]) ;; + esac + + AC_MSG_RESULT([$ICU_incdir]) + + ICU_CFLAGS="-D_REENTRANT -I${ICU_incdir}" + ICU_LIBS="-licui18n -licuuc -licudata -lpthread -lm" + + case "$ICU_libdir" in + /usr/lib64) ;; + /usr/lib) ;; + *) ICU_LIBS="-L${ICU_libdir} $ICU_LIBS" ;; + esac + fi + + AC_SUBST(ICU_VERSION) + AC_SUBST(ICU_CFLAGS) + AC_SUBST(ICU_LIBS) + + save_LIBS="$LIBS" + LIBS="$LIBS $ICU_LIBS" +# AC_CHECK_LIB($ICU_IO_LIB, utext_isWritable, +# [], +# [AC_MSG_ERROR([Could not find working Icu client library!])] +# ) + ICU_IO_LIB="-l${ICU_IO_LIB}" + AC_SUBST(ICU_IO_LIB) + LIBS="$save_LIBS" +]) dnl AX_ICU Modified: trunk/opentrep/config/soci.m4 =================================================================== --- trunk/opentrep/config/soci.m4 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/config/soci.m4 2009-08-08 14:55:34 UTC (rev 171) @@ -60,9 +60,9 @@ SOCI_CORE_LIB=${SOCI_CORE_LIB}-${SOCI_LIB_SUFFIX} SOCI_MYSQL_LIB=${SOCI_MYSQL_LIB}-${SOCI_LIB_SUFFIX} SOCI_libdir=$m + break fi done - break fi done Modified: trunk/opentrep/configure.ac =================================================================== --- trunk/opentrep/configure.ac 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/configure.ac 2009-08-08 14:55:34 UTC (rev 171) @@ -147,7 +147,16 @@ AC_SUBST(XAPIAN_CFLAGS) AC_SUBST(XAPIAN_LIBS) +# -------------------------------------------------------------------- +# Support for ICU (i18n C API): http://www.icu-project.org +# -------------------------------------------------------------------- +AX_ICU +AC_SUBST(ICU_VERSION) +AC_SUBST(ICU_CFLAGS) +AC_SUBST(ICU_LIBS) +AC_SUBST(ICU_IO_LIB) + # ------------------------------------------------------------------- # Support for documentation # ------------------------------------------------------------------- @@ -249,6 +258,9 @@ db/maintenance/Makefile db/maintenance/tables/Makefile db/data/Makefile + gui/Makefile + gui/icons/Makefile + gui/psp/Makefile test/com/Makefile test/parsers/Makefile test/i18n/Makefile @@ -327,6 +339,12 @@ o XAPIAN_CFLAGS ... : ${XAPIAN_CFLAGS} o XAPIAN_LIBS ..... : ${XAPIAN_LIBS} + - ICU ............... : + o ICU_version ..... : ${ICU_VERSION} + o ICU_CFLAGS ...... : ${ICU_CFLAGS} + o ICU_LIBS ........ : ${ICU_LIBS} + o ICU_IO_LIB ...... : ${ICU_IO_LIB} + - CPPUNIT ........... : o CPPUNIT_VERSION . : ${CPPUNIT_VERSION} o CPPUNIT_CFLAGS .. : ${CPPUNIT_CFLAGS} Modified: trunk/opentrep/db/data/ref_place_names.csv =================================================================== --- trunk/opentrep/db/data/ref_place_names.csv 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/db/data/ref_place_names.csv 2009-08-08 14:55:34 UTC (rev 171) @@ -6253,7 +6253,7 @@ en,yvo,val d'or,val d'or/qc/ca en,yvp,kuujjuaq,kuujjuaq/qc/ca en,yvq,norman wells,norman wells/nt/ca -en,yvr,vancouver int,vancouver/bc/ca:intl +en,yvr,vancouver int,vancouver/bc/ca:intl,vancouver en,yvs,ski rail station,ski/no:ski rail station en,yvt,buffalo narrows,buffalo narrows/sk/ca en,yvv,wiarton,wiarton/on/ca @@ -9446,7 +9446,7 @@ en,xdx,sarnia,sarnia/on/ca:railway station en,xdy,sudbury,sudbury/on/ca:junction rail st en,xdz,the pas,the pas/mb/ca:railway station -en,xea,vancouver,vancouver/bc/ca:railway statio +en,xea,vancouver railway,vancouver/bc/ca:railway statio en,xeb,evian les bains,evian les bains/fr:off- en,xec,windsor,windsor/on/ca:railway station en,xed,disneyland paris,paris/fr:disneyland paris Property changes on: trunk/opentrep/gui ___________________________________________________________________ Added: svn:ignore + .libs .deps Makefile Makefile.in Added: trunk/opentrep/gui/Makefile.am =================================================================== --- trunk/opentrep/gui/Makefile.am (rev 0) +++ trunk/opentrep/gui/Makefile.am 2009-08-08 14:55:34 UTC (rev 171) @@ -0,0 +1,27 @@ +# Python Server Pages (PSP) + +SUBDIRS = icons psp + +MAINTAINERCLEANFILES = Makefile.in Makefile + +datadir = @datadir@ +pkgdatadir = $(datadir)/@PACKAGE@ +guidir = $(pkgdatadir)/gui + +psp_sources = psp +psp_dests = $(foreach ext,.tar.gz .tar.bz2,$(addsuffix $(ext),$(psp_sources))) + +# Targets +$(top_builddir)/%.tar.gz $(builddir)/%.tar.gz: %/*.html %/*.py %/*.psp %/../icons/*.png + tar chof - $^ | gzip --best -c > $@ + +$(top_builddir)/%.tar.bz2 $(builddir)/%.tar.bz2: %/*.html %/*.py %/*.psp %/../icons/*.png + tar chof - $^ | bzip2 -9 -c > $@ + +dist-gui: $(addprefix $(top_builddir)/,$(psp_dests)) + +clean-local: + rm -f $(addprefix $(top_builddir)/,$(psp_dests)) + +snapshot-gui: + $(MAKE) dist-gui gui_tarname=@PACKAGE_TARNAME@-gui-`date +"%Y%m%d"` Property changes on: trunk/opentrep/gui/icons ___________________________________________________________________ Added: svn:ignore + .libs .deps Makefile Makefile.in Added: trunk/opentrep/gui/icons/Makefile.am =================================================================== --- trunk/opentrep/gui/icons/Makefile.am (rev 0) +++ trunk/opentrep/gui/icons/Makefile.am 2009-08-08 14:55:34 UTC (rev 171) @@ -0,0 +1,25 @@ +# gui/icons sub-directory: Images (.png, .gif, etc) +include $(srcdir)/sources.mk + +datadir = @datadir@ +pkgdatadir = $(datadir)/@PACKAGE@ +imgdir = $(pkgdatadir)/gui/icons + +MAINTAINERCLEANFILES = Makefile.in Makefile + +noinst_DATA = $(img_sources) + +EXTRA_DIST = $(noinst_DATA) + +# Targets +install-data-local: + $(mkinstalldirs) $(DESTDIR)$(imgdir); \ + for f in $(noinst_DATA); do \ + $(INSTALL_DATA) $$f $(DESTDIR)$(imgdir); \ + done + +uninstall-local: + rm -rf $(DESTDIR)$(imgdir) + +clean-local: + rm -rf *.log *.tag Added: trunk/opentrep/gui/icons/opentrep.png =================================================================== (Binary files differ) Property changes on: trunk/opentrep/gui/icons/opentrep.png ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: trunk/opentrep/gui/icons/opentrep.xcf =================================================================== (Binary files differ) Property changes on: trunk/opentrep/gui/icons/opentrep.xcf ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: trunk/opentrep/gui/icons/sources.mk =================================================================== --- trunk/opentrep/gui/icons/sources.mk (rev 0) +++ trunk/opentrep/gui/icons/sources.mk 2009-08-08 14:55:34 UTC (rev 171) @@ -0,0 +1 @@ +img_sources = $(top_srcdir)/gui/icons/opentrep.png Property changes on: trunk/opentrep/gui/psp ___________________________________________________________________ Added: svn:ignore + .libs .deps Makefile Makefile.in Added: trunk/opentrep/gui/psp/Makefile.am =================================================================== --- trunk/opentrep/gui/psp/Makefile.am (rev 0) +++ trunk/opentrep/gui/psp/Makefile.am 2009-08-08 14:55:34 UTC (rev 171) @@ -0,0 +1,25 @@ +# Python Server Pages (PSP) +include $(srcdir)/sources.mk + +datadir = @datadir@ +pkgdatadir = $(datadir)/@PACKAGE@ +pspdir = $(pkgdatadir)/gui/psp + +MAINTAINERCLEANFILES = Makefile.in Makefile + +noinst_DATA = $(html_sources) $(py_sources) $(psp_sources) + +EXTRA_DIST = $(noinst_DATA) + +# Targets +install-data-local: + $(mkinstalldirs) $(DESTDIR)$(pspdir); \ + for f in $(noinst_DATA); do \ + $(INSTALL_DATA) $$f $(DESTDIR)$(pspdir); \ + done + +uninstall-local: + rm -rf $(DESTDIR)$(pspdir) + +clean-local: + rm -rf *.log *.tag Added: trunk/opentrep/gui/psp/index.html =================================================================== --- trunk/opentrep/gui/psp/index.html (rev 0) +++ trunk/opentrep/gui/psp/index.html 2009-08-08 14:55:34 UTC (rev 171) @@ -0,0 +1,14 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head> +<meta http-equiv="content-type" content="text/html; charset=UTF-8" /> +<meta http-equiv="refresh" + content="0; url=http://localhost/opentrep/opentrep.psp" /> +<title>Redirection</title> +<meta name="robots" content="noindex,follow" /> +</head> + +<body> +</body> +</html> Added: trunk/opentrep/gui/psp/libpyopentrep_proxy.py =================================================================== --- trunk/opentrep/gui/psp/libpyopentrep_proxy.py (rev 0) +++ trunk/opentrep/gui/psp/libpyopentrep_proxy.py 2009-08-08 14:55:34 UTC (rev 171) @@ -0,0 +1,8 @@ +#!/usr/bin/python + +import sys + +def import_libpyopentrep(libpyopentrep_path): + sys.path.append(libpyopentrep_path) + import libpyopentrep + return libpyopentrep Added: trunk/opentrep/gui/psp/localize.py =================================================================== --- trunk/opentrep/gui/psp/localize.py (rev 0) +++ trunk/opentrep/gui/psp/localize.py 2009-08-08 14:55:34 UTC (rev 171) @@ -0,0 +1,14 @@ +#!/usr/bin/python + +import socket + +www_log_filename = '/var/log/opentrep/www.log' +trep_log_filename = '/var/log/opentrep/opentrep.log' +tmp_trep_log_filename = '/var/log/opentrep/tmp_opentrep.log' + +hostname = socket.gethostname() +main_name = hostname.split('.')[0] + +traveldb_path = '/var/www/opentrep/traveldb' +libpyopentrep_path = '/tmp/opentrep/lib' +opentrep_dbparams = {'user': 'opentrep', 'password': 'opentrep', 'host': 'localhost', 'port': '3306', 'db': 'trep_opentrep'} Added: trunk/opentrep/gui/psp/log_service.py =================================================================== --- trunk/opentrep/gui/psp/log_service.py (rev 0) +++ trunk/opentrep/gui/psp/log_service.py 2009-08-08 14:55:34 UTC (rev 171) @@ -0,0 +1,25 @@ +#!/usr/bin/python + +import socket, os, datetime + +def log(filename, req, query, codes, unrecognized): + req.add_common_vars() + # determine ip + remote_client_ip = req.connection.remote_ip + # determine hostname + hostname = req.connection.remote_host + if hostname == None: hostname = 'localhost' + # determine time + str_time = datetime.datetime.now().strftime('%y%m%d%H%M%S') + # determine user agent + agent = '' + if req.subprocess_env.has_key("HTTP_USER_AGENT"): agent = req.subprocess_env["HTTP_USER_AGENT"] + # determine user allowed languages + languages = '' + if req.subprocess_env.has_key("HTTP_ACCEPT_LANGUAGE"): languages = req.subprocess_env["HTTP_ACCEPT_LANGUAGE"] + # determine user allowed character sets + charsets = '' + if req.subprocess_env.has_key("HTTP_ACCEPT_CHARSET"): charsets = req.subprocess_env["HTTP_ACCEPT_CHARSET"] + # write to file + str_out = '^'.join([str_time,remote_client_ip, hostname, query, ','.join(codes), unrecognized, agent, languages, charsets]) + os.system('echo "%s" >> %s' % (str_out, filename)) Added: trunk/opentrep/gui/psp/opentrep.psp =================================================================== --- trunk/opentrep/gui/psp/opentrep.psp (rev 0) +++ trunk/opentrep/gui/psp/opentrep.psp 2009-08-08 14:55:34 UTC (rev 171) @@ -0,0 +1,80 @@ +<% +import os +local_path = '/var/www/opentrep' +from mod_python import apache +localize = apache.import_module('localize', path=[local_path]) +log_service = apache.import_module('log_service', path=[local_path]) + +# defaults +msg, head, form_value, unrecognized = '', '', '', '' +#body_declaration = '<body>' +quiet = True + +# parsing: recognize sequence of three-letter codes +codes = [] +alter_locations = [] +queryStringForm = form +if queryStringForm.has_key('data'): + form_value = queryStringForm['data'] + quiet = False + if form_value.rstrip(' ') == '': + pass + else: + # Use opentrep + libpyopentrep_proxy = apache.import_module('libpyopentrep_proxy', path=[local_path]) + libpyopentrep = libpyopentrep_proxy.import_libpyopentrep(localize.libpyopentrep_path) + mySearch = libpyopentrep.OpenTrepSearcher() + mySearch.init(localize.traveldb_path, localize.tmp_trep_log_filename, localize.opentrep_dbparams['user'], localize.opentrep_dbparams['password'], localize.opentrep_dbparams['host'], localize.opentrep_dbparams['port'], localize.opentrep_dbparams['db']) + str_matches = mySearch.search(form_value) + if ';' in str_matches: + str_matches, unrecognized = str_matches.split(';') + msg = 'unrecognized: %s. ' % unrecognized + str_value = unrecognized + if str_matches != '': + alter_locations = [x for x in str_matches.split(',')] + for alter_location_list in alter_locations: + alter_location_list = [x for x in alter_location_list.split('-')] + for extra_location_list in alter_location_list: + extra_location_list = [x for x in extra_location_list.split(':')] + + codes = [x[0].upper() for x in alter_locations] + if len(codes)>0: form_value = ' '.join(codes) + if str_value != '': form_value += ' ' + str_value + + # Logging + log_service.log(localize.www_log_filename, req, queryStringForm['data'], codes, unrecognized) + os.system('cat %s >> %s' % (localize.tmp_trep_log_filename, localize.trep_log_filename)) + +%> + +<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml"> +<head> +<title>OpenTREP</title> +<%= head %> +</head> + +<body> +<div align="center"> +<a href="opentrep.psp"><img src="/icons/opentrep.png" height="80px" border=0></a> +</div> +<br> + +<div align="center"> +<table border="0"> + <tr> + <td> + <form value="queryStringForm" action="opentrep.psp" method="post"> + <input type="text" size=80% name="data" value="<%= form_value%>"> + <input type="submit" value="Send"> + </form> + </td> + </tr> +</table> +</div> + +<p style="font-size:small;"><%= msg %></p> + +</body> +</html> Added: trunk/opentrep/gui/psp/result_parser.py =================================================================== --- trunk/opentrep/gui/psp/result_parser.py (rev 0) +++ trunk/opentrep/gui/psp/result_parser.py 2009-08-08 14:55:34 UTC (rev 171) @@ -0,0 +1,64 @@ +#!/usr/bin/env python + +import sys + +# Default result string +defaultResultString = 'yvr:xea/98-xtw/87,sfo/100,led:dft:htl/96;niznayou' + +# If no result string was supplied as arguments of the command-line, +# ask the user for some +resultString = ','.join(sys.argv[1:]) +if resultString == '' : resultString = defaultResultString + +# Function to parse the result string +def parseResultString(iResultString): + form_value, unrecognized = '', '' + msg = '(parsing successful)' + str_matches = iResultString + alter_locations = [] + + if ';' in str_matches: + str_matches, unrecognized = str_matches.split(';') + msg = '(unrecognized: %s)' % unrecognized + str_value = unrecognized + + if str_matches != '': + alter_locations = str_matches.split(',') + + print 'alter_locations: ', alter_locations + + idx1 = 0 + while idx1 != len(alter_locations): + +# print 'Before - alter_locations['+str(idx1)+']: ', alter_locations[idx1] + alter_locations[idx1] = alter_locations[idx1].split('-') +# print 'After - alter_locations['+str(idx1)+']: ', alter_locations[idx1], alter_locations + + idx2 = 0 + while idx2 != len(alter_locations[idx1]): + + alter_locations[idx1][idx2] = alter_locations[idx1][idx2].split(':') + + idx3 = 0 + while idx3 != len(alter_locations[idx1][idx2]): + + alter_locations[idx1][idx2][idx3] = alter_locations[idx1][idx2][idx3].split('/') + idx3 += 1 + + idx2 += 1 + + idx1 += 1 + +# codes = [x.upper() for x in alter_locations] +# if len(codes) > 0: form_value = ' '.join(codes) + if str_value != '': form_value += ' ' + str_value + + print 'After - alter_locations: ', alter_locations + + print 'Result ' + msg + ':' + return form_value + +# Main +print 'Before: ' + resultString +resultString = parseResultString(resultString) +print 'After: ' + resultString Property changes on: trunk/opentrep/gui/psp/result_parser.py ___________________________________________________________________ Added: svn:executable + * Added: trunk/opentrep/gui/psp/sources.mk =================================================================== --- trunk/opentrep/gui/psp/sources.mk (rev 0) +++ trunk/opentrep/gui/psp/sources.mk 2009-08-08 14:55:34 UTC (rev 171) @@ -0,0 +1,6 @@ +html_sources = $(top_srcdir)/gui/psp/index.html +psp_sources = $(top_srcdir)/gui/psp/opentrep.psp +py_sources = \ + $(top_srcdir)/gui/psp/localize.py \ + $(top_srcdir)/gui/psp/log_service.py \ + $(top_srcdir)/gui/psp/libpyopentrep_proxy.py Modified: trunk/opentrep/opentrep/Location.hpp =================================================================== --- trunk/opentrep/opentrep/Location.hpp 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/opentrep/Location.hpp 2009-08-08 14:55:34 UTC (rev 171) @@ -11,6 +11,7 @@ #include <list> // OpenTrep #include <opentrep/OPENTREP_Types.hpp> +#include <opentrep/LocationList.hpp> #include <opentrep/OPENTREP_Abstract.hpp> namespace OPENTREP { @@ -73,6 +74,26 @@ return _nameList; } + /** Get the matching percentage. */ + const MatchingPercentage_T& getPercentage() const { + return _percentage; + } + + /** Get the allowed edit distance/error. */ + const NbOfErrors_T& getEditDistance() const { + return _editDistance; + } + + /** Get the list of extra matching (similar) locations. */ + const LocationList_T& getExtraLocationList() const { + return _extraLocationList; + } + + /** Get the list of alternate matching (less similar) locations. */ + const LocationList_T& getAlternateLocationList() const { + return _alternateLocationList; + } + // ///////// Setters ////////// /** Set the Location code. */ @@ -125,7 +146,27 @@ _nameList = iNameList; } + /** Set the Xapian matching percentage. */ + void setPercentage (const MatchingPercentage_T& iPercentage) { + _percentage = iPercentage; + } + + /** Set the allowed edit distance/error. */ + void setEditDistance (const NbOfErrors_T& iEditDistance) { + _editDistance = iEditDistance; + } + + /** Add an extra matching location. */ + void addExtraLocation (const Location& iExtraLocation) { + _extraLocationList.push_back (iExtraLocation); + } + /** Add an alternate matching location. */ + void addAlternateLocation (const Location& iAlternateLocation) { + _alternateLocationList.push_back (iAlternateLocation); + } + + public: // ///////// Display methods //////// /** Dump a structure into an output stream. @@ -145,7 +186,17 @@ oStr << _locationCode << ", " << _cityCode << ", " << _stateCode << ", " << _countryCode << ", " << _regionCode << ", " << _continentCode << ", " << _timeZoneGroup - << ", " << _longitude << ", " << _latitude; + << ", " << _longitude << ", " << _latitude + << ", " << _percentage << ", " << _editDistance; + + if (_extraLocationList.empty() == false) { + oStr << " " << _extraLocationList.size() << " extra match(es)"; + } + + if (_alternateLocationList.empty() == false) { + oStr << " " << _alternateLocationList.size() << " alternate match(es)"; + } + return oStr.str(); } @@ -157,6 +208,36 @@ itName != _nameList.end(); ++itName) { oStr << ", " << *itName; } + + if (_extraLocationList.empty() == false) { + oStr << "; Extra matches: {"; + unsigned short idx = 0; + for (LocationList_T::const_iterator itLoc = _extraLocationList.begin(); + itLoc != _extraLocationList.end(); ++itLoc, ++idx) { + if (idx != 0) { + oStr << ", "; + } + const Location& lExtraLocation = *itLoc; + oStr << lExtraLocation.toShortString(); + } + oStr << "}"; + } + + if (_alternateLocationList.empty() == false) { + oStr << "; Alternate matches: {"; + unsigned short idx = 0; + for (LocationList_T::const_iterator itLoc = + _alternateLocationList.begin(); + itLoc != _alternateLocationList.end(); ++itLoc, ++idx) { + if (idx != 0) { + oStr << ", "; + } + const Location& lAlternateLocation = *itLoc; + oStr << lAlternateLocation.toShortString(); + } + oStr << "}"; + } + return oStr.str(); } @@ -168,12 +249,15 @@ const std::string& iRegionCode, const std::string& iContinentCode, const std::string& iTimeZoneGroup, const double iLongitude, const double iLatitude, - const LocationNameList_T& iNameList) + const LocationNameList_T& iNameList, + const MatchingPercentage_T& iPercentage, + const NbOfErrors_T& iEditDistance) : _locationCode (iPlaceCode), _cityCode (iCityCode), _stateCode (iStateCode), _countryCode (iCountryCode), _regionCode (iRegionCode), _continentCode (iContinentCode), _timeZoneGroup (iTimeZoneGroup), _longitude (iLongitude), - _latitude (iLatitude), _nameList (iNameList) { + _latitude (iLatitude), _nameList (iNameList), + _percentage (iPercentage), _editDistance (iEditDistance) { } /** Default Constructor. */ @@ -207,11 +291,19 @@ double _latitude; /** List of (American) English names. */ LocationNameList_T _nameList; - }; + /** Matching percentage. */ + MatchingPercentage_T _percentage; - /** List of (geographical) location structures. */ - typedef std::list<Location> LocationList_T; + /** Allowed edit error/distance. */ + NbOfErrors_T _editDistance; + /** List of extra matching (similar) locations. */ + LocationList_T _extraLocationList; + + /** List of alternate matching (less similar) locations. */ + LocationList_T _alternateLocationList; + }; + } #endif // __OPENTREP_LOCATION_HPP Added: trunk/opentrep/opentrep/LocationList.hpp =================================================================== --- trunk/opentrep/opentrep/LocationList.hpp (rev 0) +++ trunk/opentrep/opentrep/LocationList.hpp 2009-08-08 14:55:34 UTC (rev 171) @@ -0,0 +1,20 @@ +#ifndef __OPENTREP_LOCATIONLIST_HPP +#define __OPENTREP_LOCATIONLIST_HPP + +// ////////////////////////////////////////////////////////////////////// +// Import section +// ////////////////////////////////////////////////////////////////////// +// STL +#include <list> + +namespace OPENTREP { + + // Forward declaration + struct Location; + + /** List of (geographical) location structures. */ + typedef std::list<Location> LocationList_T; + +} +#endif // __OPENTREP_LOCATIONLIST_HPP + Modified: trunk/opentrep/opentrep/OPENTREP_Service.hpp =================================================================== --- trunk/opentrep/opentrep/OPENTREP_Service.hpp 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/opentrep/OPENTREP_Service.hpp 2009-08-08 14:55:34 UTC (rev 171) @@ -10,7 +10,7 @@ // OpenTREP #include <opentrep/OPENTREP_Types.hpp> #include <opentrep/DBParams.hpp> -#include <opentrep/Location.hpp> +#include <opentrep/LocationList.hpp> #include <opentrep/DistanceErrorRule.hpp> namespace OPENTREP { Modified: trunk/opentrep/opentrep/OPENTREP_Types.hpp =================================================================== --- trunk/opentrep/opentrep/OPENTREP_Types.hpp 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/opentrep/OPENTREP_Types.hpp 2009-08-08 14:55:34 UTC (rev 171) @@ -45,6 +45,9 @@ class XapianTravelDatabaseEmptyException : public XapianException { }; + class XapianTravelDatabaseNotInSyncWithSQLDatabaseException : public XapianException { + }; + class SQLDatabaseException : public RootException { }; @@ -87,6 +90,9 @@ /** Xapian document ID. */ typedef int XapianDocID_T; + /** Xapian percentage. */ + typedef unsigned int MatchingPercentage_T; + /** Travel search query. */ typedef std::string TravelQuery_T; Modified: trunk/opentrep/opentrep/batches/opentrep_indexer.cfg =================================================================== --- trunk/opentrep/opentrep/batches/opentrep_indexer.cfg 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/opentrep/batches/opentrep_indexer.cfg 2009-08-08 14:55:34 UTC (rev 171) @@ -1,4 +1,4 @@ -database=../../test/traveldb +database=/tmp/opentrep/share/opentrep/traveldb log=opentrep_indexer.log user=opentrep passwd=opentrep Modified: trunk/opentrep/opentrep/batches/opentrep_searcher.cfg =================================================================== --- trunk/opentrep/opentrep/batches/opentrep_searcher.cfg 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/opentrep/batches/opentrep_searcher.cfg 2009-08-08 14:55:34 UTC (rev 171) @@ -1,4 +1,4 @@ -database=../../test/traveldb +database=/tmp/opentrep/share/opentrep/traveldb log=opentrep_searcher.log user=opentrep passwd=opentrep Modified: trunk/opentrep/opentrep/batches/searcher.cpp =================================================================== --- trunk/opentrep/opentrep/batches/searcher.cpp 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/opentrep/batches/searcher.cpp 2009-08-08 14:55:34 UTC (rev 171) @@ -13,6 +13,7 @@ #include <boost/program_options.hpp> // OpenTREP #include <opentrep/OPENTREP_Service.hpp> +#include <opentrep/Location.hpp> #include <opentrep/DBParams.hpp> #include <opentrep/config/opentrep-paths.hpp> Modified: trunk/opentrep/opentrep/bom/Place.cpp =================================================================== --- trunk/opentrep/opentrep/bom/Place.cpp 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/opentrep/bom/Place.cpp 2009-08-08 14:55:34 UTC (rev 171) @@ -22,7 +22,8 @@ _regionCode (iPlace._regionCode), _continentCode (iPlace._continentCode), _timeZoneGroup (iPlace._timeZoneGroup), _longitude (iPlace._longitude), _latitude (iPlace._latitude), _nameMatrix (iPlace._nameMatrix), - _docID (iPlace._docID) { + _docID (iPlace._docID), _percentage (iPlace._percentage), + _editDistance (iPlace._editDistance) { } // ////////////////////////////////////////////////////////////////////// @@ -77,7 +78,9 @@ oStr << ", " << lCityCode << ", " << _stateCode << ", " << _countryCode << ", " << _regionCode << ", " << _continentCode << ", " << _timeZoneGroup - << ", " << _longitude << ", " << _latitude << ", " << _docID << ". "; + << ", " << _longitude << ", " << _latitude + << ", " << _docID << ", " << _percentage + << ", " << _editDistance << ". "; for (NameMatrix_T::const_iterator itNameList = _nameMatrix.begin(); itNameList != _nameMatrix.end(); ++itNameList) { @@ -85,6 +88,37 @@ oStr << lNameList.toString(); } + if (_extraPlaceList.empty() == false) { + oStr << "; Extra matches: {"; + unsigned short idx = 0; + for (PlaceOrderedList_T::const_iterator itLoc = _extraPlaceList.begin(); + itLoc != _extraPlaceList.end(); ++itLoc, ++idx) { + if (idx != 0) { + oStr << "; "; + } + const Place* lExtraPlace_ptr = *itLoc; + assert (lExtraPlace_ptr != NULL); + oStr << lExtraPlace_ptr->toShortString(); + } + oStr << "}"; + } + + if (_alternatePlaceList.empty() == false) { + oStr << "; Alternate matches: {"; + unsigned short idx = 0; + for (PlaceOrderedList_T::const_iterator itLoc = + _alternatePlaceList.begin(); + itLoc != _alternatePlaceList.end(); ++itLoc, ++idx) { + if (idx != 0) { + oStr << "; "; + } + const Place* lAlternatePlace_ptr = *itLoc; + assert (lAlternatePlace_ptr != NULL); + oStr << lAlternatePlace_ptr->toShortString(); + } + oStr << "}"; + } + return oStr.str(); } @@ -100,7 +134,9 @@ oStr << ", " << lCityCode << ", " << _stateCode << ", " << _countryCode << ", " << _regionCode << ", " << _continentCode << ", " << _timeZoneGroup - << ", " << _longitude << ", " << _latitude << ", " << _docID; + << ", " << _longitude << ", " << _latitude + << ", " << _docID << ", " << _percentage + << ", " << _editDistance; NameMatrix_T::const_iterator itNameHolder = _nameMatrix.begin(); if (itNameHolder != _nameMatrix.end()) { @@ -113,6 +149,14 @@ } } + if (_extraPlaceList.empty() == false) { + oStr << " " << _extraPlaceList.size() << " extra match(es)"; + } + + if (_alternatePlaceList.empty() == false) { + oStr << " " << _alternatePlaceList.size() << " alternate match(es)"; + } + return oStr.str(); } @@ -143,6 +187,8 @@ << ", longitude = " << _longitude << ", latitude = " << _latitude << ", docID = " << _docID + << ", percentage = " << _percentage << "%" + << ", edit distance = " << _editDistance << std::endl; return oStr.str(); } @@ -215,7 +261,37 @@ // Copy the parameters from the Place object to the Location structure Location oLocation (_placeCode, lCityCode, _stateCode, _countryCode, _regionCode, _continentCode, _timeZoneGroup, - _longitude, _latitude, lNameList); + _longitude, _latitude, lNameList, + _percentage, _editDistance); + + // Add extra matching locations, whenever they exist + if (_extraPlaceList.empty() == false) { + for (PlaceOrderedList_T::const_iterator itLoc = _extraPlaceList.begin(); + itLoc != _extraPlaceList.end(); ++itLoc) { + const Place* lExtraPlace_ptr = *itLoc; + assert (lExtraPlace_ptr != NULL); + + // Add the extra matching location + const Location& lExtraLocation = lExtraPlace_ptr->createLocation(); + oLocation.addExtraLocation (lExtraLocation); + } + } + + // Add alternate matching locations, whenever they exist + if (_alternatePlaceList.empty() == false) { + for (PlaceOrderedList_T::const_iterator itLoc = + _alternatePlaceList.begin(); + itLoc != _alternatePlaceList.end(); ++itLoc) { + const Place* lAlternatePlace_ptr = *itLoc; + assert (lAlternatePlace_ptr != NULL); + + // Add the alternate matching location + const Location& lAlternateLocation = + lAlternatePlace_ptr->createLocation(); + oLocation.addAlternateLocation (lAlternateLocation); + } + } + return oLocation; } } Modified: trunk/opentrep/opentrep/bom/Place.hpp =================================================================== --- trunk/opentrep/opentrep/bom/Place.hpp 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/opentrep/bom/Place.hpp 2009-08-08 14:55:34 UTC (rev 171) @@ -81,6 +81,16 @@ return _docID; } + /** Get the matching percentage. */ + const MatchingPercentage_T& getPercentage() const { + return _percentage; + } + + /** Get the allowed edit distance/error. */ + const NbOfErrors_T& getEditDistance() const { + return _editDistance; + } + /** Get the map of name lists. */ const NameMatrix_T& getNameMatrix () const { return _nameMatrix; @@ -156,6 +166,16 @@ _docID = iDocID; } + /** Set the Xapian matching percentage. */ + void setPercentage (const MatchingPercentage_T& iPercentage) { + _percentage = iPercentage; + } + + /** Set the allowed edit distance/error. */ + void setEditDistance (const NbOfErrors_T& iEditDistance) { + _editDistance = iEditDistance; + } + public: // ////////// Setters in underlying names //////// @@ -247,9 +267,16 @@ double _latitude; /** List of names, for each given language. */ NameMatrix_T _nameMatrix; + /** Xapian document ID. */ XapianDocID_T _docID; + /** Matching percentage. */ + MatchingPercentage_T _percentage; + + /** Allowed edit error/distance. */ + NbOfErrors_T _editDistance; + /** List of extra matching (similar) places. */ PlaceOrderedList_T _extraPlaceList; Modified: trunk/opentrep/opentrep/bom/ResultHolder.cpp =================================================================== --- trunk/opentrep/opentrep/bom/ResultHolder.cpp 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/opentrep/bom/ResultHolder.cpp 2009-08-08 14:55:34 UTC (rev 171) @@ -280,7 +280,7 @@ const NbOfMatches_T lNbOfMatches = lMatchingDocument.notifyIfExtraMatch(); OPENTREP_LOG_DEBUG ("==> " << lNbOfMatches - << " matches for the query string: `" + << " main matches for the query string: `" << lMatchedString << "' (from `" << lQueryString << "')"); Modified: trunk/opentrep/opentrep/bom/StringMatcher.cpp =================================================================== --- trunk/opentrep/opentrep/bom/StringMatcher.cpp 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/opentrep/bom/StringMatcher.cpp 2009-08-08 14:55:34 UTC (rev 171) @@ -621,4 +621,21 @@ WordHolder::createStringFromWordList (lRemainingWordList); } + // ////////////////////////////////////////////////////////////////////// + std::string StringMatcher::getPlaceCode (const Xapian::Document& iDocument) { + // Retrieve the Xapian document data + const std::string& lDocumentData = iDocument.get_data(); + + // Tokenise the string into words + WordList_T lWordList; + WordHolder::tokeniseStringIntoWordList (lDocumentData, lWordList); + assert (lWordList.empty() == false); + + // By convention (within OpenTrep), the first word of the Xapian + // document data string is the place code + const std::string& lPlaceCode = lWordList.front(); + + return lPlaceCode; + } + } Modified: trunk/opentrep/opentrep/bom/StringMatcher.hpp =================================================================== --- trunk/opentrep/opentrep/bom/StringMatcher.hpp 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/opentrep/bom/StringMatcher.hpp 2009-08-08 14:55:34 UTC (rev 171) @@ -15,6 +15,7 @@ namespace Xapian { class MSet; class Database; + class Document; } namespace OPENTREP { @@ -24,6 +25,7 @@ for more information. */ class StringMatcher : public BomAbstract { public: + // /////////////////////////////////////////////// /** Search, within the Xapian database, for occurrences of the words of the search string. @param Xapian::MSet& The Xapian matching set. It can be empty. @@ -63,6 +65,15 @@ static void subtractParsedToRemaining (const std::string& iAlreadyParsedQueryString, std::string& ioRemainingQueryString); + + + public: + // /////////////////////////////////////////////// + /** Extract the place code from the document data. + <br>The place code is the first 3-letter string of the Xapian + document data/content. */ + static std::string getPlaceCode (const Xapian::Document&); + }; } Modified: trunk/opentrep/opentrep/command/DBManager.cpp =================================================================== --- trunk/opentrep/opentrep/command/DBManager.cpp 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/opentrep/command/DBManager.cpp 2009-08-08 14:55:34 UTC (rev 171) @@ -60,6 +60,52 @@ // ////////////////////////////////////////////////////////////////////// void DBManager:: + prepareSelectOnPlaceCodeStatement (soci::session& ioSociSession, + soci::statement& ioSelectStatement, + const std::string& iPlaceCode, + Place& ioPlace) { + + try { + + // Instanciate a SQL statement (no request is performed at that stage) + /** + select rpd.code AS code, city_code, xapian_docid, is_airport, is_city, + is_main, is_commercial, state_code, country_code, region_code, + continent_code, time_zone_grp, longitude, latitude, language_code, + classical_name, extended_name, alternate_name1, alternate_name2, + alternate_name3, alternate_name4, alternate_name5, alternate_name6, + alternate_name7, alternate_name8, alternate_name9, alternate_name10 + from ref_place_details rpd, ref_place_names rpn + where rpd.code = iPlaceCode + and rpn.code = rpd.code; + */ + + ioSelectStatement = + (ioSociSession.prepare + << "select rpd.code AS code, city_code, xapian_docid, is_airport, " + << "is_city, is_main, is_commercial, state_code, country_code, " + << "region_code, continent_code, time_zone_grp, longitude, latitude, " + << "language_code, classical_name, extended_name, " + << "alternate_name1, alternate_name2, alternate_name3, " + << "alternate_name4, alternate_name5, alternate_name6, " + << "alternate_name7, alternate_name8, alternate_name9, " + << "alternate_name10 " + << "from ref_place_details rpd, ref_place_names rpn " + << "where rpd.code = :place_code " + << "and rpn.code = rpd.code", + soci::into (ioPlace), soci::use (iPlaceCode)); + + // Execute the SQL query + ioSelectStatement.execute(); + + } catch (std::exception const& lException) { + OPENTREP_LOG_ERROR ("Error: " << lException.what()); + throw SQLDatabaseException(); + } + } + + // ////////////////////////////////////////////////////////////////////// + void DBManager:: prepareSelectOnDocIDStatement (soci::session& ioSociSession, soci::statement& ioSelectStatement, const XapianDocID_T& iDocID, @@ -164,7 +210,7 @@ // ////////////////////////////////////////////////////////////////////// bool DBManager::retrievePlace (soci::session& ioSociSession, - const XapianDocID_T& iDocID, + const std::string& iPlaceCode, Place& ioPlace) { bool oHasRetrievedPlace = false; @@ -172,8 +218,9 @@ // Prepare the SQL request corresponding to the select statement soci::statement lSelectStatement (ioSociSession); - DBManager::prepareSelectOnDocIDStatement (ioSociSession, lSelectStatement, - iDocID, ioPlace); + DBManager::prepareSelectOnPlaceCodeStatement (ioSociSession, + lSelectStatement, + iPlaceCode, ioPlace); const bool shouldDoReset = true; bool hasStillData = iterateOnStatement (lSelectStatement, ioPlace, shouldDoReset); Modified: trunk/opentrep/opentrep/command/DBManager.hpp =================================================================== --- trunk/opentrep/opentrep/command/DBManager.hpp 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/opentrep/command/DBManager.hpp 2009-08-08 14:55:34 UTC (rev 171) @@ -22,28 +22,41 @@ from the database. */ class DBManager { public: + /** Update the Xapian document ID field of the database row + corresponding to the given Place object. */ + static void updatePlaceInDB (soci::session&, const Place&); + + /** Retrieve, from the (MySQL) database, the row corresponding to + the given place code (e.g., 'sfo' for San Francisco Intl + airport), and fill the given Place object with that retrieved + data. */ + static bool retrievePlace (soci::session&, const std::string& iPlaceCode, + Place&); + + + public: /** Prepare (parse and put in cache) the SQL statement. */ static void prepareSelectStatement (soci::session&, soci::statement&, Place&); - /** Prepare (parse and put in cache) the SQL statement. */ - static void prepareSelectOnDocIDStatement (soci::session&, soci::statement&, - const XapianDocID_T&, Place&); - /** Iterate on the SQL statement. <br>The SQL has to be already prepared. @parameter const bool Tells whether the Place object should be reset. */ static bool iterateOnStatement (soci::statement&, Place&, const bool iShouldDoReset); - /** Update the Xapian document ID field of the database row - corresponding to the given Place object. */ - static void updatePlaceInDB (soci::session&, const Place&); + + private: + /** Prepare (parse and put in cache) the SQL statement. */ + static void prepareSelectOnPlaceCodeStatement(soci::session&, + soci::statement&, + const std::string& iPlaceCode, + Place&); + + /** Prepare (parse and put in cache) the SQL statement. */ + static void prepareSelectOnDocIDStatement (soci::session&, soci::statement&, + const XapianDocID_T&, Place&); - /** Retrieve, from the (MySQL) database, the row corresponding to the - given Xapian Document ID, and fill the given Place object with - that retrieved data. */ - static bool retrievePlace (soci::session&, const XapianDocID_T&, Place&); private: /** Constructors. */ Modified: trunk/opentrep/opentrep/command/IndexBuilder.cpp =================================================================== --- trunk/opentrep/opentrep/command/IndexBuilder.cpp 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/opentrep/command/IndexBuilder.cpp 2009-08-08 14:55:34 UTC (rev 171) @@ -170,16 +170,10 @@ lPlace, shouldDoReset); while (hasStillData == true) { - // Add the document corresponding to the Place object to the + // Add the document, corresponding to the Place object, to the // Xapian index IndexBuilder::addDocumentToIndex (lDatabase, lPlace); - // Update the row in (MySQL) database for the given Place object: - // The Xapian document ID is generated by Xapian when inserting - // the document into the index; that document ID has to be updated - // in the (MySQL) database. - DBManager::updatePlaceInDB (ioSociSession, lPlace); - // DEBUG OPENTREP_LOG_DEBUG ("[" << idx << "] " << lPlace); Modified: trunk/opentrep/opentrep/command/RequestInterpreter.cpp =================================================================== --- trunk/opentrep/opentrep/command/RequestInterpreter.cpp 2009-07-27 05:56:43 UTC (rev 170) +++ trunk/opentrep/opentrep/command/RequestInterpreter.cpp 2009-08-08 14:55:34 UTC (rev 171) @@ -14,6 +14,7 @@ #include <opentrep/bom/ResultHolder.hpp> #include <opentrep/bom/Result.hpp> #include <opentrep/bom/PlaceHolder.hpp> +#include <opentrep/bom/StringMatcher.hpp> #include <opentrep/factory/FacPlaceHolder.hpp> #include <opentrep/factory/FacPlace.hpp> #include <opentrep/factory/FacResultHolder.hpp> @@ -59,58 +60,171 @@ << "=========================================" << std::endl << std::endl); } + + /** Helper function. */ + // ////////////////////////////////////////////////////////////////////// + bool retrieveAndFillPlace (const Xapian::Document& iDocument, + const Xapian::percent& iDocPercentage, + soci::session& ioSociSession, Place& ioPlace) { + bool hasRetrievedPlace = false; + + // Set the matching percentage + ioPlace.setPercentage (iDocPercentage); + + // Retrieve the parameters of the best matching document + const std::string& lPlaceCode = StringMatcher::getPlaceCode (iDocument); + + // DEBUG + const Xapian::docid& lDocID = iDocument.get_docid(); + const std::string& lDocData = iDocument.get_data(); + OPENTREP_LOG_DEBUG ("Place code: " << lPlaceCode << " - Document ID " + << lDocID << ", " << iDocPercentage + << "% [" << lDocData << "]"); + + // Fill the Place object with the row retrieved from the + // (MySQL) database and corresponding to the given place code + // (e.g., 'sfo' for the San Francisco Intl airport). + hasRetrievedPlace = DBManager::retrievePlace (ioSociSession, lPlaceCode, + ioPlace); + + if (hasRetrievedPlace == false) { + /** + The Xapian database/index should contain only places + available within the SQL database, as the first is built from + the latter. If that happens, it means that the user gave a + wrong Xapian database. + */ + OPENTREP_LOG_ERROR ("There is no document corresponding to " + << lPlaceCode << " (Xapian document ID" << lDocID + << " [" << lDocData << "]) in the SQL database. " + << "It usually means that the Xapian index/database " + << "is not synchronised with the SQL database. " + << "[Hint] Rebuild the Xapian index/database " + << "from the SQL database."); + throw XapianTravelDatabaseNotInSyncWithSQLDatabaseException(); + } + + return hasRetrievedPlace; + } + /** Helper function. */ // ////////////////////////////////////////////////////////////////////// + bool retrieveAndFillPlace (const Document& iDocument, + soci::session& ioSociSession, Place& ioPlace) { + // Delegate + const Xapian::Document& lXapianDocument = iDocument.getXapianDocument(); + const Xapian::percent& lDocPercentage = iDocument.getXapianPercentage(); + return retrieveAndFillPlace (lXapianDocument, lDocPercentage, + ioSociSession, ioPlace); + } + + // ////////////////////////////////////////////////////////////////////// void createPlaces (const ResultHolder& iResultHolder, soci::session& ioSociSession, PlaceHolder& ioPlaceHolder) { - // Browse the list of result objects - const ResultList_T& lResultList = iResultHolder.getResultList(); - for (ResultList_T::const_iterator itResult = lResultList.begin(); - itResult != lResultList.end(); ++itResult) { - // Retrieve the result object - const Result* lResult_ptr = *itResult; - assert (lResult_ptr != NULL); + // Browse the list of result objects + const ResultList_T& lResultList = iResultHolder.getResultList(); + for (ResultList_T::const_iterator itResult = lResultList.begin(); + itResult != lResultList.end(); ++itResult) { + // Retrieve the result object + const Result* lResult_ptr = *itResult; + assert (lResult_ptr != NULL); - /** - TODO: Add a loop for retrieving both extra and alternate Documents - Use FacPlace::initLinkWithExtraPlace() and - FacPlace::initLinkWithAlternatePlace() - */ + // Retrieve the matching document + const Document& lDocument = lResult_ptr->getMatchingDocument(); + + // Instanciate an empty place object, which will be filled from the + // rows retrieved from the database. + Place& lPlace = FacPlace::instance().create(); + + // Retrieve, in the MySQL database, the place corresponding to + // the place code located as the first word of the Xapian + // document data. + bool hasRetrievedPlace = retrieveAndFillPlace (lDocument, ioSociSession, + lPlace); + // If there was no place corresponding to the place code with + // the SQL database, an exception is thrown. Hence, here, by + // construction, the place has been retrieved from the SQL + // database. + assert (hasRetrievedPlace == true); + + // Insert the Place object within the PlaceHolder object + FacPlaceHolder::initLinkWithPlace (ioPlaceHolder, lPlace); + + // DEBUG + OPENTREP_LOG_DEBUG ("Retrieved Document: " << lPlace.toString()); + + // Retrieve the list of extra matching documents (documents + // matching with the same weight/percentage) + const Xapian::percent& lExtraDocPercentage = + lDocument.getXapianPercentage(); + const XapianDocumentList_T& lExtraDocumentList = + lDocument.getExtraDocumentList(); + for (XapianDocumentList_T::const_iterator itExtraDoc = + lExtraDocumentList.begin(); + itExtraDoc != lExtraDocumentList.end(); ++itExtraDoc) { + // Retrieve the extra matching Xapian document + const Xapian::Document& lExtraDocument = *itExtraDoc; - // Retrieve the parameters of the best matching document - const Xapian::Document& lDocument = lResult_ptr->getXapianDocument(); - const Xapian::percent& lDocPercentage = - lResult_ptr->getXapianPercentage(); - const Xapian::docid& lDocID = lDocument.get_docid(); - const std::string& lDocData = lDocument.get_data(); - + // Instanciate an empty place object, which will be filled from the + // ... [truncated message content] |
From: <den...@us...> - 2009-07-27 05:56:55
|
Revision: 170 http://opentrep.svn.sourceforge.net/opentrep/?rev=170&view=rev Author: denis_arnaud Date: 2009-07-27 05:56:43 +0000 (Mon, 27 Jul 2009) Log Message: ----------- [Dev] Prepared the code to dig out the edit distance and extra and alternate locations. Modified Paths: -------------- trunk/opentrep/opentrep/bom/Document.cpp trunk/opentrep/opentrep/bom/Document.hpp trunk/opentrep/opentrep/bom/Place.cpp trunk/opentrep/opentrep/bom/Place.hpp trunk/opentrep/opentrep/bom/ResultHolder.cpp trunk/opentrep/opentrep/bom/StringMatcher.cpp trunk/opentrep/opentrep/bom/StringMatcher.hpp trunk/opentrep/opentrep/command/RequestInterpreter.cpp trunk/opentrep/opentrep/factory/FacPlace.cpp trunk/opentrep/opentrep/factory/FacPlace.hpp Modified: trunk/opentrep/opentrep/bom/Document.cpp =================================================================== --- trunk/opentrep/opentrep/bom/Document.cpp 2009-07-25 22:27:45 UTC (rev 169) +++ trunk/opentrep/opentrep/bom/Document.cpp 2009-07-27 05:56:43 UTC (rev 170) @@ -25,7 +25,8 @@ std::ostringstream oStr; oStr << "`" << describeShortKey() << "'"; if (_correctedQueryString.empty() == false) { - oStr << " (corrected into `" << _correctedQueryString << "')"; + oStr << " (corrected into `" << _correctedQueryString + << "' with an edit distance/error of " << _editDistance << ")"; } return oStr.str(); } @@ -37,11 +38,12 @@ const Xapian::docid& lDocID = _document.get_docid(); oStr << " => Document ID " << lDocID << " matching at " << _percentage - << "% [" << _document.get_data() << "]"; + << "% (edit distance of " << _editDistance << ") [" + << _document.get_data() << "]"; if (_documentList.empty() == false) { oStr << " along with " << _documentList.size() - << " other matching document(s) ("; + << " other equivalent matching document(s) ("; unsigned short idx = 0; for (XapianDocumentList_T::const_iterator itDoc = _documentList.begin(); @@ -53,6 +55,25 @@ } oStr << lDocID; } + oStr << ")"; + } + + if (_alternateDocumentList.empty() == false) { + oStr << " and with still " << _alternateDocumentList.size() + << " other less matching document(s) ("; + + unsigned short idx = 0; + for (XapianAlternateDocumentList_T::const_iterator itDoc = + _alternateDocumentList.begin(); + itDoc != _alternateDocumentList.end(); ++itDoc, ++idx) { + const Xapian::percent& lPercentage = itDoc->first; + const Xapian::Document& lXapianDoc = itDoc->second; + const Xapian::docid& lDocID = lXapianDoc.get_docid(); + if (idx != 0) { + oStr << ", "; + } + oStr << lDocID << " / " << lPercentage << "%"; + } oStr << ")." << std::endl; } else { @@ -73,7 +94,7 @@ if (_documentList.empty() == false) { oStr << " along with " << _documentList.size() - << " other matching document(s) { "; + << " other equivalent matching document(s) { "; unsigned short idx = 0; for (XapianDocumentList_T::const_iterator itDoc = _documentList.begin(); @@ -85,6 +106,26 @@ } oStr << "Doc ID " << lDocID << " [" << lXapianDoc.get_data() << "]"; } + oStr << " }"; + } + + if (_alternateDocumentList.empty() == false) { + oStr << " and with still " << _alternateDocumentList.size() + << " other less matching document(s) { "; + + unsigned short idx = 0; + for (XapianAlternateDocumentList_T::const_iterator itDoc = + _alternateDocumentList.begin(); + itDoc != _alternateDocumentList.end(); ++itDoc, ++idx) { + const Xapian::percent& lPercentage = itDoc->first; + const Xapian::Document& lXapianDoc = itDoc->second; + const Xapian::docid& lDocID = lXapianDoc.get_docid(); + if (idx != 0) { + oStr << ", "; + } + oStr << lDocID << " / " << lPercentage << "% [" + << lXapianDoc.get_data() << "]"; + } oStr << " }." << std::endl; } else { Modified: trunk/opentrep/opentrep/bom/Document.hpp =================================================================== --- trunk/opentrep/opentrep/bom/Document.hpp 2009-07-25 22:27:45 UTC (rev 169) +++ trunk/opentrep/opentrep/bom/Document.hpp 2009-07-27 05:56:43 UTC (rev 170) @@ -17,6 +17,12 @@ // //////////////// Type definitions ///////////////// /** List of Xapian documents. */ typedef std::list<Xapian::Document> XapianDocumentList_T; + + /** Pair of a Xapian document and its associated matching percentage. */ + typedef std::pair<Xapian::percent, Xapian::Document> XapianDocumentPair_T; + + /** List of Xapian documents. */ + typedef std::list<XapianDocumentPair_T> XapianAlternateDocumentList_T; // //////////////// Main Class ///////////////// @@ -48,12 +54,19 @@ return _percentage; } - /** Get the extra list of matching Xapian documents. */ + /** Get the extra list of matching Xapian documents (i.e., those + having matched with the same weight as the main one). */ const XapianDocumentList_T& getExtraDocumentList() const { return _documentList; } + /** Get the alternate list of matching Xapian documents (i.e., those + having matched with a lower weight than the main one). */ + const XapianAlternateDocumentList_T& getAlternateDocumentList() const { + return _alternateDocumentList; + } + // ////////////////// Setters //////////////// /** Set the query string. */ void setQueryString (const TravelQuery_T& iQueryString) { @@ -75,12 +88,25 @@ _percentage = iPercentage; } + /** Set the edit distance/error, with which the matching has been made. */ + void setEditDistance (const NbOfErrors_T& iEditDistance) { + _editDistance = iEditDistance; + } + /** Add a matching Xapian document (having the same matching percentage). */ void addExtraDocument (const Xapian::Document& iMatchingDocument) { _documentList.push_back (iMatchingDocument); } + /** Add a matching Xapian document (having a lower matching percentage). */ + void addAlternateDocument (const Xapian::percent& iMatchingPercentage, + const Xapian::Document& iMatchingDocument) { + _alternateDocumentList. + push_back (XapianDocumentPair_T (iMatchingPercentage, + iMatchingDocument)); + } + public: // /////////// Business methods ///////// /** Retrieve the number of extra matches for the given query string, @@ -143,10 +169,18 @@ /** Matching document, as returned by the Xapian full text search. */ Xapian::Document _document; + /** Edit distance/error, with which the matching has been made. */ + NbOfErrors_T _editDistance; + /** List of Xapian documents having the same matching percentage. <br>Hence, any of those other Xapian documents could have been chosen, instead of the main one. */ XapianDocumentList_T _documentList; + + /** List of Xapian documents having the a lower matching percentage. + <br>Those alternate matches can be suggested (in the famous + "Did you mean Xxx?" question) to the end user. */ + XapianAlternateDocumentList_T _alternateDocumentList; }; } Modified: trunk/opentrep/opentrep/bom/Place.cpp =================================================================== --- trunk/opentrep/opentrep/bom/Place.cpp 2009-07-25 22:27:45 UTC (rev 169) +++ trunk/opentrep/opentrep/bom/Place.cpp 2009-07-27 05:56:43 UTC (rev 170) @@ -10,12 +10,13 @@ namespace OPENTREP { // ////////////////////////////////////////////////////////////////////// - Place::Place () : _world (NULL), _placeHolder (NULL) { + Place::Place () : _world (NULL), _placeHolder (NULL), _mainPlace (NULL) { } // ////////////////////////////////////////////////////////////////////// Place::Place (const Place& iPlace) : _world (iPlace._world), _placeHolder (iPlace._placeHolder), + _mainPlace (iPlace._mainPlace), _placeCode (iPlace._placeCode), _cityCode (iPlace._cityCode), _stateCode (iPlace._stateCode), _countryCode (iPlace._countryCode), _regionCode (iPlace._regionCode), _continentCode (iPlace._continentCode), Modified: trunk/opentrep/opentrep/bom/Place.hpp =================================================================== --- trunk/opentrep/opentrep/bom/Place.hpp 2009-07-25 22:27:45 UTC (rev 169) +++ trunk/opentrep/opentrep/bom/Place.hpp 2009-07-27 05:56:43 UTC (rev 170) @@ -14,6 +14,7 @@ #include <opentrep/Location.hpp> #include <opentrep/bom/BomAbstract.hpp> #include <opentrep/bom/Names.hpp> +#include <opentrep/bom/PlaceList.hpp> namespace OPENTREP { @@ -93,6 +94,16 @@ language. */ bool getNameList (const Language::EN_Language&, NameList_T&) const; + /** Get the list of extra matching (similar) places. */ + const PlaceOrderedList_T& getExtraPlaceList() const { + return _extraPlaceList; + } + + /** Get the list of alternate matching (less similar) places. */ + const PlaceOrderedList_T& getAlternatePlaceList() const { + return _alternatePlaceList; + } + // ///////// Setters //////// /** Set the Place code. */ @@ -206,9 +217,14 @@ /** Parent World. */ World* _world; - /** Parent PlaceHolder. */ + /** Parent PlaceHolder (not always defined,for instance if the + current Place object is an extra or alternate one). */ PlaceHolder* _placeHolder; + /** Parent (main) Place (not always defined,for instance if the + current Place object is itself a main one). */ + Place* _mainPlace; + private: // /////// Attributes ///////// /** Place code. */ @@ -233,6 +249,12 @@ NameMatrix_T _nameMatrix; /** Xapian document ID. */ XapianDocID_T _docID; + + /** List of extra matching (similar) places. */ + PlaceOrderedList_T _extraPlaceList; + + /** List of alternate matching (less similar) places. */ + PlaceOrderedList_T _alternatePlaceList; }; } Modified: trunk/opentrep/opentrep/bom/ResultHolder.cpp =================================================================== --- trunk/opentrep/opentrep/bom/ResultHolder.cpp 2009-07-25 22:27:45 UTC (rev 169) +++ trunk/opentrep/opentrep/bom/ResultHolder.cpp 2009-07-27 05:56:43 UTC (rev 170) @@ -110,26 +110,26 @@ << lMaxEditDistance << "."); // Retrieve the list of Xapian documents matching the query string + NbOfErrors_T lCalculatedEditDistance = 0; oMatchedString = - StringMatcher::searchString (ioMatchingSet, ioPartialQueryString, - lMaxEditDistance, - hasReachedMaximalAllowableEditDistance, - _database); + StringMatcher::searchString(ioMatchingSet, ioPartialQueryString, + lCalculatedEditDistance, lMaxEditDistance, + hasReachedMaximalAllowableEditDistance, + _database); // DEBUG OPENTREP_LOG_DEBUG ("---- Current query string: `" << ioPartialQueryString << "' --- Kept query: `" << oMatchedString - << "', with a maximal edit distance of " - << lMaxEditDistance << ", for " + << "', with an edit distance of a maximum of " + << lCalculatedEditDistance << " (over " + << lMaxEditDistance << "), for " << ioMatchingSet.size() << " matches."); if (ioMatchingSet.empty() == false) { - // Create the corresponding list of documents - StringMatcher:: - extractBestMatchingDocumentFromMSet (ioMatchingSet, - ioMatchingDocument); - + // Store the calculated (and applied) edit distance/erro + ioMatchingDocument.setEditDistance (lCalculatedEditDistance); + // Since a result has been found, the search can be stopped // for that part of the query. shouldStop = true; Modified: trunk/opentrep/opentrep/bom/StringMatcher.cpp =================================================================== --- trunk/opentrep/opentrep/bom/StringMatcher.cpp 2009-07-25 22:27:45 UTC (rev 169) +++ trunk/opentrep/opentrep/bom/StringMatcher.cpp 2009-07-27 05:56:43 UTC (rev 170) @@ -114,12 +114,13 @@ // /////////////////////////////////////////////////////////////////// void checkAndAlterIfNeeded (TravelQuery_T& ioSuggestedString, const TravelQuery_T& iOriginalString, + NbOfErrors_T& ioCalculatedEditDistance, const NbOfErrors_T& iMaxEditDistance, const Xapian::Database& iDatabase) { /** - Store a copy of the suggested string, as it will me altered by - the below method. + Store a copy of the suggested string, as it will be altered by + the below method, i.e., removeFurthestLeftWord(). */ TravelQuery_T lOriginalStringCopy (iOriginalString); StringMatcher::removeFurthestLeftWord (lOriginalStringCopy); @@ -128,15 +129,14 @@ Get a spell-corrected suggestion for the reduced original string. <br>Limit the edit distance to the given maximal one. */ - NbOfErrors_T lCalculatedEditDistance = - calculateEditDistance (lOriginalStringCopy); + ioCalculatedEditDistance = calculateEditDistance (lOriginalStringCopy); - lCalculatedEditDistance = std::min (lCalculatedEditDistance, - iMaxEditDistance); + ioCalculatedEditDistance = std::min (ioCalculatedEditDistance, + iMaxEditDistance); std::string lSuggestionForReducedOriginalString = iDatabase.get_spelling_suggestion (lOriginalStringCopy, - lCalculatedEditDistance); + ioCalculatedEditDistance); /** Note that if the suggestion on the reduced-original string is @@ -154,7 +154,7 @@ OPENTREP_LOG_DEBUG ("The suggestion (`" << ioSuggestedString << "') for `" << iOriginalString << "', with an edit distance/error of " - << lCalculatedEditDistance + << ioCalculatedEditDistance << " over " << iMaxEditDistance << " allowable" << ", is the same as the suggestion for the reduced " << "original string (`" << lOriginalStringCopy @@ -184,14 +184,14 @@ Get a spell-corrected suggestion for the reduced original string. <br>Limit the edit distance to the given maximal one. */ - lCalculatedEditDistance = calculateEditDistance (lOriginalStringCopy); + ioCalculatedEditDistance = calculateEditDistance (lOriginalStringCopy); - lCalculatedEditDistance = std::min (lCalculatedEditDistance, - iMaxEditDistance); + ioCalculatedEditDistance = std::min (ioCalculatedEditDistance, + iMaxEditDistance); lSuggestionForReducedOriginalString = iDatabase.get_spelling_suggestion (lOriginalStringCopy, - lCalculatedEditDistance); + ioCalculatedEditDistance); /** Note that if the suggestion on the reduced-original string is @@ -209,7 +209,7 @@ OPENTREP_LOG_DEBUG ("The suggestion (`" << ioSuggestedString << "') for `" << iOriginalString << "', with an edit distance/error of " - << lCalculatedEditDistance + << ioCalculatedEditDistance << " over " << iMaxEditDistance << " allowable" << ", is the same as the suggestion for the reduced " << "original string (`" << lOriginalStringCopy @@ -228,6 +228,7 @@ std::string StringMatcher:: searchString (Xapian::MSet& ioMatchingSet, const TravelQuery_T& iSearchString, + NbOfErrors_T& ioCalculatedEditDistance, NbOfErrors_T& ioMaxEditDistance, bool& ioHasReachedMaximalAllowableEditDistance, const Xapian::Database& iDatabase) { @@ -407,26 +408,26 @@ phrase/string. With the above example, 'sna francisco' yields the suggestion 'san francisco'. */ - NbOfErrors_T lCalculatedEditDistance = - calculateEditDistance (lOriginalQueryString); + ioCalculatedEditDistance = calculateEditDistance (lOriginalQueryString); // Store the greatest edit distance/error - lMaxEditDistance = std::max (lMaxEditDistance, lCalculatedEditDistance); + lMaxEditDistance = std::max (lMaxEditDistance, ioCalculatedEditDistance); // Limit the edit distance to the given maximal one - lCalculatedEditDistance = std::min (lCalculatedEditDistance, - ioMaxEditDistance); + ioCalculatedEditDistance = std::min (ioCalculatedEditDistance, + ioMaxEditDistance); std::string lFullWordCorrectedString = iDatabase.get_spelling_suggestion (lOriginalQueryString, - lCalculatedEditDistance); + ioCalculatedEditDistance); /** Check that the suggestion does not encompass extra words, which will be otherwise/rather recognised in another step. */ checkAndAlterIfNeeded (lFullWordCorrectedString, lOriginalQueryString, - ioMaxEditDistance, iDatabase); + ioCalculatedEditDistance, ioMaxEditDistance, + iDatabase); /** Since there is still no match, we search on the string @@ -528,7 +529,7 @@ NbOfMatches_T idx = 1; for ( ; itDoc != iMatchingSet.end(); ++itDoc, ++idx) { const Xapian::percent& lPercentage = itDoc.get_percent(); - // const Xapian::Document& lDocument = itDoc.get_document(); + const Xapian::Document& lDocument = itDoc.get_document(); // DEBUG /* @@ -536,12 +537,15 @@ << lDocument.get_docid() << " matching at " << lPercentage << "%."); */ - + + /** If the matching percentage is the same as for the main + (chosen) Xapian document, then add it to the dedicated + list. Otherwise, add it to the alternative choices. */ if (lPercentage == lBestPercentage) { - ioMatchingDocument.addExtraDocument (itDoc.get_document()); + ioMatchingDocument.addExtraDocument (lDocument); } else { - break; + ioMatchingDocument.addAlternateDocument (lPercentage, lDocument); } } } Modified: trunk/opentrep/opentrep/bom/StringMatcher.hpp =================================================================== --- trunk/opentrep/opentrep/bom/StringMatcher.hpp 2009-07-25 22:27:45 UTC (rev 169) +++ trunk/opentrep/opentrep/bom/StringMatcher.hpp 2009-07-27 05:56:43 UTC (rev 170) @@ -28,6 +28,7 @@ words of the search string. @param Xapian::MSet& The Xapian matching set. It can be empty. @param const std::string& The query string. + @param NbOfErrors_T& The calculated (and applied) edit distance/error. @param NbOfErrors_T& The maximal allowable edit distance/error. @param bool& Whether or not the maximal allowable edit distance/error has become greater than the maximum of the edit distance/errors @@ -37,6 +38,7 @@ which has yielded matches. */ static std::string searchString (Xapian::MSet&, const std::string& iSearchString, + NbOfErrors_T& ioCalculatedEditDistance, NbOfErrors_T& ioMaxEditDistance, bool& ioHasReachedMaximalAllowableEditDistance, const Xapian::Database&); Modified: trunk/opentrep/opentrep/command/RequestInterpreter.cpp =================================================================== --- trunk/opentrep/opentrep/command/RequestInterpreter.cpp 2009-07-25 22:27:45 UTC (rev 169) +++ trunk/opentrep/opentrep/command/RequestInterpreter.cpp 2009-07-27 05:56:43 UTC (rev 170) @@ -72,6 +72,13 @@ const Result* lResult_ptr = *itResult; assert (lResult_ptr != NULL); + /** + TODO: Add a loop for retrieving both extra and alternate Documents + Use FacPlace::initLinkWithExtraPlace() and + FacPlace::initLinkWithAlternatePlace() + */ + + // Retrieve the parameters of the best matching document const Xapian::Document& lDocument = lResult_ptr->getXapianDocument(); const Xapian::percent& lDocPercentage = Modified: trunk/opentrep/opentrep/factory/FacPlace.cpp =================================================================== --- trunk/opentrep/opentrep/factory/FacPlace.cpp 2009-07-25 22:27:45 UTC (rev 169) +++ trunk/opentrep/opentrep/factory/FacPlace.cpp 2009-07-27 05:56:43 UTC (rev 170) @@ -63,4 +63,26 @@ return *oPlace_ptr; } + // ////////////////////////////////////////////////////////////////////// + void FacPlace::initLinkWithExtraPlace (Place& ioMainPlace, + Place& ioExtraPlace) { + // Link the main Place to the extra Place, and vice versa + ioExtraPlace._mainPlace = &ioMainPlace; + + // Add the extra Place to the main Place internal map (of extra + // Place objects) + ioMainPlace._extraPlaceList.push_back (&ioExtraPlace); + } + + // ////////////////////////////////////////////////////////////////////// + void FacPlace::initLinkWithAlternatePlace (Place& ioMainPlace, + Place& ioAlternatePlace) { + // Link the main Place to the alternate Place, and vice versa + ioAlternatePlace._mainPlace = &ioMainPlace; + + // Add the alternate Place to the main Place internal map (of + // alternate Place objects) + ioMainPlace._extraPlaceList.push_back (&ioAlternatePlace); + } + } Modified: trunk/opentrep/opentrep/factory/FacPlace.hpp =================================================================== --- trunk/opentrep/opentrep/factory/FacPlace.hpp 2009-07-25 22:27:45 UTC (rev 169) +++ trunk/opentrep/opentrep/factory/FacPlace.hpp 2009-07-27 05:56:43 UTC (rev 170) @@ -36,6 +36,20 @@ @return Place& The newly created object. */ Place& clone (const Place&); + /** Initialise the link between a Place and an extra Place. + @param Place& Main Place object. + @param Place& Extra Place object. + @exception FacExceptionNullPointer + @exception FacException.*/ + static void initLinkWithExtraPlace (Place&, Place&); + + /** Initialise the link between a Place and an alternate Place. + @param Place& Main Place object. + @param Place& Alternate Place object. + @exception FacExceptionNullPointer + @exception FacException.*/ + static void initLinkWithAlternatePlace (Place&, Place&); + private: /** Default Constructor. <br>This constructor is private in order to ensure the singleton This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-07-25 22:27:55
|
Revision: 169 http://opentrep.svn.sourceforge.net/opentrep/?rev=169&view=rev Author: denis_arnaud Date: 2009-07-25 22:27:45 +0000 (Sat, 25 Jul 2009) Log Message: ----------- [DB] Added New Delhi (DEL) airport. Modified Paths: -------------- trunk/opentrep/db/data/ref_place_names.csv Modified: trunk/opentrep/db/data/ref_place_names.csv =================================================================== --- trunk/opentrep/db/data/ref_place_names.csv 2009-07-25 22:11:25 UTC (rev 168) +++ trunk/opentrep/db/data/ref_place_names.csv 2009-07-25 22:27:45 UTC (rev 169) @@ -378,7 +378,7 @@ en,def,dezful,dezful/ir:dezful en,deh,decorah,decorah/ia/us:municipal en,dei,denis island,denis island/sc -en,del,delhi,delhi/in:indira gandhi intl +en,del,delhi,delhi/in:indira gandhi intl,new delhi en,dem,dembidollo,dembidollo/et en,den,denver,denver/co/us:denver intl en,deo,dearborn,dearborn/mi/us:hyatt regency This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-07-25 22:11:33
|
Revision: 168 http://opentrep.svn.sourceforge.net/opentrep/?rev=168&view=rev Author: denis_arnaud Date: 2009-07-25 22:11:25 +0000 (Sat, 25 Jul 2009) Log Message: ----------- [DB] Altered Buenos Aeres, AR. Modified Paths: -------------- trunk/opentrep/db/data/ref_place_names.csv Modified: trunk/opentrep/db/data/ref_place_names.csv =================================================================== --- trunk/opentrep/db/data/ref_place_names.csv 2009-07-25 16:25:42 UTC (rev 167) +++ trunk/opentrep/db/data/ref_place_names.csv 2009-07-25 22:11:25 UTC (rev 168) @@ -4377,7 +4377,7 @@ en,bub,burwell,burwell/ne/us:municipal en,buc,burketown,burketown/ql/au en,bud,budapest,budapest/hu:ferihegy -en,bue,buenos aires,buenos aires/ba/ar +en,bue,buenos aires,buenos aires/ba/ar,buenos aires,buenos aeres en,buf,buffalo,buffalo/ny/us:niagara intl en,bug,benguela gen v de,benguela/ao:gen v deslandes en,buh,bucharest,bucharest/ro This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-07-25 16:25:48
|
Revision: 167 http://opentrep.svn.sourceforge.net/opentrep/?rev=167&view=rev Author: denis_arnaud Date: 2009-07-25 16:25:42 +0000 (Sat, 25 Jul 2009) Log Message: ----------- [DB] Corrected the Ukrainian names. Modified Paths: -------------- trunk/opentrep/db/data/ref_place_names.csv Modified: trunk/opentrep/db/data/ref_place_names.csv =================================================================== --- trunk/opentrep/db/data/ref_place_names.csv 2009-07-25 16:19:27 UTC (rev 166) +++ trunk/opentrep/db/data/ref_place_names.csv 2009-07-25 16:25:42 UTC (rev 167) @@ -2178,7 +2178,7 @@ en,ieg,zielona gora,zielona gora/pl:babimost en,iej,iejima,iejima/jp en,ies,riesa,riesa/de -en,iev,kiev zhulhany,kiev/ua:zhulhany,kyiv +en,iev,kiev zhulyany,kiev/ua:zhulyany,kyiv,zhulhany,julhany en,ifa,iowa falls,iowa falls/ia/us en,iff,iffley,iffley/ql/au en,ifh,hesa,hesa/ir:hesa @@ -6694,7 +6694,7 @@ en,mpt,maliana,maliana/tl en,mpu,mapua,mapua/pg en,mpv,montpelier,montpel/vt/us:edward f k state -en,mpw,mariupol,mariupol/ua,zhdanov,mariupolis +en,mpw,mariupol,mariupol/ua,zhdanov,jdanov,mariupolis en,mpx,miyanmin,miyanmin/pg en,mpy,maripasoula,maripasoula/gf en,mpz,mt pleasant,mt pleasant/ia/us:municipal @@ -8150,7 +8150,7 @@ en,udd,palm springs udd,palm springs/ca/us:bermuda en,ude,uden,uden/nl:volkel en,udi,uberlandia,uberlandia/mg/br:eduardo gomes -en,udj,uzhhorod,uzhhorod/ua,uzhgorod,ungwar +en,udj,uzhhorod,uzhhorod/ua,uzhgorod,ujgorod,ungwar en,udn,udine,udine/it:airfield en,udo,udomxay,udomxay/la en,udr,udaipur,udaipur/in:dabok This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-07-25 16:19:34
|
Revision: 166 http://opentrep.svn.sourceforge.net/opentrep/?rev=166&view=rev Author: denis_arnaud Date: 2009-07-25 16:19:27 +0000 (Sat, 25 Jul 2009) Log Message: ----------- [DB] Corrected the Ukrainian names. Modified Paths: -------------- trunk/opentrep/db/data/ref_place_names.csv Modified: trunk/opentrep/db/data/ref_place_names.csv =================================================================== --- trunk/opentrep/db/data/ref_place_names.csv 2009-07-24 10:01:54 UTC (rev 165) +++ trunk/opentrep/db/data/ref_place_names.csv 2009-07-25 16:19:27 UTC (rev 166) @@ -244,7 +244,7 @@ en,hre,harare,harare/zw en,hrg,hurghada,hurghada/eg en,hrj,chaurjhari,chaurjhari/np -en,hrk,kharkov,kharkov/ua +en,hrk,kharkiv,kharkiv/ua,kharkov en,hrl,harlingen,harlingen/tx/us:valley intl en,hrm,hassi r'mel,hassi r'mel/dz:tilrempt en,hrn,heron island,heron island/ql/au:heliport @@ -1228,7 +1228,7 @@ en,dng,doogan airport,doogan/wa/au:doogan airport en,dnh,dunhuang,dunhuang/cn en,dni,wad medani,wad medani/sd -en,dnk,dnepropetrovsk,dnepropetrovsk/ua +en,dnk,dnipropetrovsk,dnipropetrovsk/ua,dnepropetrovsk,yekaterinoslav,sicheslav,dnipro en,dnl,augusta daniel fd,augusta daniel/ga/us:daniel en,dnm,denham,denham/wa/au en,dnn,dalton,dalton/ga/us:municipal @@ -1986,7 +1986,7 @@ en,cju,jeju arpt,jeju/kr:jeju apt en,cka,cherokee,cherokee/ok/us:kegelman af en,ckb,clarksburg,clarksburg/wv/us:benedum -en,ckc,cherkassy,cherkassy/ua +en,ckc,cherkasy,cherkasy/ua,cherkassy en,ckd,crooked creek,crooked creek/ak/us en,cke,clear lake,clear lake/ca/us en,ckg,chongqing,chongqing/cn @@ -2066,7 +2066,7 @@ en,lwl,wells,wells/nv/us:harriet field en,lwm,lawrence,lawrence/ma/us en,lwn,gyoumri,gyoumri/am -en,lwo,lviv snilow,lviv/ua:snilow +en,lwo,lviv snilow,lviv/ua:snilow,lviv,lvov en,lwr,leeuwarden,leeuwarden/nl en,lws,lewiston clarkstn,lewiston/id/us:nez perce cnt en,lwt,lewistown,lewistown/mt/us:municipal @@ -2168,7 +2168,7 @@ en,kcm,kahramanmaras,kahramanmaras/tr en,kcn,chernofski,chernofski/ak/us:spb en,kco,kocaeli,kocaeli/tr:kocaeli -en,kcp,kamenets podolski,kamenets podolski/ua +en,kcp,kamyanets podilskyi,kamyanets podilskyi/ua,kamenets podolski en,kcq,chignik lake,chignik lake/ak/us en,idn,indagen,indagen/pg en,ido,santa isabel do m,santa isabel do m/to/br @@ -2178,14 +2178,14 @@ en,ieg,zielona gora,zielona gora/pl:babimost en,iej,iejima,iejima/jp en,ies,riesa,riesa/de -en,iev,kiev zhulhany,kiev/ua:zhulhany +en,iev,kiev zhulhany,kiev/ua:zhulhany,kyiv en,ifa,iowa falls,iowa falls/ia/us en,iff,iffley,iffley/ql/au en,ifh,hesa,hesa/ir:hesa en,ifj,isafjordur,isafjordur/is en,ifl,innisfail,innisfail/ql/au en,ifn,isfahan,isfahan/ir -en,ifo,ivano frankovsk,ivano frankovsk/ua +en,ifo,ivano frankivsk,ivano frankivsk/ua,ivano frankovsk en,ifp,bullhead city,bullhead city/az/us:laughlin en,iga,inagua,inagua/bs en,igb,ingeniero jacobac,ingeniero jacobacci/rn/ar @@ -2718,7 +2718,7 @@ en,hme,hassi messaoud,hassi messaoud/dz:oued irara en,hmg,hermannsburg,hermannsburg/nt/au en,hmi,hami,hami/cn -en,hmj,khmelnitskiy,khmelnitskiy/ua +en,hmj,khmelnytskyi,khmelnytskyi/ua,khmelnitskiy,ruzhichnaya en,hmn,alamogordo hmn,alamogordo/nm/us:holloman afb en,hmo,hermosillo,hermosillo/mx:gen pesqueir en,hmr,hamar,hamar/no:hamar apt @@ -3307,7 +3307,7 @@ en,khr,kharkhorin,kharkhorin/mn en,khs,khasab,khasab/om en,kht,khost,khost/af -en,khu,kremenchug,kremenchug/ua +en,khu,kremenchuk,kremenchuk/ua,kremenchug en,khv,khabarovsk,khabarovsk/ru:novyy en,khw,khwai river lodge,khwai river lodge/bw en,khy,khoy,khoy/ir @@ -3576,7 +3576,7 @@ en,ceg,chester,chester/gb en,ceh,chelinda,chelinda/mw:chelinda en,cei,chiang rai,chiang rai/th -en,cej,chernigov,chernigov/ua +en,cej,chernihiv,chernihiv/ua,chernigov en,cek,chelyabinsk,chelyabinsk/ru en,cel,cape eleuthera,cape eleuthera/bs en,cem,central,central/ak/us @@ -3984,7 +3984,7 @@ en,cvu,corvo isalnd,corvo island/pt en,cwa,wausau central,wausau/wi/us:central wisconsin en,cwb,curitiba,curitiba/pr/br:afonso pena -en,cwc,chernovtsy,chernovtsy/ua +en,cwc,chernivtsi,chernivtsi/ua,chernovtsy en,cwf,chennault intl,lake charles/la/us:chennault i en,cwg,callaway gardens,callaway gardens/ga/us en,cwi,clinton,clinton/ia/us @@ -4041,7 +4041,7 @@ en,kwd,kawadjia,kawadjia/cf en,kwe,guiyang,guiyang/cn en,kwf,waterfall,waterfall/ak/us:waterfall spb -en,kwg,krivoy rog,krivoy rog/ua +en,kwg,kryvyi rih,kryvyi rih/ua,krivoy rog en,kwh,khwahan,khwahan/af en,kwi,kuwait,kuwait/kw:international en,kwj,gwangju,gwangju/kr @@ -4481,7 +4481,7 @@ en,kgl,kigali,kigali/rw:gregoire kayibanda en,kgm,kungum,kungum/pg en,kgn,kasongo lunda,kasongo lunda/cd -en,kgo,kirovograd,kirovograd/ua +en,kgo,kirovohrad,kirovohrad/ua,kirovograd en,kgp,kogalym intl,kogalym/ru:kogalym intl en,kgr,kulgera,kulgera/nt/au en,kgs,kos,kos/gr @@ -4644,7 +4644,7 @@ en,tni,satna,satna/in en,tnj,tanjung pinang,tanjung pinang/id:kidjang en,tnk,tununak,tununak/ak/us -en,tnl,ternopol,ternopol/ua +en,tnl,ternopil,ternopil/ua,ternopol en,tnm,teniente r marsh,teniente r marsh/aq en,tnn,tainan,tainan/tw en,tno,tamarindo,tamarindo/cr @@ -4968,7 +4968,7 @@ en,nls,nicholson,nicholson/wa/au en,nlt,xinyuan city,xinyuan city/cn:nalati en,nlu,mexico city st lu,mexico city/mx:santa lucia -en,nlv,nikolaev,nikolaev/ua +en,nlv,nikolayev,nikolayev/ua,nikolaev en,nma,namangan,namangan/uz en,nmb,daman,daman/in en,nmc,norman's cay,norman's cay/bs @@ -5126,7 +5126,7 @@ en,oys,yosemite ntl park,yosemite ntl park/ca/us en,oza,ozona,ozona/tx/us en,ozc,ozamis city,ozamis city/ph:labo -en,ozh,zaporozhye,zaporozhye/ua +en,ozh,zaporizhia,zaporizhia/ua,zaporizhzhia,zaporozhye,zaporijia en,ozi,bobadilla,bobadilla/es en,ozp,moron,moron/es en,ozr,ozark cairns aaf,ozark/al/us:cairns aaf @@ -5569,7 +5569,7 @@ en,mxo,monticello,monticello/ia/us:municipal en,mxp,milan malpensa,milan/it:malpensa en,mxq,mitchell river,mitchell river/ql/au -en,mxr,mirgorod,mirgorod/ua +en,mxr,myrhorod,myrhorod/ua,mirgorod en,mxs,maota savaii is,maota savaii is/ws en,mxt,maintirano,maintirano/mg en,mxu,mullewa,mullewa/wa/au @@ -6694,7 +6694,7 @@ en,mpt,maliana,maliana/tl en,mpu,mapua,mapua/pg en,mpv,montpelier,montpel/vt/us:edward f k state -en,mpw,mariupol,mariupol/ua +en,mpw,mariupol,mariupol/ua,zhdanov,mariupolis en,mpx,miyanmin,miyanmin/pg en,mpy,maripasoula,maripasoula/gf en,mpz,mt pleasant,mt pleasant/ia/us:municipal @@ -6852,7 +6852,7 @@ en,ses,selma selfield,selma/al/us:selfield en,set,san esteban,san esteban/hn en,seu,seronera,seronera/tz -en,sev,severodoneck,severodoneck/ua +en,sev,sievierodonetsk,sievierodonetsk/ua,severodonetsk en,sew,siwa,siwa/eg en,sex,sembach,sembach/de en,sey,selibaby,selibaby/mr @@ -7767,7 +7767,7 @@ en,lea,learmonth,learmonth/wa/au en,leb,lebanon whiterjct,lebanon/nh/us:lebanon rgnl en,lec,lencois,lenco/ba/br:chapada diamantina -en,led,st petersburg,st petersburg/ru:pulkovo +en,led,st petersburg,st petersburg/ru:pulkovo,leningrad en,lee,leesburg,leesburg/fl/us en,lef,lebakeng,lebakeng/ls en,leg,aleg,aleg/mr @@ -8150,7 +8150,7 @@ en,udd,palm springs udd,palm springs/ca/us:bermuda en,ude,uden,uden/nl:volkel en,udi,uberlandia,uberlandia/mg/br:eduardo gomes -en,udj,uzhgorod,uzhgorod/ua +en,udj,uzhhorod,uzhhorod/ua,uzhgorod,ungwar en,udn,udine,udine/it:airfield en,udo,udomxay,udomxay/la en,udr,udaipur,udaipur/in:dabok @@ -9166,7 +9166,7 @@ en,rwf,redwood falls,redwood falls/mn/us:redwood en,rwi,rocky mount,rocky mount/nc/us:rocky mount en,rwl,rawlins,rawlins/wy/us -en,rwn,rovno,rovno/ua +en,rwn,rivne,rivne/ua,rovno en,rwp,rawalpindi,rawalpindi/pk:off- en,rws,sumare,sumare/sp/br:sumare en,rxa,raudha,raudha/ye @@ -9691,7 +9691,7 @@ en,zgh,copenhagen rail,copenhagen/dk:railway station en,vnx,vilanculos,vilanculos/mz en,vny,los angeles vny,los angeles/ca/us:van nuys -en,vog,volgograd,volgograd/ru +en,vog,volgograd,volgograd/ru,stalingrad en,voh,vohemar,vohemar/mg en,voi,voinjama,voinjama/lr en,vok,camp douglas,camp douglas/wi/us:volk field @@ -9718,7 +9718,7 @@ en,vsa,villahermosa,villahermosa/mx:cap c perez en,vse,viseu,viseu/pt en,vsf,springfield,springfield/vt/us:state -en,vsg,lugansk,lugansk/ua +en,vsg,luhansk,luhansk/ua,lugansk,luhanske en,vsk,kennewick,kennewick/wa/us:vista field en,vso,phuoclong,phuoclong/vn en,vst,vasteras hasslo,stockholm/se:vasteras hasslo @@ -10449,7 +10449,7 @@ en,zto,boston south,boston/ma/us:south railway sta en,ztp,itapetininga,itapetininga/sp/br en,ztq,sbb rail ch zone2,sbb rail ch zone 2/ch -en,ztr,zhitomir,zhitomir/ua +en,ztr,zhytomyr,zhytomyr/ua,zhitomir,jitomir en,zts,tahsis,tahsis/bc/ca en,ztt,cottbus,cottbus/de:railway station en,ztv,sturtevant rail,sturtevant/wi/us:railway stati @@ -10655,7 +10655,7 @@ en,vij,virgin gorda,virgin gorda/vg en,vik,kavik,kavik/ak/us:airstrip en,vil,dakhla,dakhla/ma -en,vin,vinnica,vinnica/ua +en,vin,vinnytsia,vinnytsia/ua,vinnica,vinnytsya,vinnitsa en,viq,viqueque,viqueque/tl en,vir,durban virginia,durban/za:virginia en,vis,visalia,visalia/ca/us This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |
From: <den...@us...> - 2009-07-24 10:02:08
|
Revision: 165 http://opentrep.svn.sourceforge.net/opentrep/?rev=165&view=rev Author: denis_arnaud Date: 2009-07-24 10:01:54 +0000 (Fri, 24 Jul 2009) Log Message: ----------- [db] Added new ref data set Added Paths: ----------- trunk/opentrep/db/data/ref_city2.csv Added: trunk/opentrep/db/data/ref_city2.csv =================================================================== --- trunk/opentrep/db/data/ref_city2.csv (rev 0) +++ trunk/opentrep/db/data/ref_city2.csv 2009-07-24 10:01:54 UTC (rev 165) @@ -0,0 +1,11135 @@ +aaa,,,y,y,n,y,,pf,pacif,itc3,pf087,-145.5,-17.42,-17.352606,-145.509956,AAA,Anaa,PF +aab,,,y,y,n,y,ql,au,austl,itc3,au131,141,-26.75,-26.75,141,AAB,Arrabury,AU +aac,,,y,y,n,n,,eg,afric,itc2,eg044,33.8,31.13,31.073333,33.835833,AAC,Al Arish,EG +aad,,,y,y,n,n,,sd,afric,itc2,sd172,30.95,18.05,18.05,30.95,AAD,Ad Dabbah,SD +aae,,,y,y,n,y,,dz,afric,itc2,dz,7.817,36.83,36.822225,7.809167,AAE,Annaba Les Salines,DZ +aaf,,,y,y,n,n,fl,us,namer,itc1,us105,-85.03,29.73,29.73,-85.03,AAF,Apalachicola Municipal,US +aag,,,y,y,n,n,pr,br,samer,itc1,br015,-43.27,-22.87,-22.87,-43.27,AAG,Arapoti,BR +aah,,,y,y,n,n,,de,europ,itc2,de040,6.133,50.75,50.75,6.133,AAH,Aachen Merzbruck,DE +aai,,,y,y,n,n,to,br,samer,itc1,br016,-46.93,-12.92,-12.92,-46.93,AAI,Arraias,BR +aaj,,,y,y,n,n,,sr,samer,itc1,sr,-55.37,3.9,3.9,-55.37,AAJ,Awaradam Cayana Airstrip,SR +aak,,,y,y,n,n,,ki,austl,itc3,ki069,174,0.2167,0.2167,174,AAK,Aranuka,KI +aal,,,y,y,n,y,,dk,europ,itc2,dk041,9.933,57.05,57.092789,9.849164,AAL,Aalborg,DK +aam,,,y,y,n,y,,za,afric,itc2,za,31.53,-24.8,-24.8,31.53,AAM,Mala Mala,ZA +aan,,,y,y,n,y,,ae,meast,itc2,ae,55.75,24.25,24.261667,55.609167,AAN,Al Ain,AE +aao,,,y,y,n,n,,ve,samer,itc1,ve194,-64.47,9.417,9.430225,-64.470725,AAO,Anaco,VE +aap,hou,,y,n,n,n,tx,us,namer,itc1,us107,-95.3,29.65,29.65,-95.3,HOU,Houston Hobby,US +aaq,,,y,y,n,y,,ru,euras,itc2,ru130,37.32,44.9,45.002097,37.347272,AAQ,Anapa,RU +aar,,,y,y,n,y,,dk,europ,itc2,dk041,10.62,56.3,56.300017,10.619008,AAR,Aarhus Aarhus,DK +aas,,,y,y,n,n,,id,seasi,itc3,id063,139.3,-3.917,-3.917,139.3,AAS,Apalapsili,ID +aat,,,y,y,n,y,,cn,asia,itc3,cn035,89.5,43.93,43.93,89.5,AAT,Altay,CN +aau,,,y,y,n,n,,ws,austl,itc3,ws,-172.6,-13.45,-13.45,-172.6,AAU,Asau,WS +aav,,,y,y,n,n,,ph,pacif,itc3,ph,124.8,6.333,6.333,124.8,AAV,Alah,PH +aaw,,,y,y,n,n,,pk,asia,itc3,pk163,73.25,34.2,34.2,73.25,AAW,Abbottabad,PK +aax,,,y,y,n,y,mg,br,samer,itc1,br015,-46.92,-19.57,-19.57,-46.92,AAX,Araxa,BR +aay,,,y,y,n,y,,ye,meast,itc2,ye,52.18,16.2,16.191667,52.175,AAY,Al Ghaydah,YE +aaz,,,y,y,n,n,,gt,camer,itc1,gt119,-91.5,14.86,14.86,-91.5,AAZ,Quetzaltenan Quetzaltenango,GT +aba,,,y,y,n,y,,ru,euras,itc2,ru143,91.43,53.72,-34.943333,117.808889,ABA,Abakan,RU +abb,,,y,y,n,n,,gb,europ,itc2,gb053,-0.3,51.53,51.53,-0.3,ABB,Abingdon Raf Station,GB +abc,,,y,y,n,y,,es,europ,itc2,es045,-15.18,38.95,38.95,-15.18,ABC,Albacete Los Llanos,ES +abd,,,y,y,n,y,,ir,meast,itc2,ir120,48.23,30.37,30.371111,48.228333,ABD,Abadan,IR +abe,,,y,y,n,y,pa,us,namer,itc1,us105,-75.43,40.65,40.652083,-75.440806,ABE,Allentown Bthlehm,US +abf,,,y,y,n,n,,ki,austl,itc3,ki069,172.8,1.417,1.417,172.8,ABF,Abaiang,KI +abg,,,y,y,n,n,ql,au,austl,itc3,au131,143.2,-17.67,-17.67,143.2,ABG,Abingdon,AU +abh,,,y,y,n,n,ql,au,austl,itc3,au131,146.6,-23.7,-23.646111,146.583611,ABH,Alpha,AU +abi,,,y,y,n,y,tx,us,namer,itc1,us107,-99.68,32.42,32.411319,-99.681897,ABI,Abilene Abilene Mnpl,US +abj,,,y,y,n,y,,ci,afric,itc2,ci,-3.933,5.25,5.261386,-3.926294,ABJ,Abidjan F Houphouet Boigny,CI +abk,,,y,y,n,y,,et,afric,itc2,et,44.27,6.733,6.733,44.27,ABK,Kabri Dar,ET +abl,,,y,y,n,y,ak,us,namer,itc1,us111,-157.8,67.1,67.1,-157.8,ABL,Ambler,US +abm,,,y,y,n,y,ql,au,austl,itc3,au131,142.4,-10.87,-10.950833,142.459444,ABM,Bamaga,AU +abn,,,y,y,n,n,,sr,samer,itc1,sr,-54.05,5.483,5.483,-54.05,ABN,Albina,SR +abo,,,y,y,n,n,,ci,afric,itc2,ci,-3.2,5.467,5.467,-3.2,ABO,Aboisso,CI +abp,,,y,y,n,n,,pg,austl,itc3,pg,141.1,-6.067,-6.067,141.1,ABP,Atkamba,PG +abq,,,y,y,n,y,nm,us,namer,itc1,us108,-106.6,35.05,35.05,-106.6,ABQ,Albuquerque Intl,US +abr,,,y,y,n,y,sd,us,namer,itc1,us107,-98.43,45.45,45.449056,-98.421833,ABR,Aberdeen Aberdeen Mnpl,US +abs,,,y,y,n,y,,eg,afric,itc2,eg044,31.62,22.37,22.375953,31.611722,ABS,Abu Simbel,EG +abt,,,y,y,n,y,,sa,meast,itc2,sa,42.22,22.83,20.296139,41.634277,ABT,Al Baha Al Aqiq,SA +abu,,,y,y,n,n,,id,seasi,itc3,id062,124.9,-9.333,-9.333,124.9,ABU,Atambua,ID +abv,,,y,y,n,y,,ng,afric,itc2,ng,7.183,9.2,9.006792,7.263172,ABV,Abuja International,NG +abw,,,y,y,n,n,,pg,austl,itc3,pg,148.7,-10.17,-10.17,148.7,ABW,Abau,PG +abx,,,y,y,n,y,ns,au,austl,itc3,au007,146.9,-36.05,-36.067778,146.958056,ABX,Albury,AU +aby,,,y,y,n,y,ga,us,namer,itc1,us105,-84.2,31.53,31.535515,-84.194473,ABY,Albany Dougherty County,US +abz,,,y,y,n,y,,gb,europ,itc2,gb053,-2.2,57.2,57.201944,-2.197778,ABZ,Aberdeen Dyce,GB +aca,,,y,y,n,y,,mx,namer,itc1,mx168,-99.8,16.78,16.757061,-99.753953,ACA,Acapulco Alvarez Intl,MX +acb,,,y,y,n,n,mi,us,namer,itc1,us105,-85.2,44.98,44.98,-85.2,ACB,Bellaire Antrim County,US +acc,,,y,y,n,y,,gh,afric,itc2,gh,-0.1667,5.6,5.605186,-0.166786,ACC,Accra Kotoka,GH +acd,,,y,y,n,n,,co,samer,itc1,co132,-77.3,8.517,8.517,-77.3,ACD,Acandi,CO +ace,,,y,y,n,y,,es,europ,itc2,es046,-13.6,28.95,28.945464,-13.605225,ACE,Lanzarote,ES +ach,,,y,y,n,y,,ch,europ,itc2,ch032,9.55,47.48,47.485033,9.560775,ACH,Altenrhein,CH +aci,,,y,y,n,y,,gb,europ,itc2,gb053,-2.217,49.7,49.706111,-2.214722,ACI,Alderney The Blaye,GB +acj,,,y,y,n,n,,lk,asia,itc3,lk187,80.43,8.3,8.3,80.43,ACJ,Anuradhapura Anuradhapura,LK +ack,,,y,y,n,y,ma,us,namer,itc1,us105,-70.07,41.27,41.253053,-70.060181,ACK,Nantucket Memorial,US +acl,,,y,y,n,n,,co,samer,itc1,co132,-73,4.75,4.75,-73,ACL,Aguaclara,CO +acm,,,y,y,n,n,,co,samer,itc1,co132,-71.78,-2.133,-2.133,-71.78,ACM,Arica,CO +acn,,,y,y,n,n,,mx,namer,itc1,mx168,-100.9,29.3,29.3,-100.9,ACN,Ciudad Ac Ciudad Acuna Intl,MX +aco,,,y,y,n,n,,ch,europ,itc2,ch032,8.767,46.15,46.15,8.767,ACO,Ascona,CH +acp,,,y,y,n,y,,ir,meast,itc2,ir120,46.15,37.35,37.35,46.15,ACP,Sahand Sahand Airport,IR +acr,,,y,y,n,n,,co,samer,itc1,co132,-72.3,-0.3833,-0.3833,-72.3,ACR,Araracuara,CO +acs,,,y,y,n,n,,ru,euras,itc2,ru143,90.57,56.27,56.27,90.57,ACS,Achinsk,RU +act,,,y,y,n,y,tx,us,namer,itc1,us107,-97.23,31.62,31.611289,-97.230519,ACT,Waco Waco Mnpl,US +acu,,,y,y,n,n,,pa,camer,itc1,pa,-77.97,8.433,8.433,-77.97,ACU,Achutupo,PA +acv,,,y,y,n,y,ca,us,namer,itc1,us110,-124.1,40.98,40.978111,-124.108611,ACV,Arcata Eureka,US +acx,,,y,y,n,y,,cn,asia,itc3,cn035,104.9,25.08,40.05,116.6,ACX,Xingyi Xingyi,CN +acy,aiy,,y,n,n,y,nj,us,namer,itc1,us105,-74.58,39.47,39.457583,-74.577167,AIY,Atlantic City Bader Fiel,US +acz,,,y,y,n,y,,ir,meast,itc2,ir120,61.54,31.09,31.09,61.54,ACZ,Zabol Zabol Airport,IR +ada,,,y,y,n,y,,tr,euras,itc2,tr101,35.28,36.98,36.982166,35.280388,ADA,Adana,TR +adb,izm,,y,n,n,y,,tr,euras,itc2,tr101,27.17,38.28,38.292392,27.156953,IZM,Izmir,TR +adc,,,n,y,n,n,,pg,austl,itc3,pg,154.7,-7.141,-7.141,154.7,ADC,Andakombe,PG +add,,,y,y,n,y,,et,afric,itc2,et,38.8,8.983,8.977889,38.799319,ADD,Addis Ababa Bole,ET +ade,,,y,y,n,y,,ye,meast,itc2,ye,45.03,12.83,12.829542,45.028792,ADE,Aden International,YE +adf,,,y,y,n,y,,tr,euras,itc2,tr101,38.46,37.73,37.73,38.46,ADF,Adiyaman Adiyaman,TR +adg,,,y,y,n,n,mi,us,namer,itc1,us105,-84.03,41.9,41.9,-84.03,ADG,Adrian Lenawee County,US +adh,,,y,y,n,n,,ru,euras,itc2,ru145,125.4,58.6,58.6,125.4,ADH,Aldan,RU +adi,,,y,y,n,n,,na,afric,itc2,na004,15,-22.4,-22.462223,14.98,ADI,Arandis,NA +adj,amm,,y,n,n,y,,jo,meast,itc2,jo068,35.98,31.97,31.972703,35.991569,AMM,Amman Queen Alia,JO +adk,,,y,y,n,y,ak,us,namer,itc1,us112,-176.7,51.87,51.87,-176.7,ADK,Adak Island Adak Island,US +adl,,,y,y,n,y,sa,au,austl,itc3,au009,138.5,-34.95,-34.945,138.530556,ADL,Adelaide,AU +adm,,,y,y,n,n,ok,us,namer,itc1,us107,-97.02,34.3,34.3,-97.02,ADM,Ardmore Municipal,US +adn,,,y,y,n,n,,co,samer,itc1,co132,-75.88,5.667,5.667,-75.88,ADN,Andes,CO +ado,,,y,y,n,n,sa,au,austl,itc3,au009,137.2,-31.02,-31.02,137.2,ADO,Andamooka,AU +adp,,,y,y,n,n,,lk,asia,itc3,lk187,81.63,7.336,8.301486,80.4279,ADP,Ampara Ampara,LK +adq,,,y,y,n,y,ak,us,namer,itc1,us111,-152.4,57.8,57.749967,-152.493856,ADQ,Kodiak Kodiak Apt,US +adr,,,y,y,n,n,sc,us,namer,itc1,us105,-79.57,33.45,33.45,-79.57,ADR,Andrews,US +ads,dfw,,y,n,n,n,tx,us,namer,itc1,us107,-96.85,32.85,32.85,-96.85,DFW,Dallas Dallas Ft Worth,US +adt,,,y,y,n,n,ok,us,namer,itc1,us107,-96.68,34.77,34.77,-96.68,ADT,Ada,US +adu,,,y,y,n,y,,ir,meast,itc2,ir120,48.3,30.25,38.325678,48.424356,ADU,Ardabil,IR +adv,,,y,y,n,n,,gb,europ,itc2,gb053,-1.467,51.22,51.22,-1.467,ADV,Andover,GB +adw,,,y,y,n,n,md,us,namer,itc1,us105,-76.92,38.8,38.810806,-76.867028,ADW,Camp Springs Andrews Afb,US +adx,,,y,y,n,y,,gb,europ,itc2,gb053,-2.867,56.37,56.372889,-2.868444,ADX,St Andrews Leuchars,GB +ady,,,y,y,n,n,,za,afric,itc2,za,29.1,-22.67,-22.67,29.1,ADY,Alldays,ZA +adz,,,y,y,n,y,,co,samer,itc1,co132,-81.7,12.58,12.583594,-81.711192,ADZ,San Andres Island,CO +aea,,,y,y,n,n,,ki,austl,itc3,ki069,173.9,0.4833,0.4833,173.9,AEA,Abemama Atoll,KI +aeb,,,y,y,n,n,,cn,asia,itc3,cn035,107,23.72,23.72,107,AEB,NA,NA +aed,,,y,y,n,n,ak,us,namer,itc1,us111,-152.9,58.03,58.03,-152.9,AED,Aleneva,US +aee,,,y,y,n,n,,sd,afric,itc2,sd172,32.95,10.06,10.06,32.95,AEE,NA,NA +aeg,,,y,y,n,n,,id,seasi,itc3,id061,99.45,1.383,1.383,99.45,AEG,Aek Godang,ID +aeh,,,y,y,n,y,,td,afric,itc2,td,20.85,13.85,13.847,20.844333,AEH,Abecher,TD +aei,,,y,y,n,n,,es,europ,itc2,es045,-5.45,36.18,36.18,-5.45,AEI,Algeciras,ES +aek,,,y,y,n,n,,pg,austl,itc3,pg,146.3,-7.367,-7.367,146.3,AEK,Aseki,PG +ael,,,y,y,n,n,mn,us,namer,itc1,us107,-93.37,43.68,43.68,-93.37,AEL,Albert Lea,US +aeo,,,y,y,n,n,,mr,afric,itc2,mr,-9.65,16.7,16.711294,-9.637883,AEO,Aioun El Atrouss,MR +aep,bue,,y,n,n,y,ba,ar,samer,itc1,ar003,-58.37,-34.57,-34.559175,-58.415606,BUE,Buenos Aires,AR +aer,,,y,y,n,y,,ru,euras,itc2,ru130,39.93,43.45,43.449928,39.956589,AER,Adler Sochi,RU +aes,,,y,y,n,y,,no,europ,itc2,no083,6.1,62.55,62.560372,6.110164,AES,Aalesund Vigra,NO +aet,,,y,y,n,y,ak,us,namer,itc1,us111,-152.7,66.55,66.55,-152.7,AET,Allakaket,US +aeu,,,y,y,n,n,,ir,meast,itc2,ir120,55.03,25.88,25.88,55.03,AEU,Abu Musa Abu Musa Airport,IR +aex,,,y,y,n,y,la,us,namer,itc1,us107,-92.3,31.4,31.3274,-92.549833,AEX,Alexandr Alexandria Intl,US +aey,,,y,y,n,y,,is,europ,itc2,is,-18.08,65.65,65.659994,-18.072703,AEY,Akureyri,IS +afa,,,y,y,n,y,md,ar,samer,itc1,ar003,-68.4,-34.58,-34.588314,-68.403854,AFA,San Rafael,AR +afd,,,y,y,n,n,,za,afric,itc2,za,26.88,-33.58,-33.58,26.88,AFD,Port Alfred,ZA +aff,cos,,y,n,n,n,co,us,namer,itc1,us108,-105.9,37.43,37.43,-105.9,COS,Colorado Springs Mnpl,US +afi,,,y,y,n,n,,co,samer,itc1,co132,-75.07,6.917,6.917,-75.07,AFI,Amalfi,CO +afk,adp,,y,n,n,n,,lk,asia,itc3,lk187,81.64,7.292,7.292,81.64,ADP,Ampara Ampara,LK +afl,,,y,y,n,y,mt,br,samer,itc1,br018,-56.1,-9.85,-9.866092,-56.106206,AFL,Alta Floresta,BR +afn,,,y,y,n,n,nh,us,namer,itc1,us105,-72,42.8,42.8,-72,AFN,Jaffrey Municipal,US +afo,,,y,y,n,n,wy,us,namer,itc1,us108,-110.9,42.72,42.72,-110.9,AFO,Afton Municipal,US +afr,,,y,y,n,n,,pg,austl,itc3,pg,148.4,-9.133,-9.133,148.4,AFR,Afore,PG +afs,,,y,y,n,y,,uz,euras,itc3,uz138,64.23,41.61,41.61,64.23,AFS,Zarafshan,UZ +aft,,,y,y,n,y,,sb,austl,itc3,sb,160.9,-9.183,-9.183,160.9,AFT,Afutara Afutara Aerodrome,SB +afw,dfw,,y,n,n,n,tx,us,namer,itc1,us107,-97.03,32.95,32.987639,-97.318806,DFW,Dallas Dallas Ft Worth,US +afy,,,y,y,n,n,,tr,euras,itc2,tr101,30.6,38.73,38.726425,30.601114,AFY,Afyon,TR +afz,,,y,y,n,y,,ir,meast,itc2,ir120,57.67,36.21,36.21,57.67,AFZ,Sabzevar,IR +aga,,,y,y,n,y,,ma,afric,itc2,ma196,-9.55,30.38,30.324997,-9.413067,AGA,Agadir Agadir Almassira,MA +agb,muc,,y,n,n,n,,de,europ,itc2,de040,10.9,48.38,48.425158,10.931764,MUC,Munich Franz J Strauss,DE +agc,pit,,y,n,n,n,pa,us,namer,itc1,us105,-79.93,40.35,40.354403,-79.930169,PIT,Pittsburgh Intl,US +agd,,,y,y,n,n,,id,seasi,itc3,id063,133.9,-1.383,-1.383,133.9,AGD,Anggi,ID +age,,,y,y,n,n,,de,europ,itc2,de040,7.917,53.78,53.78,7.917,AGE,Wangerooge Flugplatz,DE +agf,,,y,y,n,y,,fr,europ,itc2,fr052,0.6,44.18,44.174721,0.590556,AGF,Agen La Garenne,FR +agg,,,y,y,n,n,,pg,austl,itc3,pg,144.1,-4.067,-4.067,144.1,AGG,Angoram,PG +agh,,,y,y,n,y,,se,europ,itc2,se095,12.85,56.3,56.3,12.85,AGH,Angelholm Angelholm Airport,SE +agi,,,y,y,n,n,,sr,samer,itc1,sr,-56.68,5.767,5.767,-56.68,AGI,Wageningen,SR +agj,,,y,y,n,n,,jp,asia,itc3,jp,127.2,26.83,26.83,127.2,AGJ,Aguni,JP +agk,,,y,y,n,n,,pg,austl,itc3,pg,143.8,-6.333,-6.333,143.8,AGK,Kagua,PG +agl,,,y,y,n,n,,pg,austl,itc3,pg,149.2,-9.333,-9.333,149.2,AGL,Wanigela,PG +agm,,,y,y,n,y,,gl,europ,itc1,gl055,-38,65.58,65.58,-38,AGM,Tasiilaq,GL +agn,,,y,y,n,y,ak,us,namer,itc1,us111,-134.6,57.5,57.5,-134.6,AGN,Angoon,US +ago,,,y,y,n,n,ar,us,namer,itc1,us107,-93.22,33.23,33.23,-93.22,AGO,Magnolia Municipal,US +agp,,,y,y,n,y,,es,europ,itc2,es045,-4.5,36.67,36.6749,-4.499106,AGP,Malaga,ES +agq,,,y,y,n,n,,gr,europ,itc2,gr058,21.38,38.6,38.602022,21.351208,AGQ,Agrinion,GR +agr,,,y,y,n,y,,in,asia,itc3,in,77.97,27.15,27.155831,77.960892,AGR,Agra Kheria,IN +ags,,,y,y,n,y,ga,us,namer,itc1,us105,-81.97,33.37,33.369944,-81.9645,AGS,Augusta Bush Field,US +agt,,,y,y,n,y,,py,samer,itc1,py093,-54.52,-25.52,-25.4555,-54.843592,AGT,Ciudad Del Este Alejo Garci,PY +agu,,,y,y,n,y,,mx,namer,itc1,mx168,-102.3,21.87,21.705558,-102.317858,AGU,Aguascalientes,MX +agv,,,y,y,n,y,,ve,samer,itc1,ve194,-69.23,9.55,9.553422,-69.237536,AGV,Acarigua,VE +agw,,,y,y,n,n,ql,au,austl,itc3,au131,142.2,-12.15,-12.15,142.2,AGW,Agnew,AU +agx,,,y,y,n,y,,in,asia,itc3,in,72.2,10.83,10.823656,72.176042,AGX,Agatti Island,IN +agy,,,y,y,n,n,wa,au,austl,itc3,au011,128.8,-16.35,-16.35,128.8,AGY,Argyle Downs,AU +agz,,,y,y,n,n,,za,afric,itc2,za,18.85,-29.05,-29.281767,18.813869,AGZ,Aggeneys,ZA +aha,oka,,y,n,n,n,,jp,asia,itc3,jp,127.7,26.2,26.2,127.7,OKA,Okinawa Naha,JP +ahb,,,y,y,n,y,,sa,meast,itc2,sa,42.5,18.22,18.240367,42.656625,AHB,Abha,SA +ahc,,,y,y,n,n,ca,us,namer,itc1,us110,-120.2,40.27,40.27,-120.2,AHC,Herlong Amedee Aaf,US +ahd,adm,,y,n,n,n,ok,us,namer,itc1,us107,-97.15,34.15,34.15,-97.15,ADM,Ardmore Municipal,US +ahe,,,y,y,n,y,,pf,pacif,itc3,pf087,-146.3,-14.43,-14.43,-146.3,AHE,Ahe,PF +ahf,,,y,y,n,n,ne,us,namer,itc1,us107,-99.9,40.3,40.3,-99.9,AHF,Arapahoe Municipal,US +ahh,,,y,y,n,n,wi,us,namer,itc1,us107,-92.37,45.28,45.28,-92.37,AHH,Amery Municipal,US +ahi,,,y,y,n,n,,id,seasi,itc3,id063,128.9,-3.333,-3.333,128.9,AHI,Amahai,ID +ahl,,,y,y,n,n,,gy,samer,itc1,gy,-59.32,2.483,2.483,-59.32,AHL,Aishalton,GY +ahm,,,y,y,n,n,or,us,namer,itc1,us110,-122.7,42.19,42.19,-122.7,AHM,NA,NA +ahn,,,y,y,n,n,ga,us,namer,itc1,us105,-83.33,33.95,33.948594,-83.326347,AHN,Athens,US +aho,,,y,y,n,y,,it,europ,itc2,it067,8.283,40.63,40.632133,8.290772,AHO,Alghero Fertilia,IT +ahs,,,y,y,n,y,,hn,camer,itc1,hn,-84.42,15.43,15.43,-84.42,AHS,Ahuas,HN +aht,,,y,y,n,n,ak,us,namer,itc1,us112,179,51.5,51.5,179,AHT,Amchitka,US +ahu,,,y,y,n,y,,ma,afric,itc2,ma196,-3.833,35.18,35.177103,-3.839525,AHU,Al Hoceima Charif Al Idriss,MA +ahy,,,y,y,n,n,,mg,afric,itc2,mg,45.53,-20.02,-20.02,45.53,AHY,Ambatolahy,MG +ahz,,,y,y,n,n,,fr,europ,itc2,fr052,6.083,45.08,45.08,6.083,AHZ,Alpe D Huez,FR +aia,,,y,y,n,y,ne,us,namer,itc1,us108,-102.8,42.07,42.07,-102.8,AIA,Alliance,US +aib,,,y,y,n,n,ak,us,namer,itc1,us111,-132.4,56.23,56.23,-132.4,AIB,Anita Bay,US +aic,,,y,y,n,n,,mh,pacif,itc3,mh075,171.2,7.1,7.1,171.2,AIC,Airok,MH +aid,,,y,y,n,n,in,us,namer,itc1,us105,-85.68,40.17,40.17,-85.68,AID,Anderson Municipal,US +aie,,,y,y,n,n,,pg,austl,itc3,pg,144.7,-5.133,-5.133,144.7,AIE,Aiome,PG +aif,,,y,y,n,n,sp,br,samer,itc1,br015,-50.42,-22.67,-22.638564,-50.455914,AIF,Assis,BR +aig,,,y,y,n,n,,cf,afric,itc2,cf,23.25,6.517,6.517,23.25,AIG,Yalinga,CF +aih,,,n,y,n,n,,pg,austl,itc3,pg,141.3,-7.349,-7.349,141.3,AIH,Aiambak,PG +aii,,,y,y,n,n,,dj,afric,itc2,dj,42.72,11.15,11.15,42.72,AII,Alisabieh,DJ +aik,,,y,y,n,n,sc,us,namer,itc1,us105,-81.68,33.65,33.65,-81.68,AIK,Aiken Municipal,US +ail,,,y,y,n,n,,pa,camer,itc1,pa,-78.02,9.233,9.233,-78.02,AIL,Ailigandi,PA +aim,,,y,y,n,n,,mh,pacif,itc3,mh075,170,10.2,10.2,170,AIM,Ailuk Island,MH +ain,,,y,y,n,y,ak,us,namer,itc1,us111,-160,70.63,70.613378,-159.86035,AIN,Wainwright,US +aio,,,y,y,n,n,ia,us,namer,itc1,us107,-95.02,41.4,41.4,-95.02,AIO,Atlantic Municipal,US +aip,,,y,y,n,n,,mh,pacif,itc3,mh075,168.8,7.267,7.267,168.8,AIP,Ailinglapalap Isl,MH +air,,,y,y,n,n,mt,br,samer,itc1,br018,-56.45,-14.42,-14.42,-56.45,AIR,Aripuana,BR +ais,,,y,y,n,n,,ki,austl,itc3,ki069,176.8,-2.633,-2.633,176.8,AIS,Arorae Island,KI +ait,,,y,y,n,y,,ck,austl,itc3,ck,-159.8,-18.85,-18.830922,-159.764233,AIT,Aitutaki,CK +aiu,,,y,y,n,y,,ck,austl,itc3,ck,-158.1,-20.03,-20.03,-158.1,AIU,Atiu Island,CK +aiv,,,y,y,n,n,al,us,namer,itc1,us107,-88.2,33.1,33.1,-88.2,AIV,Aliceville George Downer,US +aiw,,,y,y,n,n,,na,afric,itc2,na004,17.58,-27.98,-27.98,17.58,AIW,Ai Ais,NA +aiy,,,y,y,n,n,nj,us,namer,itc1,us105,-74.45,39.37,39.37,-74.45,AIY,Atlantic City Bader Fiel,US +aiz,,,y,y,n,n,mo,us,namer,itc1,us107,-92.55,38.1,38.1,-92.55,AIZ,Kaiser Lake Ozark Lee C,US +aja,,,y,y,n,y,,fr,europ,itc2,fr052,8.8,41.92,41.923637,8.802917,AJA,Ajaccio Campo Dell Oro,FR +ajf,,,y,y,n,y,,sa,meast,itc2,sa,40.2,29.93,29.93,40.2,AJF,Jouf,SA +aji,,,y,y,n,y,,tr,euras,itc2,tr101,43.03,39.65,39.65,43.03,AJI,Agri,TR +ajj,,,y,y,n,n,,mr,afric,itc2,mr,-14.38,19.73,19.73,-14.38,AJJ,Akjoujt,MR +ajk,,,y,y,n,n,,ir,meast,itc2,ir120,49.68,34.09,34.09,49.68,AJK,Araak Araak,IR +ajl,,,y,y,n,y,,in,asia,itc3,in,92.72,23.73,23.746603,92.802767,AJL,Aizawl,IN +ajn,,,y,y,n,y,,km,iocea,itc2,km,44.43,-12.13,-12.131667,44.430279,AJN,Anjouan Ouani,KM +ajo,,,y,y,n,n,,ye,meast,itc2,ye,44.25,15.72,29.785133,40.100006,AJO,Aljouf,YE +ajr,,,y,y,n,y,,se,europ,itc2,se095,19.28,65.58,65.590278,19.281944,AJR,Arvidsjaur,SE +ajs,,,y,y,n,n,,mx,namer,itc1,mx080,-113.6,26.73,26.73,-113.6,AJS,Abreojos,MX +aju,,,y,y,n,y,se,br,samer,itc1,br016,-37.07,-10.9,-10.984,-37.070333,AJU,Aracaju Aracaju,BR +ajy,,,y,y,n,n,,ne,afric,itc2,ne,7.983,16.97,16.965997,8.000114,AJY,Agades,NE +aka,,,y,y,n,y,,cn,asia,itc3,cn035,109,32.5,32.5,109,AKA,Ankang,CN +akb,,,y,y,n,y,ak,us,namer,itc1,us112,-174.2,52.2,52.2,-174.2,AKB,Atka,US +akc,cak,,y,n,n,n,oh,us,namer,itc1,us105,-81.07,41.4,41.4,-81.07,CAK,Canton Akron Akron,US +akd,,,y,y,n,n,,in,asia,itc3,in,77.08,20.67,-26.871064,26.718003,AKD,Akola,IN +ake,,,y,y,n,n,,ga,afric,itc2,ga,13.92,-1.167,-1.167,13.92,AKE,Akieni,GA +akf,,,y,y,n,y,,ly,afric,itc2,ly122,23.32,24.2,24.178728,23.313958,AKF,Kufrah,LY +akg,,,y,y,n,n,,pg,austl,itc3,pg,142.3,-3.583,-3.583,142.3,AKG,Anguganak,PG +akh,,,y,y,n,n,,sa,meast,itc2,sa,47.58,24.06,24.06,47.58,AKH,Al K Prince Sultan Air Base,SA +aki,,,y,y,n,y,ak,us,namer,itc1,us111,-161.2,60.9,60.9,-161.2,AKI,Akiak,US +akj,,,y,y,n,y,,jp,asia,itc3,jp,142.5,43.5,43.670833,142.4475,AKJ,Asahikawa,JP +akk,,,y,y,n,y,ak,us,namer,itc1,us111,-154.2,56.95,56.95,-154.2,AKK,Akhiok Akhiok Spb,US +akl,,,y,y,n,y,,nz,austl,itc3,nz084,174.8,-37.02,-37.008056,174.791667,AKL,Auckland Auckland,NZ +akm,,,y,y,n,n,,td,afric,itc2,td,19.82,10.88,10.88,19.82,AKM,Zakouma,TD +akn,,,y,y,n,y,ak,us,namer,itc1,us111,-156.6,58.52,58.676778,-156.649278,AKN,King Salmon,US +ako,,,y,y,n,n,co,us,namer,itc1,us108,-103.2,40.17,40.17,-103.2,AKO,Akron Washington Co,US +akp,,,y,y,n,y,ak,us,namer,itc1,us111,-151.7,68.13,68.13,-151.7,AKP,Anaktuvuk,US +akq,,,y,y,n,n,,id,seasi,itc3,id061,105.2,-4.617,-4.617,105.2,AKQ,Astraksetra Gunung Batin,ID +akr,,,y,y,n,n,,ng,afric,itc2,ng,5.2,7.25,7.246739,5.301008,AKR,Akure,NG +aks,,,y,y,n,y,,sb,austl,itc3,sb,160.7,-8.667,-27.660617,27.315761,AKS,Auki Gwaunaru'u,SB +akt,,,y,y,n,n,,cy,europ,itc2,cy039,32.95,35.57,34.590416,32.987861,AKT,Akrotiri Akrotiri Raf,CY +aku,,,y,y,n,y,,cn,asia,itc3,cn035,75.67,38.67,38.67,75.67,AKU,Aksu,CN +akv,,,y,y,n,y,qc,ca,namer,itc1,ca025,-78.58,60.73,60.73,-78.58,AKV,Akulivik,CA +akw,,,y,y,n,n,,ir,meast,itc2,ir120,49.68,30.75,30.75,49.68,AKW,Aghajari Aghajari,IR +akx,,,y,y,n,y,,kz,euras,itc3,kz162,57.22,50.25,50.245833,57.206667,AKX,Aktyubinsk,KZ +aky,,,y,y,n,y,,mm,seasi,itc3,mm,92.87,20.12,20.132708,92.872628,AKY,Sittwe Civil,MM +ala,,,y,y,n,y,,kz,euras,itc3,kz164,76.92,43.32,43.352072,77.040508,ALA,Almaty,KZ +alb,,,y,y,n,y,ny,us,namer,itc1,us105,-73.8,42.73,42.748267,-73.801692,ALB,Albany Albany Intl Apt,US +alc,,,y,y,n,y,,es,europ,itc2,es045,-0.5,38.38,38.282169,-0.558156,ALC,Alicante,ES +ald,,,y,y,n,n,,pe,samer,itc1,pe155,-69.35,-11.7,-11.7,-69.35,ALD,Alerta,PE +ale,,,y,y,n,n,tx,us,namer,itc1,us107,-103.7,30.38,30.38,-103.7,ALE,Alpine,US +alf,,,y,y,n,y,,no,europ,itc2,no083,23.37,69.98,69.976111,23.371667,ALF,Alta,NO +alg,,,y,y,n,y,,dz,afric,itc2,dz,3.217,36.7,36.691014,3.215408,ALG,Algiers Houari Boumediene,DZ +alh,,,y,y,n,y,wa,au,austl,itc3,au011,117.8,-34.95,-34.95,117.8,ALH,Albany,AU +ali,,,y,y,n,n,tx,us,namer,itc1,us107,-98.02,27.73,27.740889,-98.026944,ALI,Alice International,US +alj,,,y,y,n,n,,za,afric,itc2,za,16.53,-28.62,-28.575001,16.533333,ALJ,Alexander Bay Kortdoorn,ZA +alk,,,y,y,n,n,,et,afric,itc2,et,39.12,7.967,7.967,39.12,ALK,Asela,ET +all,,,y,y,n,n,,it,europ,itc2,it067,8.117,44.03,44.050608,8.127428,ALL,Albenga,IT +alm,,,y,y,n,y,nm,us,namer,itc1,us108,-106,32.83,32.839944,-105.990583,ALM,Alamogordo Municipal,US +aln,,,y,y,n,n,il,us,namer,itc1,us107,-90.17,38.88,38.890292,-90.046044,ALN,Alton,US +alo,,,y,y,n,y,ia,us,namer,itc1,us107,-92.38,42.55,42.557081,-92.400345,ALO,Waterloo,US +alp,,,y,y,n,y,,sy,meast,itc2,sy098,37.23,36.18,36.180675,37.224358,ALP,Aleppo Nejrab,SY +alq,,,y,y,n,n,rs,br,samer,itc1,br015,-55.73,-29.8,-29.8,-55.73,ALQ,Alegrete Federal,BR +alr,,,y,y,n,n,,nz,austl,itc3,nz084,169.4,-45.22,-45.211666,169.373333,ALR,Alexandra,NZ +als,,,y,y,n,y,co,us,namer,itc1,us108,-105.9,37.43,37.43,-105.9,ALS,Alamosa Municipal,US +alt,,,y,y,n,n,pa,br,samer,itc1,br018,-54.73,-1.933,-1.933,-54.73,ALT,Alenquer,BR +alu,,,y,y,n,n,,so,afric,itc2,so,50.67,11.88,11.88,50.67,ALU,Alula,SO +alv,,,y,y,n,n,,ad,europ,itc2,ad001,0.45,40.98,40.98,0.45,ALV,Andorra La Andorra La Vella,AD +alw,,,y,y,n,y,wa,us,namer,itc1,us110,-118.3,46.1,46.094778,-118.289,ALW,Walla Walla,US +alx,,,y,y,n,n,al,us,namer,itc1,us107,-85.95,32.93,32.93,-85.95,ALX,Alexander City Tc Russel,US +aly,,,y,y,n,y,,eg,afric,itc2,eg044,29.95,31.18,31.183903,29.948889,ALY,Alexandria El Nohza,EG +alz,,,y,y,n,y,ak,us,namer,itc1,us111,-154.2,56.88,56.88,-154.2,ALZ,Alitak Alitak Spb,US +ama,,,y,y,n,y,tx,us,namer,itc1,us107,-101.7,35.23,35.219369,-101.705931,AMA,Amarillo International,US +amb,,,y,y,n,n,,mg,afric,itc2,mg,48.98,-13.18,-13.188431,48.987978,AMB,Ambilobe,MG +amc,,,y,y,n,n,,td,afric,itc2,td,20.28,11.03,11.03,20.28,AMC,Am Timan,TD +amd,,,y,y,n,y,,in,asia,itc3,in,72.63,23.07,23.077242,72.63465,AMD,Ahmedabad,IN +ame,,,y,y,n,n,,mz,afric,itc2,mz,37.68,-15.65,-15.65,37.68,AME,Alto Molocue,MZ +amf,,,y,y,n,n,,pg,austl,itc3,pg,141.7,-4.167,-4.167,141.7,AMF,Ama,PG +amg,,,y,y,n,n,,pg,austl,itc3,pg,143.5,-4.6,-4.6,143.5,AMG,Amboin,PG +amh,,,y,y,n,y,,et,afric,itc2,et,37.5,6,6.039389,37.590453,AMH,Arba Mintch,ET +ami,,,y,y,n,y,,id,seasi,itc3,id062,116.1,-8.567,-8.567,116.1,AMI,Mataram Selaparang,ID +amj,,,y,y,n,n,mg,br,samer,itc1,br015,-40.68,-16.18,-16.18,-40.68,AMJ,Almenara,BR +amk,dro,,y,n,n,n,co,us,namer,itc1,us108,-107.8,37.15,37.15,-107.8,DRO,Durango La Plata,US +aml,,,y,y,n,n,,pa,camer,itc1,pa,-82.87,8.3,8.3,-82.87,AML,Puerto Armuellas,PA +amm,,,y,y,n,y,,jo,meast,itc2,jo068,35.98,31.97,31.722556,35.993214,AMM,Amman Queen Alia,JO +amn,,,y,y,n,n,mi,us,namer,itc1,us105,-84.65,43.38,43.38,-84.65,AMN,Alma Gratiot Community,US +amo,,,y,y,n,n,,td,afric,itc2,td,15.32,14.12,14.12,15.32,AMO,Mao,TD +amp,,,y,y,n,n,,mg,afric,itc2,mg,44.73,-24.7,-24.7,44.73,AMP,Ampanihy,MG +amq,,,y,y,n,y,,id,seasi,itc3,id063,128.1,-3.7,-3.710264,128.089136,AMQ,Ambon Pattimura,ID +amr,,,y,y,n,n,,mh,pacif,itc3,mh075,171.7,7.083,7.083,171.7,AMR,Arno,MH +ams,,,y,y,n,y,,nl,europ,itc2,nl082,4.9,52.35,52.308613,4.763889,AMS,Amsterdam Schiphol,NL +amt,,,y,y,n,n,nt,au,austl,itc3,au010,131.1,-26.2,-26.2,131.1,AMT,Amata,AU +amu,,,y,y,n,n,,pg,austl,itc3,pg,141.3,-3.517,-3.517,141.3,AMU,Amanab,PG +amv,,,y,y,n,y,,ru,euras,itc2,ru130,61.55,69.77,69.77,61.55,AMV,Amderma,RU +amw,,,y,y,n,n,ia,us,namer,itc1,us107,-93.62,42,42,-93.62,AMW,Ames,US +amx,,,y,y,n,n,nt,au,austl,itc3,au010,135.2,-21.73,-21.73,135.2,AMX,Ammaroo,AU +amy,,,y,y,n,y,,mg,afric,itc2,mg,45.67,-17.68,-17.68,45.67,AMY,Ambatomainty,MG +amz,,,y,y,n,n,,nz,austl,itc3,nz084,175,-37.03,-37.029722,174.973333,AMZ,Ardmore,NZ +ana,,,y,y,n,n,ca,us,namer,itc1,us110,-118,33.85,33.85,-118,ANA,Anaheim,US +anb,,,y,y,n,n,al,us,namer,itc1,us107,-85.85,33.58,33.588167,-85.858111,ANB,Anniston County Arpt,US +anc,,,y,y,n,y,ak,us,namer,itc1,us111,-150,61.17,61.174361,-149.996361,ANC,Anchorage Anchorage Intl,US +and,,,y,y,n,n,sc,us,namer,itc1,us105,-82.72,34.5,34.494583,-82.709389,AND,Anderson,US +ane,,,y,y,n,y,,fr,europ,itc2,fr052,-0.55,47.47,47.47,-0.55,ANE,Angers Marce,FR +anf,,,y,y,n,y,,cl,samer,itc1,cl033,-70.43,-23.43,-23.444478,-70.4451,ANF,Antofagasta Cerro Moreno,CL +ang,,,y,y,n,n,,fr,europ,itc2,fr052,0.2,45.67,45.729247,0.221456,ANG,Angouleme Brie-champniers,FR +anh,,,y,y,n,n,,sb,austl,itc3,sb,160.5,-8.983,-8.983,160.5,ANH,Anuha Island Resort,SB +ani,,,y,y,n,y,ak,us,namer,itc1,us111,-159.5,61.58,61.58,-159.5,ANI,Aniak,US +anj,,,y,y,n,n,,cg,afric,itc2,cg,13.83,-2.85,-2.85,13.83,ANJ,Zanaga,CG +ank,,,y,y,n,n,,tr,euras,itc2,tr101,33,40.13,39.949831,32.688622,ANK,Ankara Etimesgut,TR +anl,,,y,y,n,n,,ao,afric,itc2,ao,16.75,-11.5,-11.5,16.75,ANL,Andulo,AO +anm,,,y,y,n,y,,mg,afric,itc2,mg,50.33,-15,-14.999411,50.320233,ANM,Antalaha Antsirabato,MG +ann,,,y,y,n,n,ak,us,namer,itc1,us111,-131.6,55.03,55.042436,-131.572233,ANN,Annette Island,US +ano,,,y,y,n,n,,mz,afric,itc2,mz,39.93,-16.17,-16.17,39.93,ANO,Angoche,MZ +anp,,,y,y,n,n,md,us,namer,itc1,us105,-76.5,38.98,38.98,-76.5,ANP,Annapolis Lee,US +anq,,,y,y,n,n,in,us,namer,itc1,us105,-85,41.63,41.63,-85,ANQ,Angola Tri State Steuben,US +anr,bru,,y,n,n,y,,be,europ,itc2,be012,4.467,51.18,51.189444,4.460278,BRU,Brussels National,BE +ans,,,y,y,n,y,,pe,samer,itc1,pe155,-73.42,-13.65,-13.706408,-73.350378,ANS,Andahuaylas,PE +ant,,,y,y,n,n,,at,europ,itc2,at005,10.28,47.15,47.15,10.28,ANT,St Anton,AT +anu,,,y,y,n,y,,ag,carib,itc1,ag,-61.75,17.33,17.136749,-61.792667,ANU,Antigua Vc Bird Intl,AG +anv,,,y,y,n,y,ak,us,namer,itc1,us111,-160.2,62.65,62.65,-160.2,ANV,Anvik,US +anw,,,y,y,n,n,ne,us,namer,itc1,us107,-99.98,42.58,42.58,-99.98,ANW,Ainsworth,US +anx,,,y,y,n,y,,no,europ,itc2,no083,16.12,69.32,69.2925,16.144167,ANX,Andenes,NO +any,,,y,y,n,n,ks,us,namer,itc1,us107,-98.03,37.15,37.15,-98.03,ANY,Anthony,US +anz,,,y,y,n,n,nt,au,austl,itc3,au010,132.2,-25.08,-25.08,132.2,ANZ,Angus Downs,AU +aoa,,,y,y,n,n,,pg,austl,itc3,pg,146.8,-9.033,-9.033,146.8,AOA,Aroa,PG +aob,,,y,y,n,n,,pg,austl,itc3,pg,144.7,-4.917,-4.917,144.7,AOB,Annanberg,PG +aoc,,,y,y,n,n,,de,europ,itc2,de040,12.47,50.98,50.981817,12.506361,AOC,Altenburg Altenburg Nobitz,DE +aod,,,y,y,n,n,,td,afric,itc2,td,19.28,11.47,11.47,19.28,AOD,Abou Deia,TD +aoe,esk,,y,n,n,y,,tr,euras,itc2,tr101,30.53,39.8,39.8,30.53,ESK,Eskisehir,TR +aog,,,y,y,n,n,,cn,asia,itc3,cn035,123,41.17,41.17,123,AOG,Anshan,CN +aoh,,,y,y,n,n,oh,us,namer,itc1,us105,-84.1,40.72,40.72,-84.1,AOH,Lima Allen County,US +aoi,,,y,y,n,y,,it,europ,itc2,it067,13.5,43.63,43.616342,13.362319,AOI,Ancona Falconara,IT +aoj,,,y,y,n,y,,jp,asia,itc3,jp,140.7,40.73,40.734722,140.690833,AOJ,Aomori,JP +aok,,,y,y,n,y,,gr,europ,itc2,gr058,27.23,35.5,35.421408,27.146008,AOK,Karpathos,GR +aol,,,y,y,n,n,cr,ar,samer,itc1,ar003,-57.15,-29.68,-29.689425,-57.152078,AOL,Paso De Los Libres,AR +aon,,,y,y,n,n,,pg,austl,itc3,pg,146,-6.25,-6.25,146,AON,Arona,PG +aoo,,,y,y,n,y,pa,us,namer,itc1,us105,-78.32,40.3,40.296372,-78.320022,AOO,Altoona,US +aor,,,y,y,n,y,,my,seasi,itc3,my,100.4,6.1,6.189667,100.398183,AOR,Alor Setar,MY +aos,,,y,y,n,y,ak,us,namer,itc1,us111,-153.8,57.47,57.47,-153.8,AOS,Amook,US +aot,,,y,y,n,n,,it,europ,itc2,it067,7.363,45.74,45.74,7.363,AOT,Aosta Corrado Gex Airport,IT +aou,,,y,y,n,n,,la,seasi,itc3,la,106.8,14.8,14.8,106.8,AOU,Attopeu,LA +apa,den,,y,n,n,n,co,us,namer,itc1,us108,-104.8,39.57,39.57,-104.8,DEN,Denver Denver Intl,US +apb,,,y,y,n,n,,bo,samer,itc1,bo,-68.5,-14.72,-14.72,-68.5,APB,Apolo,BO +apc,,,y,y,n,n,ca,us,namer,itc1,us110,-122.2,38.45,38.45,-122.2,APC,Napa Napa County,US +ape,,,y,y,n,n,,pe,samer,itc1,pe155,-75.17,-15.35,-15.35,-75.17,APE,San Juan Aposento,PE +apf,,,y,y,n,y,fl,us,namer,itc1,us105,-81.77,26.15,26.15,-81.77,APF,Naples,US +apg,,,y,y,n,n,md,us,namer,itc1,us105,-76.17,39.52,39.466219,-76.168808,APG,Aberdeen Phillips Aaf,US +aph,,,y,y,n,n,va,us,namer,itc1,us105,-77.35,38.05,38.05,-77.35,APH,Bowling Green Camp A P H,US +api,,,y,y,n,n,,co,samer,itc1,co132,-73.05,4.067,4.076069,-73.562731,API,Apiay,CO +apk,,,y,y,n,y,,pf,pacif,itc3,pf087,-145.3,-15.08,-15.08,-145.3,APK,Apataki,PF +apl,,,y,y,n,y,,mz,afric,itc2,mz,39.28,-15.1,-15.105611,39.2818,APL,Nampula,MZ +apn,,,y,y,n,y,mi,us,namer,itc1,us105,-83.55,45.08,45.078068,-83.560287,APN,Alpena County Regional,US +apo,,,y,y,n,y,,co,samer,itc1,co132,-77.2,7.033,7.033,-77.2,APO,Apartado,CO +app,,,y,y,n,n,,pg,austl,itc3,pg,148.1,-8.983,-8.983,148.1,APP,Asapa,PG +apq,,,y,y,n,n,al,br,samer,itc1,br016,-36.65,-9.75,-9.75,-36.65,APQ,Arapiraca,BR +apr,,,y,y,n,n,,pg,austl,itc3,pg,142.4,-4.3,-4.3,142.4,APR,April River,PG +aps,,,y,y,n,n,go,br,samer,itc1,br015,-48.97,-16.33,-16.229153,-48.964267,APS,Anapolis,BR +apt,,,y,y,n,n,tn,us,namer,itc1,us107,-85.63,35.07,35.07,-85.63,APT,Jasper Marion County,US +apu,,,y,y,n,n,pr,br,samer,itc1,br015,-51.48,-23.55,-23.55,-51.48,APU,Apucarana,BR +apv,,,y,y,n,n,ca,us,namer,itc1,us110,-117.2,34.52,34.52,-117.2,APV,Apple Valley,US +apw,,,y,y,n,y,,ws,austl,itc3,ws,-171.8,-13.83,-13.829969,-172.008336,APW,Apia Faleolo,WS +apx,,,y,y,n,n,pr,br,samer,itc1,br015,-51.95,-23.38,-23.38,-51.95,APX,Arapongas,BR +apy,,,y,y,n,n,ma,br,samer,itc1,br016,-45.97,-9.1,-9.1,-45.97,APY,Alto Parnaiba,BR +apz,,,y,y,n,n,ne,ar,samer,itc1,ar003,-70.08,-38.92,-38.9755,-70.113581,APZ,Zapala,AR +aqa,,,y,y,n,n,sp,br,samer,itc1,br015,-48.17,-21.78,-21.812,-48.133028,AQA,Araraquara,BR +aqb,,,y,y,n,n,,gt,camer,itc1,gt119,-91.15,15.01,15.01,-91.15,AQB,Quiche Quiche,GT +aqg,,,y,y,n,y,,cn,asia,itc3,cn035,117,30.52,30.52,117,AQG,Anqing,CN +aqi,,,y,y,n,y,,sa,meast,itc2,sa,46.12,28.32,28.335192,46.125069,AQI,Qaisumah,SA +aqj,,,y,y,n,y,,jo,meast,itc2,jo068,34.98,29.55,29.611619,35.018067,AQJ,Aqaba King Hussein Intl,JO +aqm,,,y,y,n,n,ro,br,samer,itc1,br018,-63.07,-9.933,-9.933,-63.07,AQM,Ariquemes,BR +aqp,,,y,y,n,y,,pe,samer,itc1,pe155,-71.57,-16.35,-16.341072,-71.583083,AQP,Arequipa Rodriguez Ballon,PE +aqs,,,y,y,n,n,,fj,austl,itc3,fj169,179.4,-16.75,-16.75,179.4,AQS,Saqani,FJ +aqy,,,y,y,n,n,ak,us,namer,itc1,us111,-149.1,60.97,60.97,-149.1,AQY,Alyeska,US +ara,,,y,y,n,n,la,us,namer,itc1,us107,-91.82,30,30,-91.82,ARA,New Iberia Acadiana Rgnl,US +arb,,,y,y,n,n,mi,us,namer,itc1,us105,-83.73,42.23,-28.741039,32.092111,ARB,Ann Arbor Municipal,US +arc,,,y,y,n,y,ak,us,namer,itc1,us111,-145.6,68.12,68.12,-145.6,ARC,Arctic Village,US +ard,,,y,y,n,n,,id,seasi,itc3,id062,124.8,-8.25,-8.25,124.8,ARD,Alor Island,ID +are,,,y,y,n,n,,pr,carib,itc1,pr,-66.67,18.45,18.45,-66.67,ARE,Arecibo,PR +arf,,,y,y,n,n,,co,samer,itc1,co132,-70.13,0.5333,0.5333,-70.13,ARF,Acaricuara,CO +arg,,,y,y,n,n,ar,us,namer,itc1,us107,-90.92,36.12,36.12,-90.92,ARG,Walnut Ridge,US +arh,,,y,y,n,y,,ru,euras,itc2,ru130,40.53,64.57,64.600281,40.716667,ARH,Arkhangelsk,RU +ari,,,y,y,n,y,,cl,samer,itc1,cl033,-70.32,-18.5,-18.348531,-70.338742,ARI,Arica Chacalluta,CL +arj,,,y,y,n,n,,id,seasi,itc3,id063,140.8,-2.933,-2.933,140.8,ARJ,Arso,ID +ark,,,y,y,n,y,,tz,afric,itc2,tz,36.68,-3.367,-3.367794,36.633333,ARK,Arusha,TZ +arl,,,y,y,n,n,,bf,afric,itc2,bf,1.25,11.58,11.58,1.25,ARL,Arly,BF +arm,,,y,y,n,y,ns,au,austl,itc3,au007,151.6,-30.53,-30.528056,151.617222,ARM,Armidale,AU +arn,sto,,y,n,n,y,,se,europ,itc2,se095,17.92,59.65,59.651944,17.918611,STO,Stockholm,SE +aro,,,y,y,n,n,,co,samer,itc1,co132,-76.43,8.867,8.867,-76.43,ARO,Arboletas,CO +arp,,,y,y,n,n,,pg,austl,itc3,pg,149.5,-9.9,-9.9,149.5,ARP,Aragip,PG +arq,,,y,y,n,n,,co,samer,itc1,co132,-71.43,7.033,7.033,-71.43,ARQ,Arauquita,CO +arr,,,y,y,n,n,cb,ar,samer,itc1,ar003,-71,-45.17,-45.17,-71,ARR,Alto Rio Senguerr,AR +ars,,,y,y,n,n,go,br,samer,itc1,br015,-52.25,-15.92,-15.92,-52.25,ARS,Aragarcas,BR +art,,,y,y,n,y,ny,us,namer,itc1,us105,-76.02,43.98,43.991922,-76.021739,ART,Watertown,US +aru,,,y,y,n,y,sp,br,samer,itc1,br015,-50.42,-21.18,-21.141342,-50.424722,ARU,Aracatuba,BR +arv,,,y,y,n,y,wi,us,namer,itc1,us107,-89.73,45.92,45.92,-89.73,ARV,Minocqua Noble F Lee,US +arw,,,y,y,n,y,,ro,eeuro,itc2,ro094,21.25,46.17,46.17655,21.262022,ARW,Arad,RO +arx,,,y,y,n,n,nj,us,namer,itc1,us105,-74.12,40.18,40.18,-74.12,ARX,Asbury Park,US +ary,,,y,y,n,n,vi,au,austl,itc3,au007,142.9,-37.28,-37.309444,142.988611,ARY,Ararat,AU +arz,,,y,y,n,n,,ao,afric,itc2,ao,12.9,-7.233,-7.233,12.9,ARZ,N'zeto,AO +asa,,,y,y,n,y,,er,afric,itc2,er157,42.65,13.05,13.071783,42.645006,ASA,Assab,ER +asb,,,y,y,n,y,,tm,euras,itc3,tm137,58.38,37.95,-29.689333,17.939611,ASB,Ashgabad Ashgabad,TM +asc,,,y,y,n,y,,bo,samer,itc1,bo,-63.07,-15.72,-15.72,-63.07,ASC,Ascension,BO +asd,,,y,y,n,y,,bs,carib,itc1,bs021,-77.7,24.6,24.698283,-77.795611,ASD,Andros Town,BS +ase,,,y,y,n,y,co,us,namer,itc1,us108,-106.9,39.22,39.22,-106.9,ASE,Aspen,US +asf,,,y,y,n,y,,ru,euras,itc2,ru130,48.05,46.35,46.283333,48.006278,ASF,Astrakhan,RU +asg,,,y,y,n,n,,nz,austl,itc3,nz084,171.8,-43.9,-43.9,171.8,ASG,Ashburton,NZ +ash,,,y,y,n,n,nh,us,namer,itc1,us105,-71.52,42.78,42.78,-71.52,ASH,Nashua Boire Field,US +asi,,,y,y,n,y,,sh,atlan,itc2,sh,-14.42,-7.933,-7.933,-14.42,ASI,Georgetown Wideawake Field,SH +asj,,,y,y,n,y,,jp,asia,itc3,jp,129.7,28.42,28.430633,129.712542,ASJ,Amami O Shima,JP +ask,,,y,y,n,n,,ci,afric,itc2,ci,-5.367,6.9,6.903167,-5.365581,ASK,Yamoussoukro,CI +asl,,,y,y,n,n,tx,us,namer,itc1,us107,-94.38,32.55,32.55,-94.38,ASL,Marshall Harrison County,US +asm,,,y,y,n,y,,er,afric,itc2,er157,38.92,15.3,15.291853,38.910667,ASM,Asmara Asmara International,ER +asn,,,y,y,n,n,al,us,namer,itc1,us107,-86.1,33.43,33.43,-86.1,ASN,Talladega,US +aso,,,y,y,n,y,,et,afric,itc2,et,34.55,10.07,10.01855,34.586253,ASO,Asosa,ET +asp,,,y,y,n,y,nt,au,austl,itc3,au010,133.9,-23.8,-23.806667,133.902222,ASP,Alice Springs,AU +asq,,,y,y,n,n,nv,us,namer,itc1,us110,-117.1,39.5,39.5,-117.1,ASQ,Austin,US +asr,,,y,y,n,y,,tr,euras,itc2,tr101,35.5,38.7,38.770386,35.495428,ASR,Kayseri,TR +ast,,,y,y,n,n,or,us,namer,itc1,us110,-123.9,46.17,46.17,-123.9,AST,Astoria,US +asu,,,y,y,n,y,,py,samer,itc1,py093,-57.42,-25.17,-25.23985,-57.519133,ASU,Asuncion Silvio Pettirossi,PY +asv,,,y,y,n,y,,ke,afric,itc2,ke,37.25,-2.633,-2.633,37.25,ASV,Amboseli,KE +asw,,,y,y,n,y,,eg,afric,itc2,eg044,32.78,23.97,23.964356,32.819975,ASW,Aswan,EG +asx,,,y,y,n,n,wi,us,namer,itc1,us107,-90.92,46.55,46.55,-90.92,ASX,Ashland,US +asy,,,y,y,n,n,nd,us,namer,itc1,us107,-99.37,46.03,46.03,-99.37,ASY,Ashley,US +asz,,,y,y,n,n,,pg,austl,itc3,pg,150.4,-5.867,-5.867,150.4,ASZ,Asirim,PG +ata,,,y,y,n,n,,pe,samer,itc1,pe155,-72.13,-13.48,-9.347444,-77.598392,ATA,Anta,PE +atb,,,y,y,n,y,,sd,afric,itc2,sd172,33.98,17.7,17.7,33.98,ATB,Atbara,SD +atc,,,y,y,n,y,,bs,carib,itc1,bs021,-75.75,24.67,24.629417,-75.673775,ATC,Arthur's Town,BS +atd,,,y,y,n,y,,sb,austl,itc3,sb,161,-8.867,-8.867,161,ATD,Atoifi,SB +ate,,,y,y,n,n,ok,us,namer,itc1,us107,-95.62,34.23,34.23,-95.62,ATE,Antlers,US +atf,,,y,y,n,n,,ec,samer,itc1,ec042,-78.57,-1.2,-1.212067,-78.574636,ATF,Ambato Chachoan,EC +atg,,,y,y,n,n,,pk,asia,itc3,pk163,72.25,33.9,33.9,72.25,ATG,Attock,PK +ath,,,y,y,n,y,,gr,europ,itc2,gr058,23.94,37.93,37.936358,23.944467,ATH,Athens E. Venizelos,GR +ati,,,y,y,n,n,,uy,samer,itc1,uy114,-56.52,-30.42,-30.42,-56.52,ATI,Artigas,UY +atj,,,y,y,n,n,,mg,afric,itc2,mg,47.07,-19.08,-19.08,47.07,ATJ,Antsirabe,MG +atk,,,y,y,n,y,ak,us,namer,itc1,us111,-157.3,70.47,70.47,-157.3,ATK,Atqasuk,US +atl,,,y,y,n,y,ga,us,namer,itc1,us105,-84.43,33.65,33.636719,-84.428067,ATL,Atlanta Hartsfield Jacks,US +atm,,,y,y,n,y,pa,br,samer,itc1,br016,-52.22,-3.2,-3.253906,-52.253978,ATM,Altamira,BR +atn,,,y,y,n,y,,pg,austl,itc3,pg,152.5,-3.667,-3.667,152.5,ATN,Namatanai,PG +ato,,,y,y,n,n,oh,us,namer,itc1,us105,-82.1,39.33,39.33,-82.1,ATO,Athens Ohio University,US +atp,,,y,y,n,n,,pg,austl,itc3,pg,142.4,-3.133,-3.133,142.4,ATP,Aitape Airstrip,PG +atq,,,y,y,n,y,,in,asia,itc3,in,74.8,31.7,31.709594,74.797264,ATQ,Amritsar Raja Sansi,IN +atr,,,y,y,n,n,,mr,afric,itc2,mr,-13.05,20.5,20.506828,-13.043194,ATR,Atar Mouakchott,MR +ats,,,y,y,n,n,nm,us,namer,itc1,us108,-104.5,32.85,32.85,-104.5,ATS,Artesia,US +att,,,y,y,n,y,ak,us,namer,itc1,us111,-162.8,61,61,-162.8,ATT,Atmautluak,US +atu,,,y,y,n,n,ak,us,namer,itc1,us112,173.2,52.83,52.83,173.2,ATU,Attu Island Casco Cove,US +atv,,,y,y,n,n,,td,afric,itc2,td,18.32,13.23,13.23,18.32,ATV,Ati,TD +atw,,,y,y,n,y,wi,us,namer,itc1,us107,-88.52,44.27,44.27,-88.52,ATW,Appleton Outagamie Cty,US +atx,,,y,y,n,n,,kz,euras,itc3,kz164,68.4,51.88,51.88,68.4,ATX,Atbasar,KZ +aty,,,y,y,n,y,sd,us,namer,itc1,us107,-97.15,44.92,44.913981,-97.154719,ATY,Watertown,US +atz,,,y,y,n,y,,eg,afric,itc2,eg044,31.02,27.05,27.05,31.02,ATZ,Assiut,EG +aua,,,y,y,n,y,,aw,carib,itc1,aw,-70.02,12.5,12.501389,-70.015221,AUA,Aruba Reina Beatrix,AW +aub,,,y,y,n,n,mt,br,samer,itc1,br018,-54.62,-20.45,-20.45,-54.62,AUB,Itauba,BR +auc,,,y,y,n,y,,co,samer,itc1,co132,-70.73,7.067,7.068881,-70.736925,AUC,Arauca,CO +aud,,,y,y,n,n,ql,au,austl,itc3,au131,139.9,-18.67,-18.67,139.9,AUD,Augustus Downs,AU +aue,,,y,y,n,n,,eg,afric,itc2,eg044,33.18,28.97,28.97,33.18,AUE,Abu Rudeis,EG +auf,,,y,y,n,n,,fr,europ,itc2,fr052,3.5,47.85,47.850193,3.497111,AUF,Auxerre Auxerre Branches,FR +aug,,,y,y,n,y,me,us,namer,itc1,us105,-69.8,44.32,44.32,-69.8,AUG,Augusta,US +auh,,,y,y,n,y,,ae,meast,itc2,ae,54.37,24.47,24.432972,54.651138,AUH,Abu Dhabi Abu Dhabi Intl,AE +aui,,,y,y,n,n,,pg,austl,itc3,pg,143.1,-1.45,-1.45,143.1,AUI,Aua Island,PG +auj,,,y,y,n,n,,pg,austl,itc3,pg,142.9,-4.25,-4.25,142.9,AUJ,Ambunti,PG +auk,,,y,y,n,y,ak,us,namer,itc1,us111,-164.6,62.68,62.68,-164.6,AUK,Alakanuk,US +aul,,,y,y,n,n,,mh,pacif,itc3,mh075,171.2,8.133,8.133,171.2,AUL,Aur Island,MH +aum,,,y,y,n,n,mn,us,namer,itc1,us107,-92.93,43.67,43.67,-92.93,AUM,Austin,US +aun,,,y,y,n,n,ca,us,namer,itc1,us110,-121.1,38.9,38.9,-121.1,AUN,Auburn,US +auo,,,y,y,n,n,al,us,namer,itc1,us107,-85.43,32.62,32.62,-85.43,AUO,Auburn Auburn Opelika,US +aup,,,y,y,n,n,,pg,austl,itc3,pg,149.3,-9.917,-9.917,149.3,AUP,Agaun,PG +auq,,,y,y,n,y,,pf,pacif,itc3,pf086,-139,-9.8,-9.8,-139,AUQ,Atuona,PF +aur,,,y,y,n,y,,fr,europ,itc2,fr052,2.433,44.93,44.891388,2.421944,AUR,Aurillac,FR +aus,,,y,y,n,y,tx,us,namer,itc1,us107,-97.7,30.3,30.194528,-97.669889,AUS,Austin Bergstrom Intl,US +aut,,,y,y,n,n,,id,seasi,itc3,id062,125.7,-8,-8,125.7,AUT,Atauro,ID +auu,,,y,y,n,y,ql,au,austl,itc3,au131,141.7,-13.35,-13.353889,141.720833,AUU,Aurukun Mission,AU +auv,,,y,y,n,n,,pg,austl,itc3,pg,148.4,-5.717,-5.717,148.4,AUV,Aumo,PG +auw,,,y,y,n,n,wi,us,namer,itc1,us107,-89.63,44.93,44.93,-89.63,AUW,Wausau Wausau Mnpl,US +aux,,,y,y,n,y,to,br,samer,itc1,br016,-48.2,-7.2,-7.2,-48.2,AUX,Araguaina,BR +auy,,,y,y,n,y,,vu,austl,itc3,vu115,169.7,-20.33,-20.33,169.7,AUY,Aneityum,VU +auz,,,y,y,n,n,il,us,namer,itc1,us107,-88.32,41.77,41.77,-88.32,AUZ,Aurora Municipal Airport,US +ava,,,y,y,n,n,,cn,asia,itc3,cn035,105.9,26.25,40.05,116.6,AVA,An Shun Huang Guo Shu,CN +avb,,,y,y,n,n,,it,europ,itc2,it067,12.6,46.07,46.031889,12.596472,AVB,Aviano,IT +avf,,,y,y,n,n,,fr,europ,itc2,fr052,6.5,46.42,46.42,6.5,AVF,Avoriaz,FR +avg,,,y,y,n,n,nt,au,austl,itc3,au010,129.9,-15.75,-15.75,129.9,AVG,Auvergne,AU +avi,,,y,y,n,n,,cu,carib,itc1,cu038,-78.78,22.02,22.027053,-78.789617,AVI,Ciego De Avila Maximo Gomez,CU +avk,,,y,y,n,n,,mn,asia,itc3,mn175,102.8,46.25,46.25,102.8,AVK,Arvaikheer,MN +avl,,,y,y,n,y,nc,us,namer,itc1,us105,-82.53,35.43,35.436194,-82.541806,AVL,Asheville Regional Aptc,US +avn,,,y,y,n,y,,fr,europ,itc2,fr052,4.9,43.9,43.9073,4.901831,AVN,Avignon Avignon Caumont,FR +avo,,,y,y,n,n,fl,us,namer,itc1,us105,-81.52,27.6,27.6,-81.52,AVO,Avon Park Municipal,US +avp,,,y,y,n,y,pa,us,namer,itc1,us105,-75.72,41.33,41.338478,-75.723403,AVP,Scranton Scranton Intl,US +avu,,,y,y,n,y,,sb,austl,itc3,sb,160.4,-9.85,-9.85,160.4,AVU,Avu Avu,SB +avv,mel,,y,n,n,y,vi,au,austl,itc3,au007,144.5,-38.03,-38.039444,144.469444,MEL,Melbourne Tullamarine,AU +avw,tus,,y,n,n,n,az,us,namer,itc1,us109,-111.2,32.4,32.4,-111.2,TUS,Tucson Tucson Intl,US +avx,,,y,y,n,n,ca,us,namer,itc1,us110,-118.4,33.33,33.33,-118.4,AVX,Catalina Isla Avalon Bay,US +awa,,,y,y,n,n,,et,afric,itc2,et,38.4,6.967,6.967,38.4,AWA,Awassa,ET +awb,,,y,y,n,y,,pg,austl,itc3,pg,144.9,-4.117,-4.117,144.9,AWB,Awaba,PG +awd,,,y,y,n,y,,vu,austl,itc3,vu115,169.6,-19.28,-19.28,169.6,AWD,Aniwa,VU +awe,,,y,y,n,n,,ga,afric,itc2,ga,9.45,-0.75,-0.75,9.45,AWE,Alowe,GA +awh,,,y,y,n,n,,et,afric,itc2,et,44.18,8.267,8.267,44.18,AWH,Awareh,ET +awk,,,y,y,n,n,,um,pacif,itc3,um104,166.7,19.28,19.282067,166.636444,AWK,Wake Island,UM +awm,,,y,y,n,n,ar,us,namer,itc1,us107,-90.18,35.13,35.13,-90.18,AWM,West Memphis Municipal,US +awn,,,y,y,n,n,sa,au,austl,itc3,au009,139.3,-26.48,-26.48,139.3,AWN,Alton Downs,AU +awp,,,y,y,n,n,nt,au,austl,itc3,au010,137.3,-20.33,-20.33,137.3,AWP,Austral Downs,AU +awr,,,y,y,n,n,,pg,austl,itc3,pg,144.9,-4.117,-4.117,144.9,AWR,Awar,PG +awz,,,y,y,n,y,,ir,meast,itc2,ir120,48.67,31.22,31.337431,48.76195,AWZ,Ahwaz,IR +axa,,,y,y,n,y,,ai,carib,itc1,ai,-63.03,18.25,18.204834,-63.055084,AXA,Anguilla Wallblake,AI +axb,,,y,y,n,n,ny,us,namer,itc1,us105,-75.9,44.25,44.25,-75.9,AXB,Alexandria Bay,US +axc,,,y,y,n,n,ql,au,austl,itc3,au131,145.3,-23.08,-23.08,145.3,AXC,Aramac,AU +axd,,,y,y,n,y,,gr,europ,itc2,gr058,25.95,40.85,40.855869,25.956264,AXD,Alexandroupolis Demokritos,GR +axe,,,y,y,n,n,sc,br,samer,itc1,br015,-52.38,-26.88,-26.88,-52.38,AXE,Xanxere Xanxere,BR +axg,,,y,y,n,n,ia,us,namer,itc1,us107,-94.23,43.07,43.07,-94.23,AXG,Algona,US +axk,,,y,y,n,n,,ye,meast,itc2,ye,46.82,14.53,14.551322,46.826183,AXK,Ataq,YE +axl,,,y,y,n,n,nt,au,austl,itc3,au010,136.8,-19.12,-19.12,136.8,AXL,Alexandria,AU +axm,,,y,y,n,y,,co,samer,itc1,co132,-75.67,4.517,4.452775,-75.766447,AXM,Armenia El Eden,CO +axn,,,y,y,n,n,mn,us,namer,itc1,us107,-95.27,45.78,45.866297,-95.394669,AXN,Alexandria,US +axp,,,y,y,n,y,,bs,carib,itc1,bs021,-73.97,22.45,22.441828,-73.970858,AXP,Spring Point Springpoint,BS +axr,,,y,y,n,y,,pf,pacif,itc3,pf087,-146.7,-15.35,-15.248289,-146.616708,AXR,Arutua,PF +axs,lts,,y,n,n,n,ok,us,namer,itc1,us107,-99.33,34.7,34.7,-99.33,LTS,Altus Altus Afb,US +axt,,,y,y,n,y,,jp,asia,itc3,jp,140.1,39.7,39.615556,140.218611,AXT,Akita,JP +axu,,,y,y,n,y,,et,afric,itc2,et,38.7,14.12,14.14675,38.772833,AXU,Axum,ET +axv,,,y,y,n,n,oh,us,namer,itc1,us105,-84.18,40.57,40.57,-84.18,AXV,Wapakoneta Neil Armstron,US +axx,,,y,y,n,n,nm,us,namer,itc1,us108,-105.3,36.42,36.42,-105.3,AXX,Angel Fire,US +aya,,,y,y,n,n,,co,samer,itc1,co132,-75.15,8.3,8.3,-75.15,AYA,Ayapel,CO +ayc,,,y,y,n,n,,co,samer,itc1,co132,-73.62,8.6,8.6,-73.62,AYC,Ayacucho,CO +ayd,,,y,y,n,n,nt,au,austl,itc3,au010,136,-19.3,-19.3,136,AYD,Alroy Downs,AU +aye,,,y,y,n,n,ma,us,namer,itc1,us105,-118.5,33.98,33.98,-118.5,AYE,Fort Devens Aaf Hpt,US +ayg,,,y,y,n,n,,co,samer,itc1,co132,-75.52,2.667,2.667,-75.52,AYG,Yaguara,CO +ayh,,,y,y,n,n,,gb,europ,itc2,gb053,-0.2667,52.37,52.37,-0.2667,AYH,Alconbury Raf Station,GB +ayi,,,y,y,n,n,,co,samer,itc1,co132,-72.27,-0.3833,-0.3833,-72.27,AYI,Yari,CO +ayk,,,y,y,n,n,,kz,euras,itc3,kz164,66.97,50.32,50.32,66.97,AYK,Arkalyk,KZ +ayl,,,y,y,n,n,nt,au,austl,itc3,au010,135.6,-18.02,-18.02,135.6,AYL,Anthony Lagoon,AU +ayn,,,y,y,n,n,,cn,asia,itc3,cn035,114.4,36.1,36.1,114.4,AYN,Anyang,CN +ayo,,,y,y,n,n,,py,samer,itc1,py093,-56.85,-27.37,-27.37065,-56.853944,AYO,Ayolas,PY +ayp,,,y,y,n,y,,pe,samer,itc1,pe155,-74.23,-13.18,-13.154819,-74.204417,AYP,Ayacucho Yanamilla,PE +ayq,,,y,y,n,y,nt,au,austl,itc3,au010,131,-25.33,-25.186111,130.975556,AYQ,Ayers Rock Connellan,AU +ayr,,,y,y,n,n,ql,au,austl,itc3,au131,147.4,-19.67,-19.584444,147.329167,AYR,Ayr,AU +ays,,,y,y,n,n,ga,us,namer,itc1,us105,-82.4,31.25,31.25,-82.4,AYS,Waycross Ware County,US +ayt,,,y,y,n,y,,tr,euras,itc2,tr101,30.8,36.92,36.898731,30.800461,AYT,Antalya,TR +ayu,,,y,y,n,n,,pg,austl,itc3,pg,145.9,-6.333,-6.333,145.9,AYU,Aiyura,PG +ayw,,,y,y,n,n,,id,seasi,itc3,id063,132.5,-1.2,-1.2,132.5,AYW,Ayawasi,ID +ayz,,,y,y,n,n,ny,us,namer,itc1,us105,-73.42,40.67,40.67,-73.42,AYZ,Amityville Zahns,US +aza,phx,,y,n,n,y,az,us,namer,itc1,us109,-111.9,33.32,33.32,-111.9,PHX,Phoenix Sky Harbor Intl,NA +azb,,,y,y,n,n,,pg,austl,itc3,pg,149.3,-10.3,-10.3,149.3,AZB,Amazon Bay,PG +azd,,,y,y,n,y,,ir,meast,itc2,ir120,54.28,31.9,31.904908,54.276503,AZD,Yazd,IR +azg,,,y,y,n,n,,mx,namer,itc1,mx168,-102.4,19.08,19.08,-102.4,AZG,Apatzingan,MX +azi,auh,,y,n,n,n,,ae,meast,itc2,ae,54.37,24.47,24.428333,54.458084,AUH,Abu Dhabi Abu Dhabi Intl,AE +azl,,,y,y,n,n,mt,br,samer,itc1,br018,-58.85,-13.47,-13.47,-58.85,AZL,NA,NA +azn,,,y,y,n,y,,uz,euras,itc3,uz138,72.3,40.73,40.73,72.3,AZN,Andizhan,UZ +azo,,,y,y,n,y,mi,us,namer,itc1,us105,-85.55,42.25,42.234875,-85.552058,AZO,Kalamazoo Battle Creek,US +azp,mex,,y,n,n,n,,mx,namer,itc1,mx168,-99.08,19.43,19.43,-99.08,MEX,Mexico City Juarez Intl,MX +azr,,,y,y,n,y,,dz,afric,itc2,dz,-0.2833,27.88,27.837589,-0.186414,AZR,Adrar,DZ +azs,,,y,y,n,y,,do,carib,itc1,do174,-69.74,19.27,19.27,-69.74,AZS,NA,NA +azt,,,y,y,n,n,,co,samer,itc1,co132,-73.28,6.817,6.817,-73.28,AZT,Zapatoca,CO +azz,,,y,y,n,n,,ao,afric,itc2,ao,13.15,-7.883,-7.883,13.15,AZZ,Ambriz,AO +baa,,,y,y,n,n,,pg,austl,itc3,pg,151,-5.317,-5.317,151,BAA,Bialla,PG +bab,myv,,y,n,n,n,ca,us,namer,itc1,us110,-124.1,40.98,39.136089,-121.436567,MYV,Marysville Yuba County,US +bac,,,y,y,n,n,,co,samer,itc1,co132,-72.97,4.583,4.583,-72.97,BAC,Barranca De Upia,CO +bad,shv,,y,n,n,n,la,us,namer,itc1,us107,-93.83,32.45,32.50182,-93.662674,SHV,Shreveport Regional,US +bae,,,y,y,n,n,,fr,europ,itc2,fr052,6.633,44.38,44.38,6.633,BAE,Barcelonnette,FR +baf,,,y,y,n,n,ma,us,namer,itc1,us105,-72.6,42.12,42.157776,-72.715553,BAF,Westfield Barnes,US +bag,,,y,y,n,y,,ph,pacif,itc3,ph,120.6,16.37,16.375103,120.619636,BAG,Baguio Loakan,PH +bah,,,y,y,n,y,,bh,meast,itc2,bh,50.5,26.2,26.270834,50.63361,BAH,Bahrain Bahrain Intl,BH +bai,,,y,y,n,n,,cr,camer,itc1,cr036,-83.33,9.167,9.167,-83.33,BAI,Buenos Aires,CR +baj,,,y,y,n,n,,pg,austl,itc3,pg,149.1,-4.883,-4.883,149.1,BAJ,Bali,PG +bak,,,n,y,n,y,,az,euras,itc2,az150,49.88,40.38,40.4675,50.046667,BAK,Baku,AZ +bal,,,y,y,n,y,,tr,euras,itc2,tr101,41.12,37.87,37.928969,41.116583,BAL,Batman,TR +bam,,,y,y,n,n,nv,us,namer,itc1,us110,-116.9,40.63,40.63,-116.9,BAM,Battle Mountain Lander C,US +ban,,,y,y,n,n,,cd,afric,itc2,cd167,20.33,-4.167,-4.167,20.33,BAN,Basongo,CD +bao,,,y,y,n,n,,th,seasi,itc3,th,102.8,17.38,17.38,102.8,BAO,Ban Mak Khaen Udorn,TH +bap,,,y,y,n,n,,pg,austl,itc3,pg,149.6,-10.33,-10.33,149.6,BAP,Baibara,PG +baq,,,y,y,n,y,,co,samer,itc1,co132,-74.78,10.88,10.889589,-74.780819,BAQ,Barranquilla E Cortissoz,CO +bar,,,y,y,n,n,ak,us,namer,itc1,us111,-170.2,57.17,57.17,-170.2,BAR,Baker Island Baker Aaf,US +bas,,,y,y,n,y,,sb,austl,itc3,sb,155.9,-7.983,-7.983,155.9,BAS,Balalae,SB +bat,,,y,y,n,n,sp,br,samer,itc1,br015,-48.58,-20.67,-20.584547,-48.594086,BAT,Barretos,BR +bau,,,y,y,n,y,sp,br,samer,itc1,br015,-49.05,-22.33,-22.345042,-49.0538,BAU,Bauru,BR +bav,,,y,y,n,y,,cn,asia,itc3,cn035,110,40.67,40.67,110,BAV,Baotou,CN +baw,,,y,y,n,n,,ga,afric,itc2,ga,9.45,-0.6667,-0.6667,9.45,BAW,Biawonque,GA +bax,,,y,y,n,y,,ru,euras,itc2,ru142,83.75,53.37,53.363775,83.538533,BAX,Barnaul,RU +bay,,,y,y,n,y,,ro,eeuro,itc2,ro094,23.52,47.63,47.658389,23.470022,BAY,Baia Mare,RO +baz,,,y,y,n,n,am,br,samer,itc1,br018,-62.93,-0.9667,-0.9667,-62.93,BAZ,Barbelos,BR +bba,,,y,y,n,y,,cl,samer,itc1,cl033,-71.7,-45.92,-45.916058,-71.689475,BBA,Balmaceda Balmaceda,CL +bbb,,,y,y,n,y,mn,us,namer,itc1,us107,-95.6,45.32,45.32,-95.6,BBB,Benson Municipal,US +bbc,,,y,y,n,y,tx,us,namer,itc1,us107,-95.97,28.98,28.98,-95.97,BBC,Bay City,US +bbd,,,y,y,n,n,tx,us,namer,itc1,us107,-99.33,31.13,31.13,-99.33,BBD,Brady Curtis Field,US +bbe,,,y,y,n,n,wa,au,austl,itc3,au011,117.7,-27.42,-27.42,117.7,BBE,Big Bell,AU +bbf,,,y,y,n,n,ma,us,namer,itc1,us105,-71.18,42.5,42.5,-71.18,BBF,Burlington,US +bbg,,,y,y,n,n,,ki,austl,itc3,ki069,172.8,3.083,3.083,172.8,BBG,Butaritari,KI +bbh,,,y,y,n,n,,de,europ,itc2,de040,12.7,54.33,54.33754,12.699705,BBH,Barth,DE +bbi,,,y,y,n,y,,in,asia,itc3,in,85.82,20.23,20.244364,85.817781,BBI,Bhubaneswar,IN +bbj,,,y,y,n,n,,de,europ,itc2,de040,6.533,49.97,51.193531,14.519747,BBJ,Bitburg Bitburg Air Base,DE +bbk,,,y,y,n,y,,bw,afric,itc2,bw,25.17,-17.78,-17.832875,25.1624,BBK,Kasane,BW +bbl,,,y,y,n,n,,ir,meast,itc2,ir120,52.67,36.65,36.65,52.67,BBL,Babolsar,IR +bbm,,,y,y,n,n,,kh,seasi,itc3,kh,103.2,13.08,13.095564,103.224408,BBM,Battambang,KH +bbn,,,y,y,n,y,,my,seasi,itc3,my,115.5,3.683,3.683,115.5,BBN,Bario,MY +bbo,,,y,y,n,y,,so,afric,itc2,so,45.02,10.42,10.389167,44.941106,BBO,Berbera,SO +bbp,,,y,y,n,n,,gb,europ,itc2,gb053,-1.083,50.68,50.68,-1.083,BBP,Bembridge,GB +bbq,,,y,y,n,n,,ag,carib,itc1,ag,-61.78,17.6,17.6,-61.78,BBQ,Barbuda,AG +bbr,,,y,y,n,n,,gp,carib,itc1,gp,-61.75,16.17,16.17,-61.75,BBR,Basse Terre Baillif,GP +bbs,,,y,y,n,n,,gb,europ,itc2,gb053,-0.8333,51.32,51.323889,-0.8475,BBS,Blackbush,GB +bbt,,,y,y,n,n,,cf,afric,itc2,cf,15.78,4.217,4.221583,15.786369,BBT,Berberati,CF +bbu,buh,,y,n,n,y,,ro,eeuro,itc2,ro094,26.08,44.5,44.503194,26.102111,BUH,Bucharest,RO +bbv,,,y,y,n,n,,ci,afric,itc2,ci,-6.95,4.667,4.667,-6.95,BBV,Bereby,CI +bbw,,,y,y,n,n,ne,us,namer,itc1,us107,-99.63,41.4,41.4,-99.63,BBW,Broken Bow,US +bbx,phl,,y,n,n,n,pa,us,namer,itc1,us105,-75.27,40.15,40.15,-75.27,PHL,Philadelphia Intl,US +bby,,,y,y,n,n,,cf,afric,itc2,cf,20.63,5.85,5.85,20.63,BBY,Bambari,CF +bbz,,,y,y,n,n,,zm,afric,itc2,zm,23.12,-13.53,-13.53,23.12,BBZ,Zambezi,ZM +bca,,,y,y,n,y,,cu,carib,itc1,cu038,-74.35,20.25,20.365317,-74.506206,BCA,Baracoa,CU +bcb,,,y,y,n,n,va,us,namer,itc1,us105,-80.42,37.23,37.23,-80.42,BCB,Blacksburg Virginia Tech,US +bcc,,,y,y,n,n,ak,us,namer,itc1,us111,-155.9,57.62,57.62,-155.9,BCC,Bear Creek,US +bcd,,,y,y,n,y,,ph,pacif,itc3,ph,122.9,10.63,10.642511,122.929617,BCD,Bacolod,PH +bce,,,y,y,n,n,ut,us,namer,itc1,us108,-112.2,37.7,37.7,-112.2,BCE,Bryce,US +bcf,,,y,y,n,n,,cf,afric,itc2,cf,18.28,6.5,6.5,18.28,BCF,Bouca,CF +bcg,,,y,y,n,n,,gy,samer,itc1,gy,-59.17,7.7,7.7,-59.17,BCG,Bemichi,GY +bch,,,y,y,n,n,,tl,asia,itc3,tl,126.4,-8.483,-8.485547,126.399389,BCH,Baucau English Madeira,TL +bci,,,y,y,n,y,ql,au,austl,itc3,au131,145.3,-23.67,-23.565278,145.306667,BCI,Barcaldine,AU +bcj,,,y,y,n,n,co,us,namer,itc1,us108,-105.8,37.97,37.97,-105.8,BCJ,Baca Grande,US +bck,,,y,y,n,n,ql,au,austl,itc3,au131,144.2,-17.4,-17.4,144.2,BCK,Bolwarra,AU +bcl,,,y,y,n,y,,cr,camer,itc1,cr036,-83.58,10.78,10.78,-83.58,BCL,Barra Colorado,CR +bcm,,,y,y,n,y,,ro,eeuro,itc2,ro094,26.88,46.6,46.521946,26.910278,BCM,Bacau,RO +bcn,,,y,y,n,y,,es,europ,itc2,es045,2.083,41.3,41.297078,2.078464,BCN,Barcelona,ES +bco,,,y,y,n,n,,et,afric,itc2,et,36.55,5.783,5.783,36.55,BCO,Jinka,ET +bcp,,,y,y,n,n,,pg,austl,itc3,pg,146.5,-5.85,-5.85,146.5,BCP,Bambu,PG +bcq,,,y,y,n,n,,ly,afric,itc2,ly122,14.22,27.68,27.68,14.22,BCQ,Brack,LY +bcr,,,y,y,n,n,am,br,samer,itc1,br018,-67.42,-8.633,-8.633,-67.42,BCR,Boca Do Acre,BR +bcs,,,y,y,n,n,la,us,namer,itc1,us107,-90.02,29.87,29.87,-90.02,BCS,Belle Chasse S Seaplane,US +bct,,,y,y,n,n,fl,us,namer,itc1,us105,-80.12,26.37,26.37,-80.12,BCT,Boca Pub. Non Commercial,US +bcu,,,y,y,n,n,,ng,afric,itc2,ng,9.833,10.32,10.32,9.833,BCU,Bauchi,NG +bcv,,,y,y,n,n,,bz,camer,itc1,bz,-88.78,17.27,61.416489,-149.50735,BCV,Belmopan,BZ +bcw,,,y,y,n,n,,mz,afric,itc2,mz,40.35,-11.35,-11.35,40.35,BCW,Benguera Island,MZ +bcx,,,y,y,n,n,,ru,euras,itc2,ru141,58.33,53.93,53.93,58.33,BCX,Beloreck,RU +bcy,,,y,y,n,n,,et,afric,itc2,et,36.67,6.217,6.217,36.67,BCY,Bulchi,ET +bcz,,,y,y,n,n,nt,au,austl,itc3,au010,137.8,-13.78,-13.78,137.8,BCZ,Bickerton Island,AU +bda,,,y,y,n,y,,bm,atlan,itc1,bm014,-64.68,32.37,32.364042,-64.678703,BDA,Bermuda Bermuda Intl,BM +bdb,,,y,y,n,y,ql,au,austl,itc3,au131,152.3,-24.9,-24.903889,152.318611,BDB,Bundaberg,AU +bdc,,,y,y,n,n,ma,br,samer,itc1,br016,-45.27,-5.467,-5.467,-45.27,BDC,Barra Do Corda,BR +bdd,,,y,y,n,y,ql,au,austl,itc3,au131,142.1,-10.12,-10.12,142.1,BDD,Badu Island,AU +bde,,,y,y,n,n,mn,us,namer,itc1,us107,-94.6,48.72,48.728444,-94.612222,BDE,Baudette,US +bdf,,,y,y,n,n,il,us,namer,itc1,us107,-89.65,41.18,41.18,-89.65,BDF,Bradford Rinkenberger,US +bdg,,,y,y,n,n,ut,us,namer,itc1,us108,-109.5,37.58,37.58,-109.5,BDG,Blanding,US +bdh,,,y,y,n,y,,ir,meast,itc2,ir120,54.88,26.55,26.532,54.824847,BDH,Bandar Lengeh,IR +bdi,,,y,y,n,n,,sc,iocea,itc2,sc,55.22,-3.717,-3.717,55.22,BDI,Bird Island,SC +bdj,,,y,y,n,y,,id,seasi,itc3,id062,114.7,-3.283,-3.283,114.7,BDJ,Banjarmasin Sjamsudin Noor,ID +bdk,,,y,y,n,n,,ci,afric,itc2,ci,-2.8,8.033,8.033,-2.8,BDK,Bondoukou,CI +bdl,hfd,,y,n,n,y,ct,us,namer,itc1,us105,-72.68,41.93,41.938889,-72.683222,HFD,Hartford Brainard,US +bdm,,,y,y,n,n,,tr,euras,itc2,tr101,27.98,40.32,40.317972,27.977694,BDM,Bandirma,TR +bdn,,,y,y,n,n,,pk,asia,itc3,pk163,68.83,24.65,24.841519,68.838408,BDN,Badin Talhar,PK +bdo,,,y,y,n,y,,id,seasi,itc3,id061,107.6,-6.9,-6.9,107.6,BDO,Bandung Husein Sastranegara,ID +bdp,,,y,y,n,y,,np,asia,itc3,np,88.08,26.57,26.57,88.08,BDP,Bhadrapur,NP +bdq,,,y,y,n,y,,in,asia,itc3,in,73.27,22.32,22.336164,73.226289,BDQ,Vadodara,IN +bdr,,,y,y,n,y,ct,us,namer,itc1,us105,-73.13,41.17,41.163472,-73.126167,BDR,Bridgeport I Sikorsky,US +bds,,,y,y,n,y,,it,europ,itc2,it067,17.95,40.65,40.657633,17.947033,BDS,Brindisi Papola Casale,IT +bdt,,,y,y,n,n,,cd,afric,itc2,cd166,19.05,0.8167,4.253206,20.975283,BDT,Gbadolite,CD +bdu,,,y,y,n,y,,no,europ,itc2,no083,18.53,69.05,69.055758,18.540356,BDU,Bardufoss,NO +bdv,,,y,y,n,n,,cd,afric,itc2,cd167,29.73,-7.083,-7.083,29.73,BDV,Moba,CD +bdw,,,y,y,n,n,wa,au,austl,itc3,au011,127.5,-17.3,-17.3,127.5,BDW,Bedford Downs,AU +bdx,,,y,y,n,n,mt,us,namer,itc1,us108,-105.4,45.45,45.45,-105.4,BDX,Broadus,US +bdy,,,y,y,n,n,or,us,namer,itc1,us110,-124.4,43.12,43.12,-124.4,BDY,Bandon State,US +bdz,,,y,y,n,n,,pg,austl,itc3,pg,147,-6.333,-6.333,147,BDZ,Baindoung,PG +bea,,,y,y,n,n,,pg,austl,itc3,pg,146.5,-8.65,-8.65,146.5,BEA,Bereina,PG +beb,,,y,y,n,y,,gb,europ,itc2,gb053,-7.367,57.48,57.481111,-7.362778,BEB,Benbecula,GB +bec,ict,,y,n,n,n,ks,us,namer,itc1,us107,-97.21,37.69,37.69,-97.21,ICT,Wichita Mid Continent,US +bed,,,y,y,n,n,ma,us,namer,itc1,us105,-71.28,42.47,42.469953,-71.289031,BED,Bedford Ha Hanscom Field,US +bee,,,y,y,n,n,wa,au,austl,itc3,au011,141.6,-12.98,-12.98,141.6,BEE,Beagle Bay,AU +bef,,,y,y,n,n,,ni,camer,itc1,ni193,-83.98,12.03,11.990961,-83.774086,BEF,Bluefields,NI +beg,,,y,y,n,y,,rs,europ,itc2,rs189,20.5,44.83,44.818444,20.309139,BEG,Belgrade Beograd,CS +beh,,,y,y,n,n,mi,us,namer,itc1,us105,-86.43,42.13,42.13,-86.43,BEH,Benton Harbor Ross Field,US +bei,,,y,y,n,n,,et,afric,itc2,et,34.53,9.383,9.383,34.53,BEI,Beica,ET +bej,,,y,y,n,n,,id,seasi,itc3,id062,117.7,2.167,2.167,117.7,BEJ,Berau,ID +bek,,,y,y,n,n,,in,asia,itc3,in,81.23,26.22,26.22,81.23,BEK,Bareli,IN +bel,,,y,y,n,y,pa,br,samer,itc1,br016,-48.48,-1.383,-1.37925,-48.476292,BEL,Belem Val De Cans,BR +bem,,,y,y,n,n,,cf,afric,itc2,cf,17.65,5.267,5.267,17.65,BEM,Bossembele,CF +ben,,,y,y,n,y,,ly,afric,itc2,ly122,20.28,32.1,32.096786,20.269472,BEN,Benghazi Benina Intl,LY +beo,ntl,,y,n,n,n,ns,au,austl,itc3,au007,151.7,-33.03,-33.03,151.7,NTL,Newcastle Williamtown,AU +bep,,,y,y,n,y,,in,asia,itc3,in,76.9,15.18,15.162783,76.882775,BEP,Bellary,IN +beq,,,y,y,n,n,,gb,europ,itc2,gb053,0.7667,52.35,52.342611,0.772939,BEQ,Bury St Edmunds Honington,GB +ber,,,n,y,n,n,,de,europ,itc2,de040,13.3,52.57,52.516,13.3769,BER,Berlin,DE +bes,,,y,y,n,y,,fr,europ,itc2,fr052,-4.417,48.45,48.447911,-4.418539,BES,Brest Guipavas,FR +bet,,,y,y,n,y,ak,us,namer,itc1,us111,-161.8,60.78,60.779778,-161.838,BET,Bethel Bethel Airport,US +beu,,,y,y,n,y,ql,au,austl,itc3,au131,139.4,-24.33,-24.346111,139.460278,BEU,Bedourie,AU +bev,,,y,y,n,n,,il,meast,itc2,il065,34.8,31.25,31.287003,34.722953,BEV,Beer Sheba,IL +bew,,,y,y,n,y,,mz,afric,itc2,mz,34.9,-19.8,-19.796419,34.907556,BEW,Beira,MZ +bex,,,y,y,n,n,,gb,europ,itc2,gb053,-0.25,51.65,51.616389,-1.095833,BEX,Benson Raf Station,GB +bey,,,y,y,n,y,,lb,meast,itc2,lb072,35.48,33.83,33.820931,35.488389,BEY,Beirut International,LB +bez,,,y,y,n,n,,ki,austl,itc3,ki069,176,-1.333,-1.333,176,BEZ,Beru,KI +bfa,,,y,y,n,n,,py,samer,itc1,py093,-58.17,-20.23,-20.23,-58.17,BFA,Bahia Negra Bahia Negra,PY +bfb,,,y,y,n,n,ak,us,namer,itc1,us111,-152.1,58.18,58.18,-152.1,BFB,Blue Fox Bay,US +bfc,,,y,y,n,n,ql,au,austl,itc3,au131,145.3,-15.9,-15.9,145.3,BFC,Bloomfield,AU +bfd,,,y,y,n,y,pa,us,namer,itc1,us105,-78.63,41.8,41.803067,-78.640122,BFD,Bradford,US +bfe,,,y,y,n,y,,de,europ,itc2,de040,8.533,52.03,52.03,8.533,BFE,Bielefeld Bielefeld,DE +bff,,,y,y,n,y,ne,us,namer,itc1,us108,-103.6,41.87,41.874028,-103.595639,BFF,Scottsbluff Scotts Bluff,US +bfg,,,y,y,n,n,ut,us,namer,itc1,us108,-110.7,37.55,37.55,-110.7,BFG,Bullfrog Basin,US +bfh,cwb,,y,n,n,n,pr,br,samer,itc1,br015,-49.23,-25.4,-25.405078,-49.232036,CWB,Curitiba Afonso Pena,BR +bfi,sea,,y,n,n,y,wa,us,namer,itc1,us110,-122.3,47.53,47.53,-122.301947,SEA,Seattle Seattle Tacoma,US +bfj,,,y,y,n,n,,fj,austl,itc3,fj169,177.7,-17.57,-17.57,177.7,BFJ,Ba Ba,FJ +bfk,den,,y,n,n,n,co,us,namer,itc1,us108,-104.9,39.77,39.77,-104.9,DEN,Denver Denver Intl,US +bfl,,,y,y,n,y,ca,us,namer,itc1,us110,-119,35.43,35.433598,-119.05677,BFL,Bakersfield Meadows Fld,US +bfm,mob,,y,n,n,n,al,us,namer,itc1,us107,-88.08,30.67,30.626783,-88.068092,MOB,Mobile Mobile Mnpl,US +bfn,,,y,y,n,y,,za,afric,itc2,za,26.18,-29.1,-29.092722,26.302444,BFN,Bloemfontein Intl,ZA +bfo,,,y,y,n,n,,zw,afric,itc2,zw,31.58,-21.02,-21.008083,31.57855,BFO,Buffalo Range,ZW +bfp,,,y,y,n,n,pa,us,namer,itc1,us105,-80.4,40.77,40.77,-80.4,BFP,Beaver Falls,US +bfq,,,y,y,n,n,,pa,camer,itc1,pa,-79.43,9.083,9.083,-79.43,BFQ,Bahia Pinas,PA +bfr,,,y,y,n,n,in,us,namer,itc1,us105,-86.48,38.85,38.85,-86.48,BFR,Bedford Virgil I Grissom,US +bfs,,,y,y,n,y,,gb,europ,itc2,gb053,-6.233,54.65,54.6575,-6.215833,BFS,Belfast Intl,GB +bft,,,y,y,n,n,sc,us,namer,itc1,us105,-80.63,32.42,32.42,-80.63,BFT,Beaufort County,US +bfu,,,y,y,n,n,,cn,asia,itc3,cn035,117.3,32.95,32.95,117.3,BFU,Bengbu,CN +bfv,,,y,y,n,y,,th,seasi,itc3,th,103.3,15.23,15.229539,103.253231,BFV,Buri Ram,TH +bfw,,,y,y,n,n,,dz,afric,itc2,dz,-0.5894,35.17,35.17,-0.5894,BFW,Sidi Belabbes Sidi Belabbes,DZ +bfx,,,y,y,n,n,,cm,afric,itc2,cm,10.4,5.483,5.536919,10.354583,BFX,Bafoussam,CM +bga,,,y,y,n,y,,co,samer,itc1,co132,-73.15,7.1,7.1265,-73.184778,BGA,Bucaramanga Palo Negro,CO +bgb,,,y,y,n,n,,ga,afric,itc2,ga,11.93,-0.1,-0.1,11.93,BGB,Booue,GA +bgc,,,y,y,n,n,,pt,europ,itc2,pt092,-6.75,41.82,41.8578,-6.707125,BGC,Braganca,PT +bgd,,,y,y,n,n,tx,us,namer,itc1,us107,-101.4,35.65,35.65,-101.4,BGD,Borger,US +bge,,,y,y,n,n,ga,us,namer,itc1,us105,-84.6,30.9,30.9,-84.6,BGE,Bainbridge Decatur Cnty,US +bgf,,,y,y,n,y,,cf,afric,itc2,cf,18.57,4.383,4.398475,18.518786,BGF,Bangui,CF +bgg,,,y,y,n,n,,ci,afric,itc2,ci,-4.2,6.65,6.65,-4.2,BGG,Bongouanou,CI +bgh,,,y,y,n,n,,mr,afric,itc2,mr,-14.2,16.63,16.63,-14.2,BGH,Boghe Abbaye,MR +bgi,,,y,y,n,y,,bb,carib,itc1,bb,-59.48,13.07,13.074603,-59.492456,BGI,Bridgetown Grantley Adams,BB +bgj,,,y,y,n,n,,is,europ,itc2,is,-21.98,64.47,64.47,-21.98,BGJ,Borgarfjordur Eystri,IS +bgk,,,y,y,n,n,,bz,camer,itc1,bz,-88.42,16.52,16.52,-88.42,BGK,Big Creek,BZ +bgl,,,y,y,n,n,,np,asia,itc3,np,83.67,28.22,28.22,83.67,BGL,Baglung,NP +bgm,,,y,y,n,y,ny,us,namer,itc1,us105,-75.98,42.22,42.208689,-75.979839,BGM,Binghamton,US +bgn,,,y,y,n,n,,de,europ,itc2,de040,6.133,51.2,51.2,6.133,BGN,Brueggen R A F,DE +bgo,,,y,y,n,y,,no,europ,itc2,no083,5.217,60.3,60.293386,5.218142,BGO,Bergen Flesland,NO +bgp,,,y,y,n,n,,ga,afric,itc2,ga,10.22,-2.167,-2.167,10.22,BGP,Bongo,GA +bgq,,,y,y,n,n,ak,us,namer,itc1,us111,-149.8,61.53,61.53,-149.8,BGQ,Big Lake,US +bgr,,,y,y,n,y,me,us,namer,itc1,us105,-68.82,44.8,44.807444,-68.828139,BGR,Bangor International,US +bgs,hca,,y,n,n,n,tx,us,namer,itc1,us107,-101.4,32.3,32.3,-101.4,HCA,Big Spring Howard County,US +bgt,,,y,y,n,n,az,us,namer,itc1,us109,-113.2,34.57,34.57,-113.2,BGT,Bagdad,US +bgu,,,y,y,n,n,,cf,afric,itc2,cf,22.8,4.783,4.783,22.8,BGU,Bangassou,CF +bgv,,,y,y,n,n,rs,br,samer,itc1,br015,-51.52,-29.17,-29.17,-51.52,BGV,Bento Goncalves,BR +bgw,,,y,y,n,y,,iq,meast,itc2,iq066,44.5,33.28,33.28,44.5,BGW,Baghdad Al Muthana,IQ +bgx,,,y,y,n,n,rs,br,samer,itc1,br015,-54.12,-31.38,-31.390528,-54.112244,BGX,Bage,BR +bgy,mil,,y,n,n,y,,it,europ,itc2,it067,9.7,45.67,45.673889,9.704166,MIL,Milan,IT +bgz,,,y,y,n,n,,pt,europ,itc2,pt092,-8.45,... [truncated message content] |