From: Christian P. <cp...@us...> - 2005-05-06 15:32:24
|
Update of /cvsroot/pclasses/pclasses2/src/Unicode In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv12388/src/Unicode Modified Files: Char.cpp Makefile.am String.cpp uctype.cpp unicodedata.cpp unicodedata.h ustring.cpp Added Files: genunicodedata.cpp Log Message: - Added c++ tool for generating unicode database. - More work on Unicode support. --- NEW FILE: genunicodedata.cpp --- /*************************************************************************** * Copyright (C) 2005 by Christian Prochnow * * cp...@se... * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU Library General Public License as * * published by the Free Software Foundation; either version 2 of the * * License, or (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU Library General Public * * License along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ #include <iostream> #include <fstream> #include <sstream> #include <string> #include <vector> #include <map> #include "unicodedata.h" using namespace std; using namespace P::Unicode; unsigned int line = 1; unsigned int letterCasemapNum = 0; unsigned int numberMappingNum = 0; unsigned int decompCharsNum = 0; map<string, Category> categoryMap; map<string, BidiClass> bidiClassMap; map<string, Decomposition> decompMap; std::string mirrored_to_bool(const std::string& m) { if(m == "Y") return "true"; else if(m == "N") return "false"; cerr << "Line " << line << ": Warning unknown mirrored property: " << m << endl; return "false"; } Category string_to_category(const std::string& cat) { map<string, Category>::const_iterator i = categoryMap.find(cat); if(i != categoryMap.end()) return i->second; cerr << "Line " << line << ": Warning unknown category property: " << cat << endl; return Other_NotAssigned; } BidiClass string_to_bidiClass(const std::string& bidi) { map<string, BidiClass>::const_iterator i = bidiClassMap.find(bidi); if(i != bidiClassMap.end()) return i->second; cerr << "Line " << line << ": Warning unknown bidirectional property: " << bidi << endl; return OtherNeutrals; } Decomposition string_to_decompTag(const std::string& decomp) { if(decomp.empty()) return Decomp_None; if(decomp.at(0) == '<') { string dt = decomp.substr(0, decomp.find('>')+1); map<string, Decomposition>::const_iterator i = decompMap.find(dt); if(i != decompMap.end()) return i->second; cerr << "Line " << line << ": Warning unknown decomposition tag property: " << dt << endl; } return Decomp_Standard; } vector<string> string_to_decompChars(const string& decomp) { istringstream is(decomp); string ch; vector<string> ret; while(is >> ch) { if(ch.at(0) == '<') continue; ret.push_back(ch); } return ret; } int main(int argc, char* argv[]) { ifstream dataFile(argv[1]); ofstream unicodeDataDb("unicodedata_db.cpp"); unicodeDataDb << "#include \"unicodedata.h\"" << endl; unicodeDataDb << "namespace P { namespace Unicode {" << endl; ofstream unicodeDataCasemapDb("unicodedata_casemap_db.cpp"); unicodeDataCasemapDb << "#include \"unicodedata.h\"" << endl; unicodeDataCasemapDb << "namespace P { namespace Unicode {" << endl; ofstream unicodeDataNumberDb("unicodedata_number_db.cpp"); unicodeDataNumberDb << "#include \"unicodedata.h\"" << endl; unicodeDataNumberDb << "namespace P { namespace Unicode {" << endl; ofstream unicodeDataDecompDb("unicodedata_decomp_db.cpp"); unicodeDataDecompDb << "#include \"unicodedata.h\"" << endl; unicodeDataDecompDb << "namespace P { namespace Unicode {" << endl; categoryMap.insert(make_pair("Mn", Mark_NonSpacing)); categoryMap.insert(make_pair("Mc", Mark_SpacingCombining)); categoryMap.insert(make_pair("Me", Mark_Enclosing)); categoryMap.insert(make_pair("Nd", Number_DecimalDigit)); categoryMap.insert(make_pair("Nl", Number_Letter)); categoryMap.insert(make_pair("No", Number_Other)); categoryMap.insert(make_pair("Zs", Separator_Space)); categoryMap.insert(make_pair("Zl", Separator_Line)); categoryMap.insert(make_pair("Zp", Separator_Paragraph)); categoryMap.insert(make_pair("Cc", Other_Control)); categoryMap.insert(make_pair("Cf", Other_Format)); categoryMap.insert(make_pair("Cs", Other_Surrogate)); categoryMap.insert(make_pair("Co", Other_PrivateUse)); categoryMap.insert(make_pair("Cn", Other_NotAssigned)); categoryMap.insert(make_pair("Lu", Letter_Uppercase)); categoryMap.insert(make_pair("Ll", Letter_Lowercase)); categoryMap.insert(make_pair("Lt", Letter_Titlecase)); categoryMap.insert(make_pair("Lm", Letter_Modifier)); categoryMap.insert(make_pair("Lo", Letter_Other)); categoryMap.insert(make_pair("Pc", Punctuation_Connector)); categoryMap.insert(make_pair("Pd", Punctuation_Dash)); categoryMap.insert(make_pair("Ps", Punctuation_Open)); categoryMap.insert(make_pair("Pe", Punctuation_Close)); categoryMap.insert(make_pair("Pi", Punctuation_InitialQuote)); categoryMap.insert(make_pair("Pf", Punctuation_FinalQuote)); categoryMap.insert(make_pair("Po", Punctuation_Other)); categoryMap.insert(make_pair("Sm", Symbol_Math)); categoryMap.insert(make_pair("Sc", Symbol_Currency)); categoryMap.insert(make_pair("Sk", Symbol_Modifier)); categoryMap.insert(make_pair("So", Symbol_Other)); bidiClassMap.insert(make_pair("L", LeftToRight)); bidiClassMap.insert(make_pair("LRE", LeftToRightEmbedding)); bidiClassMap.insert(make_pair("LRO", LeftToRightOverride)); bidiClassMap.insert(make_pair("R", RightToLeft)); bidiClassMap.insert(make_pair("AL", RightToLeftArabic)); bidiClassMap.insert(make_pair("RLE", RightToLeftEmbedding)); bidiClassMap.insert(make_pair("RLO", RightToLeftOverride)); bidiClassMap.insert(make_pair("PDF", PopDirectionalFormat)); bidiClassMap.insert(make_pair("EN", EuropeanNumber)); bidiClassMap.insert(make_pair("ES", EuropeanNumberSeparator)); bidiClassMap.insert(make_pair("ET", EuropeanNumberTerminator)); bidiClassMap.insert(make_pair("AN", ArabicNumber)); bidiClassMap.insert(make_pair("CS", CommonNumberSeparator)); bidiClassMap.insert(make_pair("NSM", NonSpacingMark)); bidiClassMap.insert(make_pair("BN", BoundaryNeutral)); bidiClassMap.insert(make_pair("B", ParagraphSeparator)); bidiClassMap.insert(make_pair("S", SegmentSeparator)); bidiClassMap.insert(make_pair("WS", Whitespace)); bidiClassMap.insert(make_pair("ON", OtherNeutrals)); decompMap.insert(make_pair("<font>", Decomp_Font)); decompMap.insert(make_pair("<noBreak>", Decomp_NoBreak)); decompMap.insert(make_pair("<initial>", Decomp_Initial)); decompMap.insert(make_pair("<medial>", Decomp_Medial)); decompMap.insert(make_pair("<final>", Decomp_Final)); decompMap.insert(make_pair("<isolated>", Decomp_Isolated)); decompMap.insert(make_pair("<circle>", Decomp_Encircled)); decompMap.insert(make_pair("<super>", Decomp_Superscript)); decompMap.insert(make_pair("<sub>", Decomp_Subscript)); decompMap.insert(make_pair("<vertical>", Decomp_Vertical)); decompMap.insert(make_pair("<wide>", Decomp_Wide)); decompMap.insert(make_pair("<narrow>", Decomp_Narrow)); decompMap.insert(make_pair("<small>", Decomp_Small)); decompMap.insert(make_pair("<square>", Decomp_Square)); decompMap.insert(make_pair("<fraction>", Decomp_Fraction)); decompMap.insert(make_pair("<compat>", Decomp_Compat)); unicodeDataDb << "extern letterCasemapData caseMappings[];" << endl; unicodeDataDb << "extern numberMappingData numberMappings[];" << endl; unicodeDataDb << "extern uchar_t decompChars[];" << endl; unicodeDataDb << "codePointData codePoints[] = {" << endl; unicodeDataCasemapDb << "letterCasemapData caseMappings[] = {" << endl; unicodeDataNumberDb << "numberMappingData numberMappings[] = {" << endl; unicodeDataDecompDb << "uchar_t decompChars[] = {" << endl; ostringstream tmpos; while(dataFile.good()) { string codePoint, charName, category, combining, bidi, decomp, num1, num2, num3, mirrored, description, upcasemap, lowcasemap, titcasemap, extraIndex, decompIndex, tmp; if(!getline(dataFile, codePoint, ';')) break; getline(dataFile, charName, ';'); getline(dataFile, category, ';'); getline(dataFile, combining, ';'); getline(dataFile, bidi, ';'); getline(dataFile, decomp, ';'); getline(dataFile, num1, ';'); getline(dataFile, num2, ';'); getline(dataFile, num3, ';'); getline(dataFile, mirrored, ';'); getline(dataFile, description, ';'); getline(dataFile, tmp, ';'); // ?? getline(dataFile, upcasemap, ';'); getline(dataFile, lowcasemap, ';'); getline(dataFile, titcasemap); extraIndex = "(uint16_t)-1"; decompIndex = "(uint16_t)-1"; // generate extra data for letters ... if(category.at(0) == 'L' && !(upcasemap.empty() && lowcasemap.empty() && titcasemap.empty())) { tmpos.str(""); tmpos << letterCasemapNum << " /*letter*/"; extraIndex = tmpos.str(); if(upcasemap.empty()) upcasemap = "0"; if(lowcasemap.empty()) lowcasemap = "0"; if(titcasemap.empty()) titcasemap = "0"; unicodeDataCasemapDb << " { 0x" << upcasemap <<", 0x" << lowcasemap << ", 0x" << titcasemap << " }," << endl; ++letterCasemapNum; } else if(category.at(0) == 'N' && !num3.empty()) { tmpos.str(""); tmpos << numberMappingNum << " /*number*/"; extraIndex = tmpos.str(); unicodeDataNumberDb << " { " << num3 << " }," << endl; ++numberMappingNum; } Decomposition decompTag = string_to_decompTag(decomp); if(decompTag != Decomp_None) { tmpos.str(""); tmpos << decompCharsNum; decompIndex = tmpos.str(); vector<string> decompCharV = string_to_decompChars(decomp); unicodeDataDecompDb << "/* codepoint " << codePoint << " begin */" << endl; vector<string>::const_iterator i = decompCharV.begin(); while(i != decompCharV.end()) { unicodeDataDecompDb << "0x" << *i << ", "; ++decompCharsNum; ++i; } unicodeDataDecompDb << "(uchar_t)-1, " << endl; unicodeDataDecompDb << "/* codepoint " << codePoint << " end */" << endl; ++decompCharsNum; } unicodeDataDb << "{ 0x" << codePoint << ", " << string_to_category(category) << ", " << combining << ", " << string_to_bidiClass(bidi) << ", " << decompTag << ", " << decompIndex << ", " << mirrored_to_bool(mirrored) << ", " << extraIndex << " }," << endl; ++line; } unicodeDataDb << "{ (uchar_t)-1, 0, 0, 0, 0, false, 0 }" << endl; unicodeDataDb << "};" << endl; unicodeDataDb << "} }" << endl; unicodeDataCasemapDb << "{ 0, 0, 0 } };" << endl; unicodeDataCasemapDb << "} }" << endl; unicodeDataNumberDb << "{ 0 } };" << endl; unicodeDataNumberDb << "} }" << endl; unicodeDataDecompDb << "(uchar_t)-1 };" << endl; unicodeDataDecompDb << "} }" << endl; dataFile.close(); } Index: unicodedata.cpp =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/unicodedata.cpp,v retrieving revision 1.1 retrieving revision 1.2 diff -u -d -r1.1 -r1.2 --- unicodedata.cpp 14 Jan 2005 14:46:02 -0000 1.1 +++ unicodedata.cpp 6 May 2005 15:32:11 -0000 1.2 @@ -24,8 +24,9 @@ namespace Unicode { -#include "unicodedata_extra_db.h" -#include "unicodedata_db.h" +extern codePointData codePoints[]; +extern letterCasemapData caseMappings[]; +extern numberMappingData numberMappings[]; const codePointData* lookupCodePoint(uchar_t codePoint) { @@ -40,6 +41,48 @@ return 0; } +const letterCasemapData* lookupLetterCasemapData(const codePointData* codePoint) +{ + switch(codePoint->category) + { + case Letter_Uppercase: + case Letter_Lowercase: + case Letter_Titlecase: + case Letter_Modifier: + case Letter_Other: + if(codePoint->extraIndex != (uint16_t)-1) + return &caseMappings[codePoint->extraIndex]; + break; + default: + break; + } + + return 0; +} + +const numberMappingData* lookupNumberMappingData(const codePointData* codePoint) +{ + if((codePoint->category == Number_DecimalDigit + || codePoint->category == Number_Letter) + && codePoint->extraIndex != (uint16_t)-1) + { + return &numberMappings[codePoint->extraIndex]; + } + + return 0; +} + +/* tests if we can simply ignore an unknown codepoint */ +bool isIgnorableCodePoint(uchar_t ch) +{ + if((ch >= 0x2060 && ch <= 0x206F) + || (ch >= 0xFFF0 && ch <= 0xFFFB) + || (ch >= 0xE0000 && ch <= 0xE0FFF)) + return true; + + return false; +} + Category category(uchar_t ch) { const codePointData* data = lookupCodePoint(ch); Index: Char.cpp =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/Char.cpp,v retrieving revision 1.5 retrieving revision 1.6 diff -u -d -r1.5 -r1.6 --- Char.cpp 28 Apr 2005 10:21:14 -0000 1.5 +++ Char.cpp 6 May 2005 15:32:11 -0000 1.6 @@ -100,30 +100,16 @@ return isualpha(_char) == 1 ? true : false; } -int Char::toNumber() const +float Char::toNumber() const { - int ret = 0; + float ret = 0.0f; const codePointData* data = lookupCodePoint(_char); if(data) { - switch(category()) - { - case Number_DecimalDigit: - case Number_Letter: - { - const decimalDigitExtraData* extraData = - (const decimalDigitExtraData*)data->extra; - ret = extraData->num; - } - break; - - case Number_Other: - break; - - default: - break; - } + const numberMappingData* numberData = lookupNumberMappingData(data); + if(numberData) + ret = numberData->num; } return ret; @@ -152,7 +138,7 @@ bool Char::isMirrored() const { const codePointData* data = lookupCodePoint(_char); - return data->mirrored == 1 ? true : false; + return data->mirrored; } Char::Category Char::category() const Index: uctype.cpp =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/uctype.cpp,v retrieving revision 1.2 retrieving revision 1.3 diff -u -d -r1.2 -r1.3 --- uctype.cpp 26 Apr 2005 12:16:02 -0000 1.2 +++ uctype.cpp 6 May 2005 15:32:11 -0000 1.3 @@ -153,10 +153,11 @@ uchar_t toulower(uchar_t c) { const codePointData* data = lookupCodePoint(c); - if(data && data->extra) + if(data) { - const letterExtraData* extraData = (const letterExtraData*)data->extra; - return extraData->lower; + const letterCasemapData* casemapData = lookupLetterCasemapData(data); + if(casemapData) + return casemapData->lower; } return c; @@ -165,10 +166,11 @@ uchar_t touupper(uchar_t c) { const codePointData* data = lookupCodePoint(c); - if(data && data->extra) + if(data) { - const letterExtraData* extraData = (const letterExtraData*)data->extra; - return extraData->upper; + const letterCasemapData* casemapData = lookupLetterCasemapData(data); + if(casemapData) + return casemapData->upper; } return c; Index: ustring.cpp =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/ustring.cpp,v retrieving revision 1.2 retrieving revision 1.3 diff -u -d -r1.2 -r1.3 --- ustring.cpp 26 Apr 2005 12:16:02 -0000 1.2 +++ ustring.cpp 6 May 2005 15:32:11 -0000 1.3 @@ -32,10 +32,7 @@ while(n-- > 0) { if(*s1 != *s2) - { - //@@fixme - return 1; - } + return (*s1 < *s2) ? -1 : +1; ++s1; ++s2; Index: unicodedata.h =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/unicodedata.h,v retrieving revision 1.1 retrieving revision 1.2 diff -u -d -r1.1 -r1.2 --- unicodedata.h 14 Jan 2005 14:46:02 -0000 1.1 +++ unicodedata.h 6 May 2005 15:32:11 -0000 1.2 @@ -93,23 +93,24 @@ //! Character Decomposition Tag enum Decomposition { - NoDecomposition, - Font, // <font> - NoBreak, // <noBreak> - Initial, // <initial> - Medial, // <medial> - Final, // <final> - Isolated, // <isolated> - Encircled, // <circle> - Superscript, // <super> - Subscript, // <sub> - Vertical, // <vertical> - Wide, // <wide> - Narrow, // <narrow> - Small, // <small> - Square, // <square> - Fraction, // <fraction> - Compat // <compat> + Decomp_None, + Decomp_Standard, + Decomp_Font, // <font> + Decomp_NoBreak, // <noBreak> + Decomp_Initial, // <initial> + Decomp_Medial, // <medial> + Decomp_Final, // <final> + Decomp_Isolated, // <isolated> + Decomp_Encircled, // <circle> + Decomp_Superscript, // <super> + Decomp_Subscript, // <sub> + Decomp_Vertical, // <vertical> + Decomp_Wide, // <wide> + Decomp_Narrow, // <narrow> + Decomp_Small, // <small> + Decomp_Square, // <square> + Decomp_Fraction, // <fraction> + Decomp_Compat // <compat> }; //! Canonical Combining Class @@ -148,22 +149,28 @@ char combining; char bidi; char decomp; - char mirrored; - void* extra; + uint16_t decompIndex; + bool mirrored; + uint16_t extraIndex; }; -struct letterExtraData { +struct letterCasemapData { uchar_t upper; uchar_t lower; uchar_t title; }; -struct decimalDigitExtraData { - int num; +struct numberMappingData { + float num; }; const codePointData* lookupCodePoint(uchar_t codePoint); +const letterCasemapData* lookupLetterCasemapData(const codePointData* codePoint); +const numberMappingData* lookupNumberMappingData(const codePointData* codePoint); + +bool isIgnorableCodePoint(uchar_t ch); + Category category(uchar_t ch); BidiClass bidiClass(uchar_t ch); Decomposition decompTag(uchar_t ch); Index: String.cpp =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/String.cpp,v retrieving revision 1.5 retrieving revision 1.6 diff -u -d -r1.5 -r1.6 --- String.cpp 28 Apr 2005 10:15:02 -0000 1.5 +++ String.cpp 6 May 2005 15:32:11 -0000 1.6 @@ -245,14 +245,12 @@ bool String::operator<(const String& str) const throw() { - //@todo String::operator< - return false; + return _str < str._str; } bool String::operator>(const String& str) const throw() { - //@todo String::operator> - return false; + return _str > str._str; } bool String::operator<=(const String& str) const throw() Index: Makefile.am =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/Makefile.am,v retrieving revision 1.8 retrieving revision 1.9 diff -u -d -r1.8 -r1.9 --- Makefile.am 26 Apr 2005 12:16:02 -0000 1.8 +++ Makefile.am 6 May 2005 15:32:11 -0000 1.9 @@ -1,15 +1,10 @@ -noinst_HEADERS = unicodedata.h unicodedata_db.h unicodedata_extra_db.h - -unicodedata_db: unicodedata_db.h unicodedata_extra_db.h - -unicodedata_db-clean: - rm -f unicodedata_db.h - rm -f unicodedata_extra_db.h +noinst_HEADERS = unicodedata.h -unicodedata_db.h unicodedata_extra_db.h: +UnicodeData.txt: wget --passive-ftp http://www.unicode.org/Public/UNIDATA/UnicodeData.txt - awk -f $(top_srcdir)/src/Unicode/unicodedata.awk UnicodeData.txt >unicodedata_db.h - rm -f UnicodeData.txt + +unicodedata_db.cpp unicodedata_casemap_db.cpp unicodedata_number_db.cpp: genunicodedata$(EXEEXT) UnicodeData.txt + $(top_builddir)/src/Unicode/genunicodedata$(EXEEXT) UnicodeData.txt INCLUDES = -I$(top_srcdir)/include -I$(top_builddir)/include -I$(top_builddir)/src/Unicode $(all_includes) METASOURCES = AUTO @@ -18,12 +13,15 @@ CPPFLAGS = -DPUNICODE_BUILD -libpclasses_unicode_la_SOURCES = unicodedata.cpp uctype.cpp ustring.cpp Char.cpp String.cpp TextStream.cpp +libpclasses_unicode_la_SOURCES = unicodedata.cpp unicodedata_db.cpp unicodedata_casemap_db.cpp \ + unicodedata_number_db.cpp unicodedata_decomp_db.cpp uctype.cpp ustring.cpp Char.cpp String.cpp TextStream.cpp libpclasses_unicode_la_LDFLAGS = -no-undefined libpclasses_unicode_la_LIBADD = $(top_builddir)/src/libpclasses.la $(LIBICONV) -all: unicodedata_db +noinst_PROGRAMS = genunicodedata +genunicodedata_SOURCES = genunicodedata.cpp -clean: unicodedata_db-clean +CLEAN_FILES = UnicodeData.txt unicodedata_db.cpp unicodedata_casemap_db.cpp unicodedata_number_db.cpp \ + unicodedata_decomp_db.cpp |