|
From: Christian P. <cp...@us...> - 2005-01-14 14:46:14
|
Update of /cvsroot/pclasses/pclasses2/src/Unicode In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv19335/src/Unicode Modified Files: Char.cpp Makefile.am unicodedata.awk Added Files: uctype.cpp unicodedata.cpp unicodedata.h ustring.cpp Log Message: Unicode re-work. Lets get compatible to std::basic_string<>. Unicode::Char, Unicode::String will be obsoleted. --- NEW FILE: unicodedata.cpp --- /*************************************************************************** * Copyright (C) 2004 by Christian Prochnow * * cp...@se... * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU Library General Public License as * * published by the Free Software Foundation; either version 2 of the * * License, or (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU Library General Public * * License along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ #include "unicodedata.h" namespace P { namespace Unicode { #include "unicodedata_extra_db.h" #include "unicodedata_db.h" const codePointData* lookupCodePoint(uchar_t codePoint) { unsigned int i = 0; while(codePoints[i].codePoint != (uchar_t)-1) { if(codePoints[i].codePoint == codePoint) return &codePoints[i]; ++i; } return 0; } Category category(uchar_t ch) { const codePointData* data = lookupCodePoint(ch); return (Category)data->category; } BidiClass bidiClass(uchar_t ch) { const codePointData* data = lookupCodePoint(ch); return (BidiClass)data->bidi; } Decomposition decompTag(uchar_t ch) { const codePointData* data = lookupCodePoint(ch); return (Decomposition)data->decomp; } CombiningClass combiningClass(uchar_t ch) { const codePointData* data = lookupCodePoint(ch); return (CombiningClass)data->combining; } } // !namespace Unicode } // !namespace P Index: Char.cpp =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/Char.cpp,v retrieving revision 1.2 retrieving revision 1.3 diff -u -d -r1.2 -r1.3 --- Char.cpp 23 Dec 2004 05:45:58 -0000 1.2 +++ Char.cpp 14 Jan 2005 14:46:02 -0000 1.3 @@ -19,27 +19,12 @@ */ #include "pclasses/Unicode/Char.h" - -#include "unicodedata_extra.h" // UNICODE extra character data -#include "unicodedata.h" // UNICODE character data +#include "unicodedata.h" namespace P { namespace Unicode { -const codePointData* lookupCodePoint(uint32_t codePoint) -{ - unsigned int i = 0; - while(codePoints[i].codePoint != (uint32_t)-1) - { - if(codePoints[i].codePoint == codePoint) - return &codePoints[i]; - ++i; - } - - return 0; -} - Char::Char(uint32_t ch) : _char(ch) { } Index: Makefile.am =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/Makefile.am,v retrieving revision 1.3 retrieving revision 1.4 diff -u -d -r1.3 -r1.4 --- Makefile.am 11 Jan 2005 14:57:33 -0000 1.3 +++ Makefile.am 14 Jan 2005 14:46:02 -0000 1.4 @@ -1,21 +1,19 @@ -noinst_HEADERS = unicodedata.h unicodedata_extra.h +noinst_HEADERS = unicodedata.h unicodedata_db.h unicodedata_extra_db.h unicodedata-clean: - rm -f unicodedata.h - rm -f unicodedata_extra.h - -unicodedata: unicodedata-clean unicodedata.h unicodedata_extra.h + rm -f unicodedata_db.h + rm -f unicodedata_extra_db.h -unicodedata.h unicodedata_extra.h: +unicodedata: unicodedata-clean wget --passive-ftp http://www.unicode.org/Public/UNIDATA/UnicodeData.txt - awk -f $(top_srcdir)/src/Unicode/unicodedata.awk UnicodeData.txt >unicodedata.h + awk -f $(top_srcdir)/src/Unicode/unicodedata.awk UnicodeData.txt >unicodedata_db.h rm -f UnicodeData.txt INCLUDES = -I$(top_srcdir)/include -I$(top_builddir)/include -I$(top_builddir)/src/Unicode $(all_includes) METASOURCES = AUTO lib_LTLIBRARIES = libpclasses_unicode.la -libpclasses_unicode_la_SOURCES = Char.cpp String.cpp TextStream.cpp +libpclasses_unicode_la_SOURCES = unicodedata.cpp uctype.cpp ustring.cpp Char.cpp String.cpp TextStream.cpp libpclasses_unicode_la_LDFLAGS = -no-undefined --- NEW FILE: uctype.cpp --- /*************************************************************************** * Copyright (C) 2004 by Christian Prochnow * * cp...@se... * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU Library General Public License as * * published by the Free Software Foundation; either version 2 of the * * License, or (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU Library General Public * * License along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ #include "pclasses/Unicode/uctype.h" #include "unicodedata.h" namespace P { namespace Unicode { int isualnum(uchar_t c) { int ret = 0; switch(category(c)) { case Letter_Uppercase: case Letter_Lowercase: case Letter_Titlecase: case Letter_Modifier: case Letter_Other: case Number_DecimalDigit: case Number_Letter: case Number_Other: ret = 1; default: break; } return ret; } int isualpha(uchar_t c) { int ret = 0; switch(category(c)) { case Letter_Uppercase: case Letter_Lowercase: case Letter_Titlecase: case Letter_Modifier: case Letter_Other: ret = 1; default: break; } return ret; } int isucntrl(uchar_t c) { int ret = 0; switch(category(c)) { case Mark_NonSpacing: case Mark_SpacingCombining: case Mark_Enclosing: case Other_Control: case Other_Format: case Other_Surrogate: case Other_PrivateUse: case Other_NotAssigned: ret = 1; default: break; } return ret; } int isudigit(uchar_t c) { int ret = 0; switch(category(c)) { case Number_DecimalDigit: case Number_Letter: case Number_Other: ret = 1; default: break; } return ret; } int isugraph(uchar_t c) { //@@fixme return 0; } int isulower(uchar_t c) { return category(c) == Letter_Lowercase ? 1 : 0; } int isuprint(uchar_t c) { //@@fixme return 0; } int isupunct(uchar_t c) { //@@fixme return 0; } int isuspace(uchar_t c) { int ret = 0; switch(category(c)) { case Separator_Space: case Separator_Line: case Separator_Paragraph: ret = 1; default: break; } return ret; } int isuupper(uchar_t c) { return category(c) == Letter_Uppercase ? 1 : 0; } uchar_t toulower(uchar_t c) { const codePointData* data = lookupCodePoint(c); if(data && data->extra) { const letterExtraData* extraData = (const letterExtraData*)data->extra; return extraData->lower; } return c; } uchar_t touupper(uchar_t c) { const codePointData* data = lookupCodePoint(c); if(data && data->extra) { const letterExtraData* extraData = (const letterExtraData*)data->extra; return extraData->upper; } return c; } uchar_t touchar(char c) { if(c < 0x7f) return c; return '?'; } } // !namespace Unicode } // !namespace P --- NEW FILE: unicodedata.h --- /*************************************************************************** * Copyright (C) 2004 by Christian Prochnow * * cp...@se... * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU Library General Public License as * * published by the Free Software Foundation; either version 2 of the * * License, or (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU Library General Public * * License along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ #ifndef P_Unicode_unicodedata_h #define P_Unicode_unicodedata_h #include "pclasses/BasicTypes.h" #include "pclasses/Unicode/uctype.h" namespace P { namespace Unicode { //! General category enum Category { Mark_NonSpacing, // Mn Mark_SpacingCombining, // Mc Mark_Enclosing, // Me Number_DecimalDigit, // Nd Number_Letter, // Nl Number_Other, // No Separator_Space, // Zs Separator_Line, // Zl Separator_Paragraph, // Zp Other_Control, // Cc Other_Format, // Cf Other_Surrogate, // Cs Other_PrivateUse, // Co Other_NotAssigned, // Cn Letter_Uppercase, // Lu Letter_Lowercase, // Ll Letter_Titlecase, // Lt Letter_Modifier, // Lm Letter_Other, // Lo Punctuation_Connector, // Pc Punctuation_Dash, // Pd Punctuation_Open, // Ps Punctuation_Close, // Pe Punctuation_InitialQuote, // Pi Punctuation_FinalQuote, // Pf Punctuation_Other, // Po Symbol_Math, // Sm Symbol_Currency, // Sc Symbol_Modifier, // Sk Symbol_Other // So }; //! Bidirectional Class enum BidiClass { LeftToRight, // L LeftToRightEmbedding, // LRE LeftToRightOverride, // LRO RightToLeft, // R RightToLeftArabic, // AL RightToLeftEmbedding, // RLE RightToLeftOverride, // RLO PopDirectionalFormat, // PDF EuropeanNumber, // EN EuropeanNumberSeparator, // ES EuropeanNumberTerminator, // ET ArabicNumber, // AN CommonNumberSeparator, // CS NonSpacingMark, // NSM BoundaryNeutral, // BN ParagraphSeparator, // B SegmentSeparator, // S Whitespace, // WS OtherNeutrals // ON }; //! Character Decomposition Tag enum Decomposition { NoDecomposition, Font, // <font> NoBreak, // <noBreak> Initial, // <initial> Medial, // <medial> Final, // <final> Isolated, // <isolated> Encircled, // <circle> Superscript, // <super> Subscript, // <sub> Vertical, // <vertical> Wide, // <wide> Narrow, // <narrow> Small, // <small> Square, // <square> Fraction, // <fraction> Compat // <compat> }; //! Canonical Combining Class enum CombiningClass { Combining_Spacing = 0, Combining_Overlays = 1, Combining_Nuktas = 7, Combining_VoicingMarks = 8, Combining_Viramas = 9, Combining_FixedStart = 10, Combining_FixedEnd = 199, Combining_BelowLeftAttached = 200, Combining_BelowAttached = 202, Combining_BelowRightAttached = 204, Combining_LeftAttached = 208, Combining_RightAttached = 210, Combining_AboveLeftAttached = 212, Combining_AboveAttached = 214, Combining_AboveRightAttached = 216, Combining_BelowLeft = 218, Combining_Below = 220, Combining_BelowRight = 222, Combining_Left = 224, Combining_Right = 226, Combining_AboveLeft = 228, Combining_Above = 230, Combining_AboveRight = 232, Combining_DoubleBelow = 233, Combining_DoubleAbove = 234, Combining_IotaSubscript = 240 }; struct codePointData { uchar_t codePoint; char category; char combining; char bidi; char decomp; char mirrored; void* extra; }; struct letterExtraData { uchar_t upper; uchar_t lower; uchar_t title; }; struct decimalDigitExtraData { int num; }; const codePointData* lookupCodePoint(uchar_t codePoint); Category category(uchar_t ch); BidiClass bidiClass(uchar_t ch); Decomposition decompTag(uchar_t ch); CombiningClass combiningClass(uchar_t ch); } // !namespace Unicode } // !namespace P #endif --- NEW FILE: ustring.cpp --- /*************************************************************************** * Copyright (C) 2004 by Christian Prochnow * * cp...@se... * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU Library General Public License as * * published by the Free Software Foundation; either version 2 of the * * License, or (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU Library General Public * * License along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * ***************************************************************************/ #include "pclasses/Unicode/ustring.h" #include <string.h> namespace P { namespace Unicode { int umemcmp(const uchar_t* s1, const uchar_t* s2, size_t n) { while(n-- > 0) { if(*s1 != *s2) { //@@fixme return 1; } ++s1; ++s2; } return 0; } size_t ucslen(const uchar_t* s) { size_t n = 0; while(*(s++) != 0) ++n; return n; } const uchar_t* umemchr(const uchar_t* s, size_t n, uchar_t a) { while(n-- > 0) { if(*(s++) == a) return s; } return 0; } uchar_t* umemmove(uchar_t* s1, const uchar_t* s2, size_t n) { return (uchar_t*)std::memmove(s1, s2, n * sizeof(uchar_t)); } uchar_t* umemcpy(uchar_t* s1, const uchar_t* s2, size_t n) { return (uchar_t*)std::memcpy(s1, s2, n * sizeof(uchar_t)); } uchar_t* umemset(uchar_t* s, size_t n, uchar_t a) { while(n-- > 0) *(s++) = a; return s; } ustring str(const char* str) { size_t len = strlen(str); ustring ret; ret.reserve(len); size_t i = 0; while(len-- > 0) ret[i++] = touchar(*(str++)); return ret; } } // !namespace Unicode } // !namespace P Index: unicodedata.awk =================================================================== RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/unicodedata.awk,v retrieving revision 1.2 retrieving revision 1.3 diff -u -d -r1.2 -r1.3 --- unicodedata.awk 23 Dec 2004 04:32:18 -0000 1.2 +++ unicodedata.awk 14 Jan 2005 14:46:02 -0000 1.3 @@ -1,23 +1,5 @@ BEGIN { FS=";" - - print "/* Automatically generated by P::Classes unicodedata.awk */" > "unicodedata_extra.h" - print "namespace P { namespace Unicode {" >> "unicodedata_extra.h" - print "struct letterExtraData { uint32_t upper, lower, title; };" >> "unicodedata_extra.h" - print "struct decimalDigitExtraData { int num; };" >> "unicodedata_extra.h" - - print "/* Automatically generated by P::Classes unicodedata.awk */" - print "namespace P { namespace Unicode {" - print "struct codePointData {" - print " uint32_t codePoint;" - print " char category;" - print " char combining;" - print " char bidi;" - print " char decomp;" - print " char mirrored;" - print " void* extra;" - print "};" - print "codePointData codePoints[] = {" extranum = 0; } @@ -223,16 +205,14 @@ if(extra != "") { - print extra >> "unicodedata_extra.h" + print extra >> "unicodedata_extra_db.h" extraval = "(void*)&" extraname; } - printf " { 0x%s, Char::%s, %s, Char::%s, Char::%s, %s, %s ", codepoint, categoryEnum, combining, bidiEnum, decompEnum, mirroredVal, extraval + printf " { 0x%s, %s, %s, %s, %s, %s, %s ", codepoint, categoryEnum, combining, bidiEnum, decompEnum, mirroredVal, extraval printf "},\n" } END { print " { (uint32_t)-1, 0, }" print "};" - print "} }" - print "} }" >> "unicodedata_extra.h" } |