Thread: [pclasses-cvs] pclasses2/src/Unicode uctype.cpp,NONE,1.1 unicodedata.cpp,NONE,1.1 unicodedata.h,NONE

SourceForge Headquarters 1320 Columbia Street Suite 310 San Diego, CA 92101 +1 (858) 422-6466

Update of /cvsroot/pclasses/pclasses2/src/Unicode
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv19335/src/Unicode

Modified Files:
	Char.cpp Makefile.am unicodedata.awk 
Added Files:
	uctype.cpp unicodedata.cpp unicodedata.h ustring.cpp 
Log Message:
Unicode re-work. Lets get compatible to std::basic_string<>.
Unicode::Char, Unicode::String will be obsoleted.

--- NEW FILE: unicodedata.cpp ---
/***************************************************************************
 *   Copyright (C) 2004 by Christian Prochnow                              *
 *   cp...@se...                                                   *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU Library General Public License as       *
 *   published by the Free Software Foundation; either version 2 of the    *
 *   License, or (at your option) any later version.                       *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU Library General Public     *
 *   License along with this program; if not, write to the                 *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************/

#include "unicodedata.h"

namespace P {

namespace Unicode {

#include "unicodedata_extra_db.h"
#include "unicodedata_db.h"

const codePointData* lookupCodePoint(uchar_t codePoint)
{
	unsigned int i = 0;
	while(codePoints[i].codePoint != (uchar_t)-1)
	{
		if(codePoints[i].codePoint == codePoint)
			return &codePoints[i];
		++i;
	}

	return 0;
}

Category category(uchar_t ch)
{
	const codePointData* data = lookupCodePoint(ch);
	return (Category)data->category;
}

BidiClass bidiClass(uchar_t ch)
{
	const codePointData* data = lookupCodePoint(ch);
	return (BidiClass)data->bidi;
}

Decomposition decompTag(uchar_t ch)
{
	const codePointData* data = lookupCodePoint(ch);
	return (Decomposition)data->decomp;
}

CombiningClass combiningClass(uchar_t ch) 
{
	const codePointData* data = lookupCodePoint(ch);
	return (CombiningClass)data->combining;
}


} // !namespace Unicode

} // !namespace P

Index: Char.cpp
===================================================================
RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/Char.cpp,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -d -r1.2 -r1.3
--- Char.cpp	23 Dec 2004 05:45:58 -0000	1.2
+++ Char.cpp	14 Jan 2005 14:46:02 -0000	1.3
@@ -19,27 +19,12 @@
  */
 
 #include "pclasses/Unicode/Char.h"
-
-#include "unicodedata_extra.h"    // UNICODE extra character data
-#include "unicodedata.h"    // UNICODE character data
+#include "unicodedata.h"
 
 namespace P {
 
 namespace Unicode {
 
-const codePointData* lookupCodePoint(uint32_t codePoint)
-{
-	unsigned int i = 0;
-	while(codePoints[i].codePoint != (uint32_t)-1)
-	{
-		if(codePoints[i].codePoint == codePoint)
-			return &codePoints[i];
-		++i;
-	}
-
-	return 0;
-}
-
 Char::Char(uint32_t ch)
 : _char(ch)
 { }

Index: Makefile.am
===================================================================
RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/Makefile.am,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- Makefile.am	11 Jan 2005 14:57:33 -0000	1.3
+++ Makefile.am	14 Jan 2005 14:46:02 -0000	1.4
@@ -1,21 +1,19 @@
-noinst_HEADERS = unicodedata.h unicodedata_extra.h
+noinst_HEADERS = unicodedata.h unicodedata_db.h unicodedata_extra_db.h
 
 unicodedata-clean:
-	rm -f unicodedata.h
-	rm -f unicodedata_extra.h
-
-unicodedata: unicodedata-clean unicodedata.h unicodedata_extra.h
+	rm -f unicodedata_db.h
+	rm -f unicodedata_extra_db.h
 
-unicodedata.h unicodedata_extra.h:
+unicodedata: unicodedata-clean
 	wget --passive-ftp http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
-	awk -f $(top_srcdir)/src/Unicode/unicodedata.awk UnicodeData.txt >unicodedata.h
+	awk -f $(top_srcdir)/src/Unicode/unicodedata.awk UnicodeData.txt >unicodedata_db.h
 	rm -f UnicodeData.txt
 
 INCLUDES = -I$(top_srcdir)/include -I$(top_builddir)/include -I$(top_builddir)/src/Unicode $(all_includes)
 METASOURCES = AUTO
 
 lib_LTLIBRARIES = libpclasses_unicode.la
-libpclasses_unicode_la_SOURCES = Char.cpp String.cpp TextStream.cpp
+libpclasses_unicode_la_SOURCES = unicodedata.cpp uctype.cpp ustring.cpp Char.cpp String.cpp TextStream.cpp
 
 libpclasses_unicode_la_LDFLAGS = -no-undefined
 

--- NEW FILE: uctype.cpp ---
/***************************************************************************
 *   Copyright (C) 2004 by Christian Prochnow                              *
 *   cp...@se...                                                   *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU Library General Public License as       *
 *   published by the Free Software Foundation; either version 2 of the    *
 *   License, or (at your option) any later version.                       *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU Library General Public     *
 *   License along with this program; if not, write to the                 *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************/

#include "pclasses/Unicode/uctype.h"
#include "unicodedata.h"

namespace P {

namespace Unicode {

int isualnum(uchar_t c)
{
	int ret = 0;

	switch(category(c))
	{
		case Letter_Uppercase:
		case Letter_Lowercase:
		case Letter_Titlecase:
		case Letter_Modifier:
		case Letter_Other:
		case Number_DecimalDigit:
		case Number_Letter:
		case Number_Other:
			ret = 1;
		default:
			break;
	}

	return ret;
}

int isualpha(uchar_t c)
{
	int ret = 0;

	switch(category(c))
	{
		case Letter_Uppercase:
		case Letter_Lowercase:
		case Letter_Titlecase:
		case Letter_Modifier:
		case Letter_Other:
			ret = 1;
		default:
			break;
	}

	return ret;
}

int isucntrl(uchar_t c)
{
	int ret = 0;

	switch(category(c))
	{
		case Mark_NonSpacing:
		case Mark_SpacingCombining:
		case Mark_Enclosing:
		case Other_Control:
		case Other_Format:
		case Other_Surrogate:
		case Other_PrivateUse:
		case Other_NotAssigned:
			ret = 1;
		default:
			break;
	}

	return ret;
}

int isudigit(uchar_t c)
{
	int ret = 0;

	switch(category(c))
	{
		case Number_DecimalDigit:
		case Number_Letter:
		case Number_Other:
			ret = 1;
		default:
			break;
	}

	return ret;
}

int isugraph(uchar_t c)
{
	//@@fixme
	return 0;
}

int isulower(uchar_t c)
{
	return category(c) == Letter_Lowercase ? 1 : 0;
}

int isuprint(uchar_t c)
{
	//@@fixme
	return 0;
}

int isupunct(uchar_t c)
{
	//@@fixme
	return 0;
}

int isuspace(uchar_t c)
{
	int ret = 0;
	
	switch(category(c))
	{
		case Separator_Space:
		case Separator_Line:
		case Separator_Paragraph:
			ret = 1;
		default:
			break;
	}

	return ret;
}

int isuupper(uchar_t c)
{
	return category(c) == Letter_Uppercase ? 1 : 0;
}

uchar_t toulower(uchar_t c)
{
	const codePointData* data = lookupCodePoint(c);
	if(data && data->extra)
	{
		const letterExtraData* extraData = (const letterExtraData*)data->extra;
		return extraData->lower;
	}

	return c;
}

uchar_t touupper(uchar_t c)
{
	const codePointData* data = lookupCodePoint(c);
	if(data && data->extra)
	{
		const letterExtraData* extraData = (const letterExtraData*)data->extra;
		return extraData->upper;
	}

	return c;
}

uchar_t touchar(char c)
{
	if(c < 0x7f)
		return c;

	return '?';
}

} // !namespace Unicode
 
} // !namespace P

--- NEW FILE: unicodedata.h ---
/***************************************************************************
 *   Copyright (C) 2004 by Christian Prochnow                              *
 *   cp...@se...                                                   *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU Library General Public License as       *
 *   published by the Free Software Foundation; either version 2 of the    *
 *   License, or (at your option) any later version.                       *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU Library General Public     *
 *   License along with this program; if not, write to the                 *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************/

#ifndef P_Unicode_unicodedata_h
#define P_Unicode_unicodedata_h

#include "pclasses/BasicTypes.h"
#include "pclasses/Unicode/uctype.h"

namespace P { 

namespace Unicode {

//! General category
enum Category {
	Mark_NonSpacing,          // Mn
	Mark_SpacingCombining,    // Mc
	Mark_Enclosing,           // Me

	Number_DecimalDigit,      // Nd
	Number_Letter,            // Nl
	Number_Other,             // No

	Separator_Space,          // Zs
	Separator_Line,           // Zl
	Separator_Paragraph,      // Zp

	Other_Control,            // Cc
	Other_Format,             // Cf
	Other_Surrogate,          // Cs
	Other_PrivateUse,         // Co
	Other_NotAssigned,        // Cn

	Letter_Uppercase,         // Lu
	Letter_Lowercase,         // Ll
	Letter_Titlecase,         // Lt
	Letter_Modifier,          // Lm
	Letter_Other,             // Lo

	Punctuation_Connector,    // Pc
	Punctuation_Dash,         // Pd
	Punctuation_Open,         // Ps
	Punctuation_Close,        // Pe
	Punctuation_InitialQuote, // Pi
	Punctuation_FinalQuote,   // Pf
	Punctuation_Other,        // Po

	Symbol_Math,              // Sm
	Symbol_Currency,          // Sc
	Symbol_Modifier,          // Sk
	Symbol_Other              // So
};

//! Bidirectional Class
enum BidiClass {
	LeftToRight,              // L
	LeftToRightEmbedding,     // LRE
	LeftToRightOverride,      // LRO
	RightToLeft,              // R
	RightToLeftArabic,        // AL
	RightToLeftEmbedding,     // RLE
	RightToLeftOverride,      // RLO
	PopDirectionalFormat,     // PDF
	EuropeanNumber,           // EN
	EuropeanNumberSeparator,  // ES
	EuropeanNumberTerminator, // ET
	ArabicNumber,             // AN
	CommonNumberSeparator,    // CS
	NonSpacingMark,           // NSM
	BoundaryNeutral,          // BN
	ParagraphSeparator,       // B
	SegmentSeparator,         // S
	Whitespace,               // WS
	OtherNeutrals             // ON
};

//! Character Decomposition Tag
enum Decomposition {
	NoDecomposition,
	Font,               // <font>
	NoBreak,            // <noBreak>
	Initial,            // <initial>
	Medial,             // <medial>
	Final,              // <final>
	Isolated,           // <isolated>
	Encircled,          // <circle>
	Superscript,        // <super>
	Subscript,          // <sub>
	Vertical,           // <vertical>
	Wide,               // <wide>
	Narrow,             // <narrow>
	Small,              // <small>
	Square,             // <square>
	Fraction,           // <fraction>
	Compat              // <compat>
};

//! Canonical Combining Class
enum CombiningClass {
	Combining_Spacing             = 0,
	Combining_Overlays            = 1,
	Combining_Nuktas              = 7,
	Combining_VoicingMarks        = 8,
	Combining_Viramas             = 9,
	Combining_FixedStart          = 10,
	Combining_FixedEnd            = 199,
	Combining_BelowLeftAttached   = 200,
	Combining_BelowAttached       = 202,
	Combining_BelowRightAttached  = 204,
	Combining_LeftAttached        = 208,
	Combining_RightAttached       = 210,
	Combining_AboveLeftAttached   = 212,
	Combining_AboveAttached       = 214,
	Combining_AboveRightAttached  = 216,
	Combining_BelowLeft           = 218,
	Combining_Below               = 220,
	Combining_BelowRight          = 222,
	Combining_Left                = 224,
	Combining_Right               = 226,
	Combining_AboveLeft           = 228,
	Combining_Above               = 230,
	Combining_AboveRight          = 232,
	Combining_DoubleBelow         = 233,
	Combining_DoubleAbove         = 234,
	Combining_IotaSubscript       = 240
};

struct codePointData {
	uchar_t codePoint;
	char category;
	char combining;
	char bidi;
	char decomp;
	char mirrored;
	void* extra;
};

struct letterExtraData { 
	uchar_t upper;
	uchar_t lower;
	uchar_t title;
};

struct decimalDigitExtraData { 
	int num; 
};

const codePointData* lookupCodePoint(uchar_t codePoint);

Category category(uchar_t ch);
BidiClass bidiClass(uchar_t ch);
Decomposition decompTag(uchar_t ch);
CombiningClass combiningClass(uchar_t ch);

} // !namespace Unicode

} // !namespace P

#endif

--- NEW FILE: ustring.cpp ---
/***************************************************************************
 *   Copyright (C) 2004 by Christian Prochnow                              *
 *   cp...@se...                                                   *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU Library General Public License as       *
 *   published by the Free Software Foundation; either version 2 of the    *
 *   License, or (at your option) any later version.                       *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU Library General Public     *
 *   License along with this program; if not, write to the                 *
 *   Free Software Foundation, Inc.,                                       *
 *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
 ***************************************************************************/

#include "pclasses/Unicode/ustring.h"
#include <string.h>

namespace P {

namespace Unicode {

int umemcmp(const uchar_t* s1, const uchar_t* s2, size_t n)
{
	while(n-- > 0)
	{
		if(*s1 != *s2)
		{
			//@@fixme
			return 1;
		}

		++s1;
		++s2;
	}

	return 0;
}

size_t ucslen(const uchar_t* s)
{
	size_t n = 0;
	while(*(s++) != 0)
		++n;

	return n;
}

const uchar_t* umemchr(const uchar_t* s, size_t n, uchar_t a)
{
	while(n-- > 0)
	{
		if(*(s++) == a)
			return s;
	}

	return 0;
}

uchar_t* umemmove(uchar_t* s1, const uchar_t* s2, size_t n)
{
	return (uchar_t*)std::memmove(s1, s2, n * sizeof(uchar_t));
}

uchar_t* umemcpy(uchar_t* s1, const uchar_t* s2, size_t n)
{
	return (uchar_t*)std::memcpy(s1, s2, n * sizeof(uchar_t));
}

uchar_t* umemset(uchar_t* s, size_t n, uchar_t a)
{
	while(n-- > 0)
		*(s++) = a;

	return s;
}

ustring str(const char* str)
{
	size_t len = strlen(str);
	ustring ret;
	ret.reserve(len);

	size_t i = 0;
	while(len-- > 0)
		ret[i++] = touchar(*(str++));

	return ret;
}

} // !namespace Unicode

} // !namespace P

Index: unicodedata.awk
===================================================================
RCS file: /cvsroot/pclasses/pclasses2/src/Unicode/unicodedata.awk,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -d -r1.2 -r1.3
--- unicodedata.awk	23 Dec 2004 04:32:18 -0000	1.2
+++ unicodedata.awk	14 Jan 2005 14:46:02 -0000	1.3
@@ -1,23 +1,5 @@
 BEGIN {
   FS=";"
-
-  print "/* Automatically generated by P::Classes unicodedata.awk */" > "unicodedata_extra.h"
-  print "namespace P { namespace Unicode {" >> "unicodedata_extra.h"
-  print "struct letterExtraData { uint32_t upper, lower, title; };" >> "unicodedata_extra.h"
-  print "struct decimalDigitExtraData { int num; };" >> "unicodedata_extra.h"
-
-  print "/* Automatically generated by P::Classes unicodedata.awk */"
-  print "namespace P { namespace Unicode {"
-  print "struct codePointData {"
-  print "  uint32_t codePoint;"
-  print "  char category;"
-  print "  char combining;"
-  print "  char bidi;"
-  print "  char decomp;"
-  print "  char mirrored;"
-  print "  void* extra;"
-  print "};"
-
   print "codePointData codePoints[] = {"
   extranum = 0;
 }
@@ -223,16 +205,14 @@
 
   if(extra != "")
   {
-    print extra >> "unicodedata_extra.h"
+    print extra >> "unicodedata_extra_db.h"
     extraval = "(void*)&" extraname;
   }
 
-  printf "  { 0x%s, Char::%s, %s, Char::%s, Char::%s, %s, %s ", codepoint, categoryEnum, combining, bidiEnum, decompEnum, mirroredVal, extraval
+  printf "  { 0x%s, %s, %s, %s, %s, %s, %s ", codepoint, categoryEnum, combining, bidiEnum, decompEnum, mirroredVal, extraval
   printf "},\n"
 }
 END {
   print "  { (uint32_t)-1, 0,  }"
   print "};"
-  print "} }"
-  print "} }" >> "unicodedata_extra.h"
 }