From: <tho...@us...> - 2008-01-31 18:10:41
|
Revision: 2606 http://clucene.svn.sourceforge.net/clucene/?rev=2606&view=rev Author: thomas_busch Date: 2008-01-31 10:10:35 -0800 (Thu, 31 Jan 2008) Log Message: ----------- added CJKAnalyzer Modified Paths: -------------- trunk/contributions/src/CLucene/analysis/cjk/CJKAnalyzer.cpp trunk/contributions/src/CLucene/analysis/cjk/CJKAnalyzer.h Modified: trunk/contributions/src/CLucene/analysis/cjk/CJKAnalyzer.cpp =================================================================== --- trunk/contributions/src/CLucene/analysis/cjk/CJKAnalyzer.cpp 2007-09-27 18:12:16 UTC (rev 2605) +++ trunk/contributions/src/CLucene/analysis/cjk/CJKAnalyzer.cpp 2008-01-31 18:10:35 UTC (rev 2606) @@ -1,4 +1,5 @@ #include "CLucene/StdHeader.h" +#include "CLucene/analysis/Analyzers.h" #include "CJKAnalyzer.h" CL_NS_DEF2(analysis,cjk) @@ -180,4 +181,34 @@ return true; } +CJKAnalyzer::CJKAnalyzer() { + StopFilter::fillStopTable( &stopTable, CJKAnalyzer::STOP_WORDS); +} + +CJKAnalyzer::CJKAnalyzer(const TCHAR** stopWords) { + StopFilter::fillStopTable(&stopTable, stopWords); +} + +CJKAnalyzer::~CJKAnalyzer() { +} + +const TCHAR* CJKAnalyzer::STOP_WORDS[] = { + _T("a"), _T("and"), _T("are"), _T("as"), _T("at"), _T("be"), + _T("but"), _T("by"), _T("for"), _T("if"), _T("in"), + _T("into"), _T("is"), _T("it"), _T("no"), _T("not"), + _T("of"), _T("on"), _T("or"), _T("s"), _T("such"), _T("t"), + _T("that"), _T("the"), _T("their"), _T("then"), + _T("there"), _T("these"), _T("they"), _T("this"), + _T("to"), _T("was"), _T("will"), _T("with"), + _T("www"), + NULL +}; + + +TokenStream* CJKAnalyzer::tokenStream(const TCHAR* fieldName, Reader* reader) { + TokenStream* ret = _CLNEW CJKTokenizer(reader); + ret = _CLNEW StopFilter(ret,true, &stopTable); + return ret; +} + CL_NS_END2 Modified: trunk/contributions/src/CLucene/analysis/cjk/CJKAnalyzer.h =================================================================== --- trunk/contributions/src/CLucene/analysis/cjk/CJKAnalyzer.h 2007-09-27 18:12:16 UTC (rev 2605) +++ trunk/contributions/src/CLucene/analysis/cjk/CJKAnalyzer.h 2008-01-31 18:10:35 UTC (rev 2606) @@ -102,6 +102,33 @@ }; +/** Represents a CJK analyzer. + * + * Filters CJKTokenizer with StopFilter. + * + * @author Che, Dong + */ + +class CJKAnalyzer : public CL_NS(analysis)::Analyzer +{ + private: + CL_NS(util)::CLSetList<const TCHAR*> stopTable; + public: + /** Builds an analyzer.*/ + CJKAnalyzer(); + /** Builds an analyzer with the given stop words. */ + CJKAnalyzer(const TCHAR** stopWords); + + ~CJKAnalyzer(); + + /** get token stream from input **/ + CL_NS(analysis)::TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader); + + /** An array containing some common English words that are not usually useful for + searching and some double-byte interpunctions. */ + static const TCHAR* STOP_WORDS[]; +}; + CL_NS_END2 #endif This was sent by the SourceForge.net collaborative development platform, the world's largest Open Source development site. |